Exemplo n.º 1
0
import logging

logging.basicConfig(level=logging.INFO)

import pydoop
import pydoop.hadut as hadut
import pydoop.test_support as pts

CONF = {
    "mapreduce.job.maps": "2",
    "mapreduce.job.reduces": "2",
    # [TODO] replace student_id with your id, e.g. 2011-12345
    "mapreduce.job.name": "nsf_2016-19762",
}
HADOOP_CONF_DIR = pydoop.hadoop_conf()
PREFIX = os.getenv("PREFIX", pts.get_wd_prefix())


def update_conf(args):
    if args.D:
        for kv_pair in args.D:
            k, v = [_.strip() for _ in kv_pair.split("=")]
            CONF[k] = v


def make_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("pipes_exe",
                        metavar="PIPES_EXE",
                        help="python script to be run by pipes")
    parser.add_argument("local_input",
Exemplo n.º 2
0
Arquivo: run.py Projeto: crs4/pydoop
MR_HOME_DIR = 'mapreduce.admin.user.home.dir'
PIPES_JAVA_RR = "hadoop.pipes.java.recordreader"
PIPES_JAVA_RW = "hadoop.pipes.java.recordwriter"
MR_OUT_COMPRESS_TYPE = "mapred.output.compression.type"
MR_REDUCE_TASKS = "mapred.reduce.tasks"
MR_IN_CLASS = "mapred.input.format.class"
MR_OUT_CLASS = "mapred.output.format.class"
MRLIB = "org.apache.hadoop.mapred"

BASE_MR_OPTIONS = {
    PIPES_JAVA_RR: "true",
    PIPES_JAVA_RW: "true",
    MR_HOME_DIR: os.path.expanduser("~"),
}

PREFIX = os.getenv("PREFIX", pts.get_wd_prefix())


def make_parser():
    parser = optparse.OptionParser(usage="%prog [OPTIONS]")
    parser.add_option("-i", dest="input", metavar="STRING",
                      help="input dir/file ['%default']",
                      default=DEFAULT_INPUT)
    parser.add_option("-t", type="int", dest="threshold", metavar="INT",
                      help="min word occurrence [%default]", default=10)
    return parser


def run_wc(opt):
    runner = hadut.PipesRunner(prefix=PREFIX)
    options = BASE_MR_OPTIONS.copy()