예제 #1
0
def mapreduce(job_class):

    job = job_class()
    step_count = len(job._steps)

    # if temporary directory root does not exist, create one
    tmp_root = job._settings.tmp_dir
    if not os.path.exists(tmp_root):
        os.makedirs(tmp_root)
    tmp_dirs = [mkdtemp(dir=tmp_root, prefix="step%d." % i) for i in range(step_count)]

    input_file_lists = [job._settings.input_files]
    for step, out_dir in zip(job._steps, tmp_dirs):
        n_reducers = step.n_reducers
        reduce_format = os.path.join(out_dir, "reduce.out.%d")
        ff = [reduce_format % n for n in range(n_reducers)]
        input_file_lists.append(ff)

    logger.info("Input files: {}".format(input_file_lists))

    # if output directory root does not exist, create one
    output_dir = job._settings.output_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i, step in enumerate(job._steps):
        cmd_opts = [
            EXEC_SCRIPT,
            "mrdomino.step",
            "--step_idx",
            i,
            "--total_steps",
            step_count,
            "--input_files",
            " ".join(input_file_lists[i]),
            "--work_dir",
            tmp_dirs[i],
            "--output_dir",
            output_dir,
            "--job_module",
            sys.modules[job.__module__].__file__,
            "--job_class",
            job.__class__.__name__,
            "--use_domino",
            int(job._settings.use_domino),
            "--n_concurrent_machines",
            job._settings.n_concurrent_machines,
            "--n_shards_per_machine",
            job._settings.n_shards_per_machine,
        ]

        cmd = util.create_cmd(cmd_opts)
        logger.info("Starting step %d with command: %s" % (i, cmd))
        util.wait_cmd(cmd, logger, "Step %d" % i)
    logger.info("All done.")
예제 #2
0
파일: step.py 프로젝트: knighton/mapreduce
def main():

    args = parse_args()
    logger.info('Mapreduce step: %s', args)

    logger.info('%d input files.', len(args.input_files))

    work_dir = args.work_dir
    logger.info('Working directory: %s', work_dir)

    job = get_instance(args)
    step = job.get_step(args.step_idx)
    logger.info('Starting %d mappers.', step.n_mappers)

    # create map command
    cmd_opts = [
        'mrdomino.map_one_machine',
        '--step_idx', args.step_idx,
        '--shards', '%s',
        '--input_files', ' '.join(args.input_files),
        '--job_module', args.job_module,
        '--job_class', args.job_class,
        '--work_dir', work_dir
    ]
    cmd = create_cmd(cmd_opts)

    schedule_machines(
        args,
        command=cmd,
        done_file_pattern=os.path.join(work_dir, 'map.done.%d'),
        n_shards=step.n_mappers)

    counter = combine_counters(
        work_dir, step.n_mappers, step.n_reducers)

    # shuffle mapper outputs to reducer inputs.
    cmd = create_cmd([EXEC_SCRIPT, 'mrdomino.shuffle',
                      '--work_dir', work_dir,
                      '--input_prefix', 'map.out',
                      '--output_prefix', 'reduce.in',
                      '--job_module', args.job_module,
                      '--job_class', args.job_class,
                      '--step_idx', args.step_idx])
    wait_cmd(cmd, logger, "Shuffling")

    logger.info('Starting %d reducers.', step.n_reducers)
    cmd = create_cmd(['mrdomino.reduce_one_machine',
                      '--step_idx', args.step_idx,
                      '--shards', '%s',
                      '--job_module', args.job_module,
                      '--job_class', args.job_class,
                      '--input_prefix', 'reduce.in',
                      '--work_dir', work_dir])
    schedule_machines(
        args,
        command=cmd,
        done_file_pattern=os.path.join(work_dir, 'reduce.done.%d'),
        n_shards=step.n_reducers)

    counter = combine_counters(
        work_dir, step.n_mappers, step.n_reducers)
    logger.info(('Step %d counters:\n' % args.step_idx) + counter.show())

    if args.step_idx == args.total_steps - 1:

        logger.info('Joining reduce outputs')

        if job.INTERNAL_PROTOCOL == protocol.JSONProtocol and \
                job.OUTPUT_PROTOCOL == protocol.JSONValueProtocol:
            unpack_tuple = True
        elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol and \
                job.OUTPUT_PROTOCOL == protocol.JSONProtocol:
            raise RuntimeError("if internal protocol is value-based, "
                               "output protocol must also be so")
        elif job.INTERNAL_PROTOCOL == protocol.JSONProtocol and \
                job.OUTPUT_PROTOCOL == protocol.JSONProtocol:
            unpack_tuple = False
        elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol and \
                job.OUTPUT_PROTOCOL == protocol.JSONValueProtocol:
            unpack_tuple = False
        else:
            raise ValueError("unsupported output protocol: {}"
                             .format(job.OUTPUT_PROTOCOL))

        # make sure that files are sorted by shard number
        glob_prefix = 'reduce.out'
        filenames = glob(path_join(work_dir, glob_prefix + '.[0-9]*'))
        prefix_match = re.compile('.*\\b' + glob_prefix + '\\.(\\d+)$')
        presorted = []
        for filename in filenames:
            match = prefix_match.match(filename)
            if match is not None:
                presorted.append((int(match.group(1)), filename))
        filenames = [filename[1] for filename in sorted(presorted)]
        out_f = path_join(args.output_dir, 'reduce.out')
        with open(out_f, 'w') as out_fh:
            for kv in read_lines(filenames):
                if unpack_tuple:
                    _, v = json.loads(kv)
                    v = json.dumps(v) + "\n"
                else:
                    v = kv
                out_fh.write(v)

    # done.
    logger.info('Mapreduce step done.')