def mapreduce(job_class): job = job_class() step_count = len(job._steps) # if temporary directory root does not exist, create one tmp_root = job._settings.tmp_dir if not os.path.exists(tmp_root): os.makedirs(tmp_root) tmp_dirs = [mkdtemp(dir=tmp_root, prefix="step%d." % i) for i in range(step_count)] input_file_lists = [job._settings.input_files] for step, out_dir in zip(job._steps, tmp_dirs): n_reducers = step.n_reducers reduce_format = os.path.join(out_dir, "reduce.out.%d") ff = [reduce_format % n for n in range(n_reducers)] input_file_lists.append(ff) logger.info("Input files: {}".format(input_file_lists)) # if output directory root does not exist, create one output_dir = job._settings.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) for i, step in enumerate(job._steps): cmd_opts = [ EXEC_SCRIPT, "mrdomino.step", "--step_idx", i, "--total_steps", step_count, "--input_files", " ".join(input_file_lists[i]), "--work_dir", tmp_dirs[i], "--output_dir", output_dir, "--job_module", sys.modules[job.__module__].__file__, "--job_class", job.__class__.__name__, "--use_domino", int(job._settings.use_domino), "--n_concurrent_machines", job._settings.n_concurrent_machines, "--n_shards_per_machine", job._settings.n_shards_per_machine, ] cmd = util.create_cmd(cmd_opts) logger.info("Starting step %d with command: %s" % (i, cmd)) util.wait_cmd(cmd, logger, "Step %d" % i) logger.info("All done.")
def main(): args = parse_args() logger.info('Mapreduce step: %s', args) logger.info('%d input files.', len(args.input_files)) work_dir = args.work_dir logger.info('Working directory: %s', work_dir) job = get_instance(args) step = job.get_step(args.step_idx) logger.info('Starting %d mappers.', step.n_mappers) # create map command cmd_opts = [ 'mrdomino.map_one_machine', '--step_idx', args.step_idx, '--shards', '%s', '--input_files', ' '.join(args.input_files), '--job_module', args.job_module, '--job_class', args.job_class, '--work_dir', work_dir ] cmd = create_cmd(cmd_opts) schedule_machines( args, command=cmd, done_file_pattern=os.path.join(work_dir, 'map.done.%d'), n_shards=step.n_mappers) counter = combine_counters( work_dir, step.n_mappers, step.n_reducers) # shuffle mapper outputs to reducer inputs. cmd = create_cmd([EXEC_SCRIPT, 'mrdomino.shuffle', '--work_dir', work_dir, '--input_prefix', 'map.out', '--output_prefix', 'reduce.in', '--job_module', args.job_module, '--job_class', args.job_class, '--step_idx', args.step_idx]) wait_cmd(cmd, logger, "Shuffling") logger.info('Starting %d reducers.', step.n_reducers) cmd = create_cmd(['mrdomino.reduce_one_machine', '--step_idx', args.step_idx, '--shards', '%s', '--job_module', args.job_module, '--job_class', args.job_class, '--input_prefix', 'reduce.in', '--work_dir', work_dir]) schedule_machines( args, command=cmd, done_file_pattern=os.path.join(work_dir, 'reduce.done.%d'), n_shards=step.n_reducers) counter = combine_counters( work_dir, step.n_mappers, step.n_reducers) logger.info(('Step %d counters:\n' % args.step_idx) + counter.show()) if args.step_idx == args.total_steps - 1: logger.info('Joining reduce outputs') if job.INTERNAL_PROTOCOL == protocol.JSONProtocol and \ job.OUTPUT_PROTOCOL == protocol.JSONValueProtocol: unpack_tuple = True elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol and \ job.OUTPUT_PROTOCOL == protocol.JSONProtocol: raise RuntimeError("if internal protocol is value-based, " "output protocol must also be so") elif job.INTERNAL_PROTOCOL == protocol.JSONProtocol and \ job.OUTPUT_PROTOCOL == protocol.JSONProtocol: unpack_tuple = False elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol and \ job.OUTPUT_PROTOCOL == protocol.JSONValueProtocol: unpack_tuple = False else: raise ValueError("unsupported output protocol: {}" .format(job.OUTPUT_PROTOCOL)) # make sure that files are sorted by shard number glob_prefix = 'reduce.out' filenames = glob(path_join(work_dir, glob_prefix + '.[0-9]*')) prefix_match = re.compile('.*\\b' + glob_prefix + '\\.(\\d+)$') presorted = [] for filename in filenames: match = prefix_match.match(filename) if match is not None: presorted.append((int(match.group(1)), filename)) filenames = [filename[1] for filename in sorted(presorted)] out_f = path_join(args.output_dir, 'reduce.out') with open(out_f, 'w') as out_fh: for kv in read_lines(filenames): if unpack_tuple: _, v = json.loads(kv) v = json.dumps(v) + "\n" else: v = kv out_fh.write(v) # done. logger.info('Mapreduce step done.')