def combine_counters(work_dir, n_map_shards, n_reduce_shards): filenames = map(lambda (work_dir, shard): os.path.join(work_dir, 'map-%d.counters' % shard), zip([work_dir] * n_map_shards, range(n_map_shards))) filenames += map(lambda (work_dir, shard): os.path.join(work_dir, 'combine-%d.counters' % shard), zip([work_dir] * n_map_shards, range(n_map_shards))) filenames += map(lambda (work_dir, shard): os.path.join(work_dir, 'reduce-%d.counters' % shard), zip([work_dir] * n_reduce_shards, range(n_reduce_shards))) return MRCounter.sum( imap(MRCounter.deserialize, read_files(filter(os.path.exists, filenames))))
def main(): args = parse_args() # count exactly how many input lines we have so we can balance work. glob_pattern = path_join(args.work_dir, args.input_prefix + '_count.[0-9]*') count_ff = glob(glob_pattern) if not count_ff: raise RuntimeError("Step {} shuffler: not input files found matching " "pattern {}".format(args.step_idx, glob_pattern)) logger.info("Step {} shuffler: counting entries from {}" .format(args.step_idx, count_ff)) num_entries = sum(imap(int, read_files(count_ff))) in_ff = sorted(glob(path_join(args.work_dir, args.input_prefix + '.[0-9]*'))) sources = [open(f, 'r') for f in in_ff] step = get_step(args) n_output_files = step.n_reducers out_format = path_join(args.work_dir, args.output_prefix + '.%d') outputs = [open(out_format % i, 'w') for i in range(n_output_files)] # To cleanly separate reducer outputs by key groups we need to unpack # values on shuffling and compare keys. Every index change has to be # accompanied by a key change, otherwise index change is postponed. old_key = None old_index = 0 lines_written = 0 for count, line in enumerate(heapq.merge(*sources)): key = json.loads(line)[0] index = count * n_output_files / num_entries # postpone switching to new index until a change in key also observed if old_index != index and old_key != key: old_index = index outputs[old_index].write(line) lines_written += 1 old_key = key for source in sources: source.close() for output in outputs: output.close() logger.info("Step {} shuffler: lines written: {}" .format(args.step_idx, lines_written))
def run_shuffle(args): # count exactly how many input lines we have so we can balance work. glob_pattern = path_join(args.work_dir, (args.input_prefix % '[0-9]*') + '.count') count_ff = glob(glob_pattern) if not count_ff: raise RuntimeError("Step {} shuffler: not input files found matching " "pattern {}".format(args.step_idx, glob_pattern)) logger.info("Step {} shuffler: counting entries from {}" .format(args.step_idx, count_ff)) num_entries = sum(imap(int, read_files(count_ff))) in_pattern = path_join(args.work_dir, args.input_prefix % '[0-9]*') in_pattern_re = re.compile(in_pattern) logger.info('Looking for files that match %s', in_pattern) # since Python does not do extended globs, need to filter-out bad matches # using a regex in_ff = sorted([f for f in glob(in_pattern) if in_pattern_re.match(f) is not None]) logger.info('Found files: {}'.format(in_ff)) sources = [open_gz(f, 'r') for f in in_ff] n_output_files = args.n_reducers out_format = path_join(args.work_dir, args.output_prefix % '%d') outputs = [open_gz(out_format % i, 'w') for i in range(n_output_files)] # To cleanly separate reducer outputs by key groups we need to unpack # values on shuffling and compare keys. Every index change has to be # accompanied by a key change, otherwise index change is postponed. old_key = None old_index = 0 lines_written = 0 for count, line in enumerate(heapq.merge(*sources)): key = json.loads(line)[0] index = count * n_output_files / num_entries # postpone switching to new index until a change in key also observed if old_index != index and old_key != key: old_index = index outputs[old_index].write(line) lines_written += 1 old_key = key for source in sources: source.close() for output in outputs: output.close() done_pattern = path_join(args.work_dir, "shuffle-%d.done") done_names = [done_pattern % i for i in range(n_output_files)] for name in done_names: with open(name, 'w') as fhandle: fhandle.write('') logger.info("Step {} shuffler: lines written: {}" .format(args.step_idx, lines_written))