def op_reduce(job): global job_name job_inputs = this_inputs() msg("Received a new reduce job!") do_sort = int(job['sort']) mem_sort_limit = int(job['mem_sort_limit']) required_modules = job['required_modules'].split() if 'ext_reduce' in job: if "ext_params" in job: red_params = job['ext_params'] else: red_params = "0\n" external.prepare(job['ext_reduce'], red_params, EXT_REDUCE % job_name) fun_reduce.func_code = external.ext_reduce.func_code else: fun_reduce.func_code = marshal.loads(job['reduce']) red_params = cPickle.loads(job['params']) for m in required_modules: fun_reduce.func_globals.setdefault(m, __import__(m)) red_in = ReduceReader(job_inputs, do_sort, mem_sort_limit) red_out = ReduceOutput() msg("Starting reduce") fun_reduce(red_in.iter(), red_out, red_params) msg("Reduce done") red_out.close() external.close_ext() msg("%d %s" % (this_partition(), red_out.disco_address()), "OUT")
def op_map(job): global job_name job_input = this_inputs() msg("Received a new map job!") if len(job_input) != 1: err("Map can only handle one input. Got: %s" % " ".join(job_input)) nr_reduces = int(job['nr_reduces']) required_modules = job['required_modules'].split() fun_map_reader.func_code = marshal.loads(job['map_reader']) fun_map_writer.func_code = marshal.loads(job['map_writer']) fun_partition.func_code = marshal.loads(job['partition']) for m in required_modules: fun_map_reader.func_globals.setdefault(m, __import__(m)) fun_partition.func_globals.setdefault(m, __import__(m)) if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" external.prepare(job['ext_map'], map_params, EXT_MAP % job_name) fun_map.func_code = external.ext_map.func_code else: map_params = cPickle.loads(job['params']) fun_map.func_code = marshal.loads(job['map']) for m in required_modules: fun_map.func_globals.setdefault(m, __import__(m)) if 'map_init' in job: fun_init.func_code = marshal.loads(job['map_init']) if 'combiner' in job: fun_combiner.func_code = marshal.loads(job['combiner']) for m in required_modules: fun_combiner.func_globals.setdefault(m, __import__(m)) partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_reduces)] else: partitions = [MapOutput(i, map_params) for i in range(nr_reduces)] run_map(job_input[0], partitions, map_params) for p in partitions: p.close() if 'chunked' in job: merge_chunks(partitions) out = "chunk://%s/%s/map-chunk-%d" %\ (this_host(), job_name, this_partition()) else: out = partitions[0].disco_address() external.close_ext() msg("%d %s" % (this_partition(), out), "OUT")
def op_reduce(job): job_inputs = this_inputs() msg("Received a new reduce job!") do_sort = int(job['sort']) mem_sort_limit = int(job['mem_sort_limit']) req_mod = job['required_modules'].split() if 'reduce_init' in job: fun_init.func_code = marshal.loads(job['reduce_init']) fun_reduce_reader.func_code = marshal.loads(job['reduce_reader']) fun_reduce_writer.func_code = marshal.loads(job['reduce_writer']) if 'required_files' in job: write_files(marshal.loads(job['required_files']), REQ_FILES) sys.path.insert(0, REQ_FILES) import_modules(req_mod, [fun_reduce_reader, fun_reduce_writer,\ fun_reduce, fun_init]) if 'ext_reduce' in job: if "ext_params" in job: red_params = job['ext_params'] else: red_params = "0\n" external.prepare(job['ext_reduce'], red_params, EXT_REDUCE) fun_reduce.func_code = external.ext_reduce.func_code else: fun_reduce.func_code = marshal.loads(job['reduce']) red_params = cPickle.loads(job['params']) red_in = ReduceReader(job_inputs, do_sort, mem_sort_limit).iter() red_out = ReduceOutput(red_params) msg("Starting reduce") fun_init(red_in, red_params) fun_reduce(red_in, red_out, red_params) msg("Reduce done") red_out.close() external.close_ext() index = cStringIO.StringIO(os.path.basename(red_out.fname) + "\n") safe_append(index, REDUCE_INDEX) msg("dir://%s/%sreduce-index.txt" % (this_host(), JOB_HOME), "OUT")
def op_map(job): job_input = this_inputs() msg("Received a new map job!") if len(job_input) != 1: err("Map can only handle one input. Got: %s" % " ".join(job_input)) nr_reduces = int(job['nr_reduces']) nr_part = max(1, nr_reduces) fun_map_reader.func_code = marshal.loads(job['map_reader']) fun_map_writer.func_code = marshal.loads(job['map_writer']) fun_partition.func_code = marshal.loads(job['partition']) if 'map_init' in job: fun_init.func_code = marshal.loads(job['map_init']) if 'required_files' in job: write_files(marshal.loads(job['required_files']), REQ_FILES) sys.path.insert(0, REQ_FILES) req_mod = job['required_modules'].split() import_modules(req_mod, [fun_map_reader, fun_map_writer, fun_partition, fun_map, fun_combiner, fun_init]) if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" external.prepare(job['ext_map'], map_params, EXT_MAP) fun_map.func_code = external.ext_map.func_code else: map_params = cPickle.loads(job['params']) fun_map.func_code = marshal.loads(job['map']) if 'combiner' in job: fun_combiner.func_code = marshal.loads(job['combiner']) partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_part)] else: partitions = [MapOutput(i, map_params) for i in range(nr_part)] run_map(job_input[0], partitions, map_params) external.close_ext() for p in partitions: p.close() if nr_reduces: merge_partitions(partitions) n = os.path.basename(PART_OUTPUT % 0) msg("dir://%s/%s%s:%d" % (this_host(), JOB_HOME, n, len(partitions) - 1), "OUT") else: res = [os.path.basename(p.fname) for p in partitions] index = cStringIO.StringIO("\n".join(res) + "\n") safe_append(index, MAP_INDEX) msg("dir://%s/%smap-index.txt" %\ (this_host(), JOB_HOME), "OUT")