def _run(self): red_out, out_url, fd_list = self.connect_output() red_in = iter(ReduceReader(self)) params = self.params if self.ext_reduce: path = self.path('EXT_REDUCE') external.prepare(self.reduce, self.ext_params, path) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) Message("Starting reduce") self.init(red_in, params) self.reduce(red_in, red_out, params) Message("Reduce done") self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Message("Results pushed to DDFS") else: index, index_url = self.reduce_index safe_update(index, {'%d %s' % (self.id, out_url): True}) OutputURL(index_url)
def _run(self): entries = self.track_status(self, "%s entries reduced") red_out, out_url, fd_list = self.connect_output() params = self.params if self.ext_reduce: external.prepare(self.reduce, self.ext_params, self.path('ext.reduce')) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) total_size = sum(size for fd, size, url in self.connected_inputs) Message("Input is %s" % (util.format_size(total_size))) self.init(entries, params) self.reduce(entries, red_out, params) self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Message("Results pushed to DDFS") else: index, index_url = self.reduce_index safe_update(index, ['%d %s' % (self.id, out_url)]) OutputURL(index_url)
def op_map(job): msg("Received a new map job!") if len(Task.inputs) != 1: err("Map can only handle one input. Got: %s" % " ".join(Task.inputs)) global fun_reader, fun_writer, fun_partition fun_reader = util.unpack(job['map_reader'], globals=globals()) fun_writer = util.unpack(job['map_writer'], globals=globals()) fun_partition = util.unpack(job['partition'], globals=globals()) global fun_init if 'map_init' in job: fun_init = util.unpack(job['map_init'], globals=globals()) global fun_map if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" path = Task.path("EXT_MAP") external.prepare(job['ext_map'], map_params, path) fun_map = external.ext_map else: map_params = util.unpack(job['params'], globals=globals()) fun_map = util.unpack(job['map'], globals=globals()) global fun_combiner if 'combiner' in job: fun_combiner = util.unpack(job['combiner'], globals=globals()) init_common(job) nr_part = max(1, Task.num_partitions) if 'combiner' in job: partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_part)] else: partitions = [MapOutput(i, map_params) for i in range(nr_part)] run_map(Task.inputs[0], partitions, map_params) external.close_ext() urls = {} for i, p in enumerate(partitions): p.close() urls["%d %s" % (i, p.url())] = True index, index_url = Task.map_index safe_update(index, urls) OutputURL(index_url)
def op_reduce(job): msg("Received a new reduce job!") do_sort = int(job['sort']) mem_sort_limit = int(job['mem_sort_limit']) global fun_init if 'reduce_init' in job: fun_init = util.unpack(job['reduce_init'], globals=globals()) global fun_reader, fun_writer fun_reader = util.unpack(job['reduce_reader'], globals=globals()) fun_writer = util.unpack(job['reduce_writer'], globals=globals()) global fun_reduce if 'ext_reduce' in job: if "ext_params" in job: red_params = job['ext_params'] else: red_params = "0\n" path = Task.path("EXT_MAP") external.prepare(job['ext_reduce'], red_params, path) fun_reduce = external.ext_reduce else: fun_reduce = util.unpack(job['reduce'], globals=globals()) red_params = util.unpack(job['params'], globals=globals()) init_common(job) red_in = ReduceReader(Task.inputs, do_sort, mem_sort_limit, red_params).iter() red_out = ReduceOutput(red_params) msg("Starting reduce") fun_init(red_in, red_params) fun_reduce(red_in, red_out, red_params) msg("Reduce done") red_out.close() external.close_ext() index, index_url = Task.reduce_index safe_update(index, {"%d %s" % (Task.id, red_out.url()): True}) OutputURL(index_url)
def _run(self): if len(self.inputs) != 1: TaskFailed("Map can only handle one input. Got: %s" % ' '.join(self.inputs)) if self.ext_map: external.prepare(self.map, self.ext_params, self.path('EXT_MAP')) self.map = FunctionType(external.ext_map.func_code, globals=external.__dict__) self.insert_globals([self.map]) partitions = [MapOutput(self, i) for i in xrange(self.num_partitions)] reader, sze, url = self.connect_input(self.inputs[0]) params = self.params self.init(reader, params) entries = (self.map(entry, params) for entry in reader) for kvs in self.track_status(entries, "%s entries mapped"): for k, v in kvs: p = self.partition(k, self.num_partitions, params) partitions[p].add(k, v) external.close_ext() urls = {} for i, partition in enumerate(partitions): partition.close() urls['%d %s' % (i, partition.url)] = True index, index_url = self.map_index safe_update(index, urls) if self.save and not self.reduce: if self.ispartitioned: TaskFailed("Storing partitioned outputs in DDFS is not yet supported") else: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Message("Results pushed to DDFS") else: OutputURL(index_url)
def _run(self): if len(self.inputs) != 1: TaskFailed("Map takes 1 input, got: %s" % ' '.join(self.inputs)) if self.save and not self.reduce and self.ispartitioned: TaskFailed("Storing partitioned outputs in DDFS is not yet supported") if self.ext_map: external.prepare(self.map, self.ext_params, self.path('ext.map')) self.map = FunctionType(external.ext_map.func_code, globals=external.__dict__) self.insert_globals([self.map]) entries = self.track_status(self, "%s entries mapped") params = self.params outputs = [MapOutput(self, i) for i in xrange(max(1, int(self.jobdict['partitions'])))] self.init(entries, params) for entry in entries: for k, v in self.map(entry, params): outputs[self.partition(k, len(outputs), params)].add(k, v) external.close_ext() index, index_url = self.map_index for output in outputs: output.close() safe_update(index, ['%d %s' % (i, output.url) for i, output in enumerate(outputs)]) if self.save and not self.reduce: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Message("Results pushed to DDFS") else: OutputURL(index_url)