def _run(self): entries = self.track_status(self, "%s entries reduced") red_out, out_url, fd_list = self.connect_output() params = self.params if self.ext_reduce: external.prepare(self.reduce, self.ext_params, self.path('ext.reduce')) self.reduce = FunctionType(external.ext_reduce.func_code, globals=external.__dict__) self.insert_globals([self.reduce]) total_size = sum(size for fd, size, url in self.connected_inputs) Status("Input is %s" % (util.format_size(total_size))) self.init(entries, params) if util.argcount(self.reduce) < 3: for k, v in self.reduce(entries, *(params, )): red_out.add(k, v) else: self.reduce(entries, red_out, params) self.close_output(fd_list) external.close_ext() if self.save: OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master)) Status("Results pushed to DDFS") else: index, index_url = self.reduce_index f = file(index, 'w') print >> f, '%d %s' % (self.id, out_url) sync(f) f.close() OutputURL(index_url)
def reduce(self, task, params): ordered = self.reduce_input(task, params) entries = self.status_iter(ordered, "%s entries reduced") output = self.output(task, None, open=self.opener('reduce', 'out', params)).file.fds[-1] self['reduce_init'](entries, params) if util.argcount(self['reduce']) < 3: for record in self['reduce'](entries, *(params, )): output.add(*record) else: self['reduce'](entries, output, params)
def __init__(self, url, streams, params, fd=None, size=None): self.fds = [] for stream in streams: maybe_params = (params,) if util.argcount(stream) == 4 else () fd = stream(fd, size, url, *maybe_params) if isinstance(fd, tuple): if len(fd) == 3: fd, size, url = fd else: fd, url = fd self.fds.append(fd)
def __init__(self, url, streams, params, fd=None, size=None): self.fds = [] for stream in streams: maybe_params = (params, ) if util.argcount(stream) == 4 else () fd = stream(fd, size, url, *maybe_params) if isinstance(fd, tuple): if len(fd) == 3: fd, size, url = fd else: fd, url = fd self.fds.append(fd)
def connect_input(self, url, fd=None, size=None): def fd_tuple(object, *args): if isinstance(object, tuple): return object return (object,) + args for input_stream in self.input_stream: fd, size, url = fd_tuple(input_stream(fd, size, url, self.params), size, url) # backwards compatibility for readers if self.reader: if util.argcount(self.reader) == 3: return fd_tuple(self.reader(fd, size, url), size, url) return fd_tuple(self.reader(fd, size, url, self.params), size, url) return fd, size, url