def connected_inputs(self): shuffled = list(self.inputs) random.shuffle(shuffled) inputs = [url for input in shuffled for url in util.urllist(input, partid=self.partid)] for input in inputs: yield self.connect_input(input)
def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'reduce_writer' in kwargs or 'map_writer' in kwargs: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten(util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules([f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs)] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = 1 + max(id for dir in self['input'] for id, url in util.read_index(dir[0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key)
def result_iterator(results, notifier=None, reader=func.chain_reader, input_stream=(func.map_input_stream, ), params=None, ddfs=None, tempdir=None): """ Iterates the key-value pairs in job results. *results* is a list of results, as returned by :meth:`Disco.wait`. :param notifier: a function called when the iterator moves to the next result file:: def notifier(url): ... *url* may be a list if results are replicated. :param reader: a custom reader function. Specify this to match with a custom *map_writer* or *reduce_writer*. By default, *reader* is :func:`disco.func.netstr_reader`. :param tempdir: if results are replicated, *result_iterator* ensures that only valid replicas are used. By default, this is done by downloading and parsing results first to a temporary file. If the temporary file was created succesfully, the results are returned, otherwise an alternative replica is used. If *tempdir=None* (default), the system default temporary directory is used (typically ``/tmp``). An alternative path can be set with *tempdir="path"*. Temporary files can be disabled with *tempdir=False*, in which case results are read in memory. """ from disco.task import Task task = Task() task.params = params task.input_stream = list(input_stream) if reader: task.input_stream.append(func.reader_wrapper(reader)) task.insert_globals(task.input_stream) for result in results: for url in util.urllist(result, ddfs=ddfs): if notifier: notifier(url) if type(url) == list: iter = process_url_safe(url, tempdir, task) else: iter, sze, url = task.connect_input(url) for x in iter: yield x
def result_iterator( results, notifier=None, reader=func.netstr_reader, input_stream=[func.map_input_stream], params=None ): task = Task(result_iterator=True) for fun in input_stream: fun.func_globals.setdefault("Task", task) res = [] res = [url for r in results for url in util.urllist(r)] for url in res: fd = sze = None for fun in input_stream: fd, sze, url = fun(fd, sze, url, params) if notifier: notifier(url) for x in reader(fd, sze, url): yield x
def __init__(self, input_files, do_sort, mem_sort_limit, params): self.inputs = [url for input in input_files for url in util.urllist(input, partid=Task.id)] random.shuffle(self.inputs) self.line_count = 0 if do_sort: total_size = 0 for input in self.inputs: fd, sze, url = connect_input(input, params) total_size += sze msg("Reduce[%d] input is %.2fMB" %\ (Task.id, total_size / 1024.0**2)) if total_size > mem_sort_limit: self.iterator = self.download_and_sort(params) else: msg("Sorting in memory") m = list(self.multi_file_iterator(self.inputs, False)) m.sort(num_cmp) self.iterator = self.list_iterator(m) else: self.iterator = self.multi_file_iterator(self.inputs, params)
def __init__(self, task): self.task = task self.inputs = [url for input in task.inputs for url in util.urllist(input, partid=self.partid, numpartitions=task.jobdict['nr_reduces'])] random.shuffle(self.inputs)
def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'reduce_writer' in kwargs or 'map_writer' in kwargs: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten( util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules( [f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [ list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs) ] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = 1 + max( id for dir in self['input'] for id, url in util.read_index(dir[0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key)
def __iter__(self): for urls in self.urls: for replicas in util.urllist(urls, ddfs=self.ddfs): self.notifier(replicas) for entry in self.try_replicas(list(util.iterify(replicas))): yield entry
def __iter__(self): for result in self.results: for urls in util.urllist(result, ddfs=self.ddfs): self.notifier(urls) for entry in self.try_replicas(list(util.iterify(urls))): yield entry
def connected_inputs(self): inputs = [url for input in self.inputs for url in util.urllist(input, partid=self.partid)] random.shuffle(inputs) for input in inputs: yield self.connect_input(input)
def inputlist(input): if hasattr(input, "__iter__"): return ["\n".join(reversed(list(input)))] return util.urllist(input)
def _run(self, **kwargs): jobargs = util.DefaultDict(self.defaults.__getitem__, kwargs) # -- check parameters -- # Backwards compatibility # (fun_map == map, input_files == input) if "fun_map" in kwargs: kwargs["map"] = kwargs["fun_map"] if "input_files" in kwargs: kwargs["input"] = kwargs["input_files"] if "chunked" in kwargs: raise DeprecationWarning("Argument 'chunked' is deprecated") if "nr_maps" in kwargs: sys.stderr.write("Warning: nr_maps is deprecated. " "Use scheduler = {'max_cores': N} instead.\n") sched = jobargs["scheduler"].copy() if "max_cores" not in sched: sched["max_cores"] = int(jobargs["nr_maps"]) jobargs["scheduler"] = sched if not "input" in kwargs: raise DiscoError("Argument input is required") if not ("map" in kwargs or "reduce" in kwargs): raise DiscoError("Specify map and/or reduce") for p in kwargs: if p not in Job.defaults: raise DiscoError("Unknown argument: %s" % p) input = kwargs["input"] # -- initialize request -- request = { "prefix": self.name, "version": ".".join(map(str, sys.version_info[:2])), "params": cPickle.dumps(jobargs["params"], cPickle.HIGHEST_PROTOCOL), "sort": str(int(jobargs["sort"])), "mem_sort_limit": str(jobargs["mem_sort_limit"]), "status_interval": str(jobargs["status_interval"]), "profile": str(int(jobargs["profile"])), } # -- required modules -- if "required_modules" in kwargs: rm = kwargs["required_modules"] else: functions = util.flatten(util.iterify(jobargs[f]) for f in self.mapreduce_functions) rm = modutil.find_modules([f for f in functions if callable(f)]) send_mod = [] imp_mod = [] for mod in rm: if type(mod) == tuple: send_mod.append(mod[1]) mod = mod[0] imp_mod.append(mod) request["required_modules"] = " ".join(imp_mod) rf = util.pack_files(send_mod) # -- input & output streams -- for stream in ["map_input_stream", "map_output_stream", "reduce_input_stream", "reduce_output_stream"]: self.pack_stack(kwargs, request, stream) # -- required files -- if "required_files" in kwargs: if isinstance(kwargs["required_files"], dict): rf.update(kwargs["required_files"]) else: rf.update(util.pack_files(kwargs["required_files"])) if rf: request["required_files"] = util.pack(rf) # -- scheduler -- sched = jobargs["scheduler"] sched_keys = ["max_cores", "force_local", "force_remote"] if "max_cores" not in sched: sched["max_cores"] = 2 ** 31 elif sched["max_cores"] < 1: raise DiscoError("max_cores must be >= 1") for k in sched_keys: if k in sched: request["sched_" + k] = str(sched[k]) # -- map -- if "map" in kwargs: k = "ext_map" if isinstance(kwargs["map"], dict) else "map" request[k] = util.pack(kwargs["map"]) for function_name in ("map_init", "map_reader", "map_writer", "partition", "combiner"): function = jobargs[function_name] if function: request[function_name] = util.pack(function) def inputlist(input): if hasattr(input, "__iter__"): return ["\n".join(reversed(list(input)))] return util.urllist(input) input = [e for i in input for e in inputlist(i)] # -- only reduce -- else: # XXX: Check for redundant inputs, external & # partitioned inputs input = [url for i in input for url in util.urllist(i)] request["input"] = " ".join(input) if "ext_params" in kwargs: e = kwargs["ext_params"] request["ext_params"] = encode_netstring_fd(e) if isinstance(e, dict) else e # -- reduce -- nr_reduces = jobargs["nr_reduces"] if "reduce" in kwargs: k = "ext_reduce" if isinstance(kwargs["reduce"], dict) else "reduce" request[k] = util.pack(kwargs["reduce"]) for function_name in ("reduce_reader", "reduce_writer", "reduce_init"): function = jobargs[function_name] if function: request[function_name] = util.pack(function) request["nr_reduces"] = str(nr_reduces) # -- encode and send the request -- reply = self.master.request("/disco/job/new", encode_netstring_fd(request)) if not reply.startswith("job started:"): raise DiscoError("Failed to start a job. Server replied: " + reply) self.name = reply.split(":", 1)[1]