def result_iterator(results, notifier = None,\ proxy = None, reader = func.netstr_reader): res = [] for dir_url in results: if dir_url.startswith("dir://"): res += util.parse_dir(dir_url, proxy) else: res.append(dir_url) x, x, root = util.load_conf() for url in res: if url.startswith("file://"): fname = url[7:] fd = file(fname) sze = os.stat(fname).st_size elif url.startswith("disco://"): host, fname = url[8:].split("/", 1) url = util.proxy_url(proxy, fname, host) if util.resultfs_enabled: f = "%s/data/%s" % (root, fname) fd = file(f) sze = os.stat(f).st_size else: sze, fd = comm.open_remote(url) else: raise JobException("Invalid result url: %s" % url) if notifier: notifier(url) for x in reader(fd, sze, fname): yield x
def __init__(self, input_files, do_sort, mem_sort_limit): self.inputs = [] for input in input_files: if input.startswith("dir://"): self.inputs += parse_dir(input) else: self.inputs.append(input) self.line_count = 0 if do_sort: total_size = 0 for input in self.inputs: sze, fd = connect_input(input) total_size += sze msg("Reduce[%d] input is %.2fMB" %\ (this_partition(), total_size / 1024.0**2)) if total_size > mem_sort_limit: self.iterator = self.download_and_sort() else: msg("Sorting in memory") m = list(self.multi_file_iterator(self.inputs, False)) m.sort(num_cmp) self.iterator = self.list_iterator(m) else: self.iterator = self.multi_file_iterator(self.inputs)
def deref(program, *files): """Usage: [file ...] Dereference the dir:// urls in file[s] or stdin and print them to stdout. """ from disco.util import parse_dir for line in fileinput.input(files): for url in parse_dir(line.strip()): print url
def result_iterator(results, notifier = None,\ proxy = None, reader = func.netstr_reader): if not proxy: proxy = os.environ.get("DISCO_PROXY", None) if proxy: if proxy.startswith("disco://"): proxy = "%s:%s" % (proxy[8:], util.MASTER_PORT) elif proxy.startswith("http://"): proxy = proxy[7:] res = [] for dir_url in results: if dir_url.startswith("dir://"): res += util.parse_dir(dir_url, proxy) else: res.append(dir_url) for url in res: if url.startswith("file://"): fname = url[7:] fd = file(fname) sze = os.stat(fname).st_size http = None else: host, fname = url[8:].split("/", 1) if proxy: ext_host = proxy fname = "/disco/node/%s/%s" % (host, fname) else: ext_host = host + ":" + util.HTTP_PORT ext_file = "/" + fname http = httplib.HTTPConnection(ext_host) http.request("GET", ext_file, "") fd = http.getresponse() if fd.status != 200: raise "HTTP error %d" % fd.status sze = int(fd.getheader("content-length")) if notifier: notifier(url) for x in reader(fd, sze, fname): yield x if http: http.close() else: fd.close()
def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'fun_map' in self and 'map' not in self: self['map'] = self.pop('fun_map') if 'input_files' in kwargs and 'input' not in self: self['input'] = self.pop('input_files') if 'reduce_writer' in self or 'map_writer' in self: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten(util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules([f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs)] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = len(util.parse_dir(self['input'][0][0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- if not self['map'] and not self['reduce']: raise DiscoError("Must specify map and/or reduce") for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key)
def _run(self, **kw): d = lambda x: kw.get(x, Job.defaults[x]) if "fun_map" in kw: kw["map"] = kw["fun_map"] if "input_files" in kw: kw["input"] = kw["input_files"] if not ("map" in kw and "input" in kw): raise "Arguments 'map' and 'input' are required" if len(kw["input"]) < 1: raise "Must have at least one input file" inputs = [] for inp in kw["input"]: if inp.startswith("dir://"): inputs += util.parse_dir(inp) else: inputs.append(inp) req = {"name": self.name, "input": " ".join(inputs), "version": ".".join(map(str, sys.version_info[:2])), "map_reader": marshal.dumps(d("map_reader").func_code), "partition": marshal.dumps(d("partition").func_code), "params": cPickle.dumps(d("params")), "sort": str(int(d("sort"))), "mem_sort_limit": str(d("mem_sort_limit"))} if type(kw["map"]) == dict: req["ext_map"] = marshal.dumps(kw["map"]) else: req["map"] = marshal.dumps(kw["map"].func_code) if "ext_params" in kw: if type(kw["ext_params"]) == dict: req["ext_params"] =\ encode_netstring_fd(kw["ext_params"]) else: req["ext_params"] = kw["ext_params"] if "nr_maps" not in kw or kw["nr_maps"] > len(inputs): nr_maps = len(inputs) else: nr_maps = kw["nr_maps"] req["nr_maps"] = str(nr_maps) nr_reduces = d("nr_reduces") if "reduce" in kw: if type(kw["reduce"]) == dict: req["ext_reduce"] = marshal.dumps(kw["reduce"]) req["reduce"] = "" else: req["reduce"] = marshal.dumps( kw["reduce"].func_code) nr_reduces = nr_reduces or max(nr_maps / 2, 1) req["chunked"] = "True" else: nr_reduces = nr_reduces or 1 req["nr_reduces"] = str(nr_reduces) if d("chunked") != None: if d("chunked"): req["chunked"] = "True" elif "chunked" in req: del req["chunked"] if "combiner" in kw: req["combiner"] =\ marshal.dumps(kw["combiner"].func_code) self.msg = encode_netstring_fd(req) reply = self.master.request("/disco/job/new", self.msg) if reply != "job started": raise "Failed to start a job. Server replied: " + reply
def _run(self, **kw): d = lambda x: kw.get(x, Job.defaults[x]) # Backwards compatibility # (fun_map == map, input_files == input) if "fun_map" in kw: kw["map"] = kw["fun_map"] if "input_files" in kw: kw["input"] = kw["input_files"] if not "input" in kw: raise Exception("input is required") if not ("map" in kw or "reduce" in kw): raise Exception("Specify map and/or reduce") for p in kw: if p not in Job.defaults: raise Exception("Unknown argument: %s" % p) inputs = kw["input"] req = {"name": self.name, "version": ".".join(map(str, sys.version_info[:2])), "params": cPickle.dumps(d("params")), "sort": str(int(d("sort"))), "mem_sort_limit": str(d("mem_sort_limit")), "status_interval": str(d("status_interval")), "required_modules": " ".join(d("required_modules")), "profile": str(int(d("profile")))} if "map" in kw: if type(kw["map"]) == dict: req["ext_map"] = marshal.dumps(kw["map"]) else: req["map"] = marshal.dumps(kw["map"].func_code) if "nr_maps" not in kw or kw["nr_maps"] > len(inputs): nr_maps = len(inputs) else: nr_maps = kw["nr_maps"] if "map_init" in kw: req["map_init"] = marshal.dumps(\ kw["map_init"].func_code) req["map_reader"] =\ marshal.dumps(d("map_reader").func_code) req["map_writer"] =\ marshal.dumps(d("map_writer").func_code) req["partition"] =\ marshal.dumps(d("partition").func_code) parsed_inputs = [] for inp in inputs: if inp.startswith("dir://"): parsed_inputs += util.parse_dir(inp) else: parsed_inputs.append(inp) inputs = parsed_inputs else: addr = [x for x in inputs\ if not x.startswith("dir://")] if d("nr_reduces") == None and not addr: raise Exception("nr_reduces must match to "\ "the number of partitions in the "\ "input data") if d("nr_reduces") != 1 and addr: raise Exception("nr_reduces must be 1 when "\ "using external inputs without "\ "the map phase") nr_maps = 0 req["input"] = " ".join(inputs) req["nr_maps"] = str(nr_maps) if "ext_params" in kw: if type(kw["ext_params"]) == dict: req["ext_params"] =\ encode_netstring_fd(kw["ext_params"]) else: req["ext_params"] = kw["ext_params"] nr_reduces = d("nr_reduces") if "reduce" in kw: if type(kw["reduce"]) == dict: req["ext_reduce"] = marshal.dumps(kw["reduce"]) req["reduce"] = "" else: req["reduce"] = marshal.dumps( kw["reduce"].func_code) nr_reduces = nr_reduces or max(nr_maps / 2, 1) req["chunked"] = "True" req["reduce_reader"] =\ marshal.dumps(d("reduce_reader").func_code) req["reduce_writer"] =\ marshal.dumps(d("reduce_writer").func_code) if "reduce_init" in kw: req["reduce_init"] = marshal.dumps(\ kw["reduce_init"].func_code) else: nr_reduces = nr_reduces or 1 req["nr_reduces"] = str(nr_reduces) if d("chunked") != None: if d("chunked"): req["chunked"] = "True" elif "chunked" in req: del req["chunked"] if "combiner" in kw: req["combiner"] =\ marshal.dumps(kw["combiner"].func_code) self.msg = encode_netstring_fd(req) reply = self.master.request("/disco/job/new", self.msg) if reply != "job started": raise Exception("Failed to start a job. Server replied: " + reply)
def _run(self, **kw): d = lambda x: kw.get(x, Job.defaults[x]) # -- check parameters -- # Backwards compatibility # (fun_map == map, input_files == input) if "fun_map" in kw: kw["map"] = kw["fun_map"] if "input_files" in kw: kw["input"] = kw["input_files"] if "chunked" in kw: raise DiscoError("Argument 'chunked' is deprecated") if not "input" in kw: raise DiscoError("input is required") if not ("map" in kw or "reduce" in kw): raise DiscoError("Specify map and/or reduce") for p in kw: if p not in Job.defaults: raise DiscoError("Unknown argument: %s" % p) inputs = kw["input"] # -- initialize request -- req = {"name": self.name, "version": ".".join(map(str, sys.version_info[:2])), "params": cPickle.dumps(d("params"), cPickle.HIGHEST_PROTOCOL), "sort": str(int(d("sort"))), "mem_sort_limit": str(d("mem_sort_limit")), "status_interval": str(d("status_interval")), "profile": str(int(d("profile")))} # -- required modules -- if "required_modules" in kw: rm = kw["required_modules"] else: funlist = [] for f in Job.funs: df = d(f) if type(df) == types.FunctionType: funlist.append(df) elif type(df) == list: funlist += df rm = modutil.find_modules(funlist) send_mod = [] imp_mod = [] for mod in rm: if type(mod) == tuple: send_mod.append(mod[1]) mod = mod[0] imp_mod.append(mod) req["required_modules"] = " ".join(imp_mod) rf = util.pack_files(send_mod) # -- required files -- if "required_files" in kw: if type(kw["required_files"]) == dict: rf.update(kw["required_files"]) else: rf.update(util.pack_files(\ kw["required_files"])) if rf: req["required_files"] = marshal.dumps(rf) # -- map -- if "map" in kw: if type(kw["map"]) == dict: req["ext_map"] = marshal.dumps(kw["map"]) else: req["map"] = marshal.dumps(kw["map"].func_code) if "map_init" in kw: req["map_init"] = marshal.dumps(\ kw["map_init"].func_code) req["map_reader"] =\ marshal.dumps(d("map_reader").func_code) req["map_writer"] =\ marshal.dumps(d("map_writer").func_code) req["partition"] =\ marshal.dumps(d("partition").func_code) if "combiner" in kw: req["combiner"] =\ marshal.dumps(kw["combiner"].func_code) parsed_inputs = [] for inp in inputs: if type(inp) == list: parsed_inputs.append( "\n".join(reversed(inp))) elif inp.startswith("dir://"): parsed_inputs += util.parse_dir(inp) else: parsed_inputs.append(inp) inputs = parsed_inputs if "nr_maps" not in kw or kw["nr_maps"] > len(inputs): nr_maps = len(inputs) else: nr_maps = kw["nr_maps"] # -- only reduce -- else: nr_maps = 0 ext_inputs = [] red_inputs = [] for inp in inputs: if type(inp) == list: raise DiscoError("Reduce doesn't "\ "accept redundant inputs") elif inp.startswith("dir://"): if inp.endswith(".txt"): ext_inputs.append(inp) else: red_inputs.append(inp) else: ext_inputs.append(inp) if ext_inputs and red_inputs: raise DiscoError("Can't mix partitioned "\ "inputs with other inputs") elif red_inputs: q = lambda x: int(x.split(":")[-1]) + 1 nr_red = q(red_inputs[0]) for x in red_inputs: if q(x) != nr_red: raise DiscoError(\ "Number of partitions must "\ "match in all inputs") n = d("nr_reduces") or nr_red if n != nr_red: raise DiscoError( "Specified nr_reduces = %d but "\ "number of partitions in the input "\ "is %d" % (n, nr_red)) kw["nr_reduces"] = nr_red inputs = red_inputs elif d("nr_reduces") != 1: raise DiscoError("nr_reduces must be 1 when "\ "using non-partitioned inputs "\ "without the map phase") else: inputs = ext_inputs # shuffle fixes a pathological case in the fifo scheduler: # if inputs for a node are consequent, data locality will be # lost after K inputs where K is the number of cores. # Randomizing the order of inputs makes this pathological case # unlikely. This issue will be fixed in the new scheduler. random.shuffle(inputs) req["input"] = " ".join(inputs) req["nr_maps"] = str(nr_maps) if "ext_params" in kw: if type(kw["ext_params"]) == dict: req["ext_params"] =\ encode_netstring_fd(kw["ext_params"]) else: req["ext_params"] = kw["ext_params"] # -- reduce -- nr_reduces = d("nr_reduces") if "reduce" in kw: if type(kw["reduce"]) == dict: req["ext_reduce"] = marshal.dumps(kw["reduce"]) req["reduce"] = "" else: req["reduce"] = marshal.dumps( kw["reduce"].func_code) nr_reduces = nr_reduces or min(max(nr_maps / 2, 1), 100) req["reduce_reader"] =\ marshal.dumps(d("reduce_reader").func_code) req["reduce_writer"] =\ marshal.dumps(d("reduce_writer").func_code) if "reduce_init" in kw: req["reduce_init"] = marshal.dumps(\ kw["reduce_init"].func_code) else: nr_reduces = nr_reduces or 0 req["nr_reduces"] = str(nr_reduces) # -- encode and send the request -- self.msg = encode_netstring_fd(req) reply = self.master.request("/disco/job/new", self.msg) if reply != "job started": raise DiscoError("Failed to start a job. Server replied: " + reply)