def _run(self, **kwargs): jobargs = util.DefaultDict(self.defaults.__getitem__, kwargs) # -- check parameters -- # Backwards compatibility # (fun_map == map, input_files == input) if "fun_map" in kwargs: kwargs["map"] = kwargs["fun_map"] if "input_files" in kwargs: kwargs["input"] = kwargs["input_files"] if "chunked" in kwargs: raise DeprecationWarning("Argument 'chunked' is deprecated") if "nr_maps" in kwargs: sys.stderr.write("Warning: nr_maps is deprecated. " "Use scheduler = {'max_cores': N} instead.\n") sched = jobargs["scheduler"].copy() if "max_cores" not in sched: sched["max_cores"] = int(jobargs["nr_maps"]) jobargs["scheduler"] = sched if not "input" in kwargs: raise DiscoError("Argument input is required") if not ("map" in kwargs or "reduce" in kwargs): raise DiscoError("Specify map and/or reduce") for p in kwargs: if p not in Job.defaults: raise DiscoError("Unknown argument: %s" % p) input = kwargs["input"] # -- initialize request -- request = { "prefix": self.name, "version": ".".join(map(str, sys.version_info[:2])), "params": cPickle.dumps(jobargs["params"], cPickle.HIGHEST_PROTOCOL), "sort": str(int(jobargs["sort"])), "mem_sort_limit": str(jobargs["mem_sort_limit"]), "status_interval": str(jobargs["status_interval"]), "profile": str(int(jobargs["profile"])), } # -- required modules -- if "required_modules" in kwargs: rm = kwargs["required_modules"] else: functions = util.flatten(util.iterify(jobargs[f]) for f in self.mapreduce_functions) rm = modutil.find_modules([f for f in functions if callable(f)]) send_mod = [] imp_mod = [] for mod in rm: if type(mod) == tuple: send_mod.append(mod[1]) mod = mod[0] imp_mod.append(mod) request["required_modules"] = " ".join(imp_mod) rf = util.pack_files(send_mod) # -- input & output streams -- for stream in ["map_input_stream", "map_output_stream", "reduce_input_stream", "reduce_output_stream"]: self.pack_stack(kwargs, request, stream) # -- required files -- if "required_files" in kwargs: if isinstance(kwargs["required_files"], dict): rf.update(kwargs["required_files"]) else: rf.update(util.pack_files(kwargs["required_files"])) if rf: request["required_files"] = util.pack(rf) # -- scheduler -- sched = jobargs["scheduler"] sched_keys = ["max_cores", "force_local", "force_remote"] if "max_cores" not in sched: sched["max_cores"] = 2 ** 31 elif sched["max_cores"] < 1: raise DiscoError("max_cores must be >= 1") for k in sched_keys: if k in sched: request["sched_" + k] = str(sched[k]) # -- map -- if "map" in kwargs: k = "ext_map" if isinstance(kwargs["map"], dict) else "map" request[k] = util.pack(kwargs["map"]) for function_name in ("map_init", "map_reader", "map_writer", "partition", "combiner"): function = jobargs[function_name] if function: request[function_name] = util.pack(function) def inputlist(input): if hasattr(input, "__iter__"): return ["\n".join(reversed(list(input)))] return util.urllist(input) input = [e for i in input for e in inputlist(i)] # -- only reduce -- else: # XXX: Check for redundant inputs, external & # partitioned inputs input = [url for i in input for url in util.urllist(i)] request["input"] = " ".join(input) if "ext_params" in kwargs: e = kwargs["ext_params"] request["ext_params"] = encode_netstring_fd(e) if isinstance(e, dict) else e # -- reduce -- nr_reduces = jobargs["nr_reduces"] if "reduce" in kwargs: k = "ext_reduce" if isinstance(kwargs["reduce"], dict) else "reduce" request[k] = util.pack(kwargs["reduce"]) for function_name in ("reduce_reader", "reduce_writer", "reduce_init"): function = jobargs[function_name] if function: request[function_name] = util.pack(function) request["nr_reduces"] = str(nr_reduces) # -- encode and send the request -- reply = self.master.request("/disco/job/new", encode_netstring_fd(request)) if not reply.startswith("job started:"): raise DiscoError("Failed to start a job. Server replied: " + reply) self.name = reply.split(":", 1)[1]
def _run(self, **kw): d = lambda x: kw.get(x, Job.defaults[x]) # -- check parameters -- # Backwards compatibility # (fun_map == map, input_files == input) if "fun_map" in kw: kw["map"] = kw["fun_map"] if "input_files" in kw: kw["input"] = kw["input_files"] if "chunked" in kw: raise DiscoError("Argument 'chunked' is deprecated") if not "input" in kw: raise DiscoError("input is required") if not ("map" in kw or "reduce" in kw): raise DiscoError("Specify map and/or reduce") for p in kw: if p not in Job.defaults: raise DiscoError("Unknown argument: %s" % p) inputs = kw["input"] # -- initialize request -- req = {"name": self.name, "version": ".".join(map(str, sys.version_info[:2])), "params": cPickle.dumps(d("params"), cPickle.HIGHEST_PROTOCOL), "sort": str(int(d("sort"))), "mem_sort_limit": str(d("mem_sort_limit")), "status_interval": str(d("status_interval")), "profile": str(int(d("profile")))} # -- required modules -- if "required_modules" in kw: rm = kw["required_modules"] else: funlist = [] for f in Job.funs: df = d(f) if type(df) == types.FunctionType: funlist.append(df) elif type(df) == list: funlist += df rm = modutil.find_modules(funlist) send_mod = [] imp_mod = [] for mod in rm: if type(mod) == tuple: send_mod.append(mod[1]) mod = mod[0] imp_mod.append(mod) req["required_modules"] = " ".join(imp_mod) rf = util.pack_files(send_mod) # -- required files -- if "required_files" in kw: if type(kw["required_files"]) == dict: rf.update(kw["required_files"]) else: rf.update(util.pack_files(\ kw["required_files"])) if rf: req["required_files"] = marshal.dumps(rf) # -- map -- if "map" in kw: if type(kw["map"]) == dict: req["ext_map"] = marshal.dumps(kw["map"]) else: req["map"] = marshal.dumps(kw["map"].func_code) if "map_init" in kw: req["map_init"] = marshal.dumps(\ kw["map_init"].func_code) req["map_reader"] =\ marshal.dumps(d("map_reader").func_code) req["map_writer"] =\ marshal.dumps(d("map_writer").func_code) req["partition"] =\ marshal.dumps(d("partition").func_code) if "combiner" in kw: req["combiner"] =\ marshal.dumps(kw["combiner"].func_code) parsed_inputs = [] for inp in inputs: if type(inp) == list: parsed_inputs.append( "\n".join(reversed(inp))) elif inp.startswith("dir://"): parsed_inputs += util.parse_dir(inp) else: parsed_inputs.append(inp) inputs = parsed_inputs if "nr_maps" not in kw or kw["nr_maps"] > len(inputs): nr_maps = len(inputs) else: nr_maps = kw["nr_maps"] # -- only reduce -- else: nr_maps = 0 ext_inputs = [] red_inputs = [] for inp in inputs: if type(inp) == list: raise DiscoError("Reduce doesn't "\ "accept redundant inputs") elif inp.startswith("dir://"): if inp.endswith(".txt"): ext_inputs.append(inp) else: red_inputs.append(inp) else: ext_inputs.append(inp) if ext_inputs and red_inputs: raise DiscoError("Can't mix partitioned "\ "inputs with other inputs") elif red_inputs: q = lambda x: int(x.split(":")[-1]) + 1 nr_red = q(red_inputs[0]) for x in red_inputs: if q(x) != nr_red: raise DiscoError(\ "Number of partitions must "\ "match in all inputs") n = d("nr_reduces") or nr_red if n != nr_red: raise DiscoError( "Specified nr_reduces = %d but "\ "number of partitions in the input "\ "is %d" % (n, nr_red)) kw["nr_reduces"] = nr_red inputs = red_inputs elif d("nr_reduces") != 1: raise DiscoError("nr_reduces must be 1 when "\ "using non-partitioned inputs "\ "without the map phase") else: inputs = ext_inputs # shuffle fixes a pathological case in the fifo scheduler: # if inputs for a node are consequent, data locality will be # lost after K inputs where K is the number of cores. # Randomizing the order of inputs makes this pathological case # unlikely. This issue will be fixed in the new scheduler. random.shuffle(inputs) req["input"] = " ".join(inputs) req["nr_maps"] = str(nr_maps) if "ext_params" in kw: if type(kw["ext_params"]) == dict: req["ext_params"] =\ encode_netstring_fd(kw["ext_params"]) else: req["ext_params"] = kw["ext_params"] # -- reduce -- nr_reduces = d("nr_reduces") if "reduce" in kw: if type(kw["reduce"]) == dict: req["ext_reduce"] = marshal.dumps(kw["reduce"]) req["reduce"] = "" else: req["reduce"] = marshal.dumps( kw["reduce"].func_code) nr_reduces = nr_reduces or min(max(nr_maps / 2, 1), 100) req["reduce_reader"] =\ marshal.dumps(d("reduce_reader").func_code) req["reduce_writer"] =\ marshal.dumps(d("reduce_writer").func_code) if "reduce_init" in kw: req["reduce_init"] = marshal.dumps(\ kw["reduce_init"].func_code) else: nr_reduces = nr_reduces or 0 req["nr_reduces"] = str(nr_reduces) # -- encode and send the request -- self.msg = encode_netstring_fd(req) reply = self.master.request("/disco/job/new", self.msg) if reply != "job started": raise DiscoError("Failed to start a job. Server replied: " + reply)