def run(self, task, job, **jobargs): # Entry point into the executing pipeline worker task. This # initializes the task environment, sets up the current stage, # and then executes it. for key in self: self[key] = self.getitem(key, job, jobargs) sys_version = '{0[0]}.{0[1]}'.format(sys.version_info[:2]) assert self['version'] == sys_version, "Python version mismatch" # Set up the task environment. globals_ = globals().copy() for module in self['required_modules']: name = module[0] if util.iskv(module) else module globals_[name.split('.')[-1]] = __import__(name, fromlist=[name]) for obj in util.flatten(self.values()): util.globalize(obj, globals_) # Set up the stage. params = self.getitem('params', job, jobargs, worker.Params()) pipeline = dict([(s.name, (idx, s)) for idx, (g, s) in enumerate(self['pipeline'])]) pipe_idx, stage = pipeline[task.stage] stage.taskinfo = TaskInfo(jobname=task.jobname, host=task.host, stage=task.stage, group=task.group, label=task.group_label) if not stage.input_chain: stage.input_chain = Stage.default_input_chain(pipe_idx) if not stage.output_chain: stage.output_chain = Stage.default_output_chain # And now run it. self.run_stage(task, stage, params)
def run(self, task, job, **jobargs): # Entry point into the executing pipeline worker task. This # initializes the task environment, sets up the current stage, # and then executes it. worker.active_task = task for key in self: self[key] = self.getitem(key, job, jobargs) sys_version = '{0[0]}.{0[1]}'.format(sys.version_info[:2]) assert self['version'] == sys_version, "Python version mismatch" # Set up the task environment. globals_ = globals().copy() for module in self['required_modules']: name = module[0] if util.iskv(module) else module globals_[name.split('.')[-1]] = __import__(name, fromlist=[name]) for obj in util.flatten(self.values()): util.globalize(obj, globals_) # Set up the stage. params = self.getitem('params', job, jobargs, worker.Params()) pipeline = dict([(s.name, (idx, s)) for idx, (g, s) in enumerate(self['pipeline'])]) pipe_idx, stage = pipeline[task.stage] stage.taskinfo = TaskInfo(jobname=task.jobname, host=task.host, stage=task.stage, group=task.group, label=task.group_label) if not stage.input_chain: stage.input_chain = Stage.default_input_chain(pipe_idx) if not stage.output_chain: stage.output_chain = Stage.default_output_chain # And now run it. self.run_stage(task, stage, params)
def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'reduce_writer' in kwargs or 'map_writer' in kwargs: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten(util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules([f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs)] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = 1 + max(id for dir in self['input'] for id, url in util.read_index(dir[0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key)
def setUp(self): host, port = self.test_server_address # assumption: scheduler starts scheduling tasks in the # order specified by self.input self.blacklisted = sorted(self.nodes.keys()) self.input = flatten([N * ['http://%s/%s:%d' % (node, host, port)] for node in self.blacklisted]) self.whitelist = {} for i in range(len(self.blacklisted) - 1): self.disco.blacklist(self.blacklisted[i + 1]) self.whitelist[self.blacklisted[i]] =\ (N, self.blacklisted[i + 1]) super(BlacklistTestCase, self).setUp()
def run(self, task, job, **jobargs): global Task Task = task for key in self: self[key] = self.getitem(key, job, jobargs) assert self['version'] == '%s.%s' % sys.version_info[:2], "Python version mismatch" params = self['params'] if isinstance(self[task.mode], dict): params = self['ext_params'] self[task.mode] = external.prepare(params, task.mode) globals_ = globals().copy() for module in self['required_modules']: name = module[0] if util.iskv(module) else module globals_[name.split('.')[-1]] = __import__(name, fromlist=[name]) for obj in util.flatten(self.values()): util.globalize(obj, globals_) getattr(self, task.mode)(task, params) external.close()
def run(self, task, job, **jobargs): global Task Task = task for key in self: self[key] = self.getitem(key, job, jobargs) assert self['version'] == '{0[0]}.{0[1]}'.format(sys.version_info[:2]), "Python version mismatch" params = self['params'] if isinstance(self[task.stage], dict): params = self['ext_params'] self[task.stage] = external.prepare(params, task.stage) globals_ = globals().copy() for module in self['required_modules']: name = module[0] if util.iskv(module) else module globals_[name.split('.')[-1]] = __import__(name, fromlist=[name]) for obj in util.flatten(self.values()): util.globalize(obj, globals_) getattr(self, task.stage)(task, params) external.close()
def test_flatten(self): self.assertEquals(list(range(7)), list(flatten(sequence)))
def runTest(self): self.assertEquals( list(flatten(N * [b] for b in self.blacklisted)), sorted([n for n, v in self.results]))
def test_rapply(self): for x, y in zip(xrange(7), flatten(rapply(sequence, function))): self.assertEquals(function(x), y)
def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'reduce_writer' in kwargs or 'map_writer' in kwargs: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten( util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules( [f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [ list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs) ] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = 1 + max( id for dir in self['input'] for id, url in util.read_index(dir[0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key)
def _run(self, **kwargs): jobargs = util.DefaultDict(self.defaults.__getitem__, kwargs) # -- check parameters -- # Backwards compatibility # (fun_map == map, input_files == input) if "fun_map" in kwargs: kwargs["map"] = kwargs["fun_map"] if "input_files" in kwargs: kwargs["input"] = kwargs["input_files"] if "chunked" in kwargs: raise DeprecationWarning("Argument 'chunked' is deprecated") if "nr_maps" in kwargs: sys.stderr.write("Warning: nr_maps is deprecated. " "Use scheduler = {'max_cores': N} instead.\n") sched = jobargs["scheduler"].copy() if "max_cores" not in sched: sched["max_cores"] = int(jobargs["nr_maps"]) jobargs["scheduler"] = sched if not "input" in kwargs: raise DiscoError("Argument input is required") if not ("map" in kwargs or "reduce" in kwargs): raise DiscoError("Specify map and/or reduce") for p in kwargs: if p not in Job.defaults: raise DiscoError("Unknown argument: %s" % p) input = kwargs["input"] # -- initialize request -- request = { "prefix": self.name, "version": ".".join(map(str, sys.version_info[:2])), "params": cPickle.dumps(jobargs["params"], cPickle.HIGHEST_PROTOCOL), "sort": str(int(jobargs["sort"])), "mem_sort_limit": str(jobargs["mem_sort_limit"]), "status_interval": str(jobargs["status_interval"]), "profile": str(int(jobargs["profile"])), } # -- required modules -- if "required_modules" in kwargs: rm = kwargs["required_modules"] else: functions = util.flatten(util.iterify(jobargs[f]) for f in self.mapreduce_functions) rm = modutil.find_modules([f for f in functions if callable(f)]) send_mod = [] imp_mod = [] for mod in rm: if type(mod) == tuple: send_mod.append(mod[1]) mod = mod[0] imp_mod.append(mod) request["required_modules"] = " ".join(imp_mod) rf = util.pack_files(send_mod) # -- input & output streams -- for stream in ["map_input_stream", "map_output_stream", "reduce_input_stream", "reduce_output_stream"]: self.pack_stack(kwargs, request, stream) # -- required files -- if "required_files" in kwargs: if isinstance(kwargs["required_files"], dict): rf.update(kwargs["required_files"]) else: rf.update(util.pack_files(kwargs["required_files"])) if rf: request["required_files"] = util.pack(rf) # -- scheduler -- sched = jobargs["scheduler"] sched_keys = ["max_cores", "force_local", "force_remote"] if "max_cores" not in sched: sched["max_cores"] = 2 ** 31 elif sched["max_cores"] < 1: raise DiscoError("max_cores must be >= 1") for k in sched_keys: if k in sched: request["sched_" + k] = str(sched[k]) # -- map -- if "map" in kwargs: k = "ext_map" if isinstance(kwargs["map"], dict) else "map" request[k] = util.pack(kwargs["map"]) for function_name in ("map_init", "map_reader", "map_writer", "partition", "combiner"): function = jobargs[function_name] if function: request[function_name] = util.pack(function) def inputlist(input): if hasattr(input, "__iter__"): return ["\n".join(reversed(list(input)))] return util.urllist(input) input = [e for i in input for e in inputlist(i)] # -- only reduce -- else: # XXX: Check for redundant inputs, external & # partitioned inputs input = [url for i in input for url in util.urllist(i)] request["input"] = " ".join(input) if "ext_params" in kwargs: e = kwargs["ext_params"] request["ext_params"] = encode_netstring_fd(e) if isinstance(e, dict) else e # -- reduce -- nr_reduces = jobargs["nr_reduces"] if "reduce" in kwargs: k = "ext_reduce" if isinstance(kwargs["reduce"], dict) else "reduce" request[k] = util.pack(kwargs["reduce"]) for function_name in ("reduce_reader", "reduce_writer", "reduce_init"): function = jobargs[function_name] if function: request[function_name] = util.pack(function) request["nr_reduces"] = str(nr_reduces) # -- encode and send the request -- reply = self.master.request("/disco/job/new", encode_netstring_fd(request)) if not reply.startswith("job started:"): raise DiscoError("Failed to start a job. Server replied: " + reply) self.name = reply.split(":", 1)[1]