def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server'))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get('worker').rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get('worker'), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START)
def check_results(self, jobname, start_time, timeout, poll_interval): try: status, results = self.results(jobname, timeout=poll_interval) except CommError as e: status = 'active' if status == 'ready': return results if status != 'active': raise JobError(Job(name=jobname, master=self), "Status {0}".format(status)) if timeout and time.time() - start_time > timeout: raise JobError(Job(name=jobname, master=self), "Timeout") raise Continue()
def new_job(self, name, **jobargs): """ Submits a new job request to the master using :class:`disco.job.Job`:: return Job(name=name, master=self.master).run(**jobargs) """ return Job(name=name, master=self.master).run(**jobargs)
def profile_stats(self, jobname, mode='', stream=sys.stdout): """ Returns results of job profiling. :ref:`jobdict` must have had the ``profile`` flag enabled. :type mode: 'map' or 'reduce' or '' :param mode: restricts results to the map or reduce phase, or not. :type stream: file-like object :param stream: alternate output stream. See the `pstats.Stats constructor <http://docs.python.org/library/profile.html#pstats.Stats>`_. The function returns a `pstats.Stats object <http://docs.python.org/library/profile.html#the-stats-class>`_. For instance, you can print out results as follows:: job.profile_stats().sort_stats('cumulative').print_stats() .. versionadded:: 0.2.1 """ prefix = 'profile-%s' % mode f = [s for s in self.oob_list(jobname) if s.startswith(prefix)] if not f: raise JobError(Job(name=jobname, master=self), "No profile data") import pstats stats = pstats.Stats(Stats(self.oob_get(jobname, f[0])), stream=stream) for s in f[1:]: stats.add(Stats(self.oob_get(jobname, s))) stats.strip_dirs() stats.sort_stats('cumulative') return stats
def get(program, key, jobname): """Usage: key jobname Print the oob value for the given key and jobname. """ from disco.job import Job print(Job(name=program.job_history(jobname), master=program.disco).oob_get(key))
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Predict the closest clusters for the datapoints in input. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator if "kmeans_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["centers"] = [ (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"]) ] job.pipeline = [("split", Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map))] job.run(input=dataset.params["data_tag"], name="kmeans_predict") return job.wait(show=show)
def oob(program, jobname): """Usage: jobname Print the oob keys for the named job. """ from disco.job import Job for key in Job(name=jobname, master=program.disco).oob_list(): print(key)
def wait(self, jobname, poll_interval=2, timeout=None, clean=False, show=None): """ Block until the job has finished. Returns a list of the result urls. :type poll_interval: int :param poll_interval: the number of seconds between job status requests. :type timeout: int or None :param timeout: if specified, the number of seconds before returning or raising a :class:`disco.JobError`. :type clean: bool :param clean: if `True`, call :meth:`Disco.clean` when the job has finished. .. deprecated:: 0.4 :type show: bool or string :param show: enables console output of job events. The default is provided by :envvar:`DISCO_EVENTS`. .. versionadded:: 0.2.3 """ if show is None: show = self.settings['DISCO_EVENTS'] event_monitor = EventMonitor(Job(name=jobname, master=self.master), format=show, poll_interval=poll_interval) start_time = time.time() try: while True: event_monitor.refresh() try: return self.check_results(jobname, start_time, timeout, poll_interval * 1000) except Continue: continue finally: if clean: self.clean(jobname) event_monitor.refresh() finally: event_monitor.cleanup()
return others, active def jobinfo(self, jobname): """Returns a dict containing information about the job.""" return json.loads(self.request('/disco/ctrl/jobinfo?name=%s' % jobname)) def check_results(self, jobname, start_time, timeout, poll_interval): try: status, results = self.results(jobname, timeout=poll_interval) except CommError, e: status = 'active' if status == 'ready': return results if status != 'active': raise JobError(Job(name=jobname, master=self), "Status %s" % status) if timeout and time.time() - start_time > timeout: raise JobError(Job(name=jobname, master=self), "Timeout") raise Continue() def wait(self, jobname, poll_interval=2, timeout=None, clean=False, show=None): """ Block until the job has finished. Returns a list of the result urls.
others.append((jobname, (status, result))) return others, active def jobinfo(self, jobname): """Returns a dict containing information about the job.""" return json.loads(self.request('/disco/ctrl/jobinfo?name=%s' % jobname)) def check_results(self, jobname, start_time, timeout, poll_interval): try: status, results = self.results(jobname, timeout=poll_interval) except CommError, e: status = 'active' if status == 'ready': return results if status != 'active': raise JobError(Job(name=jobname, master=self), "Status %s" % status) if timeout and time.time() - start_time > timeout: raise JobError(Job(name=jobname, master=self), "Timeout") raise Continue() def wait(self, jobname, poll_interval=2, timeout=None, clean=False, show=None): """ Block until the job has finished. Returns a list of the result urls. :type poll_interval: int :param poll_interval: the number of seconds between job status requests. :type timeout: int or None :param timeout: if specified, the number of seconds before returning or raising a :class:`disco.JobError`.
def fit(dataset, n_clusters=5, max_iterations=10, random_state=None, save_results=True, show=False): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator try: n_clusters = int(n_clusters) max_iterations = int(max_iterations) if n_clusters < 2: raise Exception("Parameter n_clusters should be greater than 1.") if max_iterations < 1: raise Exception( "Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("kmeans_init_map", input_chain=dataset.params["input_chain"], init=map_init, process=random_init_map)), ('group_label', Stage("kmeans_init_reduce", process=estimate_reduce, init=simple_init, combine=True))] job.params = dict(dataset.params.items() + mean_point_center.items()) job.params['seed'] = random_state job.params['k'] = n_clusters job.run(input=dataset.params["data_tag"], name="kmeans_init") init = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(init)] for j in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params['k'] = n_clusters job.params['centers'] = centers job.pipeline = [('split', Stage("kmeans_map_iter_%s" % (j + 1, ), input_chain=dataset.params["input_chain"], process=estimate_map, init=simple_init)), ('group_label', Stage("kmeans_reduce_iter_%s" % (j + 1, ), process=estimate_reduce, init=simple_init, combine=True))] job.run(input=dataset.params["data_tag"], name='kmeans_iter_%d' % (j + 1, )) fitmodel_url = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(fitmodel_url)] return {"kmeans_fitmodel": fitmodel_url} # return results url