Exemplo n.º 1
0
Arquivo: job.py Projeto: pooya/inferno
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server')))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get('worker').rpartition('.')
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name,
                           master=self.disco.master,
                           worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker"
                     % (settings.get('worker'), e))
            self.job = Job(name=rule.name,
                           master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)
Exemplo n.º 2
0
 def check_results(self, jobname, start_time, timeout, poll_interval):
     try:
         status, results = self.results(jobname, timeout=poll_interval)
     except CommError as e:
         status = 'active'
     if status == 'ready':
         return results
     if status != 'active':
         raise JobError(Job(name=jobname, master=self),
                        "Status {0}".format(status))
     if timeout and time.time() - start_time > timeout:
         raise JobError(Job(name=jobname, master=self), "Timeout")
     raise Continue()
Exemplo n.º 3
0
    def new_job(self, name, **jobargs):
        """
        Submits a new job request to the master using :class:`disco.job.Job`::

                return Job(name=name, master=self.master).run(**jobargs)
        """
        return Job(name=name, master=self.master).run(**jobargs)
Exemplo n.º 4
0
    def profile_stats(self, jobname, mode='', stream=sys.stdout):
        """
        Returns results of job profiling.
        :ref:`jobdict` must have had the ``profile`` flag enabled.

        :type  mode: 'map' or 'reduce' or ''
        :param mode: restricts results to the map or reduce phase, or not.

        :type  stream: file-like object
        :param stream: alternate output stream.
                       See the `pstats.Stats constructor <http://docs.python.org/library/profile.html#pstats.Stats>`_.

        The function returns a `pstats.Stats object <http://docs.python.org/library/profile.html#the-stats-class>`_.
        For instance, you can print out results as follows::

                job.profile_stats().sort_stats('cumulative').print_stats()

        .. versionadded:: 0.2.1
        """
        prefix = 'profile-%s' % mode
        f = [s for s in self.oob_list(jobname) if s.startswith(prefix)]
        if not f:
            raise JobError(Job(name=jobname, master=self), "No profile data")

        import pstats
        stats = pstats.Stats(Stats(self.oob_get(jobname, f[0])), stream=stream)
        for s in f[1:]:
            stats.add(Stats(self.oob_get(jobname, s)))
        stats.strip_dirs()
        stats.sort_stats('cumulative')
        return stats
Exemplo n.º 5
0
def get(program, key, jobname):
    """Usage: key jobname

    Print the oob value for the given key and jobname.
    """
    from disco.job import Job
    print(Job(name=program.job_history(jobname), master=program.disco).oob_get(key))
Exemplo n.º 6
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Predict the closest clusters for the datapoints in input.
    """

    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    if "kmeans_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params["centers"] = [
        (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])
    ]

    job.pipeline = [("split",
                     Stage("kmeans_predict",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=predict_map))]

    job.run(input=dataset.params["data_tag"], name="kmeans_predict")

    return job.wait(show=show)
Exemplo n.º 7
0
def oob(program, jobname):
    """Usage: jobname

    Print the oob keys for the named job.
    """
    from disco.job import Job
    for key in Job(name=jobname, master=program.disco).oob_list():
        print(key)
Exemplo n.º 8
0
    def wait(self,
             jobname,
             poll_interval=2,
             timeout=None,
             clean=False,
             show=None):
        """
        Block until the job has finished.
        Returns a list of the result urls.

        :type  poll_interval: int
        :param poll_interval: the number of seconds between job status requests.

        :type  timeout: int or None
        :param timeout: if specified, the number of seconds before returning or
                        raising a :class:`disco.JobError`.

        :type  clean: bool
        :param clean: if `True`,
                      call :meth:`Disco.clean` when the job has finished.

                      .. deprecated:: 0.4

        :type  show: bool or string
        :param show: enables console output of job events.
                     The default is provided by :envvar:`DISCO_EVENTS`.

                     .. versionadded:: 0.2.3
        """
        if show is None:
            show = self.settings['DISCO_EVENTS']
        event_monitor = EventMonitor(Job(name=jobname, master=self.master),
                                     format=show,
                                     poll_interval=poll_interval)
        start_time = time.time()
        try:
            while True:
                event_monitor.refresh()
                try:
                    return self.check_results(jobname, start_time, timeout,
                                              poll_interval * 1000)
                except Continue:
                    continue
                finally:
                    if clean:
                        self.clean(jobname)
                    event_monitor.refresh()
        finally:
            event_monitor.cleanup()
Exemplo n.º 9
0
        return others, active

    def jobinfo(self, jobname):
        """Returns a dict containing information about the job."""
        return json.loads(self.request('/disco/ctrl/jobinfo?name=%s' %
                                       jobname))

    def check_results(self, jobname, start_time, timeout, poll_interval):
        try:
            status, results = self.results(jobname, timeout=poll_interval)
        except CommError, e:
            status = 'active'
        if status == 'ready':
            return results
        if status != 'active':
            raise JobError(Job(name=jobname, master=self),
                           "Status %s" % status)
        if timeout and time.time() - start_time > timeout:
            raise JobError(Job(name=jobname, master=self), "Timeout")
        raise Continue()

    def wait(self,
             jobname,
             poll_interval=2,
             timeout=None,
             clean=False,
             show=None):
        """
        Block until the job has finished.
        Returns a list of the result urls.
Exemplo n.º 10
0
                others.append((jobname, (status, result)))
        return others, active

    def jobinfo(self, jobname):
        """Returns a dict containing information about the job."""
        return json.loads(self.request('/disco/ctrl/jobinfo?name=%s' % jobname))

    def check_results(self, jobname, start_time, timeout, poll_interval):
        try:
            status, results = self.results(jobname, timeout=poll_interval)
        except CommError, e:
            status = 'active'
        if status == 'ready':
            return results
        if status != 'active':
            raise JobError(Job(name=jobname, master=self), "Status %s" % status)
        if timeout and time.time() - start_time > timeout:
            raise JobError(Job(name=jobname, master=self), "Timeout")
        raise Continue()

    def wait(self, jobname, poll_interval=2, timeout=None, clean=False, show=None):
        """
        Block until the job has finished.
        Returns a list of the result urls.

        :type  poll_interval: int
        :param poll_interval: the number of seconds between job status requests.

        :type  timeout: int or None
        :param timeout: if specified, the number of seconds before returning or
                        raising a :class:`disco.JobError`.
Exemplo n.º 11
0
def fit(dataset,
        n_clusters=5,
        max_iterations=10,
        random_state=None,
        save_results=True,
        show=False):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    from disco.job import Job
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import result_iterator

    try:
        n_clusters = int(n_clusters)
        max_iterations = int(max_iterations)
        if n_clusters < 2:
            raise Exception("Parameter n_clusters should be greater than 1.")
        if max_iterations < 1:
            raise Exception(
                "Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("kmeans_init_map",
                           input_chain=dataset.params["input_chain"],
                           init=map_init,
                           process=random_init_map)),
                    ('group_label',
                     Stage("kmeans_init_reduce",
                           process=estimate_reduce,
                           init=simple_init,
                           combine=True))]
    job.params = dict(dataset.params.items() + mean_point_center.items())
    job.params['seed'] = random_state
    job.params['k'] = n_clusters

    job.run(input=dataset.params["data_tag"], name="kmeans_init")
    init = job.wait(show=show)
    centers = [(i, c) for i, c in result_iterator(init)]

    for j in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        job.params = dict(dataset.params.items() + mean_point_center.items())
        job.params['k'] = n_clusters
        job.params['centers'] = centers

        job.pipeline = [('split',
                         Stage("kmeans_map_iter_%s" % (j + 1, ),
                               input_chain=dataset.params["input_chain"],
                               process=estimate_map,
                               init=simple_init)),
                        ('group_label',
                         Stage("kmeans_reduce_iter_%s" % (j + 1, ),
                               process=estimate_reduce,
                               init=simple_init,
                               combine=True))]

        job.run(input=dataset.params["data_tag"],
                name='kmeans_iter_%d' % (j + 1, ))
        fitmodel_url = job.wait(show=show)
        centers = [(i, c) for i, c in result_iterator(fitmodel_url)]

    return {"kmeans_fitmodel": fitmodel_url}  # return results url