Пример #1
0
    def serialize(self, value):
        """"""
        if not value:
            value = 0

        value_seconds = parse_duration(value, input_unit=self.unit, unit="s")
        return human_duration(seconds=value_seconds, colon_format=True)
Пример #2
0
    def send(error, transports, t0):
        # do nothing when there are no transports
        if not transports:
            return

        # do nothing on KeyboardInterrupt, or when on_success / on_failure do not match the status
        success = error is None
        if isinstance(error, KeyboardInterrupt):
            return
        elif success and not opts["on_success"]:
            return
        elif not success and not opts["on_failure"]:
            return

        # prepare message content
        duration = human_duration(seconds=round(time.time() - t0, 1))
        status_string = "succeeded" if success else "failed"
        title = "Task {} {}!".format(_task.get_task_family(), status_string)
        parts = collections.OrderedDict([
            ("Host", socket.gethostname()),
            ("Duration", duration),
            ("Last message", "-"
             if not len(_task._message_cache) else _task._message_cache[-1]),
            ("Task", str(_task)),
        ])
        if not success:
            parts["Traceback"] = traceback.format_exc()
        message = "\n".join("{}: {}".format(*tpl) for tpl in parts.items())

        # dispatch via all transports
        for transport in transports:
            fn = transport["func"]
            raw = transport.get("raw", False)
            colored = transport.get("colored", False)

            # remove color commands if necessary
            if not colored:
                _title = uncolored(title)
                if raw:
                    _content = {
                        k: (uncolored(v)
                            if isinstance(v, six.string_types) else v)
                        for k, v in parts.items()
                    }
                else:
                    _content = uncolored(message)
            else:
                _title = title
                _content = parts.copy() if raw else message

            # invoke the function
            try:
                fn(success, _title, _content, **opts)
            except Exception as e:
                t = traceback.format_exc()
                logger.warning(
                    "notification via transport '{}' failed: {}\n{}".format(
                        fn, e, t))
Пример #3
0
    def log_duration(t0):
        duration = human_duration(seconds=round(time.time() - t0, 1))

        # log
        timeit_logger = logger.getChild("timeit")
        timeit_logger.info("runtime of {}: {}".format(task.task_id, duration))

        # optionally publish a task message to the scheduler
        if opts["publish_message"] and callable(getattr(task, "publish_message", None)):
            task.publish_message("runtime: {}".format(duration))
Пример #4
0
 def publish_step(self, msg, success_message="done", fail_message="failed", runtime=True,
         scheduler=True):
     self.publish_message(msg, scheduler=scheduler)
     success = False
     t0 = time.time()
     try:
         yield
         success = True
     finally:
         msg = success_message if success else fail_message
         if runtime:
             diff = time.time() - t0
             msg = "{} (took {})".format(msg, human_duration(seconds=diff))
         self.publish_message(msg, scheduler=scheduler)
Пример #5
0
Файл: util.py Проект: riga/law
def renew_voms_proxy(password="", vo=None, lifetime="8 days", proxy_file=None):
    """
    Renews the voms proxy using a password *password*, an optional virtual organization name *vo*,
    and a default *lifetime* of 8 days, which is internally parsed by
    :py:func:`law.util.parse_duration` where the default input unit is hours. To ensure that the
    *password* is not visible in any process listing, it is written to a temporary file first and
    piped into the ``voms-proxy-init`` command. When *proxy_file* is *None*, it defaults to the
    result of :py:func:`get_voms_proxy_file`.
    """
    # parse and format the lifetime
    lifetime_seconds = max(parse_duration(lifetime, input_unit="h", unit="s"),
                           60.0)
    lifetime = human_duration(seconds=lifetime_seconds, colon_format="h")
    # cut the seconds part
    normalized = ":".join((2 - lifetime.count(":")) * ["00"] + [""]) + lifetime
    lifetime = ":".join(normalized.rsplit(":", 3)[-3:-1])

    # when proxy_file is None, get the default
    # when empty string, don't add a --out argument
    if proxy_file is None:
        proxy_file = get_voms_proxy_file()

    with tmp_file() as (_, tmp):
        with open(tmp, "w") as f:
            f.write(password)

        cmd = "cat '{}' | voms-proxy-init --valid '{}'".format(tmp, lifetime)
        if vo:
            cmd += " -voms '{}'".format(vo)
        if proxy_file:
            proxy_file = os.path.expandvars(os.path.expanduser(proxy_file))
            cmd += " --out '{}'".format(proxy_file)

        code, out, _ = interruptable_popen(cmd,
                                           shell=True,
                                           executable="/bin/bash",
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.STDOUT)

        if code != 0:
            raise Exception("voms-proxy-init failed: {}".format(out))
Пример #6
0
 def log_duration(t0):
     duration = human_duration(seconds=round(perf_counter() - t0, 1))
     task.logger.info("runtime: {}".format(duration))
Пример #7
0
    def poll(self):
        """
        Initiates the job status polling loop.
        """
        task = self.task

        # total job count
        n_jobs = len(self.submission_data)

        # track finished and failed jobs in dicts holding status data
        finished_jobs = OrderedDict()
        failed_jobs = OrderedDict()

        # track number of consecutive polling failures and the start time
        n_poll_fails = 0
        start_time = time.time()

        # get job kwargs for status querying
        query_kwargs = self._get_job_kwargs("query")

        # start the poll loop
        i = -1
        while True:
            i += 1

            # sleep after the first iteration
            if i > 0:
                time.sleep(task.poll_interval * 60)

            # handle scheduler messages, which could change task some parameters
            task._handle_scheduler_messages()

            # walltime exceeded?
            if task.walltime != NO_FLOAT and (time.time() - start_time) > task.walltime * 3600:
                raise Exception("exceeded walltime: {}".format(human_duration(hours=task.walltime)))

            # update variable attributes for polling
            self.poll_data.n_finished_min = task.acceptance * (1 if task.acceptance > 1 else n_jobs)
            self.poll_data.n_failed_max = task.tolerance * (1 if task.tolerance > 1 else n_jobs)

            # determine the currently active jobs, i.e., the jobs whose states should be checked,
            # and also store jobs whose ids are unknown
            active_jobs = OrderedDict()
            unknown_jobs = OrderedDict()
            for job_num, data in six.iteritems(self.submission_data.jobs):
                if job_num in finished_jobs or job_num in failed_jobs:
                    continue
                elif self._can_skip_job(job_num, data["branches"]):
                    finished_jobs[job_num] = self.status_data_cls.job_data(
                        status=self.job_manager.FINISHED, code=0)
                else:
                    data = data.copy()
                    if data["job_id"] in (None, self.status_data_cls.dummy_job_id):
                        data["job_id"] = self.status_data_cls.dummy_job_id
                        unknown_jobs[job_num] = data
                    else:
                        active_jobs[job_num] = data
            self.poll_data.n_active = len(active_jobs) + len(unknown_jobs)

            # query job states
            job_ids = [data["job_id"] for data in six.itervalues(active_jobs)]  # noqa: F812
            query_data = self.job_manager.query_batch(job_ids, **query_kwargs)

            # separate into actual states and errors that might have occured during the status query
            states_by_id = {}
            errors = []
            for job_id, state_or_error in six.iteritems(query_data):
                if isinstance(state_or_error, Exception):
                    errors.append(state_or_error)
                else:
                    states_by_id[job_id] = state_or_error

            # print the first show_errors errors
            if errors:
                print("{} error(s) occured during job status query of task {}:".format(
                    len(errors), task.task_id))
                tmpl = "    {}"
                for i, err in enumerate(errors):
                    print(tmpl.format(err))
                    if i + 1 >= self.show_errors:
                        remaining = len(errors) - self.show_errors
                        if remaining > 0:
                            print("    ... and {} more".format(remaining))
                        break

                n_poll_fails += 1
                if task.poll_fails > 0 and n_poll_fails > task.poll_fails:
                    raise Exception("poll_fails exceeded")
                else:
                    continue
            else:
                n_poll_fails = 0

            # states stores job_id's as keys, so replace them by using job_num's
            # from active_jobs (which was used for the list of jobs to query in the first place)
            states_by_num = OrderedDict()
            for job_num, data in six.iteritems(active_jobs):
                job_id = data["job_id"]
                states_by_num[job_num] = self.status_data_cls.job_data(**states_by_id[job_id])

            # consider jobs with unknown ids as retry jobs
            for job_num, data in six.iteritems(unknown_jobs):
                states_by_num[job_num] = self.status_data_cls.job_data(
                    status=self.job_manager.RETRY, error="unknown job id")

            # store jobs per status and take further actions depending on the status
            pending_jobs = OrderedDict()
            running_jobs = OrderedDict()
            newly_failed_jobs = OrderedDict()
            retry_jobs = OrderedDict()
            for job_num, data in six.iteritems(states_by_num):
                if data["status"] == self.job_manager.PENDING:
                    pending_jobs[job_num] = data
                    task.forward_dashboard_event(self.dashboard, data, "status.pending", job_num)

                elif data["status"] == self.job_manager.RUNNING:
                    running_jobs[job_num] = data
                    task.forward_dashboard_event(self.dashboard, data, "status.running", job_num)

                elif data["status"] == self.job_manager.FINISHED:
                    finished_jobs[job_num] = data
                    self.poll_data.n_active -= 1
                    self.submission_data.jobs[job_num]["job_id"] = self.submission_data.dummy_job_id
                    task.forward_dashboard_event(self.dashboard, data, "status.finished", job_num)

                elif data["status"] in (self.job_manager.FAILED, self.job_manager.RETRY):
                    newly_failed_jobs[job_num] = data
                    self.poll_data.n_active -= 1

                    # retry or ultimately failed?
                    if self.job_retries[job_num] < task.retries:
                        self.job_retries[job_num] += 1
                        self.submission_data.attempts.setdefault(job_num, 0)
                        self.submission_data.attempts[job_num] += 1
                        data["status"] = self.job_manager.RETRY
                        retry_jobs[job_num] = self.submission_data.jobs[job_num]["branches"]
                        task.forward_dashboard_event(self.dashboard, data, "status.retry", job_num)
                    else:
                        failed_jobs[job_num] = data
                        task.forward_dashboard_event(self.dashboard, data, "status.failed", job_num)

                else:
                    raise Exception("unknown job status '{}'".format(data["status"]))

            # gather some counts
            n_pending = len(pending_jobs)
            n_running = len(running_jobs)
            n_finished = len(finished_jobs)
            n_retry = len(retry_jobs)
            n_failed = len(failed_jobs)
            n_unsubmitted = len(self.submission_data.unsubmitted_jobs)

            # log the status line
            counts = (n_pending, n_running, n_finished, n_retry, n_failed)
            if self.poll_data.n_parallel != self.n_parallel_max:
                counts = (n_unsubmitted,) + counts
            status_line = self.job_manager.status_line(counts, last_counts=True, sum_counts=n_jobs,
                color=True, align=task.align_polling_status_line)
            status_line = task.modify_polling_status_line(status_line)
            task.publish_message(status_line)
            self.last_status_counts = counts

            # inform the scheduler about the progress
            task.publish_progress(100. * n_finished / n_jobs)

            # log newly failed jobs
            if newly_failed_jobs:
                print("{} failed job(s) in task {}:".format(len(newly_failed_jobs), task.task_id))
                tmpl = "    job: {job_num}, branch(es): {branches}, id: {job_id}, " \
                    "status: {status}, code: {code}, error: {error}{ext}"

                for i, (job_num, data) in enumerate(six.iteritems(newly_failed_jobs)):
                    branches = self.submission_data.jobs[job_num]["branches"]
                    log_file = self.submission_data.jobs[job_num]["log_file"]
                    ext = ""
                    if data["code"] in self.job_error_messages:
                        law_err = self.job_error_messages[data["code"]]
                        ext += ", job script error: {}".format(law_err)
                    if log_file:
                        ext += ", log: {}".format(log_file)

                    print(tmpl.format(job_num=job_num, branches=",".join(str(b) for b in branches),
                        ext=ext, **data))

                    if i + 1 >= self.show_errors:
                        remaining = len(newly_failed_jobs) - self.show_errors
                        if remaining > 0:
                            print("    ... and {} more".format(remaining))
                        break

            # infer the overall status
            reached_end = n_jobs == n_finished + n_failed
            finished = n_finished >= self.poll_data.n_finished_min
            failed = n_failed > self.poll_data.n_failed_max
            unreachable = n_jobs - n_failed < self.poll_data.n_finished_min
            if finished:
                # write status output
                if "status" in self._outputs:
                    status_data = self.status_data_cls()
                    status_data.jobs.update(finished_jobs)
                    status_data.jobs.update(states_by_num)
                    self._outputs["status"].dump(status_data, formatter="json", indent=4)
                break
            elif failed:
                failed_nums = [job_num for job_num in failed_jobs if job_num not in retry_jobs]
                raise Exception("tolerance exceeded for jobs {}".format(failed_nums))
            elif unreachable:
                err = None
                if reached_end:
                    err = "acceptance of {} not reached, total jobs: {}, failed jobs: {}"
                elif task.check_unreachable_acceptance:
                    err = "acceptance of {} unreachable, total jobs: {}, failed jobs: {}"
                if err:
                    raise Exception(err.format(self.poll_data.n_finished_min, n_jobs, n_failed))

            # configurable poll callback
            task.poll_callback(self.poll_data)

            # trigger automatic resubmission and submission of unsubmitted jobs if necessary
            if retry_jobs or self.poll_data.n_active < self.poll_data.n_parallel:
                self.submit(retry_jobs)

            # break when no polling is desired
            # we can get to this point when there was already a submission and the no_poll
            # parameter was set so that only failed jobs are resubmitted once
            if task.no_poll:
                break

        duration = round(time.time() - start_time)
        task.publish_message("polling took {}".format(human_duration(seconds=duration)))
Пример #8
0
    def handle_scheduler_message(self, msg, _attr_value=None):
        """ handle_scheduler_message(msg)
        Hook that is called when a scheduler message *msg* is received. Returns *True* when the
        messages was handled, and *False* otherwise.

        Handled messages in addition to those defined in
        :py:meth:`law.workflow.base.BaseWorkflow.handle_scheduler_message`:

            - ``parallel_jobs = <int>``
            - ``walltime = <str/int/float>``
            - ``poll_fails = <int>``
            - ``poll_interval = <str/int/float>``
            - ``retries = <int>``
        """
        attr, value = _attr_value or (None, None)

        # handle "parallel_jobs"
        if attr is None:
            m = re.match(r"^\s*(parallel\_jobs)\s*(\=|\:)\s*(.*)\s*$", str(msg))
            if m:
                attr = "parallel_jobs"
                # the workflow proxy must be set here
                if not getattr(self, "workflow_proxy", None):
                    value = Exception("workflow_proxy not set yet")
                else:
                    try:
                        n = self.workflow_proxy._set_parallel_jobs(int(m.group(3)))
                        value = "unlimited" if n == self.workflow_proxy.n_parallel_max else str(n)
                    except ValueError as e:
                        value = e

        # handle "walltime"
        if attr is None:
            m = re.match(r"^\s*(walltime)\s*(\=|\:)\s*(.*)\s*$", str(msg))
            if m:
                attr = "walltime"
                try:
                    self.walltime = self.__class__.walltime.parse(m.group(3))
                    value = human_duration(hours=self.walltime, colon_format=True)
                except ValueError as e:
                    value = e

        # handle "poll_fails"
        if attr is None:
            m = re.match(r"^\s*(poll\_fails)\s*(\=|\:)\s*(.*)\s*$", str(msg))
            if m:
                attr = "poll_fails"
                try:
                    self.poll_fails = int(m.group(3))
                    value = self.poll_fails
                except ValueError as e:
                    value = e

        # handle "poll_interval"
        if attr is None:
            m = re.match(r"^\s*(poll\_interval)\s*(\=|\:)\s*(.*)\s*$", str(msg))
            if m:
                attr = "poll_interval"
                try:
                    self.poll_interval = self.__class__.poll_interval.parse(m.group(3))
                    value = human_duration(minutes=self.poll_interval, colon_format=True)
                except ValueError as e:
                    value = e

        # handle "retries"
        if attr is None:
            m = re.match(r"^\s*(retries)\s*(\=|\:)\s*(.*)\s*$", str(msg))
            if m:
                attr = "retries"
                try:
                    self.retries = int(m.group(3))
                    value = self.retries
                except ValueError as e:
                    value = e

        return super(BaseRemoteWorkflow, self).handle_scheduler_message(msg, (attr, value))