Exemplo n.º 1
0
def reattach(j: Jobs, rules: Rules,
             pending_jobs: List[Execution]) -> List[DelegateExecution]:
    executing = []
    for e in pending_jobs:
        if e.exec_xref != None:
            rule = rules.get_rule(e.transform)
            client = rules.get_client(rule.executor)
            ee = client.reattach(e.exec_xref)
            executing.append(ee)
            log.warn("Reattaching existing job {}: {}".format(
                e.transform, e.exec_xref))
        else:
            log.warn("Canceling {}".format(e.id))
            j.cancel_execution(e.id)
    return executing
Exemplo n.º 2
0
def main_loop(jinja2_env: Environment,
              j: Jobs,
              new_object_listener: Callable,
              rules: Rules,
              state_dir: str,
              executing: List[DelegateExecution],
              capture_output: bool,
              req_confirm: bool,
              maxfail: int,
              maxstart: None,
              properties_to_add=[]) -> None:
    from conseq.exec_client import create_publish_exec_client
    _client_for_publishing = Lazy(
        lambda: create_publish_exec_client(rules.get_vars()))

    resources_per_client = dict([
        (name, client.resources)
        for name, client in rules.exec_clients.items()
    ])
    timings = TimelineLog(state_dir + "/timeline.log")
    active_job_ids = set([e.id for e in executing])

    resolver = xref.Resolver(state_dir, rules.vars)

    prev_msg = None
    abort = False
    success_count = 0
    failures = []
    start_count = 0
    job_ids_to_ignore = set()
    skip_remaining = False

    def get_pending():
        pending_jobs = j.get_pending()
        if skip_remaining:
            pending_jobs = []
            job_ids_to_ignore.update([pj.id for pj in pending_jobs])
        else:
            pending_jobs = [
                pj for pj in pending_jobs if pj.id not in job_ids_to_ignore
            ]

        return pending_jobs

    with ui.capture_sigint() as was_interrupted_fn:
        while not abort:
            interrupted = was_interrupted_fn()
            if interrupted:
                break

            if len(failures) >= maxfail:
                we_should_stop = True
                if len(executing) > 0:
                    # if we have other tasks which are still running, ask user if we really want to abort now.
                    we_should_stop, maxfail = ui.user_says_we_should_stop(
                        len(failures), executing)
                if we_should_stop:
                    break

            pending_jobs = get_pending()

            summary = get_execution_summary(executing)

            msg = "%d processes running (%s), %d executions pending, %d skipped" % (
                len(executing), summary, len(pending_jobs),
                len(job_ids_to_ignore))
            if prev_msg != msg:
                log.info(msg)
                if len(pending_jobs) + len(executing) > 0:
                    long_summary = get_long_execution_summary(
                        executing, pending_jobs)
                    log.info("Summary of queue:\n%s\n", long_summary)

            prev_msg = msg
            cannot_start_more = (maxstart is not None
                                 and start_count >= maxstart) or skip_remaining
            if len(executing) == 0 and (cannot_start_more
                                        or len(pending_jobs) == 0):
                # now that we've completed everything, check for deferred jobs by marking them as ready.  If we have any, loop again
                j.enable_deferred()
                deferred_jobs = len(get_pending())
                if deferred_jobs > 0 and not cannot_start_more:
                    log.info("Marked deferred %d executions as ready",
                             deferred_jobs)
                    continue
                break

            did_useful_work = False

            # might be worth checking to see if the inputs are identical to previous call
            # to avoid wasting CPU time checking to schedule over and over when resources are exhausted.

            # also, the current design has an issue when rerunning part of of the execution tree.  Imagine
            # rule "A" produces "a1", "b1", and "c1", rule "T" transforms "a1" to "a2", "b1" to "b2, and "c1" to "c2".
            # Lastly rule F takes in a2, b2, and c2 and produces "f".
            # Now, everything is great if starting from a clean slate.  But we've run once, in the artifact db we have
            # a1, a2, b1, b2, c1, c2, f.   If we then rerun T, then we'll get the following executions:  (new objects denoted with
            # "*", old objects from previous run have no star.)
            # T(a1) -> a2*
            # F(a2*, b2, c2) -> f*
            # T(b1) -> b2*
            # F(a2*, b2*, c2) -> f*
            # T(c1) -> c2*
            # F(a2*, b2*, c2*) -> f*
            #
            # So in the end the right thing would get done.  However, we've run F three times as many as necessary.  If we
            # had a priority queue for work, then we could just set each rule execution priority to be the max(input.id)
            # That would force a breadth-first execution of the graph.  However, since jobs can execute in parallel,
            # priortizing is not enough.  (And we can't block based on priority or there'd be no parallelism!)
            #
            # ultimately, I don't think there's a shortcut, and we may need to check the DAG from the previous execution to see
            # if ancestor node is being re-executed, if so, prune that pending rule execution from the pending list until that
            # task is done.
            ready_jobs = get_satisfiable_jobs(rules, resources_per_client,
                                              pending_jobs, executing)
            for job in ready_jobs:
                assert isinstance(job, dep.RuleExecution)

                if maxstart is not None and start_count >= maxstart:
                    break

                active_job_ids.add(job.id)
                did_useful_work = True

                rule = rules.get_rule(job.transform)

                timings.log(job.id, "preprocess_xrefs")
                # process xrefs which might require rewriting an artifact
                xrefs_resolved = exec_client.preprocess_xref_inputs(
                    j, resolver, job.inputs)
                if xrefs_resolved:
                    log.info(
                        "Resolved xrefs on rule, new version will be executed next pass"
                    )
                    timings.log(job.id, "resolved_xrefs")
                    continue

                timings.log(job.id, "preprocess_inputs")
                if rule.is_publish_rule:
                    client = _client_for_publishing()
                else:
                    # localize paths that will be used in scripts
                    client = rules.get_client(rule.executor)
                inputs, resolver_state = client.preprocess_inputs(
                    resolver, bind_inputs(rule, job.inputs))
                debug_log.log_input_preprocess(job.id, job.inputs, inputs)

                # if we're required confirmation from the user, do this before we continue
                if req_confirm:
                    answer = ui.confirm_execution(job.transform, inputs)
                    if answer == "a":
                        req_confirm = False
                    elif answer == "q":
                        abort = True
                        break
                    elif answer == "s":
                        job_ids_to_ignore.add(job.id)
                        continue
                    elif answer == "S":
                        skip_remaining = True
                        break

                if rule.is_publish_rule:
                    publish(jinja2_env, rule.publish_location,
                            rules.get_vars(), inputs)

                # maybe record_started and update_exec_xref should be merged so anything started
                # always has an xref
                exec_id = j.record_started(job.id)
                timings.log(job.id, "start")

                job_dir = get_job_dir(state_dir, exec_id)
                if not os.path.exists(job_dir):
                    os.makedirs(job_dir)

                e = execute(job.transform, resolver,
                            jinja2_env, exec_id, job_dir, inputs, rule,
                            rules.get_vars(), capture_output, resolver_state,
                            client)
                executing.append(e)
                j.update_exec_xref(e.id, e.get_external_id(), job_dir)
                start_count += 1

            # now poll the jobs which are running and look for which have completed
            for i, e in reversed(list(enumerate(executing))):
                failure, completion = e.get_completion()

                if failure is None and completion is None:
                    continue

                del executing[i]
                timestamp = datetime.datetime.now().isoformat()

                if completion is not None:
                    rule = rules.get_rule(e.transform)
                    if not rule.has_for_all_input():
                        # only do this check if no inputs are marked as "for all"
                        # because we can have cases where a new artifact appears and we _do_ want
                        # to re-run the rule and clobber the output of the previous run.
                        # If we wanted to be very conservative, we could handle for-all by
                        # looking up which rule created the previous artifact and confirm that it was
                        # from a rule with the same inputs, only verifying the "all" parameters have
                        # changed. However, just ignoring clobbers from rules with "for all" is a cheap
                        # approximation.
                        _failures = []
                        for artifact in completion:
                            if j.get_existing_id(None, artifact) is not None:
                                # j.gc()
                                _failure = f"Rule {e.transform} ({e.job_dir} generated an output which already exists: {artifact}"
                                _failures.append(_failure)
                                log.error(_failure)
                        if len(_failures) > 0:
                            failure = ", ".join(_failures)

                if failure is not None:
                    job_id = j.record_completed(timestamp, e.id,
                                                dep.STATUS_FAILED, {})
                    failures.append((e.transform, e.job_dir))
                    debug_log.log_completed(job_id, dep.STATUS_FAILED,
                                            completion)
                    timings.log(job_id, "fail")
                elif completion is not None:
                    amended_outputs = _amend_outputs(completion,
                                                     properties_to_add)

                    job_id = j.record_completed(timestamp, e.id,
                                                dep.STATUS_COMPLETED,
                                                amended_outputs)
                    debug_log.log_completed(job_id, dep.STATUS_COMPLETED,
                                            completion)
                    success_count += 1
                    timings.log(job_id, "complete")

                did_useful_work = True

            j.refresh_rules()

            if not did_useful_work:
                time.sleep(0.5)

    if len(executing) > 0:
        ui.ask_user_to_cancel(j, executing)

    log.info("%d jobs successfully executed", success_count)
    if len(failures) > 0:
        # maybe also show summary of which jobs failed?
        log.warning(
            "%d jobs failed: %s", len(failures), ", ".join([
                "{} ({})".format(job_dir, transform)
                for transform, job_dir in failures
            ]))
        return -1

    return 0