def set_cancelled_flag_for_actions(job_request_id, actions):
    # It's important that we modify the Jobs in-place in the database rather than retrieving, updating and re-writing
    # them. If we did the latter then we would risk dirty writes if the run thread modified a Job while we were
    # working.
    update_where(
        Job,
        {"cancelled": True},
        job_request_id=job_request_id,
        action__in=actions,
    )
示例#2
0
def test_existing_cancelled_jobs_are_ignored_up_when_checking_dependencies(
    tmp_work_dir, ):
    create_jobs_with_project_file(make_job_request(action="generate_cohort"),
                                  TEST_PROJECT)
    cancelled_generate_job = find_one(Job, action="generate_cohort")
    update_where(Job, {"cancelled": True}, id=cancelled_generate_job.id)

    # Now schedule a job which has the above job as a dependency
    create_jobs_with_project_file(make_job_request(action="prepare_data_1"),
                                  TEST_PROJECT)

    # Check that it's spawned a new instance of the cancelled job and wired up the dependencies correctly
    prepare_job = find_one(Job, action="prepare_data_1")
    new_generate_job = find_one(Job, action="generate_cohort", cancelled=0)
    assert new_generate_job.id != cancelled_generate_job.id

    assert len(prepare_job.wait_for_job_ids) == 1
    assert prepare_job.wait_for_job_ids[0] == new_generate_job.id
示例#3
0
def main():
    print(
        "== DANGER ZONE ==\n"
        "\n"
        "This will kill all running jobs and reset them to the PENDING state, ready\n"
        "to be restarted following a reboot.\n"
        "\n"
        "It should only be run when the job-runner service has been stopped."
        "\n"
    )
    confirm = input("Are you sure you want to continue? (y/N)")
    assert confirm.strip().lower() == "y"
    # Reset all running jobs to pending
    update_where(Job, {"state": State.PENDING, "started_at": None}, state=State.RUNNING)
    # Make sure all containers and volumes are removed ready to freshly restart the jobs
    # after the reboot
    for job in find_where(Job, state=State.PENDING):
        docker.kill(container_name(job))
        docker.delete_container(container_name(job))
        docker.delete_volume(volume_name(job))
示例#4
0
def test_run_all_ignores_failed_actions_that_have_been_removed(tmp_work_dir):
    # Long ago there was an useless action that failed and then was rightly expunged from the study pipeline
    obsolete_action_def = """
  obsolete_action:
    run: python:latest -c pass
    outputs:
      moderately_sensitive:
        name: path
    """
    create_jobs_with_project_file(make_job_request(action="obsolete_action"),
                                  TEST_PROJECT + obsolete_action_def)
    update_where(Job, {"state": State.FAILED}, action="obsolete_action")

    # Since then all the healthy, vigorous actions have been successfully run individually
    request = make_job_request(actions=[
        "generate_cohort", "prepare_data_1", "prepare_data_2", "analyse_data"
    ])
    create_jobs_with_project_file(request, TEST_PROJECT)
    update_where(Job, {"state": State.SUCCEEDED}, job_request_id=request.id)

    with pytest.raises(NothingToDoError):
        # Now this should be a no-op because all the actions that are still part of the study have succeeded
        create_jobs_with_project_file(make_job_request(action="run_all"),
                                      TEST_PROJECT)
示例#5
0
def create_and_run_jobs(
    project_dir,
    actions,
    force_run_dependencies,
    continue_on_error,
    temp_dir,
    docker_label,
    clean_up_docker_objects=True,
    log_format=LOCAL_RUN_FORMAT,
    format_output_for_github=False,
):
    # Fiddle with the configuration to suit what we need for running local jobs
    docker.LABEL = docker_label
    # It's more helpful in this context to have things consistent
    config.RANDOMISE_JOB_ORDER = False
    config.HIGH_PRIVACY_WORKSPACES_DIR = project_dir.parent
    config.DATABASE_FILE = project_dir / "metadata" / "db.sqlite"
    config.TMP_DIR = temp_dir
    config.JOB_LOG_DIR = temp_dir / "logs"
    config.BACKEND = "expectations"
    config.USING_DUMMY_DATA_BACKEND = True
    config.CLEAN_UP_DOCKER_OBJECTS = clean_up_docker_objects

    # We want to fetch any reusable actions code directly from Github so as to
    # avoid pushing unnecessary traffic through the proxy
    config.GIT_PROXY_DOMAIN = "github.com"
    # Rather than using the throwaway `temp_dir` to store git repos in we use a
    # consistent directory within the system tempdir. This means we don't have
    # to keep refetching commits and also avoids the complexity of deleting
    # git's read-only directories on Windows. We use the current username as a
    # crude means of scoping the directory to the user in order to avoid
    # potential permissions issues if multiple users share the same directory.
    config.GIT_REPO_DIR = Path(
        tempfile.gettempdir()).joinpath(f"opensafely_{getuser()}")

    # None of the below should be used when running locally
    config.WORKDIR = None
    config.HIGH_PRIVACY_STORAGE_BASE = None
    config.MEDIUM_PRIVACY_STORAGE_BASE = None
    config.MEDIUM_PRIVACY_WORKSPACES_DIR = None

    configure_logging(
        fmt=log_format,
        # All the other output we produce goes to stdout and it's a bit
        # confusing if the log messages end up on a separate stream
        stream=sys.stdout,
        # Filter out log messages in the local run context
        extra_filter=filter_log_messages,
    )

    # Any jobs that are running or pending must be left over from a previous run that was aborted either by an
    # unexpected and unhandled exception or by the researcher abruptly terminating the process. We can't reasonably
    # recover them (and the researcher may not want to -- maybe that's why they terminated), so we mark them as
    # cancelled. This causes the rest of the system to effectively ignore them.
    #
    # We do this here at the beginning rather than trying to catch these cases when the process exits because the
    # latter couldn't ever completely guarantee to catch every possible termination case correctly.
    database.update_where(
        Job,
        {
            "cancelled": True,
            "state": State.FAILED
        },
        state__in=[State.RUNNING, State.PENDING],
    )

    try:
        job_request, jobs = create_job_request_and_jobs(
            project_dir, actions, force_run_dependencies)
    except NothingToDoError:
        print("=> All actions already completed successfully")
        print("   Use -f option to force everything to re-run")
        return True
    except (ProjectValidationError, ReusableActionError, JobRequestError) as e:
        print(f"=> {type(e).__name__}")
        print(textwrap.indent(str(e), "   "))
        if hasattr(e, "valid_actions"):
            print("\n   Valid action names are:")
            for action in e.valid_actions:
                if action != RUN_ALL_COMMAND:
                    print(f"     {action}")
                else:
                    print(f"     {action} (runs all actions in project)")
        return False

    docker_images = get_docker_images(jobs)

    uses_stata = any(
        i.startswith(f"{config.DOCKER_REGISTRY}/stata-mp:")
        for i in docker_images)
    if uses_stata and config.STATA_LICENSE is None:
        config.STATA_LICENSE = get_stata_license()
        if config.STATA_LICENSE is None:
            print(
                "The docker image 'stata-mp' requires a license to function.\n"
                "\n"
                "If you are a member of OpenSAFELY we should have been able to fetch\n"
                "the license automatically, so something has gone wrong. Please open\n"
                "a new discussion here so we can help:\n"
                "  https://github.com/opensafely/documentation/discussions\n"
                "\n"
                "If you are not a member of OpenSAFELY you will have to provide your\n"
                "own license. See the dicussion here for pointers:\n"
                " https://github.com/opensafely/documentation/discussions/299")
            return False

    for image in docker_images:
        if not docker.image_exists_locally(image):
            print(f"Fetching missing docker image: docker pull {image}")
            try:
                # We want to be chatty when running in the console so users can
                # see progress and quiet in CI so we don't spam the logs with
                # layer download noise
                docker.pull(image, quiet=not sys.stdout.isatty())
            except docker.DockerPullError as e:
                print("Failed with error:")
                print(e)
                return False

    action_names = [job.action for job in jobs]
    print(f"\nRunning actions: {', '.join(action_names)}\n")

    # Wrap all the log output inside an expandable block when running inside
    # Github Actions
    if format_output_for_github:
        print(
            f"::group::Job Runner Logs {ANSI.Grey}(click to view){ANSI.Reset}")

    # Run everything
    exit_condition = (no_jobs_remaining
                      if continue_on_error else job_failed_or_none_remaining)
    try:
        run_main(exit_callback=exit_condition)
    except KeyboardInterrupt:
        pass
    finally:
        if format_output_for_github:
            print("::endgroup::")

    final_jobs = find_where(Job,
                            state__in=[State.FAILED, State.SUCCEEDED],
                            job_request_id=job_request.id)
    # Always show failed jobs last, otherwise show in order run
    final_jobs.sort(key=lambda job: (
        1 if job.state == State.FAILED else 0,
        job.started_at or 0,
    ))

    # Pretty print details of each action
    print()
    if not final_jobs:
        print("=> No jobs completed")
    for job in final_jobs:
        log_file = f"{METADATA_DIR}/{job.action}.log"
        # If a job fails we don't want to clutter the output with its failed
        # dependants.
        if (job.state == State.FAILED
                and job.status_code == StatusCode.DEPENDENCY_FAILED):
            continue
        if format_output_for_github:
            print(f"{ANSI.Bold}=> {job.action}{ANSI.Reset}")
        else:
            print(f"=> {job.action}")
        print(textwrap.indent(job.status_message, "   "))
        # Where a job failed because expected outputs weren't found we show a
        # list of other outputs which were generated
        if job.unmatched_outputs:
            print(
                "\n   Did you mean to match one of these files instead?\n    - ",
                end="")
            print("\n    - ".join(job.unmatched_outputs))
        print()
        # Output the entire log file inside an expandable block when running
        # inside Github Actions
        if format_output_for_github:
            print(
                f"::group:: log file: {log_file} {ANSI.Grey}(click to view){ANSI.Reset}"
            )
            long_grey_line = ANSI.Grey + ("\u2015" * 80) + ANSI.Reset
            print(long_grey_line)
            print((project_dir / log_file).read_text())
            print(long_grey_line)
            print("::endgroup::")
        else:
            print(f"   log file: {log_file}")
        # Display matched outputs
        print("   outputs:")
        outputs = sorted(job.outputs.items()) if job.outputs else []
        print(
            tabulate(outputs, separator="  - ", indent=5,
                     empty="(no outputs)"))
        # If a job exited with an error code then try to display the end of the
        # log output in case that makes the problem immediately obvious
        if job.status_code == StatusCode.NONZERO_EXIT:
            logs, truncated = get_log_file_snippet(project_dir / log_file,
                                                   max_lines=32)
            if logs:
                print(f"\n   logs{' (truncated)' if truncated else ''}:\n")
                print(textwrap.indent(logs, "     "))
        print()

    success_flag = all(job.state == State.SUCCEEDED for job in final_jobs)
    return success_flag