def set_cancelled_flag_for_actions(job_request_id, actions): # It's important that we modify the Jobs in-place in the database rather than retrieving, updating and re-writing # them. If we did the latter then we would risk dirty writes if the run thread modified a Job while we were # working. update_where( Job, {"cancelled": True}, job_request_id=job_request_id, action__in=actions, )
def test_existing_cancelled_jobs_are_ignored_up_when_checking_dependencies( tmp_work_dir, ): create_jobs_with_project_file(make_job_request(action="generate_cohort"), TEST_PROJECT) cancelled_generate_job = find_one(Job, action="generate_cohort") update_where(Job, {"cancelled": True}, id=cancelled_generate_job.id) # Now schedule a job which has the above job as a dependency create_jobs_with_project_file(make_job_request(action="prepare_data_1"), TEST_PROJECT) # Check that it's spawned a new instance of the cancelled job and wired up the dependencies correctly prepare_job = find_one(Job, action="prepare_data_1") new_generate_job = find_one(Job, action="generate_cohort", cancelled=0) assert new_generate_job.id != cancelled_generate_job.id assert len(prepare_job.wait_for_job_ids) == 1 assert prepare_job.wait_for_job_ids[0] == new_generate_job.id
def main(): print( "== DANGER ZONE ==\n" "\n" "This will kill all running jobs and reset them to the PENDING state, ready\n" "to be restarted following a reboot.\n" "\n" "It should only be run when the job-runner service has been stopped." "\n" ) confirm = input("Are you sure you want to continue? (y/N)") assert confirm.strip().lower() == "y" # Reset all running jobs to pending update_where(Job, {"state": State.PENDING, "started_at": None}, state=State.RUNNING) # Make sure all containers and volumes are removed ready to freshly restart the jobs # after the reboot for job in find_where(Job, state=State.PENDING): docker.kill(container_name(job)) docker.delete_container(container_name(job)) docker.delete_volume(volume_name(job))
def test_run_all_ignores_failed_actions_that_have_been_removed(tmp_work_dir): # Long ago there was an useless action that failed and then was rightly expunged from the study pipeline obsolete_action_def = """ obsolete_action: run: python:latest -c pass outputs: moderately_sensitive: name: path """ create_jobs_with_project_file(make_job_request(action="obsolete_action"), TEST_PROJECT + obsolete_action_def) update_where(Job, {"state": State.FAILED}, action="obsolete_action") # Since then all the healthy, vigorous actions have been successfully run individually request = make_job_request(actions=[ "generate_cohort", "prepare_data_1", "prepare_data_2", "analyse_data" ]) create_jobs_with_project_file(request, TEST_PROJECT) update_where(Job, {"state": State.SUCCEEDED}, job_request_id=request.id) with pytest.raises(NothingToDoError): # Now this should be a no-op because all the actions that are still part of the study have succeeded create_jobs_with_project_file(make_job_request(action="run_all"), TEST_PROJECT)
def create_and_run_jobs( project_dir, actions, force_run_dependencies, continue_on_error, temp_dir, docker_label, clean_up_docker_objects=True, log_format=LOCAL_RUN_FORMAT, format_output_for_github=False, ): # Fiddle with the configuration to suit what we need for running local jobs docker.LABEL = docker_label # It's more helpful in this context to have things consistent config.RANDOMISE_JOB_ORDER = False config.HIGH_PRIVACY_WORKSPACES_DIR = project_dir.parent config.DATABASE_FILE = project_dir / "metadata" / "db.sqlite" config.TMP_DIR = temp_dir config.JOB_LOG_DIR = temp_dir / "logs" config.BACKEND = "expectations" config.USING_DUMMY_DATA_BACKEND = True config.CLEAN_UP_DOCKER_OBJECTS = clean_up_docker_objects # We want to fetch any reusable actions code directly from Github so as to # avoid pushing unnecessary traffic through the proxy config.GIT_PROXY_DOMAIN = "github.com" # Rather than using the throwaway `temp_dir` to store git repos in we use a # consistent directory within the system tempdir. This means we don't have # to keep refetching commits and also avoids the complexity of deleting # git's read-only directories on Windows. We use the current username as a # crude means of scoping the directory to the user in order to avoid # potential permissions issues if multiple users share the same directory. config.GIT_REPO_DIR = Path( tempfile.gettempdir()).joinpath(f"opensafely_{getuser()}") # None of the below should be used when running locally config.WORKDIR = None config.HIGH_PRIVACY_STORAGE_BASE = None config.MEDIUM_PRIVACY_STORAGE_BASE = None config.MEDIUM_PRIVACY_WORKSPACES_DIR = None configure_logging( fmt=log_format, # All the other output we produce goes to stdout and it's a bit # confusing if the log messages end up on a separate stream stream=sys.stdout, # Filter out log messages in the local run context extra_filter=filter_log_messages, ) # Any jobs that are running or pending must be left over from a previous run that was aborted either by an # unexpected and unhandled exception or by the researcher abruptly terminating the process. We can't reasonably # recover them (and the researcher may not want to -- maybe that's why they terminated), so we mark them as # cancelled. This causes the rest of the system to effectively ignore them. # # We do this here at the beginning rather than trying to catch these cases when the process exits because the # latter couldn't ever completely guarantee to catch every possible termination case correctly. database.update_where( Job, { "cancelled": True, "state": State.FAILED }, state__in=[State.RUNNING, State.PENDING], ) try: job_request, jobs = create_job_request_and_jobs( project_dir, actions, force_run_dependencies) except NothingToDoError: print("=> All actions already completed successfully") print(" Use -f option to force everything to re-run") return True except (ProjectValidationError, ReusableActionError, JobRequestError) as e: print(f"=> {type(e).__name__}") print(textwrap.indent(str(e), " ")) if hasattr(e, "valid_actions"): print("\n Valid action names are:") for action in e.valid_actions: if action != RUN_ALL_COMMAND: print(f" {action}") else: print(f" {action} (runs all actions in project)") return False docker_images = get_docker_images(jobs) uses_stata = any( i.startswith(f"{config.DOCKER_REGISTRY}/stata-mp:") for i in docker_images) if uses_stata and config.STATA_LICENSE is None: config.STATA_LICENSE = get_stata_license() if config.STATA_LICENSE is None: print( "The docker image 'stata-mp' requires a license to function.\n" "\n" "If you are a member of OpenSAFELY we should have been able to fetch\n" "the license automatically, so something has gone wrong. Please open\n" "a new discussion here so we can help:\n" " https://github.com/opensafely/documentation/discussions\n" "\n" "If you are not a member of OpenSAFELY you will have to provide your\n" "own license. See the dicussion here for pointers:\n" " https://github.com/opensafely/documentation/discussions/299") return False for image in docker_images: if not docker.image_exists_locally(image): print(f"Fetching missing docker image: docker pull {image}") try: # We want to be chatty when running in the console so users can # see progress and quiet in CI so we don't spam the logs with # layer download noise docker.pull(image, quiet=not sys.stdout.isatty()) except docker.DockerPullError as e: print("Failed with error:") print(e) return False action_names = [job.action for job in jobs] print(f"\nRunning actions: {', '.join(action_names)}\n") # Wrap all the log output inside an expandable block when running inside # Github Actions if format_output_for_github: print( f"::group::Job Runner Logs {ANSI.Grey}(click to view){ANSI.Reset}") # Run everything exit_condition = (no_jobs_remaining if continue_on_error else job_failed_or_none_remaining) try: run_main(exit_callback=exit_condition) except KeyboardInterrupt: pass finally: if format_output_for_github: print("::endgroup::") final_jobs = find_where(Job, state__in=[State.FAILED, State.SUCCEEDED], job_request_id=job_request.id) # Always show failed jobs last, otherwise show in order run final_jobs.sort(key=lambda job: ( 1 if job.state == State.FAILED else 0, job.started_at or 0, )) # Pretty print details of each action print() if not final_jobs: print("=> No jobs completed") for job in final_jobs: log_file = f"{METADATA_DIR}/{job.action}.log" # If a job fails we don't want to clutter the output with its failed # dependants. if (job.state == State.FAILED and job.status_code == StatusCode.DEPENDENCY_FAILED): continue if format_output_for_github: print(f"{ANSI.Bold}=> {job.action}{ANSI.Reset}") else: print(f"=> {job.action}") print(textwrap.indent(job.status_message, " ")) # Where a job failed because expected outputs weren't found we show a # list of other outputs which were generated if job.unmatched_outputs: print( "\n Did you mean to match one of these files instead?\n - ", end="") print("\n - ".join(job.unmatched_outputs)) print() # Output the entire log file inside an expandable block when running # inside Github Actions if format_output_for_github: print( f"::group:: log file: {log_file} {ANSI.Grey}(click to view){ANSI.Reset}" ) long_grey_line = ANSI.Grey + ("\u2015" * 80) + ANSI.Reset print(long_grey_line) print((project_dir / log_file).read_text()) print(long_grey_line) print("::endgroup::") else: print(f" log file: {log_file}") # Display matched outputs print(" outputs:") outputs = sorted(job.outputs.items()) if job.outputs else [] print( tabulate(outputs, separator=" - ", indent=5, empty="(no outputs)")) # If a job exited with an error code then try to display the end of the # log output in case that makes the problem immediately obvious if job.status_code == StatusCode.NONZERO_EXIT: logs, truncated = get_log_file_snippet(project_dir / log_file, max_lines=32) if logs: print(f"\n logs{' (truncated)' if truncated else ''}:\n") print(textwrap.indent(logs, " ")) print() success_flag = all(job.state == State.SUCCEEDED for job in final_jobs) return success_flag