def persist_outputs(job, outputs, container_metadata): """Copy logs and generated outputs to persistant storage.""" # job_metadata is a big dict capturing everything we know about the state # of the job job.completed_at = int(time.time()) job_metadata = dict() job_metadata["job_id"] = job.id job_metadata["job_request_id"] = job.job_request_id job_metadata["created_at"] = job.created_at job_metadata["completed_at"] = int(time.time()) job_metadata["docker_image_id"] = container_metadata["Image"] # convert exit code to str so 0 exit codes get logged job_metadata["exit_code"] = str(container_metadata["State"]["ExitCode"]) job_metadata["container_metadata"] = container_metadata job_metadata["outputs"] = outputs job_metadata["commit"] = job.study.commit job_metadata["local_run"] = True # Dump useful info in log directory log_dir = get_log_dir(job) ensure_overwritable(log_dir / "logs.txt", log_dir / "metadata.json") write_log_file(job, job_metadata, log_dir / "logs.txt") with open(log_dir / "metadata.json", "w") as f: json.dump(job_metadata, f, indent=2) # Copy logs to workspace workspace_dir = get_high_privacy_workspace(job.workspace) metadata_log_file = workspace_dir / METADATA_DIR / f"{job.action}.log" copy_file(log_dir / "logs.txt", metadata_log_file) log.info(f"Logs written to: {metadata_log_file}") # Extract outputs to workspace ensure_overwritable(*[workspace_dir / f for f in outputs.keys()]) volume = volume_name(job) for filename in outputs.keys(): log.info(f"Extracting output file: {filename}") docker.copy_from_volume(volume, filename, workspace_dir / filename) # Copy out logs and medium privacy files medium_privacy_dir = get_medium_privacy_workspace(job.workspace) if medium_privacy_dir: copy_file( workspace_dir / METADATA_DIR / f"{job.action}.log", medium_privacy_dir / METADATA_DIR / f"{job.action}.log", ) for filename, privacy_level in outputs.items(): if privacy_level == "moderately_sensitive": copy_file(workspace_dir / filename, medium_privacy_dir / filename) # this can be removed once osrelease is dead write_manifest_file( medium_privacy_dir, {"repo": job.study.git_repo_url, "workspace": job.workspace}, )
def main(partial_job_ids, cleanup=False): jobs = get_jobs(partial_job_ids) for job in jobs: # If the job has been previously killed we don't want to overwrite the # timestamps here if job.state in (State.PENDING, State.RUNNING): mark_job_as_failed(job, "Killed by admin") # All these docker commands are idempotent docker.kill(container_name(job)) if cleanup: docker.delete_container(container_name(job)) docker.delete_volume(volume_name(job))
def find_matching_outputs(job): """ Returns a dict mapping output filenames to their privacy level, plus a list of any patterns that had no matches at all """ all_matches = docker.glob_volume_files(volume_name(job), job.output_spec.keys()) unmatched_patterns = [] outputs = {} for pattern, privacy_level in job.output_spec.items(): filenames = all_matches[pattern] if not filenames: unmatched_patterns.append(pattern) for filename in filenames: outputs[filename] = privacy_level return outputs, unmatched_patterns
def get_status(self, job): name = container_name(job) job_running = docker.container_inspect( name, "State.Running", none_if_not_exists=True ) if job_running is None: # no container for this job found volume = volume_name(job) if docker.volume_exists(volume): return JobStatus(ExecutorState.PREPARED) else: return JobStatus(ExecutorState.UNKNOWN) elif job_running: return JobStatus(ExecutorState.EXECUTING) elif job.id in RESULTS: return JobStatus(ExecutorState.FINALIZED) else: # container present but not running, i.e. finished return JobStatus(ExecutorState.EXECUTED)
def execute(self, job): current = self.get_status(job) if current.state != ExecutorState.PREPARED: return current try: docker.run( container_name(job), [job.image] + job.args, volume=(volume_name(job), "/workspace"), env=job.env, allow_network_access=job.allow_database_access, label=LABEL, labels=get_job_labels(job), ) except Exception as exc: return JobStatus( ExecutorState.ERROR, f"Failed to start docker container: {exc}" ) return JobStatus(ExecutorState.EXECUTING)
def prepare_job(job): """Creates a volume and populates it with the repo and input files.""" workspace_dir = get_high_privacy_workspace(job.workspace) volume = volume_name(job) docker.create_volume(volume, get_job_labels(job)) # `docker cp` can't create parent directories for us so we make sure all # these directories get created when we copy in the code extra_dirs = set(Path(filename).parent for filename in job.inputs) try: if job.study.git_repo_url and job.study.commit: copy_git_commit_to_volume( volume, job.study.git_repo_url, job.study.commit, extra_dirs ) else: # We only encounter jobs without a repo or commit when using the # "local_run" command to execute uncommitted local code copy_local_workspace_to_volume(volume, workspace_dir, extra_dirs) except subprocess.CalledProcessError: raise LocalDockerError( f"Could not checkout commit {job.study.commit} from {job.study.git_repo_url}" ) for filename in job.inputs: log.info(f"Copying input file: {filename}") if not (workspace_dir / filename).exists(): raise LocalDockerError( f"The file {filename} doesn't exist in workspace {job.workspace} as requested for job {job.id}" ) docker.copy_to_volume(volume, workspace_dir / filename, filename) # Hack: see `get_unmatched_outputs`. For some reason this requires a # non-empty file so copying `os.devnull` didn't work. some_non_empty_file = Path(__file__) docker.copy_to_volume(volume, some_non_empty_file, TIMESTAMP_REFERENCE_FILE) return volume
def main(): print( "== DANGER ZONE ==\n" "\n" "This will kill all running jobs and reset them to the PENDING state, ready\n" "to be restarted following a reboot.\n" "\n" "It should only be run when the job-runner service has been stopped." "\n") confirm = input("Are you sure you want to continue? (y/N)") assert confirm.strip().lower() == "y" # Reset all running jobs to pending update_where(Job, { "state": State.PENDING, "started_at": None }, state=State.RUNNING) # Make sure all containers and volumes are removed ready to freshly restart the jobs # after the reboot for job in find_where(Job, state=State.PENDING): docker.kill(container_name(job)) docker.delete_container(container_name(job)) docker.delete_volume(volume_name(job))