def test_create_or_update_jobs(tmp_work_dir): repo_url = str(Path(__file__).parent.resolve() / "fixtures/git-repo") job_request = JobRequest( id="123", repo_url=repo_url, # GIT_DIR=tests/fixtures/git-repo git rev-parse v1 commit="d1e88b31cbe8f67c58f938adb5ee500d54a69764", branch="v1", requested_actions=["generate_cohort"], cancelled_actions=[], workspace="1", database_name="dummy", original={}, ) create_or_update_jobs(job_request) old_job = find_one(Job) assert old_job.job_request_id == "123" assert old_job.state == State.PENDING assert old_job.repo_url == repo_url assert old_job.commit == "d1e88b31cbe8f67c58f938adb5ee500d54a69764" assert old_job.workspace == "1" assert old_job.action == "generate_cohort" assert old_job.wait_for_job_ids == [] assert old_job.requires_outputs_from == [] assert old_job.run_command == ( "cohortextractor:latest generate_cohort --expectations-population=1000" " --output-dir=.") assert old_job.output_spec == {"highly_sensitive": {"cohort": "input.csv"}} assert old_job.status_message is None # Check no new jobs created from same JobRequest create_or_update_jobs(job_request) new_job = find_one(Job) assert old_job == new_job
def test_adding_job_creates_dependencies(tmp_work_dir): create_jobs_with_project_file(make_job_request(action="analyse_data"), TEST_PROJECT) analyse_job = find_one(Job, action="analyse_data") prepare_1_job = find_one(Job, action="prepare_data_1") prepare_2_job = find_one(Job, action="prepare_data_2") generate_job = find_one(Job, action="generate_cohort") assert set( analyse_job.wait_for_job_ids) == {prepare_1_job.id, prepare_2_job.id} assert prepare_1_job.wait_for_job_ids == [generate_job.id] assert prepare_2_job.wait_for_job_ids == [generate_job.id] assert generate_job.wait_for_job_ids == []
def test_local_run_copes_with_detritus_of_earlier_interrupted_run( extraction_tool, tmp_path ): # This test simulates the case where an earlier run has been interrupted (for example by the user pressing ctrl-c). # In particular we put a couple of jobs in unfinished states, which they could never be left in under normal # operation. The correct behaviour of the local run, which this tests for, is for such unfinished jobs to be marked # as cancelled on the next run. project_dir = tmp_path / "project" shutil.copytree(str(FIXTURE_DIR / "full_project"), project_dir) config.DATABASE_FILE = project_dir / "metadata" / "db.sqlite" project = load_pipeline(project_dir / "project.yaml") database.insert(SavedJobRequest(id="previous-request", original={})) def job(job_id, action, state): spec = get_action_specification( project, action, using_dummy_data_backend=config.USING_DUMMY_DATA_BACKEND, ) return Job( id=job_id, job_request_id="previous-request", state=state, status_message="", repo_url=str(project_dir), workspace=project_dir.name, database_name="a-database", action=action, wait_for_job_ids=[], requires_outputs_from=spec.needs, run_command=spec.run, output_spec=spec.outputs, created_at=int(time.time()), updated_at=int(time.time()), outputs={}, ) # FIXME: consolidate these when databuilder supports more columns in dummy data if extraction_tool == "cohortextractor": actions = ["generate_cohort", "prepare_data_m_cohortextractor"] else: actions = ["generate_dataset", "analyse_data_databuilder"] database.insert(job(job_id="123", action=actions[0], state=State.RUNNING)) database.insert(job(job_id="456", action=actions[1], state=State.PENDING)) assert local_run.main(project_dir=project_dir, actions=[actions[1]]) assert database.find_one(Job, id="123").cancelled assert database.find_one(Job, id="123").state == State.FAILED assert database.find_one(Job, id="456").cancelled assert database.find_one(Job, id="456").state == State.FAILED
def test_cancelled_jobs_are_flagged(tmp_work_dir): job_request = make_job_request(action="analyse_data") create_jobs_with_project_file(job_request, TEST_PROJECT) job_request.cancelled_actions = ["prepare_data_1", "prepare_data_2"] create_or_update_jobs(job_request) analyse_job = find_one(Job, action="analyse_data") prepare_1_job = find_one(Job, action="prepare_data_1") prepare_2_job = find_one(Job, action="prepare_data_2") generate_job = find_one(Job, action="generate_cohort") assert analyse_job.cancelled == 0 assert prepare_1_job.cancelled == 1 assert prepare_2_job.cancelled == 1 assert generate_job.cancelled == 0
def test_create_or_update_jobs_with_git_error(tmp_work_dir): repo_url = str(Path(__file__).parent.resolve() / "fixtures/git-repo") bad_commit = "0" * 40 job_request = JobRequest( id="123", repo_url=repo_url, commit=bad_commit, branch="v1", requested_actions=["generate_cohort"], cancelled_actions=[], workspace="1", database_name="dummy", original={}, ) create_or_update_jobs(job_request) j = find_one(Job) assert j.job_request_id == "123" assert j.state == State.FAILED assert j.repo_url == repo_url assert j.commit == bad_commit assert j.workspace == "1" assert j.wait_for_job_ids is None assert j.requires_outputs_from is None assert j.run_command is None assert j.output_spec is None assert (j.status_message == f"GitError: Error fetching commit {bad_commit} from {repo_url}")
def test_existing_active_jobs_are_picked_up_when_checking_dependencies( tmp_work_dir): create_jobs_with_project_file(make_job_request(action="prepare_data_1"), TEST_PROJECT) prepare_1_job = find_one(Job, action="prepare_data_1") generate_job = find_one(Job, action="generate_cohort") assert prepare_1_job.wait_for_job_ids == [generate_job.id] # Now schedule a job which has the above jobs as dependencies create_jobs_with_project_file(make_job_request(action="analyse_data"), TEST_PROJECT) # Check that it's waiting on the existing jobs analyse_job = find_one(Job, action="analyse_data") prepare_2_job = find_one(Job, action="prepare_data_2") assert set( analyse_job.wait_for_job_ids) == {prepare_1_job.id, prepare_2_job.id} assert prepare_2_job.wait_for_job_ids == [generate_job.id]
def test_existing_cancelled_jobs_are_ignored_up_when_checking_dependencies( tmp_work_dir, ): create_jobs_with_project_file(make_job_request(action="generate_cohort"), TEST_PROJECT) cancelled_generate_job = find_one(Job, action="generate_cohort") update_where(Job, {"cancelled": True}, id=cancelled_generate_job.id) # Now schedule a job which has the above job as a dependency create_jobs_with_project_file(make_job_request(action="prepare_data_1"), TEST_PROJECT) # Check that it's spawned a new instance of the cancelled job and wired up the dependencies correctly prepare_job = find_one(Job, action="prepare_data_1") new_generate_job = find_one(Job, action="generate_cohort", cancelled=0) assert new_generate_job.id != cancelled_generate_job.id assert len(prepare_job.wait_for_job_ids) == 1 assert prepare_job.wait_for_job_ids[0] == new_generate_job.id
def test_update_excluding_a_field(tmp_work_dir): job = Job(id="foo123", action="foo", commit="commit-of-glory") insert(job) job.action = "bar" job.commit = "commit-of-doom" update(job, exclude_fields=["commit"]) j = find_one(Job, id="foo123") assert j.action == "bar" assert j.commit == "commit-of-glory"
def test_basic_roundtrip(tmp_work_dir): job = Job( id="foo123", job_request_id="bar123", state=State.RUNNING, output_spec={"hello": [1, 2, 3]}, ) insert(job) j = find_one(Job, job_request_id__in=["bar123", "baz123"]) assert job.id == j.id assert job.output_spec == j.output_spec
def test_find_one_fails_if_there_is_more_than_one_result(tmp_work_dir): insert(Job(id="foo123", workspace="the-workspace")) insert(Job(id="foo456", workspace="the-workspace")) with pytest.raises(ValueError): find_one(Job, workspace="the-workspace")
def test_find_one_fails_if_there_are_no_results(tmp_work_dir): with pytest.raises(ValueError): find_one(Job, id="foo123")
def test_find_one_returns_a_single_value(tmp_work_dir): insert(Job(id="foo123", workspace="the-workspace")) job = find_one(Job, id="foo123") assert job.workspace == "the-workspace"
def test_update(tmp_work_dir): job = Job(id="foo123", action="foo") insert(job) job.action = "bar" update(job) assert find_one(Job, id="foo123").action == "bar"
def get_flag(name): """Get a flag from the db""" return find_one(Flag, id=name)