def fail_job(self, run_id, error_message, lsf_log_location=None, input_json_location=None): lock_id = "run_lock_%s" % run_id with memcache_task_lock(lock_id, self.app.oid) as acquired: if acquired: run = RunObjectFactory.from_db(run_id) if run.run_obj.is_failed: logger.info( format_log("Run Fail already processed", obj=run.run_obj)) return restart_run = run.run_obj.set_for_restart() if not restart_run: run.fail(error_message) run.to_db() job_group_notifier = run.job_group_notifier job_group_notifier_id = str( job_group_notifier.id) if job_group_notifier else None ci_review = SetCIReviewEvent(job_group_notifier_id).to_dict() send_notification.delay(ci_review) _upload_qc_report(run.run_obj) _job_finished_notify(run, lsf_log_location, input_json_location) else: run_id, output_directory, execution_id = restart_run submit_job.delay(run_id, output_directory, execution_id) else: logger.warning("Run %s is processing by another worker" % run_id)
def create_run_task(run_id, inputs, output_directory=None): logger.info(format_log("Creating and validating run", obj_id=run_id)) run = RunObjectFactory.from_definition(run_id, inputs) run.ready() run.to_db() submit_job.delay(run_id, output_directory) logger.info(format_log("Run is ready", obj=run))
def submit_job(run_id, output_directory=None, execution_id=None): resume = None try: run = Run.objects.get(id=run_id) except Run.DoesNotExist: raise Exception("Failed to submit a run") run1 = RunObjectFactory.from_db(run_id) if run.resume: run2 = RunObjectFactory.from_db(run.resume) if run1.equal(run2): logger.info( format_log("Resuming run with execution id %s" % run2.run_obj.execution_id, obj=run)) resume = str(run2.run_obj.execution_id) else: logger.info( format_log( "Failed to resume runs as run is not equal to the following run: %s" % str(run2), obj=run)) if execution_id: resume = execution_id if not output_directory: output_directory = os.path.join(run.app.output_directory, str(run_id)) job = run1.dump_job(output_directory=output_directory) logger.info(format_log("Job ready for submitting", obj=run)) if resume: url = urljoin(settings.RIDGEBACK_URL, "/v0/jobs/{id}/resume/".format(id=resume)) job = {"root_dir": output_directory} else: url = settings.RIDGEBACK_URL + "/v0/jobs/" if run.app.walltime: job["walltime"] = run.app.walltime if run.app.memlimit: job["memlimit"] = run.app.memlimit response = requests.post(url, json=job) if response.status_code == 201: run.execution_id = response.json()["id"] logger.info(format_log("Job successfully submitted", obj=run)) run.save() else: raise Exception("Failed to submit job %s" % run_id)
def test_run_fail_job(self, mock_get_pipeline, memcache_task_lock, send_notification, set_for_restart): with open("runner/tests/run/pair-workflow.cwl", "r") as f: app = json.load(f) with open("runner/tests/run/inputs.json", "r") as f: inputs = json.load(f) set_for_restart.return_value = None mock_get_pipeline.return_value = app memcache_task_lock.return_value = True send_notification.return_value = False run = RunObjectFactory.from_definition(str(self.run.id), inputs) run.to_db() operator_run = OperatorRun.objects.first() operator_run.runs.add(run.run_obj) num_failed_runs = operator_run.num_failed_runs fail_job(run.run_id, {"details": "Error has happened"}) operator_run.refresh_from_db() self.assertEqual(operator_run.num_failed_runs, num_failed_runs + 1) run_obj = RunObjectFactory.from_db(run.run_id) self.assertEqual(run_obj.message, {"details": "Error has happened"})
def test_run_to_db(self, mock_get_pipeline): with open("runner/tests/run/pair-workflow.cwl", "r") as f: app = json.load(f) with open("runner/tests/run/inputs.json", "r") as f: inputs = json.load(f) mock_get_pipeline.return_value = app run = RunObjectFactory.from_definition(str(self.run.id), inputs) run.to_db() try: run_obj = Run.objects.get(id=run.run_id) except Run.DoesNotExist as e: pass self.assertEqual(str(run_obj.id), run.run_id)
def test_run_complete_job( self, mock_populate_job_group_notifier, mock_get_pipeline, memcache_task_lock, send_notification ): with open("runner/tests/run/pair-workflow.cwl", "r") as f: app = json.load(f) with open("runner/tests/run/inputs.json", "r") as f: inputs = json.load(f) mock_populate_job_group_notifier.return_value = None mock_get_pipeline.return_value = app memcache_task_lock.return_value = True send_notification.return_value = False run = RunObjectFactory.from_definition(str(self.run.id), inputs) run.to_db() operator_run = OperatorRun.objects.first() operator_run.runs.add(run.run_obj) num_completed_runs = operator_run.num_completed_runs complete_job(run.run_id, self.outputs) operator_run.refresh_from_db() self.assertEqual(operator_run.num_completed_runs, num_completed_runs + 1) run_obj = RunObjectFactory.from_db(run.run_id) file_obj = File.objects.filter(path=self.outputs["maf"]["location"].replace("file://", "")).first() run_obj.to_db() for out in run_obj.outputs: if out.name == "maf": self.assertEqual(out.value["location"], self.outputs["maf"]["location"]) self.assertEqual(FileProcessor.get_bid_from_file(file_obj), out.db_value["location"]) port = Port.objects.filter(run_id=run_obj.run_id, name="bams").first() self.assertEqual(len(port.files.all()), 4) expected_result = ( "/output/argos_pair_workflow/425194f6-a974-4c2f-995f-f27d7ba54ddc/outputs/test_1.rg.md.abra.printreads.bam", "/output/argos_pair_workflow/425194f6-a974-4c2f-995f-f27d7ba54ddc/outputs/test_1.rg.md.abra.printreads.bai", "/output/argos_pair_workflow/425194f6-a974-4c2f-995f-f27d7ba54ddc/outputs/test_2.rg.md.abra.printreads.bam", "/output/argos_pair_workflow/425194f6-a974-4c2f-995f-f27d7ba54ddc/outputs/test_2.rg.md.abra.printreads.bai", ) self.assertTrue(port.files.all()[0].path in expected_result) self.assertTrue(port.files.all()[1].path in expected_result) self.assertTrue(port.files.all()[2].path in expected_result) self.assertTrue(port.files.all()[3].path in expected_result)
def test_restart_run(self, submit_job_task): fg = FileGroup.objects.create(name="test", slug="test") pipeline = Pipeline.objects.create(name="pipeline", output_directory="/tmp", output_file_group=fg) operator_run = OperatorRun.objects.create(num_total_runs=1, num_completed_runs=0, num_failed_runs=1) failed_run = Run.objects.create( name="failed_run", operator_run=operator_run, output_directory="/test", status=RunStatus.FAILED, notify_for_outputs=[], app=pipeline, ) input_port = Port.objects.create(run=failed_run, port_type=PortType.INPUT) output_port = Port.objects.create(run=failed_run, port_type=PortType.OUTPUT) operator_run_id = operator_run.id response = self.client.post("/v0/run/restart/", {"operator_run_id": operator_run_id}, format="json") self.assertEqual(response.status_code, status.HTTP_201_CREATED) # Restarted run should have a new ID restart_run_id = submit_job_task.call_args[0][0] self.assertNotEqual(failed_run.id, restart_run_id) # Restarted run should have resume directory set to original run restarted_run = Run.objects.get(id=restart_run_id) self.assertEqual(str(failed_run.id), str(restarted_run.resume)) # Both runs should have same input ports restart_run_object = RunObjectFactory.from_db(restart_run_id) original_run_object = RunObjectFactory.from_db(failed_run.id) self.assertTrue(original_run_object.equal(restart_run_object))
def test_run_creation_from_cwl(self, mock_get_pipeline): with open("runner/tests/run/pair-workflow.cwl", "r") as f: app = json.load(f) with open("runner/tests/run/inputs.json", "r") as f: inputs = json.load(f) mock_get_pipeline.return_value = app run = RunObjectFactory.from_definition(str(self.run.id), inputs) run.ready() for inp in run.inputs: if inp.name == "pair": self.assertEqual(inp.db_value[0]["R1"][0]["location"], "bid://%s" % str(self.file1.id)) self.assertEqual(inp.value[0]["R1"][0]["path"], self.file1.path) self.assertEqual(inp.db_value[0]["R2"][0]["location"], "bid://%s" % str(self.file2.id)) self.assertEqual(inp.value[0]["R2"][0]["path"], self.file2.path) self.assertEqual(inp.db_value[1]["R1"][0]["location"], "bid://%s" % str(self.file3.id)) self.assertEqual(inp.value[1]["R1"][0]["path"], self.file3.path) self.assertEqual(inp.db_value[1]["R2"][0]["location"], "bid://%s" % str(self.file4.id)) self.assertEqual(inp.value[1]["R2"][0]["path"], self.file4.path)
def complete_job(self, run_id, outputs, lsf_log_location=None, inputs_json_location=None): lock_id = "run_lock_%s" % run_id with memcache_task_lock(lock_id, self.app.oid) as acquired: if acquired: run = RunObjectFactory.from_db(run_id) if run.run_obj.is_completed: logger.info( format_log("Run Complete already processed", obj=run.run_obj)) return logger.info(format_log("Completing Run", obj=run.run_obj)) try: run.complete(outputs) except Exception as e: fail_job(run_id, str(e)) return run.to_db() job_group = run.job_group job_group_id = str(job_group.id) if job_group else None _job_finished_notify(run, lsf_log_location, inputs_json_location) for trigger in run.run_obj.operator_run.operator.from_triggers.filter( run_type=TriggerRunType.INDIVIDUAL): create_jobs_from_chaining.delay( trigger.to_operator_id, trigger.from_operator_id, [run_id], job_group_id=job_group_id, parent=str(run.run_obj.operator_run.id) if run.run_obj.operator_run else None, ) else: logger.warning("Run %s is processing by another worker" % run_id)