class K8sJobActions(BaseJobActions): """ Used by K8sLando to handle messages at a job specific context. """ def __init__(self, settings): super(K8sJobActions, self).__init__(settings) self.cluster_api = settings.get_cluster_api() self.bespin_job = self.job_api.get_job() self.manager = JobManager(self.cluster_api, settings.config, self.bespin_job) def _set_job_state(self, state): # Keep cached bespin_job state up to date super(K8sJobActions, self)._set_job_state(state) self.bespin_job.state = state def _set_job_step(self, step): # Keep cached bespin_job step up to date super(K8sJobActions, self)._set_job_step(step) self.bespin_job.step = step def job_is_at_state_and_step(self, state, step): return self.bespin_job.state == state and self.bespin_job.step == step def start_job(self, payload): """ Request from user to start running a job. This starts a job to stage user input data into a volume. :param payload:StartJobPayload contains job_id we should start """ self._set_job_state(JobStates.RUNNING) self._set_job_step(JobSteps.CREATE_VM) input_files = self.job_api.get_input_files() input_files_size_in_g = self._calculate_input_data_size_in_g( input_files) # The stage data volume contains the workflow, job order, file metadata, and the user's input files. stage_data_volume_size_in_g = self.config.base_stage_data_volume_size_in_g + input_files_size_in_g self._show_status("Creating stage data persistent volumes") self.manager.create_stage_data_persistent_volumes( stage_data_volume_size_in_g) self.perform_staging_step(input_files) @staticmethod def _calculate_input_data_size_in_g(input_files): total_bytes = 0 for dds_file in input_files.dds_files: total_bytes += dds_file.size for url_file in input_files.url_files: total_bytes += url_file.size return math.ceil(float(total_bytes) / (1024.0 * 1024.0 * 1024.0)) def perform_staging_step(self, input_files): self._set_job_step(JobSteps.STAGING) self._show_status("Creating Stage data job") job = self.manager.create_stage_data_job(input_files) self._show_status("Launched stage data job: {}".format( job.metadata.name)) def stage_job_complete(self, payload): """ Message from worker that a the staging job step is complete and successful. Sets the job state to RUNNING and puts the run job message into the queue for the worker. :param payload: JobStepCompletePayload: contains job id and vm_instance_name(unused) """ if not self.job_is_at_state_and_step(JobStates.RUNNING, JobSteps.STAGING): # ignore request to perform incompatible step logging.info( "Ignoring request to run job:{} wrong step/state".format( self.job_id)) return self._set_job_step(JobSteps.RUNNING) self._show_status("Cleaning up after stage data") self.manager.cleanup_stage_data_job() self._show_status("Creating volumes for running workflow") self.manager.create_run_workflow_persistent_volumes() self.run_workflow_job() def run_workflow_job(self): self._show_status("Creating run workflow job") job = self.manager.create_run_workflow_job() self._show_status("Launched run workflow job: {}".format( job.metadata.name)) def run_job_complete(self, payload): """ Message from worker that a the run job step is complete and successful. Sets the job state to STORING_OUTPUT and puts the store output message into the queue for the worker. :param payload: JobStepCompletePayload: contains job id and vm_instance_name(unused) """ if not self.job_is_at_state_and_step(JobStates.RUNNING, JobSteps.RUNNING): # ignore request to perform incompatible step logging.info( "Ignoring request to store output for job:{} wrong step/state". format(self.job_id)) return self.manager.cleanup_run_workflow_job() self.organize_output_project() def organize_output_project(self): self._set_job_step(JobSteps.ORGANIZE_OUTPUT_PROJECT) self._show_status("Creating organize output project job") methods_document = self.job_api.get_workflow_methods_document( self.bespin_job.workflow.methods_document) methods_content = None if methods_document: methods_content = methods_document.content job = self.manager.create_organize_output_project_job(methods_content) self._show_status("Launched organize output project job: {}".format( job.metadata.name)) def organize_output_complete(self, payload): if not self.job_is_at_state_and_step(JobStates.RUNNING, JobSteps.ORGANIZE_OUTPUT_PROJECT): # ignore request to perform incompatible step logging.info( "Ignoring request to organize output project for job:{} wrong step/state" .format(self.job_id)) return self.manager.cleanup_organize_output_project_job() self.save_output() def save_output(self): store_output_data = self.job_api.get_store_output_job_data() # get_store_output_job_data self._set_job_step(JobSteps.STORING_JOB_OUTPUT) self._show_status("Creating store output job") job = self.manager.create_save_output_job( store_output_data.share_dds_ids) self._show_status("Launched save output job: {}".format( job.metadata.name)) def store_job_output_complete(self, payload): """ Message from worker that a the store output job step is complete and successful. Records information about the resulting output project and frees cloud resources. :param payload: JobStepCompletePayload: contains job id and vm_instance_name(unused) """ if not self.job_is_at_state_and_step(JobStates.RUNNING, JobSteps.STORING_JOB_OUTPUT): # ignore request to perform incompatible step logging.info( "Ignoring request to cleanup for job:{} wrong step/state". format(self.job_id)) return self.manager.cleanup_save_output_job() self._set_job_step(JobSteps.RECORD_OUTPUT_PROJECT) self._show_status("Creating record output project job") job = self.manager.create_record_output_project_job() self._show_status("Launched record output project job: {}".format( job.metadata.name)) def record_output_project_complete(self, payload): """ Records the output project id and readme file id that based on the store output pod logs. """ if not self.job_is_at_state_and_step(JobStates.RUNNING, JobSteps.RECORD_OUTPUT_PROJECT): # ignore request to perform incompatible step logging.info( "Ignoring request to cleanup for job:{} wrong step/state". format(self.job_id)) return project_id, readme_file_id = self.manager.read_record_output_project_details( ) self._show_status("Saving project id {} and readme id {}.".format( project_id, readme_file_id)) self.job_api.save_project_details(project_id, readme_file_id) self.manager.cleanup_record_output_project_job() self._show_status("Marking job finished") self._set_job_step(JobSteps.NONE) self._set_job_state(JobStates.FINISHED) def restart_job(self, payload): """ Request from user to resume running a job. It will resume based on the value of bespin_job.step returned from the job api. Canceled jobs will always restart from the beginning :param payload:RestartJobPayload contains job_id we should restart """ full_restart = False if self.bespin_job.state != JobStates.CANCELED: self.manager.cleanup_jobs_and_config_maps() if self.bespin_job.step == JobSteps.STAGING: self._set_job_state(JobStates.RUNNING) input_files = self.job_api.get_input_files() self.perform_staging_step(input_files) elif self.bespin_job.step == JobSteps.RUNNING: self._set_job_state(JobStates.RUNNING) self.run_workflow_job() elif self.bespin_job.step == JobSteps.ORGANIZE_OUTPUT_PROJECT: self._set_job_state(JobStates.RUNNING) self.organize_output_project() elif self.bespin_job.step == JobSteps.STORING_JOB_OUTPUT: self._set_job_state(JobStates.RUNNING) self.save_output() elif self.bespin_job.step == JobSteps.RECORD_OUTPUT_PROJECT: self.cannot_restart_step_error( step_name="record output project") else: full_restart = True else: full_restart = True if full_restart: self.manager.cleanup_all() self.start_job(None) def cancel_job(self, payload): """ Request from user to cancel a running a job. Sets status to canceled and terminates the associated jobs, configmaps and pvcs :param payload: CancelJobPayload: contains job id we should cancel """ self._set_job_step(JobSteps.NONE) self._set_job_state(JobStates.CANCELED) self._show_status("Canceling job") self.manager.cleanup_all() def stage_job_error(self, payload): """ Message from watcher that the staging job had an error :param payload:JobStepErrorPayload: info about error """ self._job_step_failed("Staging job failed", payload) def run_job_error(self, payload): """ Message from watcher that the run workflow job had an error :param payload:JobStepErrorPayload: info about error """ self._job_step_failed("Running job failed", payload) def organize_output_error(self, payload): """ Message from watcher that the organize output project job had an error :param payload:JobStepErrorPayload: info about error """ self._job_step_failed("Organize output job failed", payload) def store_job_output_error(self, payload): """ Message from watcher that the store output project job had an error :param payload:JobStepErrorPayload: info about error """ self._job_step_failed("Storing job output failed", payload) def record_output_project_error(self, payload): self._job_step_failed("Recording output project failed", payload) def _job_step_failed(self, message, payload): self._set_job_state(JobStates.ERRORED) self._show_status(message) self._log_error(message=payload.message)
def test_create_run_workflow_job(self): mock_cluster_api = Mock() mock_config = Mock(storage_class_name='nfs') manager = JobManager(cluster_api=mock_cluster_api, config=mock_config, job=self.mock_job) manager.create_run_workflow_job() # it should have created a job to run the workflow with several volumes mounted args, kwargs = mock_cluster_api.create_job.call_args name, batch_spec = args self.assertEqual(name, 'run-workflow-51-jpb') # job name self.assertEqual(batch_spec.name, 'run-workflow-51-jpb') # job spec name self.assertEqual(batch_spec.labels['bespin-job-id'], '51') # Bespin job id stored in a label self.assertEqual(batch_spec.labels['bespin-job-step'], 'run_workflow') # store the job step in a label job_container = batch_spec.container self.assertEqual(job_container.name, 'run-workflow-51-jpb') # container name self.assertEqual(job_container.image_name, self.mock_job.k8s_settings.run_workflow.image_name, 'run workflow image name is based on job settings') expected_bash_command = 'cwltool --cachedir /bespin/output-data/tmpout/ ' \ '--outdir /bespin/output-data/results/ ' \ '--max-ram 1G --max-cores 2 ' \ '--usage-report /bespin/output-data/job-51-jpb-resource-usage.json ' \ '--stdout /bespin/output-data/bespin-workflow-output.json ' \ '--stderr /bespin/output-data/bespin-workflow-output.log ' \ '/bespin/job-data/workflow/someurl.cwl#main ' \ '/bespin/job-data/job-order.json'.split(' ') self.assertEqual( job_container.command, expected_bash_command, 'run workflow command combines job settings and staged files') self.assertEqual( job_container.env_dict['CALRISSIAN_POD_NAME'].field_path, 'metadata.name', 'We should store the pod name in a CALRISSIAN_POD_NAME environment variable' ) self.assertEqual( job_container.requested_cpu, self.mock_job.k8s_settings.run_workflow.cpus, 'run workflow requested cpu is based on a job setting') self.assertEqual( job_container.requested_memory, self.mock_job.k8s_settings.run_workflow.memory, 'run workflow requested memory is based on a job setting') self.assertEqual(len(job_container.volumes), 3) job_data_volume = job_container.volumes[0] self.assertEqual(job_data_volume.name, 'job-data-51-jpb') self.assertEqual(job_data_volume.mount_path, '/bespin/job-data') self.assertEqual(job_data_volume.volume_claim_name, 'job-data-51-jpb') self.assertEqual(job_data_volume.read_only, True, 'job data should be a read only volume') output_data_volume = job_container.volumes[1] self.assertEqual(output_data_volume.name, 'output-data-51-jpb') self.assertEqual(output_data_volume.mount_path, '/bespin/output-data') self.assertEqual(output_data_volume.volume_claim_name, 'output-data-51-jpb') self.assertEqual(output_data_volume.read_only, False) system_data_volume = job_container.volumes[2] self.assertEqual(system_data_volume.name, 'system-data-51-jpb') self.assertEqual( system_data_volume.mount_path, mock_config.run_workflow_settings.system_data_volume.mount_path, 'mount path for the system volume is based on a config setting') self.assertEqual( system_data_volume.volume_claim_name, mock_config. run_workflow_settings.system_data_volume.volume_claim_name, 'pvc name for the system volume is based on a config setting') self.assertEqual( system_data_volume.read_only, True, 'system data should be read only for running workflow')