def check_plugin_instance_app_exec_status(self): """ Check a plugin instance's app execution status. If the associated job's execution time exceeds the maximum set for the remote compute environment then the job is cancelled. Otherwise the job's execution status is fetched from the remote and if finished without error then the job's zip file is downloaded and unpacked and the output files registered with the DB. Finally a delete request is made to remove the job from the remote environment. """ if self.c_plugin_inst.status == 'started': job_id = self.str_job_id delta_exec_time = timezone.now() - self.c_plugin_inst.start_date delta_seconds = delta_exec_time.total_seconds() max_exec_seconds = self.c_plugin_inst.compute_resource.max_job_exec_seconds if delta_seconds > max_exec_seconds: logger.error( f'[CODE13,{job_id}]: Error, job exceeded maximum execution ' f'time ({max_exec_seconds} seconds)') self.c_plugin_inst.error_code = 'CODE13' self.cancel_plugin_instance_app_exec() return self.c_plugin_inst.status pfcon_url = self.pfcon_client.url logger.info( f'Sending job status request to pfcon url -->{pfcon_url}<-- for ' f'job {job_id}') try: d_resp = self.pfcon_client.get_job_status(job_id, timeout=200) except PfconRequestException as e: logger.error( f'[CODE02,{job_id}]: Error getting job status at pfcon ' f'url -->{pfcon_url}<--, detail: {str(e)}') return self.c_plugin_inst.status # return, CUBE will retry later logger.info( f'Successful job status response from pfcon url -->{pfcon_url}<--' f' for job {job_id}: {json.dumps(d_resp, indent=4)}') status = d_resp['compute']['status'] logger.info(f'Current job {job_id} remote status = {status}') logger.info( f'Current job {job_id} DB status = {self.c_plugin_inst.status}' ) summary = self.get_job_status_summary(d_resp) self.c_plugin_inst.summary = summary raw = json_zip2str(d_resp) self.c_plugin_inst.raw = raw # only update (atomically) if status='started' to avoid concurrency problems PluginInstance.objects.filter(id=self.c_plugin_inst.id, status='started').update( summary=summary, raw=raw) if status == 'finishedSuccessfully': self._handle_finished_successfully_status() elif status == 'finishedWithError': self._handle_finished_with_error_status() elif status == 'undefined': self._handle_undefined_status() return self.c_plugin_inst.status
def check_plugin_instance_app_exec_status(self): """ Check a plugin instance's app execution status. It connects to the remote pfcon service to determine job status and if finished without error then downloads and unpacks job's zip file and registers output files with the DB. Finally it sends a request to delete the job's data from the remote environment. """ if self.c_plugin_inst.status == 'started': pfcon_url = self.pfcon_client.url job_id = self.str_job_id logger.info(f'Sending job status request to pfcon url -->{pfcon_url}<-- for ' f'job {job_id}') try: d_resp = self.pfcon_client.get_job_status(job_id, timeout=200) except PfconRequestException as e: logger.error(f'[CODE02,{job_id}]: Error getting job status at pfcon ' f'url -->{pfcon_url}<--, detail: {str(e)}') return self.c_plugin_inst.status # return, CUBE will retry later logger.info(f'Successful job status response from pfcon url -->{pfcon_url}<--' f' for job {job_id}: {json.dumps(d_resp, indent=4)}') status = d_resp['compute']['status'] logger.info(f'Current job {job_id} remote status = {status}') logger.info(f'Current job {job_id} DB status = {self.c_plugin_inst.status}') summary = self.get_job_status_summary(d_resp) self.c_plugin_inst.summary = summary raw = json_zip2str(d_resp) self.c_plugin_inst.raw = raw # only update (atomically) if status='started' to avoid concurrency problems PluginInstance.objects.filter( id=self.c_plugin_inst.id, status='started').update(summary=summary, raw=raw) if status in ('finishedSuccessfully', 'finishedWithError', 'undefined'): if status == 'finishedSuccessfully': self._handle_finished_successfully_status() elif status == 'finishedWithError': self._handle_finished_with_error_status() else: self._handle_undefined_status() logger.info(f'Deleting job {job_id} data from pfcon at url ' f'-->{pfcon_url}<--') try: self.pfcon_client.delete_job_data(job_id, timeout=500) except PfconRequestException as e: logger.error(f'[CODE12,{job_id}]: Error deleting job data from ' f'pfcon at url -->{pfcon_url}<--, detail: {str(e)}') else: logger.info(f'Successfully deleted job {job_id} data from pfcon at ' f'url -->{pfcon_url}<--') return self.c_plugin_inst.status
def run_plugin_instance_app(self): """ Run the plugin instance's app via a call to a remote pfcon service. """ if self.c_plugin_inst.status == 'cancelled': return plugin = self.c_plugin_inst.plugin plugin_type = plugin.meta.type inputdirs = [] try: if plugin_type == 'ds': inputdirs.append(self.get_previous_output_path()) else: inputdirs.append(self.manage_plugin_instance_app_empty_inputdir()) except Exception: self.c_plugin_inst.status = 'cancelled' # giving up self.save_plugin_instance_final_status() return d_unextpath_params, d_path_params = self.get_plugin_instance_path_parameters() for path_param_value in [param_value for param_value in d_path_params.values()]: # the value of each parameter of type 'path' is a string # representing a comma-separated list of paths in obj storage inputdirs = inputdirs + path_param_value.split(',') # create data file to transmit try: zip_file = self.create_zip_file(inputdirs) except Exception: self.c_plugin_inst.status = 'cancelled' # giving up self.save_plugin_instance_final_status() return # create job description dictionary cmd_args = self.get_plugin_instance_app_cmd_args() cmd_path_flags = list(d_unextpath_params.keys()) + list(d_path_params.keys()) job_descriptors = { 'cmd_args': ' '.join(cmd_args), 'cmd_path_flags': ','.join(cmd_path_flags), 'auid': self.c_plugin_inst.owner.username, 'number_of_workers': str(self.c_plugin_inst.number_of_workers), 'cpu_limit': str(self.c_plugin_inst.cpu_limit), 'memory_limit': str(self.c_plugin_inst.memory_limit), 'gpu_limit': str(self.c_plugin_inst.gpu_limit), 'image': plugin.dock_image, 'selfexec': plugin.selfexec, 'selfpath': plugin.selfpath, 'execshell': plugin.execshell, 'type': plugin_type } pfcon_url = self.pfcon_client.url job_id = self.str_job_id logger.info(f'Submitting job {job_id} to pfcon url -->{pfcon_url}<--, ' f'description: {json.dumps(job_descriptors, indent=4)}') try: d_resp = self.pfcon_client.submit_job(job_id, job_descriptors, zip_file.getvalue(), timeout=9000) except PfconRequestException as e: logger.error(f'[CODE01,{job_id}]: Error submitting job to pfcon url ' f'-->{pfcon_url}<--, detail: {str(e)}') self.c_plugin_inst.error_code = 'CODE01' self.c_plugin_inst.status = 'cancelled' # giving up self.save_plugin_instance_final_status() else: logger.info(f'Successfully submitted job {job_id} to pfcon url ' f'-->{pfcon_url}<--, response: {json.dumps(d_resp, indent=4)}') # update the job status and summary self.c_plugin_inst.status = 'started' self.c_plugin_inst.summary = self.get_job_status_summary() # initial status self.c_plugin_inst.raw = json_zip2str(d_resp) self.c_plugin_inst.save()