예제 #1
0
    def check_plugin_instance_app_exec_status(self):
        """
        Check a plugin instance's app execution status. If the associated job's
        execution time exceeds the maximum set for the remote compute environment then
        the job is cancelled. Otherwise the job's execution status is fetched from the
        remote and if finished without error then the job's zip file is downloaded and
        unpacked and the output files registered with the DB. Finally a delete request
        is made to remove the job from the remote environment.
        """
        if self.c_plugin_inst.status == 'started':
            job_id = self.str_job_id

            delta_exec_time = timezone.now() - self.c_plugin_inst.start_date
            delta_seconds = delta_exec_time.total_seconds()
            max_exec_seconds = self.c_plugin_inst.compute_resource.max_job_exec_seconds
            if delta_seconds > max_exec_seconds:
                logger.error(
                    f'[CODE13,{job_id}]: Error, job exceeded maximum execution '
                    f'time ({max_exec_seconds} seconds)')
                self.c_plugin_inst.error_code = 'CODE13'
                self.cancel_plugin_instance_app_exec()
                return self.c_plugin_inst.status

            pfcon_url = self.pfcon_client.url
            logger.info(
                f'Sending job status request to pfcon url -->{pfcon_url}<-- for '
                f'job {job_id}')
            try:
                d_resp = self.pfcon_client.get_job_status(job_id, timeout=200)
            except PfconRequestException as e:
                logger.error(
                    f'[CODE02,{job_id}]: Error getting job status at pfcon '
                    f'url -->{pfcon_url}<--, detail: {str(e)}')
                return self.c_plugin_inst.status  # return, CUBE will retry later

            logger.info(
                f'Successful job status response from pfcon url -->{pfcon_url}<--'
                f' for job {job_id}: {json.dumps(d_resp, indent=4)}')
            status = d_resp['compute']['status']
            logger.info(f'Current job {job_id} remote status = {status}')
            logger.info(
                f'Current job {job_id} DB status = {self.c_plugin_inst.status}'
            )

            summary = self.get_job_status_summary(d_resp)
            self.c_plugin_inst.summary = summary
            raw = json_zip2str(d_resp)
            self.c_plugin_inst.raw = raw
            # only update (atomically) if status='started' to avoid concurrency problems
            PluginInstance.objects.filter(id=self.c_plugin_inst.id,
                                          status='started').update(
                                              summary=summary, raw=raw)

            if status == 'finishedSuccessfully':
                self._handle_finished_successfully_status()
            elif status == 'finishedWithError':
                self._handle_finished_with_error_status()
            elif status == 'undefined':
                self._handle_undefined_status()
        return self.c_plugin_inst.status
예제 #2
0
    def check_plugin_instance_app_exec_status(self):
        """
        Check a plugin instance's app execution status. It connects to the remote pfcon
        service to determine job status and if finished without error then downloads
        and unpacks job's zip file and registers output files with the DB. Finally it
        sends a request to delete the job's data from the remote environment.
        """
        if self.c_plugin_inst.status == 'started':
            pfcon_url = self.pfcon_client.url
            job_id = self.str_job_id
            logger.info(f'Sending job status request to pfcon url -->{pfcon_url}<-- for '
                        f'job {job_id}')
            try:
                d_resp = self.pfcon_client.get_job_status(job_id, timeout=200)
            except PfconRequestException as e:
                logger.error(f'[CODE02,{job_id}]: Error getting job status at pfcon '
                             f'url -->{pfcon_url}<--, detail: {str(e)}')
                return self.c_plugin_inst.status  # return, CUBE will retry later

            logger.info(f'Successful job status response from pfcon url -->{pfcon_url}<--'
                        f' for job {job_id}: {json.dumps(d_resp, indent=4)}')
            status = d_resp['compute']['status']
            logger.info(f'Current job {job_id} remote status = {status}')
            logger.info(f'Current job {job_id} DB status = {self.c_plugin_inst.status}')

            summary = self.get_job_status_summary(d_resp)
            self.c_plugin_inst.summary = summary
            raw = json_zip2str(d_resp)
            self.c_plugin_inst.raw = raw
            # only update (atomically) if status='started' to avoid concurrency problems
            PluginInstance.objects.filter(
                id=self.c_plugin_inst.id,
                status='started').update(summary=summary, raw=raw)

            if status in ('finishedSuccessfully', 'finishedWithError', 'undefined'):
                if status == 'finishedSuccessfully':
                    self._handle_finished_successfully_status()
                elif status == 'finishedWithError':
                    self._handle_finished_with_error_status()
                else:
                    self._handle_undefined_status()

                logger.info(f'Deleting job {job_id} data from pfcon at url '
                            f'-->{pfcon_url}<--')
                try:
                    self.pfcon_client.delete_job_data(job_id, timeout=500)
                except PfconRequestException as e:
                    logger.error(f'[CODE12,{job_id}]: Error deleting job data from '
                                 f'pfcon at url -->{pfcon_url}<--, detail: {str(e)}')
                else:
                    logger.info(f'Successfully deleted job {job_id} data from pfcon at '
                                f'url -->{pfcon_url}<--')
        return self.c_plugin_inst.status
예제 #3
0
    def run_plugin_instance_app(self):
        """
        Run the plugin instance's app via a call to a remote pfcon service.
        """
        if self.c_plugin_inst.status == 'cancelled':
            return

        plugin = self.c_plugin_inst.plugin
        plugin_type = plugin.meta.type
        inputdirs = []
        try:
            if plugin_type == 'ds':
                inputdirs.append(self.get_previous_output_path())
            else:
                inputdirs.append(self.manage_plugin_instance_app_empty_inputdir())
        except Exception:
            self.c_plugin_inst.status = 'cancelled'  # giving up
            self.save_plugin_instance_final_status()
            return

        d_unextpath_params, d_path_params = self.get_plugin_instance_path_parameters()
        for path_param_value in [param_value for param_value in d_path_params.values()]:
            # the value of each parameter of type 'path' is a string
            # representing a comma-separated list of paths in obj storage
            inputdirs = inputdirs + path_param_value.split(',')

        # create data file to transmit
        try:
            zip_file = self.create_zip_file(inputdirs)
        except Exception:
            self.c_plugin_inst.status = 'cancelled'  # giving up
            self.save_plugin_instance_final_status()
            return

        # create job description dictionary
        cmd_args = self.get_plugin_instance_app_cmd_args()
        cmd_path_flags = list(d_unextpath_params.keys()) + list(d_path_params.keys())
        job_descriptors = {
            'cmd_args': ' '.join(cmd_args),
            'cmd_path_flags': ','.join(cmd_path_flags),
            'auid': self.c_plugin_inst.owner.username,
            'number_of_workers': str(self.c_plugin_inst.number_of_workers),
            'cpu_limit': str(self.c_plugin_inst.cpu_limit),
            'memory_limit': str(self.c_plugin_inst.memory_limit),
            'gpu_limit': str(self.c_plugin_inst.gpu_limit),
            'image': plugin.dock_image,
            'selfexec': plugin.selfexec,
            'selfpath': plugin.selfpath,
            'execshell': plugin.execshell,
            'type': plugin_type
        }
        pfcon_url = self.pfcon_client.url
        job_id = self.str_job_id
        logger.info(f'Submitting job {job_id} to pfcon url -->{pfcon_url}<--, '
                    f'description: {json.dumps(job_descriptors, indent=4)}')
        try:
            d_resp = self.pfcon_client.submit_job(job_id, job_descriptors,
                                                  zip_file.getvalue(), timeout=9000)
        except PfconRequestException as e:
            logger.error(f'[CODE01,{job_id}]: Error submitting job to pfcon url '
                         f'-->{pfcon_url}<--, detail: {str(e)}')
            self.c_plugin_inst.error_code = 'CODE01'
            self.c_plugin_inst.status = 'cancelled'  # giving up
            self.save_plugin_instance_final_status()
        else:
            logger.info(f'Successfully submitted job {job_id} to pfcon url '
                        f'-->{pfcon_url}<--, response: {json.dumps(d_resp, indent=4)}')
            # update the job status and summary
            self.c_plugin_inst.status = 'started'
            self.c_plugin_inst.summary = self.get_job_status_summary()  # initial status
            self.c_plugin_inst.raw = json_zip2str(d_resp)
            self.c_plugin_inst.save()