def status(self, job_ids): ''' Get the status of a list of jobs identified by their ids. Args: - job_ids (List of ids) : List of identifiers for the jobs Returns: - List of status codes. ''' logger.debug("Checking status of: {0}".format(job_ids)) for job_id in self.resources: retcode, stdout, stderr = self.channel.execute_wait('ps -p {} > /dev/null 2> /dev/null; echo "STATUS:$?" '.format( self.resources[job_id]['remote_pid']), self.cmd_timeout) for line in stdout.split('\n'): if line.startswith("STATUS:"): status = line.split("STATUS:")[1].strip() if status == "0": self.resources[job_id]['status'] = JobStatus(JobState.RUNNING) else: self.resources[job_id]['status'] = JobStatus(JobState.FAILED) return [self.resources[jid]['status'] for jid in job_ids]
def _status(self): ''' Internal: Do not call. Returns the status list for a list of job_ids Args: self Returns: [status...] : Status list of all jobs ''' job_id_list = ','.join(self.resources.keys()) cmd = "squeue --job {0}".format(job_id_list) logger.debug("Executing sqeueue") retcode, stdout, stderr = self.execute_wait(cmd) logger.debug("sqeueue returned") # Execute_wait failed. Do no update if retcode != 0: logger.warning("squeue failed with non-zero exit code {} - see https://github.com/Parsl/parsl/issues/1588".format(retcode)) return jobs_missing = list(self.resources.keys()) for line in stdout.split('\n'): parts = line.split() if parts and parts[0] != 'JOBID': job_id = parts[0] status = translate_table.get(parts[4], JobState.UNKNOWN) logger.debug("Updating job {} with slurm status {} to parsl status {}".format(job_id, parts[4], status)) self.resources[job_id]['status'] = JobStatus(status) jobs_missing.remove(job_id) # squeue does not report on jobs that are not running. So we are filling in the # blanks for missing jobs, we might lose some information about why the jobs failed. for missing_job in jobs_missing: logger.debug("Updating missing job {} to completed status".format(missing_job)) self.resources[missing_job]['status'] = JobStatus(JobState.COMPLETED)
def _least_loaded(self): """ Find channels that are not in use Returns ------- channel : Channel object None : When there are no more available channels """ while True: channel_counts = {channel: 0 for channel in self.channels} for job_id in self.resources: channel = self.resources[job_id]['channel'] if self.resources[job_id]['status'] == JobStatus( JobState.RUNNING): channel_counts[channel] = channel_counts.get(channel, 0) + 1 else: channel_counts[channel] = channel_counts.get(channel, 0) logger.debug("Channel_counts : {}".format(channel_counts)) if 0 not in channel_counts.values(): yield None for channel in channel_counts: if channel_counts[channel] == 0: yield channel
def _fail_job_async(self, job_id: Any, message: str): """Marks a job that has failed to start but would not otherwise be included in status() as failed and report it in status() """ if job_id is None: job_id = "failed-block-{}".format(self._generated_job_id_counter) self._generated_job_id_counter += 1 self._simulated_status[job_id] = JobStatus(JobState.FAILED, message)
def submit(self, command, tasks_per_node, job_name="parsl.localprovider"): ''' Submits the command onto an Local Resource Manager job. Submit returns an ID that corresponds to the task that was just submitted. If tasks_per_node < 1: 1/tasks_per_node is provisioned If tasks_per_node == 1: A single node is provisioned If tasks_per_node > 1 : tasks_per_node nodes are provisioned. Args: - command :(String) Commandline invocation to be made on the remote side. - tasks_per_node (int) : command invocations to be launched per node Kwargs: - job_name (String): Name for job, must be unique Returns: - None: At capacity, cannot provision more - job_id: (string) Identifier for the job ''' job_name = "{0}.{1}".format(job_name, time.time()) # Set script path script_path = "{0}/{1}.sh".format(self.script_dir, job_name) script_path = os.path.abspath(script_path) wrap_command = self.worker_init + '\n' + self.launcher(command, tasks_per_node, self.nodes_per_block) self._write_submit_script(wrap_command, script_path) job_id = None remote_pid = None if (self.move_files is None and not isinstance(self.channel, LocalChannel)) or (self.move_files): logger.debug("Pushing start script") script_path = self.channel.push_file(script_path, self.channel.script_dir) logger.debug("Launching in remote mode") # Bash would return until the streams are closed. So we redirect to a outs file cmd = 'bash {0} > {0}.out 2>&1 & \n echo "PID:$!" '.format(script_path) retcode, stdout, stderr = self.channel.execute_wait(cmd, self.cmd_timeout) for line in stdout.split('\n'): if line.startswith("PID:"): remote_pid = line.split("PID:")[1].strip() job_id = remote_pid if job_id is None: logger.warning("Channel failed to start remote command/retrieve PID") self.resources[job_id] = {'job_id': job_id, 'status': JobStatus(JobState.RUNNING), 'remote_pid': remote_pid} return job_id
def status(self, job_ids): """Get the status of a list of jobs identified by their ids. Parameters ---------- job_ids : list of str Identifiers for the jobs. Returns ------- list of int The status codes of the requsted jobs. """ statuses = [] logger.info('List VMs in resource group') for job_id in job_ids: try: vm = self.compute_client.virtual_machines.get( self.group_name, job_id, expand='instanceView') status = vm.instance_view.statuses[1].display_status statuses.append(JobStatus(translate_table.get(status, JobState.UNKNOWN))) # This only happens when it is in ProvisionState/Pending except IndexError: statuses.append(JobStatus(JobState.PENDING)) return statuses