def getJobs(self, jobs=None, user=None, as_dict=False): """ Overrides original method from DirectScheduler in order to list missing processes as DONE. """ job_stats = super(DirectScheduler, self).getJobs(jobs=jobs, user=user, as_dict=as_dict) found_jobs = [] # Get the list of known jobs if as_dict: found_jobs = job_stats.keys() else: found_jobs = [j.job_id for j in job_stats] # Now check if there are any the user requested but were not found not_found_jobs = list(set(jobs) - set(found_jobs)) if jobs else [] for job_id in not_found_jobs: job = JobInfo() job.job_id = job_id job.job_state = job_states.DONE # Owner and wallclock time is unknown if as_dict: job_stats[job_id] = job else: job_stats.append(job) return job_stats
def update_job_calc_from_detailed_job_info(calc, detailed_job_info): """ Updates the detailed job info for a JobCalculation as obtained from the scheduler :param calc: The job calculation :param detailed_job_info: the detailed information as returned by the scheduler for this job """ from aiida.scheduler.datastructures import JobInfo last_jobinfo = calc._get_last_jobinfo() if last_jobinfo is None: last_jobinfo = JobInfo() last_jobinfo.job_id = calc.get_job_id() last_jobinfo.job_state = JOB_STATES.DONE last_jobinfo.detailedJobinfo = detailed_job_info calc._set_last_jobinfo(last_jobinfo)
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ # I don't raise because if I pass a list of jobs, I get a non-zero status # if one of the job is not in the list anymore # retval should be zero #if retval != 0: #self.logger.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output # but I strip lines containing "Unknown Job Id", that happens # also when I ask for a calculation that has finished # # I also strip for "Job has finished" because this happens for # those schedulers configured to leave the job in the output # of qstat for some time after job completion. filtered_stderr = '\n'.join( l for l in stderr.split('\n') if "Unknown Job Id" not in l and "Job has finished" not in l) if filtered_stderr.strip(): self.logger.warning("Warning in _parse_joblist_output, non-empty " "(filtered) stderr='{}'".format(filtered_stderr)) if retval != 0: raise SchedulerError( "Error during qstat parsing (_parse_joblist_output function)") jobdata_raw = [] # will contain raw data parsed from qstat output # Get raw data and split in lines for line_num, l in enumerate(stdout.split('\n'), start=1): # Each new job stanza starts with the string 'Job Id:': I # create a new item in the jobdata_raw list if l.startswith('Job Id:'): jobdata_raw.append( {'id': l.split(':', 1)[1].strip(), 'lines': [], 'warning_lines_idx': []}) # warning_lines_idx: lines that do not start either with # tab or space else: if l.strip(): # This is a non-empty line, therefore it is an attribute # of the last job found if not jobdata_raw: # The list is still empty! (This means that I found a # non-empty line, before finding the first 'Job Id:' # string: it is an error. However this may happen # only before the first job. raise SchedulerParsingError("I did not find the header for the first job") #self.logger.warning("I found some text before the " #"first job: {}".format(l)) else: if l.startswith(' '): # If it starts with a space, it is a new field jobdata_raw[-1]['lines'].append(l) elif l.startswith('\t'): # If a line starts with a TAB, # I append to the previous string # stripping the TAB if not jobdata_raw[-1]['lines']: raise SchedulerParsingError( "Line {} is the first line of the job, but it " "starts with a TAB! ({})".format(line_num, l)) jobdata_raw[-1]['lines'][-1] += l[1:] else: #raise SchedulerParsingError( # "Wrong starting character at line {}! ({})" # "".format(line_num, l)) ## For some reasons, the output of 'comment' and ## 'Variable_List', for instance, can have ## newlines if they are included... # I do a ## workaround jobdata_raw[-1]['lines'][-1] += "\n{}".format(l) jobdata_raw[-1]['warning_lines_idx'].append( len(jobdata_raw[-1]['lines']) - 1) # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: this_job = JobInfo() this_job.job_id = job['id'] lines_without_equals_sign = [i for i in job['lines'] if '=' not in i] # There are lines without equals sign: this is bad if lines_without_equals_sign: # Should I only warn? self.logger.error("There are lines without equals sign! {}" "".format(lines_without_equals_sign)) raise (SchedulerParsingError("There are lines without equals " "sign.")) raw_data = {i.split('=', 1)[0].strip().lower(): i.split('=', 1)[1].lstrip() for i in job['lines'] if '=' in i} ## I ignore the errors for the time being - this seems to be ## a problem if there are \n in the content of some variables? ## I consider this a workaround... #for line_with_warning in set(job['warning_lines_idx']): # if job['lines'][line_with_warning].split( # '=',1)[0].strip().lower() != "comment": # raise SchedulerParsingError( # "Wrong starting character in one of the lines " # "of job {}, and it's not a comment! ({})" # "".format(this_job.job_id, # job['lines'][line_with_warning])) problematic_fields = [] for line_with_warning in set(job['warning_lines_idx']): problematic_fields.append(job['lines'][line_with_warning].split( '=', 1)[0].strip().lower()) if problematic_fields: # These are the fields that contain unexpected newlines raw_data['warning_fields_with_newlines'] = problematic_fields # I believe that exit_status and terminating_signal cannot be # retrieved from the qstat -f output. # I wrap calls in try-except clauses to avoid errors if a field # is missing try: this_job.title = raw_data['job_name'] except KeyError: self.logger.debug("No 'job_name' field for job id " "{}".format(this_job.job_id)) try: this_job.annotation = raw_data['comment'] except KeyError: # Many jobs do not have a comment; I do not complain about it. pass #self.logger.debug("No 'comment' field for job id {}".format( # this_job.job_id)) try: job_state_string = raw_data['job_state'] try: this_job.job_state = self._map_status[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_string, this_job.job_id)) this_job.job_state = job_states.UNDETERMINED except KeyError: self.logger.debug("No 'job_state' field for job id {}".format( this_job.job_id)) this_job.job_state = job_states.UNDETERMINED try: this_job.job_substate = raw_data['substate'] except KeyError: self.logger.debug("No 'substate' field for job id {}".format( this_job.job_id)) try: exec_hosts = raw_data['exec_host'].split('+') except KeyError: # No exec_host information found (it may be ok, if the job # is not running) pass else: # parse each host; syntax, from the man page: # hosta/J1+hostb/J2*P+... # where J1 and J2 are an index of the job # on the named host and P is the number of # processors allocated from that host to this job. # P does not appear if it is 1. try: exec_host_list = [] for exec_host in exec_hosts: node = MachineInfo() node.name, data = exec_host.split('/') data = data.split('*') if len(data) == 1: node.jobIndex = int(data[0]) node.num_cpus = 1 elif len(data) == 2: node.jobIndex = int(data[0]) node.num_cpus = int(data[1]) else: raise ValueError("Wrong number of pieces: {} " "instead of 1 or 2 in exec_hosts: " "{}".format(len(data), exec_hosts)) exec_host_list.append(node) this_job.allocated_machines = exec_host_list except Exception as e: self.logger.debug("Problem parsing the node names, I " "got Exception {} with message {}; " "exec_hosts was {}".format( str(type(e)), e.message, exec_hosts)) try: # I strip the part after the @: is this always ok? this_job.job_owner = raw_data['job_owner'].split('@')[0] except KeyError: self.logger.debug("No 'job_owner' field for job id {}".format( this_job.job_id)) try: this_job.num_cpus = int(raw_data['resource_list.ncpus']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: self.logger.debug("No 'resource_list.ncpus' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.ncpus' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.ncpus'], this_job.job_id)) try: this_job.num_mpiprocs = int(raw_data['resource_list.mpiprocs']) # TODO: understand if this is the correct field also for # multithreaded (OpenMP) jobs. except KeyError: self.logger.debug("No 'resource_list.mpiprocs' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.mpiprocs' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.mpiprocs'], this_job.job_id)) try: this_job.num_machines = int(raw_data['resource_list.nodect']) except KeyError: self.logger.debug("No 'resource_list.nodect' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("'resource_list.nodect' is not an integer " "({}) for job id {}!".format( raw_data['resource_list.nodect'], this_job.job_id)) # Double check of redundant info if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error("The length of the list of allocated " "nodes ({}) is different from the " "expected number of nodes ({})!".format( len(this_job.allocated_machines), this_job.num_machines)) try: this_job.queue_name = raw_data['queue'] except KeyError: self.logger.debug("No 'queue' field for job id " "{}".format(this_job.job_id)) try: this_job.RequestedWallclockTime = (self._convert_time( raw_data['resource_list.walltime'])) except KeyError: self.logger.debug("No 'resource_list.walltime' field for " "job id {}".format(this_job.job_id)) except ValueError: self.logger.warning("Error parsing 'resource_list.walltime' " "for job id {}".format(this_job.job_id)) try: this_job.wallclock_time_seconds = (self._convert_time( raw_data['resources_used.walltime'])) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.walltime' " "for job id {}".format(this_job.job_id)) try: this_job.cpu_time = (self._convert_time( raw_data['resources_used.cput'])) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.cput' " "for job id {}".format(this_job.job_id)) # # ctime: The time that the job was created # mtime: The time that the job was last modified, changed state, # or changed locations. # qtime: The time that the job entered the current queue # stime: The time when the job started execution. # etime: The time that the job became eligible to run, i.e. in a # queued state while residing in an execution queue. try: this_job.submission_time = self._parse_time_string( raw_data['ctime']) except KeyError: self.logger.debug("No 'ctime' field for job id " "{}".format(this_job.job_id)) except ValueError: self.logger.warning("Error parsing 'ctime' for job id " "{}".format(this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string( raw_data['stime']) except KeyError: # The job may not have been started yet pass except ValueError: self.logger.warning("Error parsing 'stime' for job id " "{}".format(this_job.job_id)) # TODO: see if we want to set also finish_time for finished jobs, # if there are any # Everything goes here anyway for debugging purposes this_job.raw_data = raw_data # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command (qstat -f). Return a list of JobInfo objects, one of each job, each relevant parameters implemented. .. note:: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ import re filtered_stderr = '\n'.join(l for l in stderr.split('\n')) if filtered_stderr.strip(): self.logger.warning( "Warning in _parse_joblist_output, non-empty " "(filtered) stderr='{}'".format(filtered_stderr)) if retval != 0: raise SchedulerError( "Error during direct execution parsing (_parse_joblist_output function)" ) # Create dictionary and parse specific fields job_list = [] for line in stdout.split('\n'): if re.search('^\s*PID', line) or line == '': # Skip the header if present continue line = re.sub('^\s+', '', line) job = re.split('\s+', line) this_job = JobInfo() this_job.job_id = job[0] try: job_state_string = job[1] try: if job_state_string[0] == 'S': this_job.job_state = job_states.SUSPENDED else: this_job.job_state = \ _map_status_ps[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_string, this_job.job_id)) this_job.job_state = job_states.UNDETERMINED except KeyError: self.logger.debug("No 'job_state' field for job id {}".format( this_job.job_id)) this_job.job_state = job_states.UNDETERMINED try: # I strip the part after the @: is this always ok? this_job.job_owner = job[2] except KeyError: self.logger.debug("No 'job_owner' field for job id {}".format( this_job.job_id)) try: this_job.wallclock_time_seconds = self._convert_time(job[3]) except KeyError: # May not have started yet pass except ValueError: self.logger.warning("Error parsing 'resources_used.walltime' " "for job id {}".format(this_job.job_id)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def _parse_joblist_output(self, retval, stdout, stderr): import xml.dom.minidom if retval != 0: self.logger.error("Error in _parse_joblist_output: retval={}; " "stdout={}; stderr={}".format( retval, stdout, stderr)) raise SchedulerError("Error during joblist retrieval, retval={}".\ format(retval)) if stderr.strip(): self.logger.warning("in _parse_joblist_output for {}: " "there was some text in stderr: {}".format( str(self.transport), stderr)) if stdout: try: xmldata = xml.dom.minidom.parseString(stdout) except xml.parsers.expat.ExpatError: self.logger.error("in sge._parse_joblist_output: " "xml parsing of stdout failed:" "{}".format(stdout)) raise SchedulerParsingError("Error during joblist retrieval," "xml parsing of stdout failed") else: self.logger.error("Error in sge._parse_joblist_output: retval={}; " "stdout={}; stderr={}".format( retval, stdout, stderr)) raise SchedulerError("Error during joblist retrieval," "no stdout produced") try: first_child = xmldata.firstChild second_childs = first_child.childNodes tag_names_sec = [elem.tagName for elem in second_childs \ if elem.nodeType == 1] if not 'queue_info' in tag_names_sec: self.logger.error("Error in sge._parse_joblist_output: " "no queue_info: {}".\ format(stdout)) raise SchedulerError if not 'job_info' in tag_names_sec: self.logger.error("Error in sge._parse_joblist_output: " "no job_info: {}".\ format(stdout)) raise SchedulerError except SchedulerError: self.logger.error("Error in sge._parse_joblist_output: stdout={}"\ .format(stdout)) raise SchedulerError("Error during xml processing, of stdout:" "There is no 'job_info' or no 'queue_info'" "element or there are no jobs!") #If something weird happens while firstChild, pop, etc: except Exception: self.logger.error("Error in sge._parse_joblist_output: stdout={}"\ .format(stdout)) raise SchedulerError("Error during xml processing, of stdout") jobs = [i for i in first_child.getElementsByTagName('job_list')] #jobs = [i for i in jobinfo.getElementsByTagName('job_list')] #print [i[0].childNodes[0].data for i in job_numbers if i] joblist = [] for job in jobs: this_job = JobInfo() #In case the user needs more information the xml-data for #each job is stored: this_job.raw_data = job.toxml() try: job_element = job.getElementsByTagName('JB_job_number').pop(0) element_child = job_element.childNodes.pop(0) this_job.job_id = str(element_child.data).strip() if not this_job.job_id: raise SchedulerError except SchedulerError: self.logger.error("Error in sge._parse_joblist_output:" "no job id is given, stdout={}"\ .format(stdout)) raise SchedulerError("Error in sge._parse_joblist_output:" "no job id is given") except IndexError: self.logger.error("No 'job_number' given for job index {} in " "job list, stdout={}".format(jobs.index(job)\ ,stdout)) raise IndexError("Error in sge._parse_joblist_output:" "no job id is given") try: job_element = job.getElementsByTagName('state').pop(0) element_child = job_element.childNodes.pop(0) job_state_string = str(element_child.data).strip() try: this_job.job_state = _map_status_sge[job_state_string] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_string, this_job.job_id)) this_job.job_state = job_states.UNDETERMINED except IndexError: self.logger.warning("No 'job_state' field for job id {} in" "stdout={}".format(this_job.job_id, stdout)) this_job.job_state = job_states.UNDETERMINED try: job_element = job.getElementsByTagName('JB_owner').pop(0) element_child = job_element.childNodes.pop(0) this_job.job_owner = str(element_child.data).strip() except IndexError: self.logger.warning("No 'job_owner' field for job " "id {}".format(this_job.job_id)) try: job_element = job.getElementsByTagName('JB_name').pop(0) element_child = job_element.childNodes.pop(0) this_job.title = str(element_child.data).strip() except IndexError: self.logger.warning("No 'title' field for job " "id {}".format(this_job.job_id)) try: job_element = job.getElementsByTagName('queue_name').pop(0) element_child = job_element.childNodes.pop(0) this_job.queue_name = str(element_child.data).strip() except IndexError: if this_job.job_state == job_states.RUNNING: self.logger.warning("No 'queue_name' field for job " "id {}".format(this_job.job_id)) try: job_element = job.getElementsByTagName( 'JB_submission_time').pop(0) element_child = job_element.childNodes.pop(0) time_string = str(element_child.data).strip() try: this_job.submission_time = self._parse_time_string( time_string) except ValueError: self.logger.warning("Error parsing 'JB_submission_time' " "for job id {} ('{}')".format( this_job.job_id, time_string)) except IndexError: try: job_element = job.getElementsByTagName( 'JAT_start_time').pop(0) element_child = job_element.childNodes.pop(0) time_string = str(element_child.data).strip() try: this_job.dispatch_time = self._parse_time_string( time_string) except ValueError: self.logger.warning("Error parsing 'JAT_start_time'" "for job id {} ('{}')".format( this_job.job_id, time_string)) except IndexError: self.logger.warning("No 'JB_submission_time' and no " "'JAT_start_time' field for job " "id {}".format(this_job.job_id)) #There is also cpu_usage, mem_usage, io_usage information available: if this_job.job_state == job_states.RUNNING: try: job_element = job.getElementsByTagName('slots').pop(0) element_child = job_element.childNodes.pop(0) this_job.num_mpiprocs = str(element_child.data).strip() except IndexError: self.logger.warning("No 'slots' field for job " "id {}".format(this_job.job_id)) joblist.append(this_job) #self.logger.debug("joblist final: {}".format(joblist)) return joblist
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ num_fields = len(self.fields) # I don't raise because if I pass a list of jobs, # I get a non-zero status # if one of the job is not in the list anymore # retval should be zero #if retval != 0: #self.logger.warning("Error in _parse_joblist_output: retval={}; " # "stdout={}; stderr={}".format(retval, stdout, stderr)) # issue a warning if there is any stderr output and # there is no line containing "Invalid job id specified", that happens # when I ask for specific calculations, and they are all finished if stderr.strip() and "Invalid job id specified" not in stderr: self.logger.warning("Warning in _parse_joblist_output, non-empty " "stderr='{}'".format(stderr.strip())) if retval != 0: raise SchedulerError( "Error during squeue parsing (_parse_joblist_output function)" ) # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [ l.split(_field_separator, num_fields) for l in stdout.splitlines() if _field_separator in l ] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: thisjob_dict = {k[1]: v for k, v in zip(self.fields, job)} this_job = JobInfo() try: this_job.job_id = thisjob_dict['job_id'] this_job.annotation = thisjob_dict['annotation'] job_state_raw = thisjob_dict['state_raw'] except KeyError: # I skip this calculation if I couldn't find this basic info # (I don't append anything to job_list before continuing) self.logger.error("Wrong line length in squeue output! '{}'" "".format(job)) continue try: job_state_string = _map_status_slurm[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_raw, this_job.job_id)) job_state_string = job_states.UNDETERMINED # QUEUED_HELD states are not specific states in SLURM; # they are instead set with state QUEUED, and then the # annotation tells if the job is held. # I check for 'Dependency', 'JobHeldUser', # 'JobHeldAdmin', 'BeginTime'. # Other states should not bring the job in QUEUED_HELD, I believe # (the man page of slurm seems to be incomplete, for instance # JobHeld* are not reported there; I also checked at the source code # of slurm 2.6 on github (https://github.com/SchedMD/slurm), # file slurm/src/common/slurm_protocol_defs.c, # and these seem all the states to be taken into account for the # QUEUED_HELD status). # There are actually a few others, like possible # failures, or partition-related reasons, but for the moment I # leave them in the QUEUED state. if (job_state_string == job_states.QUEUED and this_job.annotation in [ 'Dependency', 'JobHeldUser', 'JobHeldAdmin', 'BeginTime' ]): job_state_string = job_states.QUEUED_HELD this_job.job_state = job_state_string #### # Up to here, I just made sure that there were at least three # fields, to set the most important fields for a job. # I now check if the length is equal to the number of fields if len(job) < num_fields: # I store this job only with the information # gathered up to now, and continue to the next job # Also print a warning self.logger.warning("Wrong line length in squeue output!" "Skipping optional fields. Line: '{}'" "".format(jobdata_raw)) # I append this job before continuing job_list.append(this_job) continue # TODO: store executing_host? this_job.job_owner = thisjob_dict['username'] try: this_job.num_machines = int(thisjob_dict['number_nodes']) except ValueError: self.logger.warning("The number of allocated nodes is not " "an integer ({}) for job id {}!".format( thisjob_dict['number_nodes'], this_job.job_id)) try: this_job.num_mpiprocs = int(thisjob_dict['number_cpus']) except ValueError: self.logger.warning("The number of allocated cores is not " "an integer ({}) for job id {}!".format( thisjob_dict['number_cpus'], this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == job_states.RUNNING: this_job.allocated_machines_raw = thisjob_dict[ 'allocated_machines'] this_job.queue_name = thisjob_dict['partition'] try: this_job.requested_wallclock_time_seconds = ( self._convert_time(thisjob_dict['time_limit'])) except ValueError: self.logger.warning("Error parsing the time limit " "for job id {}".format(this_job.job_id)) # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == job_states.RUNNING: try: this_job.wallclock_time_seconds = (self._convert_time( thisjob_dict['time_used'])) except ValueError: self.logger.warning("Error parsing time_used " "for job id {}".format( this_job.job_id)) try: this_job.dispatch_time = self._parse_time_string( thisjob_dict['dispatch_time']) except ValueError: self.logger.warning("Error parsing dispatch_time for job " "id {}".format(this_job.job_id)) try: this_job.submission_time = self._parse_time_string( thisjob_dict['submission_time']) except ValueError: self.logger.warning("Error parsing submission_time for job " "id {}".format(this_job.job_id)) this_job.title = thisjob_dict['job_name'] # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error("The length of the list of allocated " "nodes ({}) is different from the " "expected number of nodes ({})!".format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list
def update_running_calcs_status(authinfo): """ Update the states of calculations in WITHSCHEDULER status belonging to user and machine as defined in the 'dbauthinfo' table. """ from aiida.orm import JobCalculation, Computer from aiida.scheduler.datastructures import JobInfo from aiida.utils.logger import get_dblogger_extra if not authinfo.enabled: return execlogger.debug("Updating running calc status for user {} " "and machine {}".format(authinfo.aiidauser.email, authinfo.dbcomputer.name)) # This returns an iterator over aiida JobCalculation objects calcs_to_inquire = list( JobCalculation._get_all_with_state(state=calc_states.WITHSCHEDULER, computer=authinfo.dbcomputer, user=authinfo.aiidauser)) # NOTE: no further check is done that machine and # aiidauser are correct for each calc in calcs s = Computer(dbcomputer=authinfo.dbcomputer).get_scheduler() t = authinfo.get_transport() computed = [] # I avoid to open an ssh connection if there are # no calcs with state WITHSCHEDULER if len(calcs_to_inquire): jobids_to_inquire = [str(c.get_job_id()) for c in calcs_to_inquire] # Open connection with t: s.set_transport(t) # TODO: Check if we are ok with filtering by job (to make this work, # I had to remove the check on the retval for getJobs, # because if the job has computed and is not in the output of # qstat, it gives a nonzero retval) # TODO: catch SchedulerError exception and do something # sensible (at least, skip this computer but continue with # following ones, and set a counter; set calculations to # UNKNOWN after a while? if s.get_feature('can_query_by_user'): found_jobs = s.getJobs(user="******", as_dict=True) else: found_jobs = s.getJobs(jobs=jobids_to_inquire, as_dict=True) # I update the status of jobs for c in calcs_to_inquire: try: logger_extra = get_dblogger_extra(c) t._set_logger_extra(logger_extra) jobid = c.get_job_id() if jobid is None: execlogger.error("JobCalculation {} is WITHSCHEDULER " "but no job id was found!".format( c.pk), extra=logger_extra) continue # I check if the calculation to be checked (c) # is in the output of qstat if jobid in found_jobs: # jobinfo: the information returned by # qstat for this job jobinfo = found_jobs[jobid] execlogger.debug("Inquirying calculation {} (jobid " "{}): it has job_state={}".format( c.pk, jobid, jobinfo.job_state), extra=logger_extra) # For the moment, FAILED is not defined if jobinfo.job_state in [job_states.DONE ]: # , job_states.FAILED]: computed.append(c) try: c._set_state(calc_states.COMPUTED) except ModificationNotAllowed: # Someone already set it, just skip pass ## Do not set the WITHSCHEDULER state multiple times, ## this would raise a ModificationNotAllowed # else: # c._set_state(calc_states.WITHSCHEDULER) c._set_scheduler_state(jobinfo.job_state) c._set_last_jobinfo(jobinfo) else: execlogger.debug("Inquirying calculation {} (jobid " "{}): not found, assuming " "job_state={}".format( c.pk, jobid, job_states.DONE), extra=logger_extra) # calculation c is not found in the output of qstat computed.append(c) c._set_scheduler_state(job_states.DONE) except Exception as e: # TODO: implement a counter, after N retrials # set it to a status that # requires the user intervention execlogger.warning("There was an exception for " "calculation {} ({}): {}".format( c.pk, e.__class__.__name__, e.message), extra=logger_extra) continue for c in computed: try: logger_extra = get_dblogger_extra(c) try: detailed_jobinfo = s.get_detailed_jobinfo( jobid=c.get_job_id()) except NotImplementedError: detailed_jobinfo = ( u"AiiDA MESSAGE: This scheduler does not implement " u"the routine get_detailed_jobinfo to retrieve " u"the information on " u"a job after it has finished.") last_jobinfo = c._get_last_jobinfo() if last_jobinfo is None: last_jobinfo = JobInfo() last_jobinfo.job_id = c.get_job_id() last_jobinfo.job_state = job_states.DONE last_jobinfo.detailedJobinfo = detailed_jobinfo c._set_last_jobinfo(last_jobinfo) except Exception as e: execlogger.warning("There was an exception while " "retrieving the detailed jobinfo " "for calculation {} ({}): {}".format( c.pk, e.__class__.__name__, e.message), extra=logger_extra) continue finally: # Set the state to COMPUTED as the very last thing # of this routine; no further change should be done after # this, so that in general the retriever can just # poll for this state, if we want to. try: c._set_state(calc_states.COMPUTED) except ModificationNotAllowed: # Someone already set it, just skip pass return computed
def _parse_joblist_output(self, retval, stdout, stderr): """ Parse the queue output string, as returned by executing the command returned by _get_joblist_command command, that is here implemented as a list of lines, one for each job, with _field_separator as separator. The order is described in the _get_joblist_command function. Return a list of JobInfo objects, one of each job, each relevant parameters implemented. Note: depending on the scheduler configuration, finished jobs may either appear here, or not. This function will only return one element for each job find in the qstat output; missing jobs (for whatever reason) simply will not appear here. """ num_fields = len(self._joblist_fields) if retval != 0: self.logger.warning("Error in _parse_joblist_output: retval={}; " "stdout={}; stderr={}".format( retval, stdout, stderr)) raise SchedulerError("Error during parsing joblist output, " "retval={}\n" "stdout={}\nstderr={}".format( retval, stdout, stderr)) # will contain raw data parsed from output: only lines with the # separator, and already split in fields # I put num_fields, because in this way # if the symbol _field_separator appears in the title (that is # the last field), I don't split the title. # This assumes that _field_separator never # appears in any previous field. jobdata_raw = [ l.split(_field_separator, num_fields) for l in stdout.splitlines() if _field_separator in l ] # Create dictionary and parse specific fields job_list = [] for job in jobdata_raw: # Each job should have all fields. if len(job) != num_fields: # I skip this calculation # (I don't append anything to job_list before continuing) self.logger.error("Wrong line length in squeue output! '{}'" "".format(job)) continue this_job = JobInfo() this_job.job_id = job[0] this_job.annotation = job[2] job_state_raw = job[1] try: job_state_string = _map_status_lsf[job_state_raw] except KeyError: self.logger.warning("Unrecognized job_state '{}' for job " "id {}".format(job_state_raw, this_job.job_id)) job_state_string = job_states.UNDETERMINED this_job.job_state = job_state_string # I get the remaining fields # The first three were already obtained # I know that the length is exactly num_fields because # I used split(_field_separator, num_fields) before # when creting 'job' # (_, _, _, executing_host, username, number_nodes, # number_cpus, allocated_machines, partition, # time_limit, time_used, dispatch_time, job_name) = job (_, _, _, executing_host, username, number_nodes, number_cpus, allocated_machines, partition, finish_time, start_time, percent_complete, submission_time, job_name) = job this_job.job_owner = username try: this_job.num_machines = int(number_nodes) except ValueError: self.logger.warning("The number of allocated nodes is not " "an integer ({}) for job id {}!".format( number_nodes, this_job.job_id)) try: this_job.num_mpiprocs = int(number_cpus) except ValueError: self.logger.warning("The number of allocated cores is not " "an integer ({}) for job id {}!".format( number_cpus, this_job.job_id)) # ALLOCATED NODES HERE # string may be in the format # nid00[684-685,722-723,748-749,958-959] # therefore it requires some parsing, that is unnecessary now. # I just store is as a raw string for the moment, and I leave # this_job.allocated_machines undefined if this_job.job_state == job_states.RUNNING: this_job.allocated_machines_raw = allocated_machines this_job.queue_name = partition psd_finish_time = self._parse_time_string(finish_time, fmt='%b %d %H:%M') psd_start_time = self._parse_time_string(start_time, fmt='%b %d %H:%M') psd_submission_time = self._parse_time_string(submission_time, fmt='%b %d %H:%M') # Now get the time in seconds which has been used # Only if it is RUNNING; otherwise it is not meaningful, # and may be not set (in my test, it is set to zero) if this_job.job_state == job_states.RUNNING: try: requested_walltime = psd_finish_time - psd_start_time # fix of a weird bug. Since the year is not parsed, it is assumed # to always be 1900. Therefore, job submitted # in december and finishing in january would produce negative time differences if requested_walltime.total_seconds() < 0: import datetime old_month = psd_finish_time.month old_day = psd_finish_time.day old_hour = psd_finish_time.hour old_minute = psd_finish_time.minute new_year = psd_start_time.year + 1 # note: we assume that no job will last more than 1 year... psd_finish_time = datetime.datetime(year=new_year, month=old_month, day=old_day, hour=old_hour, minute=old_minute) requested_walltime = psd_finish_time - psd_start_time this_job.requested_wallclock_time_seconds = requested_walltime.total_seconds( ) except (TypeError, ValueError): self.logger.warning("Error parsing the time limit " "for job id {}".format( this_job.job_id)) try: psd_percent_complete = float( percent_complete.strip(' L').strip("%")) this_job.wallclock_time_seconds = requested_walltime.total_seconds( ) * psd_percent_complete / 100. except ValueError: self.logger.warning("Error parsing the time used " "for job id {}".format( this_job.job_id)) try: this_job.submission_time = psd_submission_time except ValueError: self.logger.warning("Error parsing submission time for job " "id {}".format(this_job.job_id)) this_job.title = job_name # Everything goes here anyway for debugging purposes this_job.raw_data = job # Double check of redundant info # Not really useful now, allocated_machines in this # version of the plugin is never set if (this_job.allocated_machines is not None and this_job.num_machines is not None): if len(this_job.allocated_machines) != this_job.num_machines: self.logger.error("The length of the list of allocated " "nodes ({}) is different from the " "expected number of nodes ({})!".format( len(this_job.allocated_machines), this_job.num_machines)) # I append to the list of jobs to return job_list.append(this_job) return job_list