def do_sacct(job_ids): # there's a lag between when a job finishes and when sacct is available :(Z cmd = 'sacct --format=' \ '"State,JobID,CPUTime,MaxRSS,AveRSS,AveCPU,CPUTimeRAW,AveVMSize,MaxVMSize,Elapsed,ExitCode,Start,End" ' \ '-j %s -P' % ','.join(job_ids) out, err = check_output_and_stderr(cmd, preexec_fn=exit_process_group, shell=True) parts = out.strip().split("\n") # job_id_to_job_info_dict all_jobs = dict() # first line is the header keys = parts[0].split('|') # second line is all dashes, ignore it for line in parts[2:]: values = line.split('|') job_dict = dict(zip(keys, values)) if 'batch' in job_dict['JobID']: # slurm prints these .batch versions of jobids which have better information, overwrite job_dict['JobID'] = job_dict['JobID'].replace('.batch', '') all_jobs[job_dict['JobID']] = job_dict return all_jobs
def do_sacct(job_ids): # there's a lag between when a job finishes and when sacct is available :(Z cmd = 'sacct --format=' \ '"State,JobID,CPUTime,MaxRSS,AveRSS,AveCPU,CPUTimeRAW,AveVMSize,MaxVMSize,Elapsed,ExitCode,Start,End" ' \ '-j %s -P' % ','.join(job_ids) out, err = check_output_and_stderr(cmd, preexec_fn=exit_process_group, shell=True ) parts = out.strip().split("\n") # job_id_to_job_info_dict all_jobs = dict() # first line is the header keys = parts[0].split('|') # second line is all dashes, ignore it for line in parts[2:]: values = line.split('|') job_dict = dict(zip(keys, values)) if 'batch' in job_dict['JobID']: # slurm prints these .batch versions of jobids which have better information, overwrite job_dict['JobID'] = job_dict['JobID'].replace('.batch', '') all_jobs[job_dict['JobID']] = job_dict return all_jobs
def sbatch(task): ns = task.drm_native_specification if task.drm_native_specification else '' cmd = (['sbatch', '-o', os.path.abspath(task.output_stdout_path), '-e', os.path.abspath(task.output_stderr_path)] + ns.split() + [task.output_command_script_path]) out, err = check_output_and_stderr(cmd, env=os.environ, preexec_fn=exit_process_group) return str(re.search(r'job (\d+)', out).group(1))
def sbatch(task): ns = task.drm_native_specification if task.drm_native_specification else '' cmd = ([ 'sbatch', '-o', os.path.abspath(task.output_stdout_path), '-e', os.path.abspath(task.output_stderr_path) ] + ns.split() + [task.output_command_script_path]) out, err = check_output_and_stderr(cmd, env=os.environ, preexec_fn=exit_process_group) return str(re.search(r'job (\d+)', out).group(1))
def _qacct_raw(task, timeout=600, quantum=15): """ Parse qacct output into key/value pairs. If qacct reports results in multiple blocks (separated by a row of ===='s), the most recently-generated block with valid data is returned. If no such block exists, then return the most recently-generated block of corrupt data. """ start = time.time() curr_qacct_dict = None good_qacct_dict = None num_retries = int(timeout / quantum) for i in xrange(num_retries): qacct_returncode = 0 try: qacct_stdout_str, qacct_stderr_str = check_output_and_stderr( ['qacct', '-j', unicode(task.drm_jobID)], preexec_fn=exit_process_group) if qacct_stdout_str.strip(): break except CosmosCalledProcessError as err: qacct_stdout_str = err.output.strip() qacct_stderr_str = err.stderr.strip() qacct_returncode = err.returncode if qacct_stderr_str and re.match(r'error: job id \d+ not found', qacct_stderr_str): if i > 0: task.workflow.log.info( '%s SGE (qacct -j %s) reports "not found"; this may mean ' 'qacct is merely slow, or %s died in the \'qw\' state', task, task.drm_jobID, task.drm_jobID) else: task.workflow.log.error( '%s SGE (qacct -j %s) returned error code %d', task, task.drm_jobID, qacct_returncode) if qacct_stdout_str or qacct_stderr_str: task.workflow.log.error( '%s SGE (qacct -j %s) printed the following', task, task.drm_jobID) if qacct_stdout_str: task.workflow.log.error('stdout: "%s"', qacct_stdout_str) if qacct_stderr_str: task.workflow.log.error('stderr: "%s"', qacct_stderr_str) if i > 0: task.workflow.log.info( '%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s', task, task.drm_jobID, i + 1, time.time() - start, '. Will recheck job status after %d sec' % quantum if i + 1 < num_retries else '') if i + 1 < num_retries: sleep_through_signals(timeout=quantum) else: # fallthrough: all retries failed raise ValueError( 'No valid `qacct -j %s` output after %d tries and %d sec' % (task.drm_jobID, i, time.time() - start)) for line in qacct_stdout_str.strip().split('\n'): if line.startswith('='): if curr_qacct_dict and not _is_corrupt(curr_qacct_dict): # # Cache this non-corrupt block of qacct data just # in case all the more recent blocks are corrupt. # good_qacct_dict = curr_qacct_dict curr_qacct_dict = OrderedDict() continue try: k, v = re.split(r'\s+', line, maxsplit=1) except ValueError: raise EnvironmentError( '%s with drm_jobID=%s has unparseable qacct output:\n%s' % (task, task.drm_jobID, qacct_stdout_str)) curr_qacct_dict[k] = v.strip() # if the last block of qacct data looks good, promote it if curr_qacct_dict and not _is_corrupt(curr_qacct_dict): good_qacct_dict = curr_qacct_dict return good_qacct_dict if good_qacct_dict else curr_qacct_dict
def qacct(job_id, num_retries=10, quantum=30, logger=None, log_prefix=""): """ Parse qacct output into key/value pairs. If qacct reports results in multiple blocks (separated by a row of ===='s), the most recently-generated block with valid data is returned. If no block with valid data exists, then return the most recently-generated block of corrupt data. Call ``is_corrupt()`` on the output of this method to see if the data are suitable for use. """ if not logger: logger = _get_null_logger() start = time.time() curr_qacct_dict = None good_qacct_dict = None for i in xrange(num_retries): qacct_returncode = 0 try: qacct_stdout_str, qacct_stderr_str = check_output_and_stderr( ['qacct', '-j', unicode(job_id)], preexec_fn=exit_process_group) if qacct_stdout_str.strip(): break except DetailedCalledProcessError as err: qacct_stdout_str = err.output.strip() qacct_stderr_str = err.stderr.strip() qacct_returncode = err.returncode if qacct_stderr_str and re.match(r'error: job id \d+ not found', qacct_stderr_str): if i > 0: logger.info( '%s SGE (qacct -j %s) reports "not found"; this may mean ' 'qacct is merely slow, or %s died in the \'qw\' state', log_prefix, job_id, job_id) else: logger.error('%s SGE (qacct -j %s) returned error code %d', log_prefix, job_id, qacct_returncode) if qacct_stdout_str or qacct_stderr_str: logger.error('%s SGE (qacct -j %s) printed the following', log_prefix, job_id) if qacct_stdout_str: logger.error('stdout: "%s"', qacct_stdout_str) if qacct_stderr_str: logger.error('stderr: "%s"', qacct_stderr_str) if i > 0: logger.info( '%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s', log_prefix, job_id, i + 1, time.time() - start, '. Will recheck job status after %d sec' % quantum if i + 1 < num_retries else '') if i + 1 < num_retries: sleep_through_signals(timeout=quantum) else: # fallthrough: all retries failed raise QacctJobNotFoundError( '%s No valid SGE (qacct -j %s) output after %d tries over %d sec' % (log_prefix, job_id, i, time.time() - start)) for line in qacct_stdout_str.strip().split('\n'): if line.startswith('='): if curr_qacct_dict and not is_corrupt(curr_qacct_dict): # # Cache this non-corrupt block of qacct data just # in case all the more recent blocks are corrupt. # good_qacct_dict = curr_qacct_dict curr_qacct_dict = OrderedDict() continue try: k, v = re.split(r'\s+', line, maxsplit=1) except ValueError: raise EnvironmentError( '%s SGE (qacct -j %s) output is unparseable:\n%s' % (log_prefix, job_id, qacct_stdout_str)) curr_qacct_dict[k] = v.strip() # if the last block of qacct data looks good, promote it if curr_qacct_dict and not is_corrupt(curr_qacct_dict): good_qacct_dict = curr_qacct_dict return good_qacct_dict if good_qacct_dict else curr_qacct_dict
def _scontrol_raw(task, timeout=600, quantum=15): """ Parse "scontrol show jobid" output into key/value pairs. """ start = time.time() num_retries = int(timeout / quantum) for i in xrange(num_retries): qacct_returncode = 0 try: qacct_stdout_str, qacct_stderr_str = check_output_and_stderr( [ 'scontrol', 'show', 'jobid', '-d', '-o', unicode(task.drm_jobID) ], preexec_fn=exit_process_group) if qacct_stdout_str.strip(): break except CosmosCalledProcessError as err: qacct_stdout_str = err.output.strip() qacct_stderr_str = err.stderr.strip() qacct_returncode = err.returncode if qacct_stderr_str == 'slurm_load_jobs error: Invalid job id specified': # too many jobs were scheduled since it finished and the job id was forgotten return dict(JobId=task.drm_jobID) else: task.workflow.log.error( '%s Slurm (scontrol show jobid -d -o %s) returned error code %d', task, task.drm_jobID, qacct_returncode) if qacct_stdout_str or qacct_stderr_str: task.workflow.log.error( '%s Slurm (scontrol show jobid -d -o %s) printed the following', task, task.drm_jobID) if qacct_stdout_str: task.workflow.log.error('stdout: "%s"', qacct_stdout_str) if qacct_stderr_str: task.workflow.log.error('stderr: "%s"', qacct_stderr_str) if i > 0: task.workflow.log.info( '%s Slurm (scontrol show jobid -d -o %s) attempt %d failed %d sec after first attempt%s', task, task.drm_jobID, i + 1, time.time() - start, '. Will recheck job status after %d sec' % quantum if i + 1 < num_retries else '') if i + 1 < num_retries: sleep_through_signals(timeout=quantum) else: # fallthrough: all retries failed raise ValueError( 'No valid `scontrol show jobid -d -o %s` output after %d tries and %d sec' % (task.drm_jobID, i, time.time() - start)) acct_dict = {} k, v = None, None for kv in qacct_stdout_str.strip().split(): eq_pos = kv.find('=') if eq_pos == -1: # add the string to previous value - most likely the previous value contained a white space if k is not None: acct_dict[k] += (" " + kv) continue else: raise EnvironmentError( '%s with drm_jobID=%s has unparseable "scontrol show jobid -d -o" output:\n%s\n' 'Could not find "=" in "%s"' % (task, task.drm_jobID, qacct_stdout_str, kv)) k, v = kv[:eq_pos], kv[(eq_pos + 1):] acct_dict[k] = v return acct_dict