def qacct(job_id, num_retries=10, quantum=30, logger=None, log_prefix=""): """ Parse qacct output into key/value pairs. If qacct reports results in multiple blocks (separated by a row of ===='s), the most recently-generated block with valid data is returned. If no block with valid data exists, then return the most recently-generated block of corrupt data. Call ``is_corrupt()`` on the output of this method to see if the data are suitable for use. """ if not logger: logger = _get_null_logger() start = time.time() curr_qacct_dict = None good_qacct_dict = None for i in range(num_retries): qacct_stdout_str, qacct_stderr_str, qacct_returncode = run_cli_cmd( ["qacct", "-j", str(job_id)], logger=logger) if qacct_returncode == 0 and qacct_stdout_str.strip(): # qacct returned actual output w/no error code. we're good break if qacct_stderr_str and re.match(r"error: job id \d+ not found", qacct_stderr_str): if i > 0: logger.info( '%s SGE (qacct -j %s) reports "not found"; this may mean ' "qacct is merely slow, or %s died in the 'qw' state", log_prefix, job_id, job_id, ) else: logger.error( "%s SGE (qacct -j %s) returned error code %d", log_prefix, job_id, qacct_returncode, ) if qacct_stdout_str or qacct_stderr_str: logger.error("%s SGE (qacct -j %s) printed the following", log_prefix, job_id) if qacct_stdout_str: logger.error('stdout: "%s"', qacct_stdout_str) if qacct_stderr_str: logger.error('stderr: "%s"', qacct_stderr_str) if i > 0: logger.info( "%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s", log_prefix, job_id, i + 1, time.time() - start, ". Will recheck job status after %d sec" % quantum if i + 1 < num_retries else "", ) if i + 1 < num_retries: logger.info( "%s Will wait %d sec before calling qacct on %s again", log_prefix, quantum, job_id, ) sleep_through_signals(timeout=quantum) else: # fallthrough: all retries failed raise QacctJobNotFoundError( "%s No valid SGE (qacct -j %s) output after %d tries over %d sec" % (log_prefix, job_id, i, time.time() - start)) for line in qacct_stdout_str.strip().split("\n"): if line.startswith("="): if curr_qacct_dict and not is_corrupt(curr_qacct_dict): # # Cache this non-corrupt block of qacct data just # in case all the more recent blocks are corrupt. # good_qacct_dict = curr_qacct_dict curr_qacct_dict = OrderedDict() continue try: k, v = re.split(r"\s+", line, maxsplit=1) except ValueError: raise EnvironmentError( "%s SGE (qacct -j %s) output is unparseable:\n%s" % (log_prefix, job_id, qacct_stdout_str)) curr_qacct_dict[k] = v.strip() # if the last block of qacct data looks good, promote it if curr_qacct_dict and not is_corrupt(curr_qacct_dict): good_qacct_dict = curr_qacct_dict return good_qacct_dict if good_qacct_dict else curr_qacct_dict
def _qacct_raw(task, timeout=600, quantum=15): """ Parse qacct output into key/value pairs. If qacct reports results in multiple blocks (separated by a row of ===='s), the most recently-generated block with valid data is returned. If no such block exists, then return the most recently-generated block of corrupt data. """ start = time.time() curr_qacct_dict = None good_qacct_dict = None num_retries = int(timeout / quantum) for i in xrange(num_retries): qacct_returncode = 0 try: qacct_stdout_str, qacct_stderr_str = check_output_and_stderr( ['qacct', '-j', unicode(task.drm_jobID)], preexec_fn=exit_process_group) if qacct_stdout_str.strip(): break except CosmosCalledProcessError as err: qacct_stdout_str = err.output.strip() qacct_stderr_str = err.stderr.strip() qacct_returncode = err.returncode if qacct_stderr_str and re.match(r'error: job id \d+ not found', qacct_stderr_str): if i > 0: task.workflow.log.info( '%s SGE (qacct -j %s) reports "not found"; this may mean ' 'qacct is merely slow, or %s died in the \'qw\' state', task, task.drm_jobID, task.drm_jobID) else: task.workflow.log.error( '%s SGE (qacct -j %s) returned error code %d', task, task.drm_jobID, qacct_returncode) if qacct_stdout_str or qacct_stderr_str: task.workflow.log.error( '%s SGE (qacct -j %s) printed the following', task, task.drm_jobID) if qacct_stdout_str: task.workflow.log.error('stdout: "%s"', qacct_stdout_str) if qacct_stderr_str: task.workflow.log.error('stderr: "%s"', qacct_stderr_str) if i > 0: task.workflow.log.info( '%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s', task, task.drm_jobID, i + 1, time.time() - start, '. Will recheck job status after %d sec' % quantum if i + 1 < num_retries else '') if i + 1 < num_retries: sleep_through_signals(timeout=quantum) else: # fallthrough: all retries failed raise ValueError( 'No valid `qacct -j %s` output after %d tries and %d sec' % (task.drm_jobID, i, time.time() - start)) for line in qacct_stdout_str.strip().split('\n'): if line.startswith('='): if curr_qacct_dict and not _is_corrupt(curr_qacct_dict): # # Cache this non-corrupt block of qacct data just # in case all the more recent blocks are corrupt. # good_qacct_dict = curr_qacct_dict curr_qacct_dict = OrderedDict() continue try: k, v = re.split(r'\s+', line, maxsplit=1) except ValueError: raise EnvironmentError( '%s with drm_jobID=%s has unparseable qacct output:\n%s' % (task, task.drm_jobID, qacct_stdout_str)) curr_qacct_dict[k] = v.strip() # if the last block of qacct data looks good, promote it if curr_qacct_dict and not _is_corrupt(curr_qacct_dict): good_qacct_dict = curr_qacct_dict return good_qacct_dict if good_qacct_dict else curr_qacct_dict
def run_cli_cmd(args, attempts=1, interval=15, logger=None, preexec_fn=exit_process_group, timeout=30, trust_exit_code=True, **kwargs): """ Run the supplied cmd, optionally retrying some number of times if it fails or times out. You can pass through arbitrary arguments to this command. They eventually wind up as constructor arguments to subprocess.Popen(). """ while attempts > 0: attempts -= 1 try: result = subprocess.run(args, check=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE, timeout=timeout, universal_newlines=True, **kwargs) if result.returncode == 0: if trust_exit_code: attempts = 0 elif result.stdout: attempts = 0 except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as exc: result = exc if logger is not None: log_func = logger.error details = ": stdout='%s', stderr='%s'" % ( result.stdout.strip(), result.stderr.strip(), ) if isinstance(result, subprocess.TimeoutExpired): effect = "exceeded %s-sec timeout" % result.timeout else: effect = "had exit code %s" % result.returncode if result.returncode == 0 and attempts == 0: log_func = logger.debug details = "" plan = "will retry in %s sec" % interval if attempts else "final attempt" log_func( "Call to %s %s (%s)%s", args.split()[0] if isinstance(args, str) else args[0], effect, plan, details, ) if attempts: sleep_through_signals(timeout=interval) returncode = result.returncode if hasattr(result, "returncode") else "TIMEOUT" return result.stdout, result.stderr, returncode
def qacct(job_id, num_retries=10, quantum=30, logger=None, log_prefix=""): """ Parse qacct output into key/value pairs. If qacct reports results in multiple blocks (separated by a row of ===='s), the most recently-generated block with valid data is returned. If no block with valid data exists, then return the most recently-generated block of corrupt data. Call ``is_corrupt()`` on the output of this method to see if the data are suitable for use. """ if not logger: logger = _get_null_logger() start = time.time() curr_qacct_dict = None good_qacct_dict = None for i in xrange(num_retries): qacct_returncode = 0 try: qacct_stdout_str, qacct_stderr_str = check_output_and_stderr( ['qacct', '-j', unicode(job_id)], preexec_fn=exit_process_group) if qacct_stdout_str.strip(): break except DetailedCalledProcessError as err: qacct_stdout_str = err.output.strip() qacct_stderr_str = err.stderr.strip() qacct_returncode = err.returncode if qacct_stderr_str and re.match(r'error: job id \d+ not found', qacct_stderr_str): if i > 0: logger.info( '%s SGE (qacct -j %s) reports "not found"; this may mean ' 'qacct is merely slow, or %s died in the \'qw\' state', log_prefix, job_id, job_id) else: logger.error('%s SGE (qacct -j %s) returned error code %d', log_prefix, job_id, qacct_returncode) if qacct_stdout_str or qacct_stderr_str: logger.error('%s SGE (qacct -j %s) printed the following', log_prefix, job_id) if qacct_stdout_str: logger.error('stdout: "%s"', qacct_stdout_str) if qacct_stderr_str: logger.error('stderr: "%s"', qacct_stderr_str) if i > 0: logger.info( '%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s', log_prefix, job_id, i + 1, time.time() - start, '. Will recheck job status after %d sec' % quantum if i + 1 < num_retries else '') if i + 1 < num_retries: sleep_through_signals(timeout=quantum) else: # fallthrough: all retries failed raise QacctJobNotFoundError( '%s No valid SGE (qacct -j %s) output after %d tries over %d sec' % (log_prefix, job_id, i, time.time() - start)) for line in qacct_stdout_str.strip().split('\n'): if line.startswith('='): if curr_qacct_dict and not is_corrupt(curr_qacct_dict): # # Cache this non-corrupt block of qacct data just # in case all the more recent blocks are corrupt. # good_qacct_dict = curr_qacct_dict curr_qacct_dict = OrderedDict() continue try: k, v = re.split(r'\s+', line, maxsplit=1) except ValueError: raise EnvironmentError( '%s SGE (qacct -j %s) output is unparseable:\n%s' % (log_prefix, job_id, qacct_stdout_str)) curr_qacct_dict[k] = v.strip() # if the last block of qacct data looks good, promote it if curr_qacct_dict and not is_corrupt(curr_qacct_dict): good_qacct_dict = curr_qacct_dict return good_qacct_dict if good_qacct_dict else curr_qacct_dict
def _scontrol_raw(task, timeout=600, quantum=15): """ Parse "scontrol show jobid" output into key/value pairs. """ start = time.time() num_retries = int(timeout / quantum) for i in xrange(num_retries): qacct_returncode = 0 try: qacct_stdout_str, qacct_stderr_str = check_output_and_stderr( [ 'scontrol', 'show', 'jobid', '-d', '-o', unicode(task.drm_jobID) ], preexec_fn=exit_process_group) if qacct_stdout_str.strip(): break except CosmosCalledProcessError as err: qacct_stdout_str = err.output.strip() qacct_stderr_str = err.stderr.strip() qacct_returncode = err.returncode if qacct_stderr_str == 'slurm_load_jobs error: Invalid job id specified': # too many jobs were scheduled since it finished and the job id was forgotten return dict(JobId=task.drm_jobID) else: task.workflow.log.error( '%s Slurm (scontrol show jobid -d -o %s) returned error code %d', task, task.drm_jobID, qacct_returncode) if qacct_stdout_str or qacct_stderr_str: task.workflow.log.error( '%s Slurm (scontrol show jobid -d -o %s) printed the following', task, task.drm_jobID) if qacct_stdout_str: task.workflow.log.error('stdout: "%s"', qacct_stdout_str) if qacct_stderr_str: task.workflow.log.error('stderr: "%s"', qacct_stderr_str) if i > 0: task.workflow.log.info( '%s Slurm (scontrol show jobid -d -o %s) attempt %d failed %d sec after first attempt%s', task, task.drm_jobID, i + 1, time.time() - start, '. Will recheck job status after %d sec' % quantum if i + 1 < num_retries else '') if i + 1 < num_retries: sleep_through_signals(timeout=quantum) else: # fallthrough: all retries failed raise ValueError( 'No valid `scontrol show jobid -d -o %s` output after %d tries and %d sec' % (task.drm_jobID, i, time.time() - start)) acct_dict = {} k, v = None, None for kv in qacct_stdout_str.strip().split(): eq_pos = kv.find('=') if eq_pos == -1: # add the string to previous value - most likely the previous value contained a white space if k is not None: acct_dict[k] += (" " + kv) continue else: raise EnvironmentError( '%s with drm_jobID=%s has unparseable "scontrol show jobid -d -o" output:\n%s\n' 'Could not find "=" in "%s"' % (task, task.drm_jobID, qacct_stdout_str, kv)) k, v = kv[:eq_pos], kv[(eq_pos + 1):] acct_dict[k] = v return acct_dict