Exemplo n.º 1
0
def do_sacct(job_ids):
    # there's a lag between when a job finishes and when sacct is available :(Z
    cmd = 'sacct --format=' \
          '"State,JobID,CPUTime,MaxRSS,AveRSS,AveCPU,CPUTimeRAW,AveVMSize,MaxVMSize,Elapsed,ExitCode,Start,End" ' \
          '-j %s -P' % ','.join(job_ids)

    out, err = check_output_and_stderr(cmd,
                                       preexec_fn=exit_process_group,
                                       shell=True)

    parts = out.strip().split("\n")
    # job_id_to_job_info_dict
    all_jobs = dict()
    # first line is the header
    keys = parts[0].split('|')
    # second line is all dashes, ignore it
    for line in parts[2:]:
        values = line.split('|')
        job_dict = dict(zip(keys, values))

        if 'batch' in job_dict['JobID']:
            # slurm prints these .batch versions of jobids which have better information, overwrite
            job_dict['JobID'] = job_dict['JobID'].replace('.batch', '')

        all_jobs[job_dict['JobID']] = job_dict

    return all_jobs
Exemplo n.º 2
0
def do_sacct(job_ids):
    # there's a lag between when a job finishes and when sacct is available :(Z
    cmd = 'sacct --format=' \
          '"State,JobID,CPUTime,MaxRSS,AveRSS,AveCPU,CPUTimeRAW,AveVMSize,MaxVMSize,Elapsed,ExitCode,Start,End" ' \
          '-j %s -P' % ','.join(job_ids)

    out, err = check_output_and_stderr(cmd,
                                       preexec_fn=exit_process_group,
                                       shell=True
                                       )

    parts = out.strip().split("\n")
    # job_id_to_job_info_dict
    all_jobs = dict()
    # first line is the header
    keys = parts[0].split('|')
    # second line is all dashes, ignore it
    for line in parts[2:]:
        values = line.split('|')
        job_dict = dict(zip(keys, values))

        if 'batch' in job_dict['JobID']:
            # slurm prints these .batch versions of jobids which have better information, overwrite
            job_dict['JobID'] = job_dict['JobID'].replace('.batch', '')

        all_jobs[job_dict['JobID']] = job_dict

    return all_jobs
Exemplo n.º 3
0
def sbatch(task):
    ns = task.drm_native_specification if task.drm_native_specification else ''

    cmd = (['sbatch', '-o', os.path.abspath(task.output_stdout_path), '-e', os.path.abspath(task.output_stderr_path)]
           + ns.split()
           + [task.output_command_script_path])

    out, err = check_output_and_stderr(cmd, env=os.environ, preexec_fn=exit_process_group)
    return str(re.search(r'job (\d+)', out).group(1))
Exemplo n.º 4
0
def sbatch(task):
    ns = task.drm_native_specification if task.drm_native_specification else ''

    cmd = ([
        'sbatch', '-o',
        os.path.abspath(task.output_stdout_path), '-e',
        os.path.abspath(task.output_stderr_path)
    ] + ns.split() + [task.output_command_script_path])

    out, err = check_output_and_stderr(cmd,
                                       env=os.environ,
                                       preexec_fn=exit_process_group)
    return str(re.search(r'job (\d+)', out).group(1))
Exemplo n.º 5
0
def _qacct_raw(task, timeout=600, quantum=15):
    """
    Parse qacct output into key/value pairs.

    If qacct reports results in multiple blocks (separated by a row of ===='s),
    the most recently-generated block with valid data is returned. If no such
    block exists, then return the most recently-generated block of corrupt data.
    """
    start = time.time()
    curr_qacct_dict = None
    good_qacct_dict = None
    num_retries = int(timeout / quantum)

    for i in xrange(num_retries):
        qacct_returncode = 0
        try:
            qacct_stdout_str, qacct_stderr_str = check_output_and_stderr(
                ['qacct', '-j', unicode(task.drm_jobID)],
                preexec_fn=exit_process_group)
            if qacct_stdout_str.strip():
                break
        except CosmosCalledProcessError as err:
            qacct_stdout_str = err.output.strip()
            qacct_stderr_str = err.stderr.strip()
            qacct_returncode = err.returncode

        if qacct_stderr_str and re.match(r'error: job id \d+ not found',
                                         qacct_stderr_str):
            if i > 0:
                task.workflow.log.info(
                    '%s SGE (qacct -j %s) reports "not found"; this may mean '
                    'qacct is merely slow, or %s died in the \'qw\' state',
                    task, task.drm_jobID, task.drm_jobID)
        else:
            task.workflow.log.error(
                '%s SGE (qacct -j %s) returned error code %d', task,
                task.drm_jobID, qacct_returncode)
            if qacct_stdout_str or qacct_stderr_str:
                task.workflow.log.error(
                    '%s SGE (qacct -j %s) printed the following', task,
                    task.drm_jobID)
                if qacct_stdout_str:
                    task.workflow.log.error('stdout: "%s"', qacct_stdout_str)
                if qacct_stderr_str:
                    task.workflow.log.error('stderr: "%s"', qacct_stderr_str)

        if i > 0:
            task.workflow.log.info(
                '%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s',
                task, task.drm_jobID, i + 1,
                time.time() - start, '. Will recheck job status after %d sec' %
                quantum if i + 1 < num_retries else '')
        if i + 1 < num_retries:
            sleep_through_signals(timeout=quantum)
    else:
        # fallthrough: all retries failed
        raise ValueError(
            'No valid `qacct -j %s` output after %d tries and %d sec' %
            (task.drm_jobID, i, time.time() - start))

    for line in qacct_stdout_str.strip().split('\n'):
        if line.startswith('='):
            if curr_qacct_dict and not _is_corrupt(curr_qacct_dict):
                #
                # Cache this non-corrupt block of qacct data just
                # in case all the more recent blocks are corrupt.
                #
                good_qacct_dict = curr_qacct_dict

            curr_qacct_dict = OrderedDict()
            continue

        try:
            k, v = re.split(r'\s+', line, maxsplit=1)
        except ValueError:
            raise EnvironmentError(
                '%s with drm_jobID=%s has unparseable qacct output:\n%s' %
                (task, task.drm_jobID, qacct_stdout_str))

        curr_qacct_dict[k] = v.strip()

    # if the last block of qacct data looks good, promote it
    if curr_qacct_dict and not _is_corrupt(curr_qacct_dict):
        good_qacct_dict = curr_qacct_dict

    return good_qacct_dict if good_qacct_dict else curr_qacct_dict
Exemplo n.º 6
0
def qacct(job_id, num_retries=10, quantum=30, logger=None, log_prefix=""):
    """
    Parse qacct output into key/value pairs.

    If qacct reports results in multiple blocks (separated by a row of ===='s),
    the most recently-generated block with valid data is returned. If no block
    with valid data exists, then return the most recently-generated block of
    corrupt data. Call ``is_corrupt()`` on the output of this method to see if
    the data are suitable for use.
    """
    if not logger:
        logger = _get_null_logger()

    start = time.time()
    curr_qacct_dict = None
    good_qacct_dict = None

    for i in xrange(num_retries):
        qacct_returncode = 0
        try:
            qacct_stdout_str, qacct_stderr_str = check_output_and_stderr(
                ['qacct', '-j', unicode(job_id)],
                preexec_fn=exit_process_group)
            if qacct_stdout_str.strip():
                break
        except DetailedCalledProcessError as err:
            qacct_stdout_str = err.output.strip()
            qacct_stderr_str = err.stderr.strip()
            qacct_returncode = err.returncode

        if qacct_stderr_str and re.match(r'error: job id \d+ not found',
                                         qacct_stderr_str):
            if i > 0:
                logger.info(
                    '%s SGE (qacct -j %s) reports "not found"; this may mean '
                    'qacct is merely slow, or %s died in the \'qw\' state',
                    log_prefix, job_id, job_id)
        else:
            logger.error('%s SGE (qacct -j %s) returned error code %d',
                         log_prefix, job_id, qacct_returncode)
            if qacct_stdout_str or qacct_stderr_str:
                logger.error('%s SGE (qacct -j %s) printed the following',
                             log_prefix, job_id)
                if qacct_stdout_str:
                    logger.error('stdout: "%s"', qacct_stdout_str)
                if qacct_stderr_str:
                    logger.error('stderr: "%s"', qacct_stderr_str)

        if i > 0:
            logger.info(
                '%s SGE (qacct -j %s) attempt %d failed %d sec after first attempt%s',
                log_prefix, job_id, i + 1,
                time.time() - start, '. Will recheck job status after %d sec' %
                quantum if i + 1 < num_retries else '')
        if i + 1 < num_retries:
            sleep_through_signals(timeout=quantum)
    else:
        # fallthrough: all retries failed
        raise QacctJobNotFoundError(
            '%s No valid SGE (qacct -j %s) output after %d tries over %d sec' %
            (log_prefix, job_id, i, time.time() - start))

    for line in qacct_stdout_str.strip().split('\n'):
        if line.startswith('='):
            if curr_qacct_dict and not is_corrupt(curr_qacct_dict):
                #
                # Cache this non-corrupt block of qacct data just
                # in case all the more recent blocks are corrupt.
                #
                good_qacct_dict = curr_qacct_dict

            curr_qacct_dict = OrderedDict()
            continue

        try:
            k, v = re.split(r'\s+', line, maxsplit=1)
        except ValueError:
            raise EnvironmentError(
                '%s SGE (qacct -j %s) output is unparseable:\n%s' %
                (log_prefix, job_id, qacct_stdout_str))

        curr_qacct_dict[k] = v.strip()

    # if the last block of qacct data looks good, promote it
    if curr_qacct_dict and not is_corrupt(curr_qacct_dict):
        good_qacct_dict = curr_qacct_dict

    return good_qacct_dict if good_qacct_dict else curr_qacct_dict
Exemplo n.º 7
0
def _scontrol_raw(task, timeout=600, quantum=15):
    """
    Parse "scontrol show jobid" output into key/value pairs.
    """
    start = time.time()
    num_retries = int(timeout / quantum)

    for i in xrange(num_retries):
        qacct_returncode = 0
        try:
            qacct_stdout_str, qacct_stderr_str = check_output_and_stderr(
                [
                    'scontrol', 'show', 'jobid', '-d', '-o',
                    unicode(task.drm_jobID)
                ],
                preexec_fn=exit_process_group)
            if qacct_stdout_str.strip():
                break
        except CosmosCalledProcessError as err:
            qacct_stdout_str = err.output.strip()
            qacct_stderr_str = err.stderr.strip()
            qacct_returncode = err.returncode

            if qacct_stderr_str == 'slurm_load_jobs error: Invalid job id specified':
                # too many jobs were scheduled since it finished and the job id was forgotten
                return dict(JobId=task.drm_jobID)
            else:
                task.workflow.log.error(
                    '%s Slurm (scontrol show jobid -d -o %s) returned error code %d',
                    task, task.drm_jobID, qacct_returncode)
                if qacct_stdout_str or qacct_stderr_str:
                    task.workflow.log.error(
                        '%s Slurm (scontrol show jobid -d -o %s) printed the following',
                        task, task.drm_jobID)
                    if qacct_stdout_str:
                        task.workflow.log.error('stdout: "%s"',
                                                qacct_stdout_str)
                    if qacct_stderr_str:
                        task.workflow.log.error('stderr: "%s"',
                                                qacct_stderr_str)

        if i > 0:
            task.workflow.log.info(
                '%s Slurm (scontrol show jobid -d -o %s) attempt %d failed %d sec after first attempt%s',
                task, task.drm_jobID, i + 1,
                time.time() - start, '. Will recheck job status after %d sec' %
                quantum if i + 1 < num_retries else '')
        if i + 1 < num_retries:
            sleep_through_signals(timeout=quantum)
    else:
        # fallthrough: all retries failed
        raise ValueError(
            'No valid `scontrol show jobid -d -o %s` output after %d tries and %d sec'
            % (task.drm_jobID, i, time.time() - start))

    acct_dict = {}
    k, v = None, None
    for kv in qacct_stdout_str.strip().split():
        eq_pos = kv.find('=')
        if eq_pos == -1:
            # add the string to previous value - most likely the previous value contained a white space
            if k is not None:
                acct_dict[k] += (" " + kv)
                continue
            else:
                raise EnvironmentError(
                    '%s with drm_jobID=%s has unparseable "scontrol show jobid -d -o" output:\n%s\n'
                    'Could not find "=" in "%s"' %
                    (task, task.drm_jobID, qacct_stdout_str, kv))
        k, v = kv[:eq_pos], kv[(eq_pos + 1):]
        acct_dict[k] = v

    return acct_dict