Exemplo n.º 1
0
def parse_sacct(job_info, log=None):
    try:
        job_info2 = job_info.copy()
        if job_info2['State'] in FAILED_STATES + PENDING_STATES:
            job_info2['exit_status'] = None
        else:
            job_info2['exit_status'] = int(job_info2['ExitCode'].split(":")[0])
        job_info2['cpu_time'] = int(job_info2['CPUTimeRAW'])
        job_info2['wall_time'] = parse_slurm_time(job_info2['Elapsed'])
        job_info2['percent_cpu'] = div(float(job_info2['cpu_time']),
                                       float(job_info2['wall_time']))

        job_info2['avg_rss_mem'] = convert_size_to_kb(
            job_info2['AveRSS']) if job_info2['AveRSS'] != '' else None
        job_info2['max_rss_mem'] = convert_size_to_kb(
            job_info2['MaxRSS']) if job_info2['MaxRSS'] != '' else None
        job_info2['avg_vms_mem'] = convert_size_to_kb(
            job_info2['AveVMSize']) if job_info2['AveVMSize'] != '' else None
        job_info2['max_vms_mem'] = convert_size_to_kb(
            job_info2['MaxVMSize']) if job_info2['MaxVMSize'] != '' else None
    except Exception as e:
        if log:
            log.info('Error Parsing: %s' % pformat(job_info2))
        raise e

    return job_info2
Exemplo n.º 2
0
def parse_sacct(job_info, log=None):
    try:
        job_info2 = job_info.copy()
        if job_info2["State"] in FAILED_STATES + PENDING_STATES:
            job_info2["exit_status"] = None
        else:
            job_info2["exit_status"] = int(job_info2["ExitCode"].split(":")[0])
        job_info2["cpu_time"] = int(job_info2["CPUTimeRAW"])
        job_info2["wall_time"] = parse_slurm_time(job_info2["Elapsed"])
        job_info2["percent_cpu"] = div(float(job_info2["cpu_time"]),
                                       float(job_info2["wall_time"]))

        job_info2["avg_rss_mem"] = (convert_size_to_kb(job_info2["AveRSS"])
                                    if job_info2["AveRSS"] != "" else None)
        job_info2["max_rss_mem"] = (convert_size_to_kb(job_info2["MaxRSS"])
                                    if job_info2["MaxRSS"] != "" else None)
        job_info2["avg_vms_mem"] = (convert_size_to_kb(job_info2["AveVMSize"])
                                    if job_info2["AveVMSize"] != "" else None)
        job_info2["max_vms_mem"] = (convert_size_to_kb(job_info2["MaxVMSize"])
                                    if job_info2["MaxVMSize"] != "" else None)
    except Exception as e:
        if log:
            log.info("Error Parsing: %s" % pformat(job_info2))
        raise e

    return job_info2
Exemplo n.º 3
0
def parse_drmaa_jobinfo(drmaa_jobinfo):
    d = drmaa_jobinfo['resourceUsage']
    cosmos_jobinfo = dict(
        exit_status=int(drmaa_jobinfo.get('exitStatus', os.EX_UNAVAILABLE)),
        percent_cpu=div(float(d.get('cpu', 0)), float(d.get('ru_wallclock',
                                                            0))),
        wall_time=float(d.get('ru_wallclock', 0)),
        cpu_time=float(d.get('cpu', 0)),
        user_time=float(d.get('ru_utime', 0)),
        system_time=float(d.get('ru_stime', 0)),

        # TODO should we be calling convert_size_to_kb() for avg_rss_mem?
        avg_rss_mem=d.get('ru_ixrss', "0"),
        max_rss_mem_kb=convert_size_to_kb(d.get('ru_maxrss', "0")),
        avg_vms_mem_kb=None,
        max_vms_mem_kb=convert_size_to_kb(d.get('maxvmem', "0")),
        io_read_count=int(float(d.get('ru_inblock', 0))),
        io_write_count=int(float(d.get('ru_oublock', 0))),
        io_wait=float(d.get('iow', 0)),
        io_read_kb=float(d.get('io', 0)),
        io_write_kb=float(d.get('io', 0)),
        ctx_switch_voluntary=int(float(d.get('ru_nvcsw', 0))),
        ctx_switch_involuntary=int(float(d.get('ru_nivcsw', 0))),
        avg_num_threads=None,
        max_num_threads=None,
        avg_num_fds=None,
        max_num_fds=None,
        memory=float(d.get('mem', 0)),
    )

    #
    # Wait, what? drmaa has two exit status fields? Of course, they don't always
    # agree when an error occurs. Worse, sometimes drmaa doesn't set exit_status
    # when a job is killed. We may not be able to get the exact exit code, but
    # at least we can guarantee it will be non-zero for any job that shows signs
    # of terminating in error.
    #
    if int(drmaa_jobinfo['exitStatus']) != 0 or \
       drmaa_jobinfo['hasSignal'] or \
       drmaa_jobinfo['wasAborted'] or \
       not drmaa_jobinfo['hasExited']:

        if cosmos_jobinfo['exit_status'] == 0:
            try:
                cosmos_jobinfo['exit_status'] = int(
                    float(drmaa_jobinfo['resourceUsage']['exit_status']))
            except KeyError:
                cosmos_jobinfo['exit_status'] = os.EX_UNAVAILABLE

        if cosmos_jobinfo['exit_status'] == 0:
            cosmos_jobinfo['exit_status'] = os.EX_SOFTWARE

        cosmos_jobinfo['successful'] = False
    else:
        cosmos_jobinfo['successful'] = True

    return cosmos_jobinfo
Exemplo n.º 4
0
    def _get_task_return_data(self, task):
        """
        Convert raw qacct job data into Cosmos's more portable format.

        Returns a 2-tuple comprising:
        [0] a dictionary of job metadata,
        [1] a boolean indicating whether the metadata in [0] are affected by an
            SGE bug that causes qacct to occasionally return corrupt results.
        """
        d = self.task_qacct(task)

        job_failed = d['failed'][0] != '0'
        data_are_corrupt = is_corrupt(d)

        if job_failed or data_are_corrupt:
            task.workflow.log.warn('%s SGE (qacct -j %s) reports %s:\n%s' %
                                   (task, task.drm_jobID,
                                    'corrupt data' if data_are_corrupt else 'job failure',
                                    json.dumps(d, indent=4, sort_keys=True)))

        processed_data = dict(
            exit_status=int(d['exit_status']) if not job_failed else int(re.search(r'^(\d+)', d['failed']).group(1)),

            percent_cpu=div(float(d['cpu']), float(d['ru_wallclock'])),
            wall_time=float(d['ru_wallclock']),

            cpu_time=float(d['cpu']),
            user_time=float(d['ru_utime']),
            system_time=float(d['ru_stime']),

            avg_rss_mem=d['ru_ixrss'],
            max_rss_mem_kb=convert_size_to_kb(d['maxrss']),
            avg_vms_mem_kb=None,
            max_vms_mem_kb=convert_size_to_kb(d['maxvmem']),

            io_read_count=int(d['ru_inblock']),
            io_write_count=int(d['ru_oublock']),
            io_wait=float(d['iow']),
            io_read_kb=convert_size_to_kb("%fG" % float(d['io'])),
            io_write_kb=convert_size_to_kb("%fG" % float(d['io'])),

            ctx_switch_voluntary=int(d['ru_nvcsw']),
            ctx_switch_involuntary=int(d['ru_nivcsw']),

            avg_num_threads=None,
            max_num_threads=None,

            avg_num_fds=None,
            max_num_fds=None,

            memory=float(d['mem']),
        )

        return processed_data, data_are_corrupt
Exemplo n.º 5
0
def parse_sacct(job_info, log=None):
    try:
        job_info2 = job_info.copy()
        if job_info2['State'] in FAILED_STATES + PENDING_STATES:
            job_info2['exit_status'] = None
        else:
            job_info2['exit_status'] = int(job_info2['ExitCode'].split(":")[0])
        job_info2['cpu_time'] = int(job_info2['CPUTimeRAW'])
        job_info2['wall_time'] = parse_slurm_time(job_info2['Elapsed'])
        job_info2['percent_cpu'] = div(float(job_info2['cpu_time']), float(job_info2['wall_time']))

        job_info2['avg_rss_mem'] = convert_size_to_kb(job_info2['AveRSS']) if job_info2['AveRSS'] != '' else None
        job_info2['max_rss_mem'] = convert_size_to_kb(job_info2['MaxRSS']) if job_info2['MaxRSS'] != ''  else None
        job_info2['avg_vms_mem'] = convert_size_to_kb(job_info2['AveVMSize']) if job_info2['AveVMSize'] != '' else None
        job_info2['max_vms_mem'] = convert_size_to_kb(job_info2['MaxVMSize']) if job_info2['MaxVMSize'] != '' else None
    except Exception as e:
        if log:
            log.info('Error Parsing: %s' % pformat(job_info2))
        raise e

    return job_info2