예제 #1
0
def create_monitor(vgrid_name):
    """Write monitor HTML file for vgrid_name"""

    html_file = os.path.join(configuration.vgrid_home, vgrid_name,
                             '%s.html' % configuration.vgrid_monitor)

    print 'collecting statistics for VGrid %s' % vgrid_name
    sleep_secs = configuration.sleep_secs
    slackperiod = configuration.slackperiod
    now = time.asctime(time.localtime())

    html_vars = {
        'sleep_secs': sleep_secs,
        'vgrid_name': vgrid_name,
        'logo_url': '/images/logo.jpg',
        'now': now,
        'short_title': configuration.short_title,
    }

    monitor_meta = '''<meta http-equiv="refresh" content="%(sleep_secs)s" />
''' % html_vars
    add_import = '''
<script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script>
    '''
    add_init = ''
    add_ready = '''
          // table initially sorted by col. 1 (name)
          var sortOrder = [[1,0]];

          // use image path for sorting if there is any inside
          var imgTitle = function(contents) {
              var key = $(contents).find("a").attr("class");
              if (key == null) {
                  key = $(contents).html();
              }
              return key;
          }
          $("table.monitor").tablesorter({widgets: ["zebra"],
                                          textExtraction: imgTitle,
                                         });
          $("table.monitor").each(function () {
              try {
                  $(this).trigger("sorton", [sortOrder]);
              } catch(err) {
                  /* tablesorter chokes on empty tables - just continue */
              }
          });
    '''
    monitor_js = '''
%s

<script type="text/javascript" >

%s

$(document).ready(function() {
%s
          }
);
</script>
''' % (add_import, add_init, add_ready)

    # User default site style
    style_helpers = themed_styles(configuration)
    script_helpers = themed_scripts(configuration)
    script_helpers['advanced'] += add_import
    script_helpers['init'] += add_init
    script_helpers['ready'] += add_ready
    html = get_xgi_html_header(
        configuration,
        '%(short_title)s Monitor, VGrid %(vgrid_name)s' % html_vars,
        '',
        html=True,
        meta=monitor_meta,
        style_map=style_helpers,
        script_map=script_helpers,
        frame=False,
        menu=False,
        widgets=False,
        userstyle=False,
    )
    html += \
        '''
<!-- end of raw header: this line is used by showvgridmonitor -->
<h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1>
<div class="generatornote smallcontent">
This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs).
</div>
'''\
         % html_vars

    # loop and get totals

    parse_count = 0
    queued_count = 0
    frozen_count = 0
    executing_count = 0
    finished_count = 0
    failed_count = 0
    retry_count = 0
    canceled_count = 0

    cpucount_requested = 0
    cpucount_done = 0
    nodecount_requested = 0
    nodecount_done = 0
    cputime_requested = 0
    cputime_done = 0
    used_walltime = 0
    disk_requested = 0
    disk_done = 0
    memory_requested = 0
    memory_done = 0
    runtimeenv_dict = {'': 0}
    runtimeenv_requested = 0
    runtimeenv_done = 0

    number_of_jobs = 0
    up_count = 0
    down_count = 0
    slack_count = 0

    job_assigned = 0
    job_assigned_cpus = 0

    gstat = GridStat(configuration, logger)

    runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                      'RUNTIMEENVIRONMENT', {})

    parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'PARSE')
    queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'QUEUED')
    frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FROZEN')
    executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                      'EXECUTING')
    failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FAILED')
    retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RETRY')
    canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                     'CANCELED')
    expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'EXPIRED')
    finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                     'FINISHED')

    nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                          'NODECOUNT_REQ')
    nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                     'NODECOUNT_DONE')
    cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                        'CPUTIME_REQ')
    cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                   'CPUTIME_DONE')

    used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                    'USED_WALLTIME')

    if (used_walltime == 0):
        used_walltime = datetime.timedelta(0)

    used_walltime = format_timedelta(used_walltime)

    disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                     'DISK_REQ')
    disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'DISK_DONE')
    memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                       'MEMORY_REQ')
    memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                  'MEMORY_DONE')
    cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                         'CPUCOUNT_REQ')
    cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                    'CPUCOUNT_DONE')
    runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                           'RUNTIMEENVIRONMENT_REQ')
    runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(),
                                      'RUNTIMEENVIRONMENT_DONE')

    number_of_jobs = parse_count
    number_of_jobs += queued_count
    number_of_jobs += frozen_count
    number_of_jobs += expired_count
    number_of_jobs += canceled_count
    number_of_jobs += failed_count
    number_of_jobs += executing_count
    number_of_jobs += finished_count
    number_of_jobs += retry_count

    html_vars = {
        'parse_count': parse_count,
        'queued_count': queued_count,
        'frozen_count': frozen_count,
        'executing_count': executing_count,
        'failed_count': failed_count,
        'retry_count': retry_count,
        'canceled_count': canceled_count,
        'expired_count': expired_count,
        'finished_count': finished_count,
        'number_of_jobs': number_of_jobs,
        'cpucount_requested': cpucount_requested,
        'cpucount_done': cpucount_done,
        'nodecount_requested': nodecount_requested,
        'nodecount_done': nodecount_done,
        'cputime_requested': cputime_requested,
        'cputime_done': cputime_done,
        'used_walltime': used_walltime,
        'disk_requested': disk_requested,
        'disk_done': disk_done,
        'memory_requested': memory_requested,
        'memory_done': memory_done,
        'runtimeenv_requested': runtimeenv_requested,
        'runtimeenv_done': runtimeenv_done,
    }

    html += \
        """<h2>Job Stats</h2><table class=monitorstats><tr><td>
<table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr>
<tr><td>Parse</td><td>%(parse_count)s</td></tr>
<tr><td>Queued</td><td>%(queued_count)s</td></tr>
<tr><td>Frozen</td><td>%(frozen_count)s</td></tr>
<tr><td>Executing</td><td>%(executing_count)s</td></tr>
<tr><td>Failed</td><td>%(failed_count)s</td></tr>
<tr><td>Retry</td><td>%(retry_count)s</td></tr>
<tr><td>Canceled</td><td>%(canceled_count)s</td></tr>
<tr><td>Expired</td><td>%(expired_count)s</td></tr>
<tr><td>Finished</td><td>%(finished_count)s</td></tr>
<tr><td>Total</td><td>%(number_of_jobs)s</td></tr>
</table>
</td><td>
<table class=monitorresreq>
<tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr>
<tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr>
<tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr>
<tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr>
<tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr>
<tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr>
<tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr>
<tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr>
</table><br />
</td><td>
<div class=monitorruntimeenvdetails>
<table class=monitorruntimeenvdone>
<tr class=title><td>Runtime Envs Done</td><td></td></tr>
"""\
         % html_vars

    if len(runtimeenv_dict.keys()) < 1:

        # No runtimeenv requests

        html += '<tr><td></td><td>-</td></tr>\n'
    else:
        for entry in runtimeenv_dict.keys():
            if not entry == '':
                html += '<tr><td>' + entry + '</td><td>'\
                    + str(runtimeenv_dict[entry]) + '</td></tr>\n'

    total_number_of_exe_resources, total_number_of_store_resources = 0, 0
    total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0

    vgrid_name_list = vgrid_name.split('/')
    current_dir = ''

    exes, stores = '', ''
    for vgrid_name_part in vgrid_name_list:
        current_dir = os.path.join(current_dir, vgrid_name_part)
        abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir)
        # print 'dir: %s' % abs_mon_dir
        # Potential race - just ignore if it disappeared
        try:
            sorted_names = os.listdir(abs_mon_dir)
        except OSError:
            continue
        sorted_names.sort()
        for filename in sorted_names:
            # print filename
            if filename.startswith('monitor_last_request_'):

                # read last request helper file

                mon_file_name = os.path.join(abs_mon_dir, filename)
                print 'found ' + mon_file_name
                last_request_dict = unpickle(mon_file_name, logger)
                if not last_request_dict:
                    print 'could not open and unpickle: '\
                        + mon_file_name
                    continue
                if not last_request_dict.has_key('CREATED_TIME'):
                    print 'skip broken last request dict: '\
                        + mon_file_name
                    continue

                difference = datetime.datetime.now()\
                    - last_request_dict['CREATED_TIME']
                days = str(difference.days)
                hours = str(difference.seconds / 3600)
                minutes = str((difference.seconds % 3600) / 60)
                seconds = str((difference.seconds % 60) % 60)

                last_timetuple = last_request_dict['CREATED_TIME'].timetuple()

                if last_request_dict.has_key('CPUTIME'):
                    cputime = last_request_dict['CPUTIME']
                elif last_request_dict.has_key('cputime'):
                    cputime = last_request_dict['cputime']
                else:
                    print 'ERROR: last request does not contain cputime field!: %s'\
                        % last_request_dict
                    continue

                try:
                    cpusec = int(cputime)
                except ValueError:
                    try:
                        cpusec = int(float(cputime))
                    except ValueError, verr:
                        print 'ERROR: failed to parse cputime %s: %s'\
                            % (cputime, verr)

                # Include execution delay guesstimate for strict fill
                # LRMS resources

                try:
                    delay = int(last_request_dict['EXECUTION_DELAY'])
                except KeyError:
                    delay = 0
                except ValueError:
                    delay = 0

                time_remaining = (last_request_dict['CREATED_TIME']
                                  + datetime.timedelta(seconds=cpusec)
                                  + datetime.timedelta(seconds=delay))\
                    - datetime.datetime.now()
                days_rem = str(time_remaining.days)
                hours_rem = str(time_remaining.seconds / 3600)
                minutes_rem = str((time_remaining.seconds % 3600) / 60)
                seconds_rem = str((time_remaining.seconds % 60) % 60)

                if time_remaining.days < -7:
                    try:
                        print 'removing: %s as we havent seen him for %s days.'\
                            % (mon_file_name, abs(time_remaining).days)
                        os.remove(mon_file_name)
                    except Exception, err:
                        print "could not remove: '%s' Error: %s"\
                            % (mon_file_name, str(err))
                    pass
                else:
                    unique_res_name_and_exe_list = \
                        filename.split('monitor_last_request_', 1)
                    if cpusec == 0:
                        resource_status = 'unavailable'
                    elif time_remaining.days < 0:

                        # time_remaining.days < 0 means that we have passed the specified time

                        time_rem_abs = abs(time_remaining)
                        if time_rem_abs.days == 0\
                                and int(time_rem_abs.seconds)\
                                < int(slackperiod):
                            resource_status = 'slack'
                            slack_count = slack_count + 1
                        else:
                            resource_status = 'offline'
                            down_count = down_count + 1
                    else:
                        resource_status = 'online'
                        up_count = up_count + 1

                    exes += '<tr>'
                    exes += \
                        '<td><img src=/images/status-icons/%s.png /></td>'\
                        % resource_status
                    public_id = unique_res_name_and_exe_list[1]
                    if last_request_dict['RESOURCE_CONFIG'].get(
                            'ANONYMOUS', True):
                        public_id = anon_resource_id(public_id)
                    public_name = last_request_dict['RESOURCE_CONFIG'].get(
                        'PUBLICNAME', '')
                    resource_parts = public_id.split('_', 2)
                    resource_name = "<a href='viewres.py?unique_resource_name=%s'>%s</a>" % \
                                    (resource_parts[0], resource_parts[0])
                    if public_name:
                        resource_name += "<br />(alias %s)" % public_name
                    else:
                        resource_name += "<br />(no alias)"
                    resource_name += "<br />%s" % resource_parts[1]
                    exes += '<td>%s</td>' % resource_name

                    last_asctime = time.asctime(last_timetuple)
                    last_epoch = time.mktime(last_timetuple)
                    exes += '<td><div class="sortkey">%s</div>%s<br />' %  \
                            (last_epoch, last_asctime)
                    exes += '(%sd %sh %sm %ss ago)</td>' % (days, hours,
                                                            minutes, seconds)
                    exes += '<td>' + vgrid_name + '</td>'
                    runtime_envs = last_request_dict['RESOURCE_CONFIG'][
                        'RUNTIMEENVIRONMENT']
                    runtime_envs.sort()
                    re_list_text = ', '.join([i[0] for i in runtime_envs])
                    exes += '<td title="%s">' % re_list_text \
                        + str(len(runtime_envs)) + '</td>'
                    exes += '<td>'\
                        + str(last_request_dict['RESOURCE_CONFIG'
                                                ]['CPUTIME']) + '</td><td>'\
                        + str(last_request_dict['RESOURCE_CONFIG'
                                                ]['NODECOUNT']) + '</td><td>'\
                        + str(last_request_dict['RESOURCE_CONFIG'
                                                ]['CPUCOUNT']) + '</td><td>'\
                        + str(last_request_dict['RESOURCE_CONFIG'
                                                ]['DISK']) + '</td><td>'\
                        + str(last_request_dict['RESOURCE_CONFIG'
                                                ]['MEMORY']) + '</td><td>'\
                        + str(last_request_dict['RESOURCE_CONFIG'
                                                ]['ARCHITECTURE']) + '</td>'
                    exes += '<td>' + last_request_dict['STATUS']\
                        + '</td><td>' + str(last_request_dict['CPUTIME'
                                                              ]) + '</td>'

                    exes += '<td class=status_%s>' % resource_status
                    if 'unavailable' == resource_status:
                        exes += '-'
                    elif 'slack' == resource_status:
                        exes += 'Within slack period (%s < %s secs)'\
                            % (time_rem_abs.seconds, slackperiod)
                    elif 'offline' == resource_status:
                        exes += 'down?'
                    else:
                        exes += '%sd, %sh, %sm, %ss'\
                            % (days_rem, hours_rem, minutes_rem,
                                seconds_rem)
                    exes += '</td>'

                    exes += '</tr>\n'
                    if last_request_dict['STATUS'] == 'Job assigned':
                        job_assigned = job_assigned + 1
                        job_assigned_cpus = job_assigned_cpus\
                            + int(last_request_dict['RESOURCE_CONFIG'
                                                    ]['NODECOUNT'])\
                            * int(last_request_dict['RESOURCE_CONFIG'
                                                    ]['CPUCOUNT'])

                    total_number_of_exe_resources += 1
                    total_number_of_exe_cpus += int(
                        last_request_dict['RESOURCE_CONFIG']['NODECOUNT']) \
                        * int(last_request_dict['RESOURCE_CONFIG']['CPUCOUNT'])
            elif filename.startswith('monitor_last_status_'):

                # store must be linked to this vgrid, not only parent vgrid:
                # inheritance only covers access, not automatic participation

                if current_dir != vgrid_name:
                    continue

                # read last resource action status file

                mon_file_name = os.path.join(abs_mon_dir, filename)
                print 'found ' + mon_file_name
                last_status_dict = unpickle(mon_file_name, logger)
                if not last_status_dict:
                    print 'could not open and unpickle: '\
                        + mon_file_name
                    continue
                if not last_status_dict.has_key('CREATED_TIME'):
                    print 'skip broken last request dict: '\
                        + mon_file_name
                    continue

                difference = datetime.datetime.now()\
                    - last_status_dict['CREATED_TIME']
                days = str(difference.days)
                hours = str(difference.seconds / 3600)
                minutes = str((difference.seconds % 3600) / 60)
                seconds = str((difference.seconds % 60) % 60)

                if last_status_dict['STATUS'] == 'stopped':
                    time_stopped = datetime.datetime.now() - \
                        last_status_dict['CREATED_TIME']
                    if time_stopped.days > 7:
                        try:
                            print 'removing: %s as we havent seen him for %s days.'\
                                  % (mon_file_name, abs(time_stopped).days)
                            os.remove(mon_file_name)
                        except Exception, err:
                            print "could not remove: '%s' Error: %s"\
                                  % (mon_file_name, str(err))
                        continue
예제 #2
0
                            print 'removing: %s as we havent seen him for %s days.'\
                                  % (mon_file_name, abs(time_stopped).days)
                            os.remove(mon_file_name)
                        except Exception, err:
                            print "could not remove: '%s' Error: %s"\
                                  % (mon_file_name, str(err))
                        continue

                unique_res_name_and_store_list = filename.split(
                    'monitor_last_status_', 1)
                mount_point = last_status_dict.get('MOUNT_POINT', 'UNKNOWN')
                is_live = os.path.ismount(mount_point)

                public_id = unique_res_name_and_store_list[1]
                if last_status_dict['RESOURCE_CONFIG'].get('ANONYMOUS', True):
                    public_id = anon_resource_id(public_id)

                vgrid_link = os.path.join(configuration.vgrid_files_home,
                                          vgrid_name, public_id)
                is_linked = (os.path.realpath(vgrid_link) == mount_point)

                total_disk = last_status_dict['RESOURCE_CONFIG']['DISK']
                free_disk, avail_disk, used_disk, used_percent = 0, 0, 0, 0
                gig_bytes = 1.0 * 2**30

                # Fall back status - show last action unless statvfs succeeds

                last_status = last_status_dict['STATUS']
                last_timetuple = last_status_dict['CREATED_TIME'].timetuple()

                # These disk stats are slightly confusing but match 'df'
예제 #3
0
def validate(configuration, job, vgrid_resource_map, job_cond, errors):
    """Validates the readiness condition of a submitted job in respect to the
    requested specifications of vgrid, resource etc.
    """

    best_job_cond = {}
    best_job_cond['JOB_COND'] = RED
    best_errors = errors
    found_best = False
    other_errors = {}

    # Avoid repeated get_resource_map calls for every single resource
    
    resource_map = get_resource_map(configuration)
    for (vgrid, resources) in vgrid_resource_map.items():

        if best_job_cond['JOB_COND'] == GREEN:
            break

        for resource_id in resources:

            resource = get_resource_configuration(configuration, resource_id, resource_map)

            # vgrid_resource_map may contain e.g. deleted resources
            
            if not resource:
                continue

            # Check resource availability

            job_cond['REGISTERED'] = validate_resource_seen(configuration,
                                                            resource, errors,
                                                            resource_id)
            job_cond['SEEN_WITHIN_X'] = \
            validate_resource_seen_within_x_hours(configuration, resource,
                                                  errors, resource_id)

            # Check mRSL requested specs

            job_cond['ARCHITECTURE'] = validate_architecture(configuration,
                                                             job, resource,
                                                             errors)
            job_cond['CPUCOUNT'] = validate_cpucount(configuration, job,
                                                     resource, errors)
            job_cond['CPUTIME'] = validate_cputime(configuration, job,
                                                   resource, errors)
            job_cond['DISK'] = validate_disk(configuration, job, resource,
                                             errors)
            job_cond['JOBTYPE'] = validate_jobtype(configuration, job,
                                                   resource, errors)
            job_cond['MAXPRICE'] = validate_maxprice(configuration, job,
                                                     resource, errors)
            job_cond['MEMORY'] = validate_memory(configuration, job, resource,
                                                 errors)
            job_cond['NODECOUNT'] = validate_nodecount(configuration, job,
                                                       resource, errors)
            job_cond['PLATFORM'] = validate_platform(configuration, job,
                                                     resource, errors)
            job_cond['RETRIES'] = validate_retries(configuration, job, errors)
            job_cond['RUNTIMEENVIRONMENT'] = validate_runtimeenvironment(
                configuration, job, resource, errors)
            job_cond['SANDBOX'] = validate_sandbox(configuration, job,
                                                   resource, errors)

            job_cond['suggested_vgrid'] = vgrid
            job_cond['suggested_resource'] = anon_resource_id(resource_id,
                                                              False)

            set_pass_level(configuration, job_cond)

            if threshold_color_to_value(job_cond['JOB_COND']) >= \
                   threshold_color_to_value(best_job_cond['JOB_COND']):
                if not found_best or (len(errors) <= len(best_errors)):
                    found_best = True
                    best_job_cond = job_cond.copy()
                    best_errors = errors.copy()

                    if best_job_cond['JOB_COND'] == GREEN:
                        break

            job_cond.clear()
            errors.clear()

    # Run resource independent checks only once

    job_cond['INPUTFILES'] = validate_inputfiles(configuration, job,
                                                 other_errors)
    job_cond['EXECUTABLES'] = validate_executables(configuration, job,
                                                   other_errors)
    job_cond['VERIFYFILES'] = validate_verifyfiles(configuration, job,
                                                   other_errors)
    job_cond['NOTIFY'] = validate_notify(configuration, job, other_errors)

    if not found_best:
        best_job_cond['RESOURCE'] = False
        best_errors['RESOURCE'] = 'No valid/allowed resources available.'
    else:
        best_job_cond.update(job_cond)

    if other_errors:
        best_errors.update(other_errors)
        if best_job_cond['JOB_COND'] == GREEN:
            best_job_cond['JOB_COND'] = YELLOW
    return (best_job_cond, best_errors)
예제 #4
0
def create_monitor(vgrid_name):
    """Write monitor HTML file for vgrid_name"""

    html_file = os.path.join(configuration.vgrid_home, vgrid_name, "%s.html" % configuration.vgrid_monitor)

    print "collecting statistics for VGrid %s" % vgrid_name
    sleep_secs = configuration.sleep_secs
    slackperiod = configuration.slackperiod
    now = time.asctime(time.localtime())

    html_vars = {
        "sleep_secs": sleep_secs,
        "vgrid_name": vgrid_name,
        "logo_url": "/images/logo.jpg",
        "now": now,
        "short_title": configuration.short_title,
    }

    html = get_cgi_html_header(
        configuration,
        "%(short_title)s Monitor, VGrid %(vgrid_name)s" % html_vars,
        "",
        True,
        """<meta http-equiv="refresh" content="%(sleep_secs)s" />
        """
        % html_vars,
        themed_styles(configuration),
        """
<script type="text/javascript" src="/images/js/jquery.js"></script>
<script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script>

<script type="text/javascript" >

$(document).ready(function() {

          // table initially sorted by col. 1 (name)
          var sortOrder = [[1,0]];

          // use image path for sorting if there is any inside
          var imgTitle = function(contents) {
              var key = $(contents).find("a").attr("class");
              if (key == null) {
                  key = $(contents).html();
              }
              return key;
          }
          $("table.monitor").tablesorter({widgets: ["zebra"],
                                          textExtraction: imgTitle,
                                         });
          $("table.monitor").each(function () {
              try {
                  $(this).trigger("sorton", [sortOrder]);
              } catch(err) {
                  /* tablesorter chokes on empty tables - just continue */
              }
          });
     }
);
</script>
        """,
        "",
        False,
    )
    html += (
        """
<!-- end of raw header: this line is used by showvgridmonitor -->
<h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1>
<div class="generatornote smallcontent">
This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs).
</div>
"""
        % html_vars
    )

    # loop and get totals

    parse_count = 0
    queued_count = 0
    frozen_count = 0
    executing_count = 0
    finished_count = 0
    failed_count = 0
    retry_count = 0
    canceled_count = 0

    cpucount_requested = 0
    cpucount_done = 0
    nodecount_requested = 0
    nodecount_done = 0
    cputime_requested = 0
    cputime_done = 0
    used_walltime = 0
    disk_requested = 0
    disk_done = 0
    memory_requested = 0
    memory_done = 0
    runtimeenv_dict = {"": 0}
    runtimeenv_requested = 0
    runtimeenv_done = 0

    number_of_jobs = 0
    up_count = 0
    down_count = 0
    slack_count = 0

    job_assigned = 0
    job_assigned_cpus = 0

    gstat = GridStat(configuration, logger)

    runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT", {})

    parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "PARSE")
    queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "QUEUED")
    frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FROZEN")
    executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXECUTING")
    failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FAILED")
    retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RETRY")
    canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CANCELED")
    expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXPIRED")
    finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FINISHED")

    nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_REQ")
    nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_DONE")
    cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_REQ")
    cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_DONE")

    used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "USED_WALLTIME")

    if used_walltime == 0:
        used_walltime = datetime.timedelta(0)

    used_walltime = format_timedelta(used_walltime)

    disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_REQ")
    disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_DONE")
    memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_REQ")
    memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_DONE")
    cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_REQ")
    cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_DONE")
    runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_REQ")
    runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_DONE")

    number_of_jobs = parse_count
    number_of_jobs += queued_count
    number_of_jobs += frozen_count
    number_of_jobs += expired_count
    number_of_jobs += canceled_count
    number_of_jobs += failed_count
    number_of_jobs += executing_count
    number_of_jobs += finished_count
    number_of_jobs += retry_count

    html_vars = {
        "parse_count": parse_count,
        "queued_count": queued_count,
        "frozen_count": frozen_count,
        "executing_count": executing_count,
        "failed_count": failed_count,
        "retry_count": retry_count,
        "canceled_count": canceled_count,
        "expired_count": expired_count,
        "finished_count": finished_count,
        "number_of_jobs": number_of_jobs,
        "cpucount_requested": cpucount_requested,
        "cpucount_done": cpucount_done,
        "nodecount_requested": nodecount_requested,
        "nodecount_done": nodecount_done,
        "cputime_requested": cputime_requested,
        "cputime_done": cputime_done,
        "used_walltime": used_walltime,
        "disk_requested": disk_requested,
        "disk_done": disk_done,
        "memory_requested": memory_requested,
        "memory_done": memory_done,
        "runtimeenv_requested": runtimeenv_requested,
        "runtimeenv_done": runtimeenv_done,
    }

    html += (
        """<h2>Job Stats</h2><table class=monitorstats><tr><td>
<table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr>
<tr><td>Parse</td><td>%(parse_count)s</td></tr>
<tr><td>Queued</td><td>%(queued_count)s</td></tr>
<tr><td>Frozen</td><td>%(frozen_count)s</td></tr>
<tr><td>Executing</td><td>%(executing_count)s</td></tr>
<tr><td>Failed</td><td>%(failed_count)s</td></tr>
<tr><td>Retry</td><td>%(retry_count)s</td></tr>
<tr><td>Canceled</td><td>%(canceled_count)s</td></tr>
<tr><td>Expired</td><td>%(expired_count)s</td></tr>
<tr><td>Finished</td><td>%(finished_count)s</td></tr>
<tr><td>Total</td><td>%(number_of_jobs)s</td></tr>
</table>
</td><td>
<table class=monitorresreq>
<tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr>
<tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr>
<tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr>
<tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr>
<tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr>
<tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr>
<tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr>
<tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr>
</table><br />
</td><td>
<div class=monitorruntimeenvdetails>
<table class=monitorruntimeenvdone>
<tr class=title><td>Runtime Envs Done</td><td></td></tr>
"""
        % html_vars
    )

    if len(runtimeenv_dict.keys()) < 1:

        # No runtimeenv requests

        html += "<tr><td></td><td>-</td></tr>\n"
    else:
        for entry in runtimeenv_dict.keys():
            if not entry == "":
                html += "<tr><td>" + entry + "</td><td>" + str(runtimeenv_dict[entry]) + "</td></tr>\n"

    total_number_of_exe_resources, total_number_of_store_resources = 0, 0
    total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0

    vgrid_name_list = vgrid_name.split("/")
    current_dir = ""

    exes, stores = "", ""
    for vgrid_name_part in vgrid_name_list:
        current_dir = os.path.join(current_dir, vgrid_name_part)
        abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir)
        # print 'dir: %s' % abs_mon_dir
        # Potential race - just ignore if it disappeared
        try:
            sorted_names = os.listdir(abs_mon_dir)
        except OSError:
            continue
        sorted_names.sort()
        for filename in sorted_names:
            # print filename
            if filename.startswith("monitor_last_request_"):

                # read last request helper file

                mon_file_name = os.path.join(abs_mon_dir, filename)
                print "found " + mon_file_name
                last_request_dict = unpickle(mon_file_name, logger)
                if not last_request_dict:
                    print "could not open and unpickle: " + mon_file_name
                    continue

                difference = datetime.datetime.now() - last_request_dict["CREATED_TIME"]
                days = str(difference.days)
                hours = str(difference.seconds / 3600)
                minutes = str((difference.seconds % 3600) / 60)
                seconds = str((difference.seconds % 60) % 60)

                if last_request_dict.has_key("CPUTIME"):
                    cputime = last_request_dict["CPUTIME"]
                elif last_request_dict.has_key("cputime"):
                    cputime = last_request_dict["cputime"]
                else:
                    print "ERROR: last request does not contain cputime field!: %s" % last_request_dict
                    continue

                try:
                    cpusec = int(cputime)
                except ValueError:
                    try:
                        cpusec = int(float(cputime))
                    except ValueError, verr:
                        print "ERROR: failed to parse cputime %s: %s" % (cputime, verr)

                # Include execution delay guesstimate for strict fill
                # LRMS resources

                try:
                    delay = int(last_request_dict["EXECUTION_DELAY"])
                except KeyError:
                    delay = 0
                except ValueError:
                    delay = 0

                time_remaining = (
                    last_request_dict["CREATED_TIME"]
                    + datetime.timedelta(seconds=cpusec)
                    + datetime.timedelta(seconds=delay)
                ) - datetime.datetime.now()
                days_rem = str(time_remaining.days)
                hours_rem = str(time_remaining.seconds / 3600)
                minutes_rem = str((time_remaining.seconds % 3600) / 60)
                seconds_rem = str((time_remaining.seconds % 60) % 60)

                if time_remaining.days < -7:
                    try:
                        print "removing: %s as we havent seen him for %s days." % (
                            mon_file_name,
                            abs(time_remaining).days,
                        )
                        os.remove(mon_file_name)
                    except Exception, err:
                        print "could not remove: '%s' Error: %s" % (mon_file_name, str(err))
                    pass
                else:
                    unique_res_name_and_exe_list = filename.split("monitor_last_request_", 1)
                    if cpusec == 0:
                        resource_status = "unavailable"
                    elif time_remaining.days < 0:

                        # time_remaining.days < 0 means that we have passed the specified time

                        time_rem_abs = abs(time_remaining)
                        if time_rem_abs.days == 0 and int(time_rem_abs.seconds) < int(slackperiod):
                            resource_status = "slack"
                            slack_count = slack_count + 1
                        else:
                            resource_status = "offline"
                            down_count = down_count + 1
                    else:
                        resource_status = "online"
                        up_count = up_count + 1

                    exes += "<tr>"
                    exes += "<td><img src=/images/status-icons/%s.png /></td>" % resource_status
                    public_id = unique_res_name_and_exe_list[1]
                    if last_request_dict["RESOURCE_CONFIG"].get("ANONYMOUS", True):
                        public_id = anon_resource_id(public_id)
                    public_name = last_request_dict["RESOURCE_CONFIG"].get("PUBLICNAME", "")
                    resource_parts = public_id.split("_", 2)
                    resource_name = "<a href='viewres.py?unique_resource_name=%s'>%s</a>" % (
                        resource_parts[0],
                        resource_parts[0],
                    )
                    if public_name:
                        resource_name += "<br />(alias %s)" % public_name
                    else:
                        resource_name += "<br />(no alias)"
                    resource_name += "<br />%s" % resource_parts[1]
                    exes += "<td>%s</td>" % resource_name

                    exes += "<td>%s<br />(%sd %sh %sm %ss ago)</td>" % (
                        time.asctime(last_request_dict["CREATED_TIME"].timetuple()),
                        days,
                        hours,
                        minutes,
                        seconds,
                    )
                    exes += "<td>" + vgrid_name + "</td>"
                    runtime_envs = last_request_dict["RESOURCE_CONFIG"]["RUNTIMEENVIRONMENT"]
                    re_list_text = ", ".join([i[0] for i in runtime_envs])
                    exes += '<td title="%s">' % re_list_text + str(len(runtime_envs)) + "</td>"
                    exes += (
                        "<td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["CPUTIME"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["DISK"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["MEMORY"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["ARCHITECTURE"])
                        + "</td>"
                    )
                    exes += (
                        "<td>" + last_request_dict["STATUS"] + "</td><td>" + str(last_request_dict["CPUTIME"]) + "</td>"
                    )

                    exes += "<td class=status_%s>" % resource_status
                    if "unavailable" == resource_status:
                        exes += "-"
                    elif "slack" == resource_status:
                        exes += "Within slack period (%s < %s secs)" % (time_rem_abs.seconds, slackperiod)
                    elif "offline" == resource_status:
                        exes += "down?"
                    else:
                        exes += "%sd, %sh, %sm, %ss" % (days_rem, hours_rem, minutes_rem, seconds_rem)
                    exes += "</td>"

                    exes += "</tr>\n"
                    if last_request_dict["STATUS"] == "Job assigned":
                        job_assigned = job_assigned + 1
                        job_assigned_cpus = job_assigned_cpus + int(
                            last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]
                        ) * int(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"])

                    total_number_of_exe_resources += 1
                    total_number_of_exe_cpus += int(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]) * int(
                        last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"]
                    )
            elif filename.startswith("monitor_last_status_"):

                # store must be linked to this vgrid, not only parent vgrid:
                # inheritance only covers access, not automatic participation

                if current_dir != vgrid_name:
                    continue

                # read last resource action status file

                mon_file_name = os.path.join(abs_mon_dir, filename)
                print "found " + mon_file_name
                last_status_dict = unpickle(mon_file_name, logger)
                if not last_status_dict:
                    print "could not open and unpickle: " + mon_file_name
                    continue

                difference = datetime.datetime.now() - last_status_dict["CREATED_TIME"]
                days = str(difference.days)
                hours = str(difference.seconds / 3600)
                minutes = str((difference.seconds % 3600) / 60)
                seconds = str((difference.seconds % 60) % 60)

                if last_status_dict["STATUS"] == "stopped":
                    time_stopped = datetime.datetime.now() - last_status_dict["CREATED_TIME"]
                    if time_stopped.days > 7:
                        try:
                            print "removing: %s as we havent seen him for %s days." % (
                                mon_file_name,
                                abs(time_stopped).days,
                            )
                            os.remove(mon_file_name)
                        except Exception, err:
                            print "could not remove: '%s' Error: %s" % (mon_file_name, str(err))
                        continue
예제 #5
0
                            print "removing: %s as we havent seen him for %s days." % (
                                mon_file_name,
                                abs(time_stopped).days,
                            )
                            os.remove(mon_file_name)
                        except Exception, err:
                            print "could not remove: '%s' Error: %s" % (mon_file_name, str(err))
                        continue

                unique_res_name_and_store_list = filename.split("monitor_last_status_", 1)
                mount_point = last_status_dict.get("MOUNT_POINT", "UNKNOWN")
                is_live = os.path.ismount(mount_point)

                public_id = unique_res_name_and_store_list[1]
                if last_status_dict["RESOURCE_CONFIG"].get("ANONYMOUS", True):
                    public_id = anon_resource_id(public_id)

                vgrid_link = os.path.join(configuration.vgrid_files_home, vgrid_name, public_id)
                is_linked = os.path.realpath(vgrid_link) == mount_point

                total_disk = last_status_dict["RESOURCE_CONFIG"]["DISK"]
                free_disk, avail_disk, used_disk, used_percent = 0, 0, 0, 0
                gig_bytes = 1.0 * 2 ** 30

                # Fall back status - show last action unless statvfs succeeds

                store_status = "<td>%s %s<br />(%sd %sh %sm %ss ago)</td>" % (
                    last_status_dict["STATUS"],
                    time.asctime(last_status_dict["CREATED_TIME"].timetuple()),
                    days,
                    hours,
예제 #6
0
def validate(configuration, job, vgrid_resource_map, job_cond, errors):
    """Validates the readiness condition of a submitted job in respect to the
    requested specifications of vgrid, resource etc.
    """

    best_job_cond = {}
    best_job_cond['JOB_COND'] = RED
    best_errors = errors
    found_best = False
    other_errors = {}

    # Avoid repeated get_resource_map calls for every single resource

    resource_map = get_resource_map(configuration)
    for (vgrid, resources) in vgrid_resource_map.items():

        if best_job_cond['JOB_COND'] == GREEN:
            break

        for resource_id in resources:

            resource = get_resource_configuration(configuration, resource_id,
                                                  resource_map)

            # vgrid_resource_map may contain e.g. deleted resources

            if not resource:
                continue

            # Check resource availability

            job_cond['REGISTERED'] = validate_resource_seen(
                configuration, resource, errors, resource_id)
            job_cond['SEEN_WITHIN_X'] = \
                validate_resource_seen_within_x_hours(configuration, resource,
                                                      errors, resource_id)

            # Check mRSL requested specs

            job_cond['ARCHITECTURE'] = validate_architecture(
                configuration, job, resource, errors)
            job_cond['CPUCOUNT'] = validate_cpucount(configuration, job,
                                                     resource, errors)
            job_cond['CPUTIME'] = validate_cputime(configuration, job,
                                                   resource, errors)
            job_cond['DISK'] = validate_disk(configuration, job, resource,
                                             errors)
            job_cond['JOBTYPE'] = validate_jobtype(configuration, job,
                                                   resource, errors)
            job_cond['MAXPRICE'] = validate_maxprice(configuration, job,
                                                     resource, errors)
            job_cond['MEMORY'] = validate_memory(configuration, job, resource,
                                                 errors)
            job_cond['NODECOUNT'] = validate_nodecount(configuration, job,
                                                       resource, errors)
            job_cond['PLATFORM'] = validate_platform(configuration, job,
                                                     resource, errors)
            job_cond['RETRIES'] = validate_retries(configuration, job, errors)
            job_cond['RUNTIMEENVIRONMENT'] = validate_runtimeenvironment(
                configuration, job, resource, errors)
            job_cond['SANDBOX'] = validate_sandbox(configuration, job,
                                                   resource, errors)

            job_cond['suggested_vgrid'] = vgrid
            job_cond['suggested_resource'] = anon_resource_id(
                resource_id, False)

            set_pass_level(configuration, job_cond)

            if threshold_color_to_value(job_cond['JOB_COND']) >= \
                    threshold_color_to_value(best_job_cond['JOB_COND']):
                if not found_best or (len(errors) <= len(best_errors)):
                    found_best = True
                    best_job_cond = job_cond.copy()
                    best_errors = errors.copy()

                    if best_job_cond['JOB_COND'] == GREEN:
                        break

            job_cond.clear()
            errors.clear()

    # Run resource independent checks only once

    job_cond['INPUTFILES'] = validate_inputfiles(configuration, job,
                                                 other_errors)
    job_cond['EXECUTABLES'] = validate_executables(configuration, job,
                                                   other_errors)
    job_cond['VERIFYFILES'] = validate_verifyfiles(configuration, job,
                                                   other_errors)
    job_cond['NOTIFY'] = validate_notify(configuration, job, other_errors)

    if not found_best:
        best_job_cond['RESOURCE'] = False
        best_errors['RESOURCE'] = 'No valid/allowed resources available.'
    else:
        best_job_cond.update(job_cond)

    if other_errors:
        best_errors.update(other_errors)
        if best_job_cond['JOB_COND'] == GREEN:
            best_job_cond['JOB_COND'] = YELLOW
    return (best_job_cond, best_errors)