cert_name = str(os.getenv('SSL_CLIENT_S_DN_CN')) cert_no_spaces = cert_name.replace(' ', '_') if cert_no_spaces == 'None': sys.exit(1) configuration = get_configuration_object() logger = configuration.logger logger.info('Resource list GUI: start') print '''Content-type: text/html ''' form = cgi.FieldStorage() print get_cgi_html_header(configuration, 'Grid Resource administration', 'Welcome to the Grid resource administration.') dir_list = os.listdir(configuration.resource_home) for file in dir_list: hosturl = file[0:file.rindex('.')] hostidentifier = file[file.rindex('.') + 1:] print " <a href='resource_edit.py?hosturl=" + hosturl\ + '&hostidentifier=' + hostidentifier\ + "'>edit</a> <b>" + file + '</b><br />' print """ <hr> <form action="./resource_edit.py" method="post"> <input type="hidden" name="new_resource" value="true" /> <input type="submit" name="New" value="New" /> </form> """
def create_monitor(vgrid_name): """Write monitor HTML file for vgrid_name""" html_file = os.path.join(configuration.vgrid_home, vgrid_name, "%s.html" % configuration.vgrid_monitor) print "collecting statistics for VGrid %s" % vgrid_name sleep_secs = configuration.sleep_secs slackperiod = configuration.slackperiod now = time.asctime(time.localtime()) html_vars = { "sleep_secs": sleep_secs, "vgrid_name": vgrid_name, "logo_url": "/images/logo.jpg", "now": now, "short_title": configuration.short_title, } html = get_cgi_html_header( configuration, "%(short_title)s Monitor, VGrid %(vgrid_name)s" % html_vars, "", True, """<meta http-equiv="refresh" content="%(sleep_secs)s" /> """ % html_vars, themed_styles(configuration), """ <script type="text/javascript" src="/images/js/jquery.js"></script> <script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script> <script type="text/javascript" > $(document).ready(function() { // table initially sorted by col. 1 (name) var sortOrder = [[1,0]]; // use image path for sorting if there is any inside var imgTitle = function(contents) { var key = $(contents).find("a").attr("class"); if (key == null) { key = $(contents).html(); } return key; } $("table.monitor").tablesorter({widgets: ["zebra"], textExtraction: imgTitle, }); $("table.monitor").each(function () { try { $(this).trigger("sorton", [sortOrder]); } catch(err) { /* tablesorter chokes on empty tables - just continue */ } }); } ); </script> """, "", False, ) html += ( """ <!-- end of raw header: this line is used by showvgridmonitor --> <h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1> <div class="generatornote smallcontent"> This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs). </div> """ % html_vars ) # loop and get totals parse_count = 0 queued_count = 0 frozen_count = 0 executing_count = 0 finished_count = 0 failed_count = 0 retry_count = 0 canceled_count = 0 cpucount_requested = 0 cpucount_done = 0 nodecount_requested = 0 nodecount_done = 0 cputime_requested = 0 cputime_done = 0 used_walltime = 0 disk_requested = 0 disk_done = 0 memory_requested = 0 memory_done = 0 runtimeenv_dict = {"": 0} runtimeenv_requested = 0 runtimeenv_done = 0 number_of_jobs = 0 up_count = 0 down_count = 0 slack_count = 0 job_assigned = 0 job_assigned_cpus = 0 gstat = GridStat(configuration, logger) runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT", {}) parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "PARSE") queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "QUEUED") frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FROZEN") executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXECUTING") failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FAILED") retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RETRY") canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CANCELED") expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXPIRED") finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FINISHED") nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_REQ") nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_DONE") cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_REQ") cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_DONE") used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "USED_WALLTIME") if used_walltime == 0: used_walltime = datetime.timedelta(0) used_walltime = format_timedelta(used_walltime) disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_REQ") disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_DONE") memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_REQ") memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_DONE") cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_REQ") cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_DONE") runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_REQ") runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_DONE") number_of_jobs = parse_count number_of_jobs += queued_count number_of_jobs += frozen_count number_of_jobs += expired_count number_of_jobs += canceled_count number_of_jobs += failed_count number_of_jobs += executing_count number_of_jobs += finished_count number_of_jobs += retry_count html_vars = { "parse_count": parse_count, "queued_count": queued_count, "frozen_count": frozen_count, "executing_count": executing_count, "failed_count": failed_count, "retry_count": retry_count, "canceled_count": canceled_count, "expired_count": expired_count, "finished_count": finished_count, "number_of_jobs": number_of_jobs, "cpucount_requested": cpucount_requested, "cpucount_done": cpucount_done, "nodecount_requested": nodecount_requested, "nodecount_done": nodecount_done, "cputime_requested": cputime_requested, "cputime_done": cputime_done, "used_walltime": used_walltime, "disk_requested": disk_requested, "disk_done": disk_done, "memory_requested": memory_requested, "memory_done": memory_done, "runtimeenv_requested": runtimeenv_requested, "runtimeenv_done": runtimeenv_done, } html += ( """<h2>Job Stats</h2><table class=monitorstats><tr><td> <table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr> <tr><td>Parse</td><td>%(parse_count)s</td></tr> <tr><td>Queued</td><td>%(queued_count)s</td></tr> <tr><td>Frozen</td><td>%(frozen_count)s</td></tr> <tr><td>Executing</td><td>%(executing_count)s</td></tr> <tr><td>Failed</td><td>%(failed_count)s</td></tr> <tr><td>Retry</td><td>%(retry_count)s</td></tr> <tr><td>Canceled</td><td>%(canceled_count)s</td></tr> <tr><td>Expired</td><td>%(expired_count)s</td></tr> <tr><td>Finished</td><td>%(finished_count)s</td></tr> <tr><td>Total</td><td>%(number_of_jobs)s</td></tr> </table> </td><td> <table class=monitorresreq> <tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr> <tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr> <tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr> <tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr> <tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr> <tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr> <tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr> <tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr> </table><br /> </td><td> <div class=monitorruntimeenvdetails> <table class=monitorruntimeenvdone> <tr class=title><td>Runtime Envs Done</td><td></td></tr> """ % html_vars ) if len(runtimeenv_dict.keys()) < 1: # No runtimeenv requests html += "<tr><td></td><td>-</td></tr>\n" else: for entry in runtimeenv_dict.keys(): if not entry == "": html += "<tr><td>" + entry + "</td><td>" + str(runtimeenv_dict[entry]) + "</td></tr>\n" total_number_of_exe_resources, total_number_of_store_resources = 0, 0 total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0 vgrid_name_list = vgrid_name.split("/") current_dir = "" exes, stores = "", "" for vgrid_name_part in vgrid_name_list: current_dir = os.path.join(current_dir, vgrid_name_part) abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir) # print 'dir: %s' % abs_mon_dir # Potential race - just ignore if it disappeared try: sorted_names = os.listdir(abs_mon_dir) except OSError: continue sorted_names.sort() for filename in sorted_names: # print filename if filename.startswith("monitor_last_request_"): # read last request helper file mon_file_name = os.path.join(abs_mon_dir, filename) print "found " + mon_file_name last_request_dict = unpickle(mon_file_name, logger) if not last_request_dict: print "could not open and unpickle: " + mon_file_name continue difference = datetime.datetime.now() - last_request_dict["CREATED_TIME"] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_request_dict.has_key("CPUTIME"): cputime = last_request_dict["CPUTIME"] elif last_request_dict.has_key("cputime"): cputime = last_request_dict["cputime"] else: print "ERROR: last request does not contain cputime field!: %s" % last_request_dict continue try: cpusec = int(cputime) except ValueError: try: cpusec = int(float(cputime)) except ValueError, verr: print "ERROR: failed to parse cputime %s: %s" % (cputime, verr) # Include execution delay guesstimate for strict fill # LRMS resources try: delay = int(last_request_dict["EXECUTION_DELAY"]) except KeyError: delay = 0 except ValueError: delay = 0 time_remaining = ( last_request_dict["CREATED_TIME"] + datetime.timedelta(seconds=cpusec) + datetime.timedelta(seconds=delay) ) - datetime.datetime.now() days_rem = str(time_remaining.days) hours_rem = str(time_remaining.seconds / 3600) minutes_rem = str((time_remaining.seconds % 3600) / 60) seconds_rem = str((time_remaining.seconds % 60) % 60) if time_remaining.days < -7: try: print "removing: %s as we havent seen him for %s days." % ( mon_file_name, abs(time_remaining).days, ) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s" % (mon_file_name, str(err)) pass else: unique_res_name_and_exe_list = filename.split("monitor_last_request_", 1) if cpusec == 0: resource_status = "unavailable" elif time_remaining.days < 0: # time_remaining.days < 0 means that we have passed the specified time time_rem_abs = abs(time_remaining) if time_rem_abs.days == 0 and int(time_rem_abs.seconds) < int(slackperiod): resource_status = "slack" slack_count = slack_count + 1 else: resource_status = "offline" down_count = down_count + 1 else: resource_status = "online" up_count = up_count + 1 exes += "<tr>" exes += "<td><img src=/images/status-icons/%s.png /></td>" % resource_status public_id = unique_res_name_and_exe_list[1] if last_request_dict["RESOURCE_CONFIG"].get("ANONYMOUS", True): public_id = anon_resource_id(public_id) public_name = last_request_dict["RESOURCE_CONFIG"].get("PUBLICNAME", "") resource_parts = public_id.split("_", 2) resource_name = "<a href='viewres.py?unique_resource_name=%s'>%s</a>" % ( resource_parts[0], resource_parts[0], ) if public_name: resource_name += "<br />(alias %s)" % public_name else: resource_name += "<br />(no alias)" resource_name += "<br />%s" % resource_parts[1] exes += "<td>%s</td>" % resource_name exes += "<td>%s<br />(%sd %sh %sm %ss ago)</td>" % ( time.asctime(last_request_dict["CREATED_TIME"].timetuple()), days, hours, minutes, seconds, ) exes += "<td>" + vgrid_name + "</td>" runtime_envs = last_request_dict["RESOURCE_CONFIG"]["RUNTIMEENVIRONMENT"] re_list_text = ", ".join([i[0] for i in runtime_envs]) exes += '<td title="%s">' % re_list_text + str(len(runtime_envs)) + "</td>" exes += ( "<td>" + str(last_request_dict["RESOURCE_CONFIG"]["CPUTIME"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["DISK"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["MEMORY"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["ARCHITECTURE"]) + "</td>" ) exes += ( "<td>" + last_request_dict["STATUS"] + "</td><td>" + str(last_request_dict["CPUTIME"]) + "</td>" ) exes += "<td class=status_%s>" % resource_status if "unavailable" == resource_status: exes += "-" elif "slack" == resource_status: exes += "Within slack period (%s < %s secs)" % (time_rem_abs.seconds, slackperiod) elif "offline" == resource_status: exes += "down?" else: exes += "%sd, %sh, %sm, %ss" % (days_rem, hours_rem, minutes_rem, seconds_rem) exes += "</td>" exes += "</tr>\n" if last_request_dict["STATUS"] == "Job assigned": job_assigned = job_assigned + 1 job_assigned_cpus = job_assigned_cpus + int( last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"] ) * int(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"]) total_number_of_exe_resources += 1 total_number_of_exe_cpus += int(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]) * int( last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"] ) elif filename.startswith("monitor_last_status_"): # store must be linked to this vgrid, not only parent vgrid: # inheritance only covers access, not automatic participation if current_dir != vgrid_name: continue # read last resource action status file mon_file_name = os.path.join(abs_mon_dir, filename) print "found " + mon_file_name last_status_dict = unpickle(mon_file_name, logger) if not last_status_dict: print "could not open and unpickle: " + mon_file_name continue difference = datetime.datetime.now() - last_status_dict["CREATED_TIME"] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_status_dict["STATUS"] == "stopped": time_stopped = datetime.datetime.now() - last_status_dict["CREATED_TIME"] if time_stopped.days > 7: try: print "removing: %s as we havent seen him for %s days." % ( mon_file_name, abs(time_stopped).days, ) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s" % (mon_file_name, str(err)) continue