def getCondorStatusConstrained(collector_names, type_constraint, constraint=None, format_list=None, subsystem_name=None): out_status_dict = {} for collector in collector_names: full_constraint = type_constraint[0:] # make copy if constraint is not None: full_constraint = "(%s) && (%s)" % (full_constraint, constraint) try: status = condorMonitor.CondorStatus(subsystem_name=subsystem_name, pool_name=collector) status.load(full_constraint, format_list) except condorMonitor.QueryError: if collector is not None: msg = "Condor Error. Failed to talk to collector %s: " % collector else: msg = "Condor Error. Failed to talk to collector: " logSupport.log.exception(msg) # If collector not found it is equivalent to no classads continue except RuntimeError: logSupport.log.exception("Runtime error. Failed to talk to collector: ") continue except Exception: logSupport.log.exception("Unknown error. Failed to talk to collector: ") continue if len(status.fetchStored()) > 0: out_status_dict[collector] = status return out_status_dict
def get_globals_classads(factory_collector=glideFactoryInterface.DEFAULT_VAL): if factory_collector == glideFactoryInterface.DEFAULT_VAL: factory_collector = glideFactoryInterface.factoryConfig.factory_collector status_constraint = '(GlideinMyType=?="glideclientglobal")' status = condorMonitor.CondorStatus("any", pool_name=factory_collector) status.require_integrity( True) # important, this dictates what gets submitted status.load(status_constraint) data = status.fetchStored() return data
def getMonitorVMStatus(pool_name, monitorVM): cs = condorMonitor.CondorStatus(pool_name=pool_name) data = cs.fetch(constraint='(Name=="%s")' % monitorVM, format_list=[('IS_MONITOR_VM', 'b'), ('HAS_MONITOR_VM', 'b'), ('State', 's'), ('Activity', 's'), ('vm2_State', 's'), ('vm2_Activity', 's'), ('GLEXEC_STARTER', 'b'), ('USES_MONITOR_STARTD', 'b'), ('GLEXEC_JOB', 'b')]) if monitorVM not in data: raise RuntimeError("Monitor slot %s does not exist!" % monitorVM) return data[monitorVM]
def get_production_ress_entries(server, ref_dict_list): production_entries = [] condor_obj = condorMonitor.CondorStatus(pool_name=server) condor_obj.load( constraint= '(GlueCEInfoContactString=!=UNDEFINED)&&(GlueCEStateStatus=?="Production")', format_list=[]) condor_refs = condor_obj.fetchStored().keys() #del condor_obj for el in ref_dict_list: ref = el['ref'] if ref in condor_refs: production_entries.append(el['entry_name']) return production_entries
def get_ress_data(self): common.logit("ReSS host: %s" % self.glidein.ress_host()) #-- validate host --- if not common.url_is_valid(self.glidein.ress_host()): common.logerr("ReSS server (%s) in ress_host option is not valid or inaccssible." % self.glidein.ress_host()) condor_sbin = "%s/sbin" % self.wms.condor_location() condor_bin = "%s/bin" % self.wms.condor_location() condorExe.set_path(condor_bin, condor_sbin) #-- get gatekeeper data from ReSS -- common.logit("Supported VOs: %s" % self.glidein.entry_vos()) constraint = self.glidein.ress_vo_constraint() common.logit("Constraints: %s" % constraint) condor_obj=condorMonitor.CondorStatus(pool_name=self.glidein.ress_host()) try: condor_obj.load(constraint=constraint) condor_data=condor_obj.fetchStored() except Exception as e: common.logerr(e) del condor_obj return condor_data
def getMonitorVM(pool_name, jobVM): cs = condorMonitor.CondorStatus(pool_name=pool_name) data = cs.fetch(constraint='(Name=="%s")' % jobVM, format_list=[('IS_MONITOR_VM', 'b'), ('HAS_MONITOR_VM', 'b'), ('Monitoring_Name', 's')]) if jobVM not in data: raise RuntimeError("Job claims it runs on %s, but cannot find it!" % jobVM) job_data = data[jobVM] if ('HAS_MONITOR_VM' not in job_data) or ('IS_MONITOR_VM' not in job_data): raise RuntimeError("Slot %s does not support monitoring!" % jobVM) if not (job_data['HAS_MONITOR_VM'] == True): raise RuntimeError( "Slot %s does not support monitoring! HAS_MONITOR_VM not True." % jobVM) if not (job_data['IS_MONITOR_VM'] == False): raise RuntimeError( "Slot %s is a monitoring slot itself! Cannot monitor." % jobVM) if 'Monitoring_Name' not in job_data: raise RuntimeError("Slot %s does not publish the monitoring slot!" % jobVM) return job_data['Monitoring_Name']
def findWork(factory_name, glidein_name, entry_name, supported_signtypes, pub_key_obj=None, additional_constraints=None, factory_collector=DEFAULT_VAL): """ Find request classAds that have my (factory, glidein name, entry name) and create the dictionary of work request information. @type factory_name: string @param factory_name: name of the factory @type glidein_name: string @param glidein_name: name of the glidein instance @type entry_name: string @param entry_name: name of the factory entry @type supported_signtypes: list @param supported_signtypes: only support one kind of signtype, 'sha1', default is None @type pub_key_obj: string @param pub_key_obj: only support 'RSA' @type additional_constraints: string @param additional_constraints: any additional constraints to include for querying the WMS collector, default is None @type factory_collector: string or None @param factory_collector: the collector to query, special value 'default' will get it from the global config @return: dictionary, each key is the name of a frontend. Each value has a 'requests' and a 'params' key. Both refer to classAd dictionaries. """ global factoryConfig logSupport.log.debug("Querying collector for requests") if factory_collector==DEFAULT_VAL: factory_collector=factoryConfig.factory_collector status_constraint = '(GlideinMyType=?="%s") && (ReqGlidein=?="%s@%s@%s")' % (factoryConfig.client_id, entry_name, glidein_name, factory_name) if supported_signtypes is not None: status_constraint += ' && stringListMember(%s%s,"%s")' % (factoryConfig.client_web_prefix, factoryConfig.client_web_signtype_suffix, string.join(supported_signtypes, ",")) if additional_constraints is not None: status_constraint = "((%s)&&(%s))" % (status_constraint, additional_constraints) status = condorMonitor.CondorStatus(subsystem_name="any", pool_name=factory_collector) status.require_integrity(True) #important, this dictates what gets submitted status.glidein_name = glidein_name status.entry_name = entry_name # serialize access to the Collector accross all the processes # these is a single Collector anyhow lock_fname=os.path.join(factoryConfig.lock_dir, "gfi_status.lock") if not os.path.exists(lock_fname): #create a lock file if needed try: fd=open(lock_fname, "w") fd.close() except: # could be a race condition pass fd=open(lock_fname, "r+") try: fcntl.flock(fd, fcntl.LOCK_EX) try: status.load(status_constraint) finally: fcntl.flock(fd, fcntl.LOCK_UN) finally: fd.close() data = status.fetchStored() reserved_names = ("ReqName", "ReqGlidein", "ClientName", "FrontendName", "GroupName", "ReqPubKeyID", "ReqEncKeyCode", "ReqEncIdentity", "AuthenticatedIdentity") out = {} # copy over requests and parameters for k in data.keys(): kel = data[k] el = {"requests":{}, "web":{}, "params":{}, "params_decrypted":{}, "monitor":{}, "internals":{}} for (key, prefix) in (("requests", factoryConfig.client_req_prefix), ("web", factoryConfig.client_web_prefix), ("params", factoryConfig.glidein_param_prefix), ("monitor", factoryConfig.glidein_monitor_prefix)): plen = len(prefix) for attr in kel.keys(): if attr in reserved_names: continue # skip reserved names if attr[:plen] == prefix: el[key][attr[plen:]] = kel[attr] if pub_key_obj is not None: if 'ReqPubKeyID' in kel: try: sym_key_obj = pub_key_obj.extract_sym_key(kel['ReqEncKeyCode']) except: continue # bad key, ignore entry else: sym_key_obj = None # no key used, will not decrypt else: sym_key_obj = None # have no key, will not decrypt if sym_key_obj is not None: # this is verifying that the identity that the client claims to be is the identity that Condor thinks it is try: enc_identity = sym_key_obj.decrypt_hex(kel['ReqEncIdentity']) except: logSupport.log.warning("Client %s provided invalid ReqEncIdentity, could not decode. Skipping for security reasons." % k) continue # corrupted classad if enc_identity != kel['AuthenticatedIdentity']: logSupport.log.warning("Client %s provided invalid ReqEncIdentity(%s!=%s). Skipping for security reasons." % (k, enc_identity, kel['AuthenticatedIdentity'])) continue # uh oh... either the client is misconfigured, or someone is trying to cheat invalid_classad = False for (key, prefix) in (("params_decrypted", factoryConfig.encrypted_param_prefix),): plen = len(prefix) for attr in kel.keys(): if attr in reserved_names: continue # skip reserved names if attr[:plen] == prefix: el[key][attr[plen:]] = None # define it even if I don't understand the content if sym_key_obj is not None: try: el[key][attr[plen:]] = sym_key_obj.decrypt_hex(kel[attr]) except: invalid_classad = True break # I don't understand it -> invalid if invalid_classad: logSupport.log.warning("At least one of the encrypted parameters for client %s cannot be decoded. Skipping for security reasons." % k) continue # need to go this way as I may have problems in an inner loop for attr in kel.keys(): if attr in ("ClientName", "FrontendName", "GroupName", "ReqName", "LastHeardFrom", "ReqPubKeyID", "AuthenticatedIdentity"): el["internals"][attr] = kel[attr] out[k] = el return out
def findGroupWork(factory_name, glidein_name, entry_names, supported_signtypes, pub_key_obj=None, additional_constraints=None, factory_collector=DEFAULT_VAL): """ Find request classAds that have my (factory, glidein name, entries) and create the dictionary of dictionary of work request information. Example: work[entry_name][frontend] = {'params':'value', 'requests':'value} @type factory_name: string @param factory_name: name of the factory @type glidein_name: string @param glidein_name: name of the glidein instance @type entry_names: list @param entry_names: list of factory entry names @type supported_signtypes: list @param supported_signtypes: only support one kind of signtype, 'sha1', default is None @type pub_key_obj: string @param pub_key_obj: only support 'RSA', defaults to None @type additional_constraints: string @param additional_constraints: any additional constraints to include for querying the WMS collector, default is None @type factory_collector: string or None @param factory_collector: the collector to query, special value 'default' will get it from the global config @rtype: dict @return: Dictionary of work to perform. Return format is work[entry_name][frontend] = {'params':'value', 'requests':'value} """ global factoryConfig if factory_collector==DEFAULT_VAL: factory_collector=factoryConfig.factory_collector req_glideins = '' for entry in entry_names: req_glideins = '%s@%s@%s,%s' % (entry, glidein_name, factory_name, req_glideins) # Strip off leading & trailing comma req_glideins = req_glideins.strip(',') status_constraint='(GlideinMyType=?="%s") && (stringListMember(ReqGlidein,"%s")=?=True)' % (factoryConfig.client_id, req_glideins) if (supported_signtypes is not None): status_constraint += ' && stringListMember(%s%s,"%s")' % \ (factoryConfig.client_web_prefix, factoryConfig.client_web_signtype_suffix, string.join(supported_signtypes, ",")) if (pub_key_obj is not None): # Get only classads that have my key or no key at all # Any other key will not work status_constraint += ' && (((ReqPubKeyID=?="%s") && (ReqEncKeyCode=!=Undefined) && (ReqEncIdentity=!=Undefined)) || (ReqPubKeyID=?=Undefined))' % pub_key_obj.get_pub_key_id() if (additional_constraints is not None): status_constraint = "(%s)&&(%s)" % (status_constraint, additional_constraints) status = condorMonitor.CondorStatus(subsystem_name="any", pool_name=factory_collector) # Important, this dictates what gets submitted status.require_integrity(True) status.glidein_name = glidein_name # Serialize access to the Collector accross all the processes # these is a single Collector anyhow lock_fname = os.path.join(factoryConfig.lock_dir, "gfi_status.lock") if not os.path.exists(lock_fname): # Create a lock file if needed try: fd = open(lock_fname, "w") fd.close() except: # could be a race condition pass fd = open(lock_fname, "r+") try: fcntl.flock(fd, fcntl.LOCK_EX) try: status.load(status_constraint) finally: fcntl.flock(fd, fcntl.LOCK_UN) finally: fd.close() data = status.fetchStored() reserved_names = ("ReqName", "ReqGlidein", "ClientName", "FrontendName", "GroupName", "ReqPubKeyID", "ReqEncKeyCode", "ReqEncIdentity", "AuthenticatedIdentity") # Output is now in the format of # out[entry_name][frontend] out = {} # Copy over requests and parameters for k in data: kel = data[k] el = {"requests":{}, "web":{}, "params":{}, "params_decrypted":{}, "monitor":{}, "internals":{}} for (key, prefix) in (("requests", factoryConfig.client_req_prefix), ("web", factoryConfig.client_web_prefix), ("params", factoryConfig.glidein_param_prefix), ("monitor", factoryConfig.glidein_monitor_prefix)): plen = len(prefix) for attr in kel: if attr in reserved_names: # Skip reserved names continue if attr[:plen] == prefix: el[key][attr[plen:]] = kel[attr] # sym_key_obj will stay None if # 1) extract_sym_key throws exception # 2) kel does not contain 'ReqPubKeyID' # 3) pub_key_obj is None and there is no key to decrypt sym_key_obj = None if (pub_key_obj is not None) and ('ReqPubKeyID' in kel): try: sym_key_obj = pub_key_obj.extract_sym_key(kel['ReqEncKeyCode']) except: continue if (sym_key_obj is not None): # Verify that the identity the client claims to be is the # identity that Condor thinks it is try: enc_identity = sym_key_obj.decrypt_hex(kel['ReqEncIdentity']) except: logSupport.log.warning("Client %s provided invalid ReqEncIdentity, could not decode. Skipping for security reasons." % k) continue # Corrupted classad if enc_identity != kel['AuthenticatedIdentity']: logSupport.log.warning("Client %s provided invalid ReqEncIdentity(%s!=%s). Skipping for security reasons." % (k, enc_identity, kel['AuthenticatedIdentity'])) # Either the client is misconfigured or someone is cheating continue invalid_classad = False for (key, prefix) in (("params_decrypted", factoryConfig.encrypted_param_prefix),): # TODO: useless for, only one element plen = len(prefix) for attr in kel: if attr in reserved_names: # Skip reserved names continue if attr[:plen] == prefix: # Define it even if I don't understand the content el[key][attr[plen:]] = None if sym_key_obj is not None : try: el[key][attr[plen:]] = sym_key_obj.decrypt_hex(kel[attr]) except: # I don't understand it -> invalid invalid_classad = True break # Continue if I have problems in an inner loop if invalid_classad: logSupport.log.warning("At least one of the encrypted parameters for client %s cannot be decoded. Skipping for security reasons."%k) continue for attr in kel: if attr in ("ClientName", "FrontendName", "GroupName", "ReqName", "LastHeardFrom", "ReqPubKeyID", "AuthenticatedIdentity"): el["internals"][attr] = kel[attr] out[k] = el return workGroupByEntries(out)
def spawn(sleep_time, advertize_rate, startup_dir, glideinDescript, frontendDescript, entries, restart_attempts, restart_interval): """ Spawn and keep track of the entry processes. Restart them if required. Advertise glidefactoryglobal classad every iteration @type sleep_time: long @param sleep_time: Delay between every iteration @type advertize_rate: long @param advertize_rate: Rate at which entries advertise their classads @type startup_dir: String @param startup_dir: Path to glideinsubmit directory @type glideinDescript: glideFactoryConfig.GlideinDescript @param glideinDescript: Factory config's glidein description object @type frontendDescript: glideFactoryConfig.FrontendDescript @param frontendDescript: Factory config's frontend description object @type entries: list @param entries: Sorted list of entry names @type restart_interval: long @param restart_interval: Allowed restart interval in second @type restart_attempts: long @param restart_attempts: Number of allowed restart attempts in the interval """ global STARTUP_DIR childs = {} # Number of glideFactoryEntry processes to spawn and directly relates to # number of concurrent condor_status processess # # NOTE: If number of entries gets too big, we may excede the shell args # limit. If that becomes an issue, move the logic to identify the # entries to serve to the group itself. # # Each process will handle multiple entries split as follows # - Sort the entries alphabetically. Already done # - Divide the list into equal chunks as possible # - Last chunk may get fewer entries entry_process_count = 1 starttime = time.time() oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime childs_uptime={} factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile']) logSupport.log.info("Available Entries: %s" % entries) group_size = long(math.ceil(float(len(entries))/entry_process_count)) entry_groups = entry_grouper(group_size, entries) def _set_rlimit(soft_l=None, hard_l=None): #set new hard and soft open file limits #if setting limits fails or no input parameters use inherited limits #from parent process #nb 1. it is possible to raise limits #up to [hard_l,hard_l] but once lowered they cannot be raised #nb 2. it may be better just to omit calling this function at #all from subprocess - in which case it inherits limits from #parent process lim = resource.getrlimit(resource.RLIMIT_NOFILE) if soft_l is not None or hard_l is not None: if not hard_l: hard_l = soft_l if not soft_l: soft_l=hard_l try: new_lim = [soft_l, hard_l] resource.setrlimit(resource.RLIMIT_NOFILE, new_lim) except: resource.setrlimit(resource.RLIMIT_NOFILE, lim) try: for group in range(len(entry_groups)): entry_names = string.join(entry_groups[group], ':') logSupport.log.info("Starting EntryGroup %s: %s" % \ (group, entry_groups[group])) # Converted to using the subprocess module command_list = [sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntryGroup.py"), str(os.getpid()), str(sleep_time), str(advertize_rate), startup_dir, entry_names, str(group)] childs[group] = subprocess.Popen(command_list, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, preexec_fn=_set_rlimit) # Get the startup time. Used to check if the entry is crashing # periodically and needs to be restarted. childs_uptime[group] = list() childs_uptime[group].insert(0, time.time()) logSupport.log.info("EntryGroup startup times: %s" % childs_uptime) for group in childs: # set it in non blocking mode # since we will run for a long time, we do not want to block for fd in (childs[group].stdout.fileno(), childs[group].stderr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) # If RemoveOldCredFreq < 0, do not do credential cleanup. curr_time = 0 # To ensure curr_time is always initialized if int(glideinDescript.data['RemoveOldCredFreq']) > 0: # Convert credential removal frequency from hours to seconds remove_old_cred_freq = int(glideinDescript.data['RemoveOldCredFreq']) * 60 * 60 curr_time = time.time() update_time = curr_time + remove_old_cred_freq # Convert credential removal age from days to seconds remove_old_cred_age = int(glideinDescript.data['RemoveOldCredAge']) * 60 * 60 * 24 # Create cleaners for old credential files logSupport.log.info("Adding cleaners for old credentials") cred_base_dir = glideinDescript.data['ClientProxiesBaseDir'] for username in frontendDescript.get_all_usernames(): cred_base_user = os.path.join(cred_base_dir, "user_%s" % username) cred_user_instance_dirname = os.path.join(cred_base_user, "glidein_%s" % glideinDescript.data['GlideinName']) cred_cleaner = cleanupSupport.DirCleanupCredentials( cred_user_instance_dirname, "(credential_*)", remove_old_cred_age) cleanupSupport.cred_cleaners.add_cleaner(cred_cleaner) iteration_basetime = time.time() while True: # retrieves WebMonitoringURL from glideclient classAd iteration_timecheck = time.time() iteration_timediff = iteration_timecheck - iteration_basetime if iteration_timediff >= 3600: # every hour iteration_basetime = time.time() # reset the start time fronmonpath = os.path.join(startup_dir, "monitor", "frontendmonitorlink.txt") fronmonconstraint = '(MyType=="glideclient")' fronmonformat_list = [('WebMonitoringURL', 's'), ('FrontendName', 's')] fronmonstatus = condorMonitor.CondorStatus(subsystem_name="any") fronmondata = fronmonstatus.fetch(constraint=fronmonconstraint, format_list=fronmonformat_list) fronmon_list_names = fronmondata.keys() if fronmon_list_names is not None: urlset = set() if os.path.exists(fronmonpath): os.remove(fronmonpath) for frontend_entry in fronmon_list_names: fronmonelement = fronmondata[frontend_entry] fronmonurl = fronmonelement['WebMonitoringURL'].encode('utf-8') fronmonfrt = fronmonelement['FrontendName'].encode('utf-8') if (fronmonfrt, fronmonurl) not in urlset: urlset.add((fronmonfrt, fronmonurl)) with open(fronmonpath, 'w') as fronmonf: fronmonf.write("%s, %s" % (fronmonfrt, fronmonurl)) # Record the iteration start time iteration_stime = time.time() # THIS IS FOR SECURITY # Make sure you delete the old key when its grace is up. # If a compromised key is left around and if attacker can somehow # trigger FactoryEntry process crash, we do not want the entry # to pick up the old key again when factory auto restarts it. if time.time() > oldkey_eoltime and glideinDescript.data['OldPubKeyObj'] is not None: glideinDescript.data['OldPubKeyObj'] = None glideinDescript.data['OldPubKeyType'] = None try: glideinDescript.remove_old_key() logSupport.log.info("Removed the old public key after its grace time of %s seconds" % oldkey_gracetime) except: # Do not crash if delete fails. Just log it. logSupport.log.warning("Failed to remove the old public key after its grace time") # Only removing credentials in the v3+ protocol # Affects Corral Frontend which only supports the v3+ protocol. # IF freq < zero, do not do cleanup. if int(glideinDescript.data['RemoveOldCredFreq']) > 0 and curr_time >= update_time: logSupport.log.info("Checking credentials for cleanup") # Query queue for glideins. Don't remove proxies in use. try: in_use_creds = glideFactoryLib.getCondorQCredentialList() cleanupSupport.cred_cleaners.cleanup(in_use_creds) except: logSupport.log.exception("Unable to cleanup old credentials") update_time = curr_time + remove_old_cred_freq curr_time = time.time() logSupport.log.info("Checking for credentials %s" % entries) # Read in the frontend globals classad # Do this first so that the credentials are immediately # available when the Entries startup classads = {} try: classads = glideFactoryCredentials.get_globals_classads() except Exception: logSupport.log.error("Error occurred retrieving globals classad -- is Condor running?") for classad_key in classads: classad = classads[classad_key] try: glideFactoryCredentials.process_global(classad, glideinDescript, frontendDescript) except: logSupport.log.exception("Error occurred processing the globals classads: ") logSupport.log.info("Checking EntryGroups %s" % childs.keys()) for group in childs: entry_names = string.join(entry_groups[group], ':') child = childs[group] # empty stdout and stderr try: tempOut = child.stdout.read() if len(tempOut) != 0: logSupport.log.warning("EntryGroup %s STDOUT: %s" % (group, tempOut)) except IOError: pass # ignore try: tempErr = child.stderr.read() if len(tempErr) != 0: logSupport.log.warning("EntryGroup %s STDERR: %s" % (group, tempErr)) except IOError: pass # ignore # look for exited child if child.poll(): # the child exited logSupport.log.warning("EntryGroup %s exited. Checking if it should be restarted." % (group)) tempOut = child.stdout.readlines() tempErr = child.stderr.readlines() if is_crashing_often(childs_uptime[group], restart_interval, restart_attempts): del childs[group] raise RuntimeError("EntryGroup '%s' has been crashing too often, quit the whole factory:\n%s\n%s" % (group, tempOut, tempErr)) else: # Restart the entry setting its restart time logSupport.log.warning("Restarting EntryGroup %s." % (group)) del childs[group] command_list = [sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntryGroup.py"), str(os.getpid()), str(sleep_time), str(advertize_rate), startup_dir, entry_names, str(group)] childs[group] = subprocess.Popen(command_list, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, preexec_fn=_set_rlimit) if len(childs_uptime[group]) == restart_attempts: childs_uptime[group].pop(0) childs_uptime[group].append(time.time()) for fd in (childs[group].stdout.fileno(), childs[group].stderr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) logSupport.log.warning("EntryGroup startup/restart times: %s" % (childs_uptime,)) # Aggregate Monitoring data periodically logSupport.log.info("Aggregate monitoring data") stats = aggregate_stats(factory_downtimes.checkDowntime()) save_stats(stats, os.path.join(startup_dir, glideFactoryConfig.factoryConfig.aggregated_stats_file)) # Aggregate job data periodically if glideinDescript.data.get('AdvertisePilotAccounting', False) in ['True', '1']: # data attributes are strings logSupport.log.info("Starting updating job classads") update_classads() logSupport.log.info("Finishing updating job classads") # Advertise the global classad with the factory keys and Factory statistics try: # KEL TODO need to add factory downtime? glideFactoryInterface.advertizeGlobal( glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName'], glideFactoryLib.factoryConfig.supported_signtypes, glideinDescript.data['PubKeyObj'] ) except Exception as e: logSupport.log.exception("Error advertising global classads: %s" % e) cleanupSupport.cleaners.cleanup() iteration_etime = time.time() iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime) if iteration_sleep_time < 0: iteration_sleep_time = 0 logSupport.log.info("Sleep %s secs" % iteration_sleep_time) time.sleep(iteration_sleep_time) # end while 1: finally: # cleanup at exit logSupport.log.info("Received signal...exit") try: try: clean_exit(childs) except: # if anything goes wrong, hardkill the rest for group in childs: logSupport.log.info("Hard killing EntryGroup %s" % group) try: os.kill(childs[group].pid, signal.SIGKILL) except OSError: pass # ignore dead clients finally: logSupport.log.info("Deadvertize myself") try: glideFactoryInterface.deadvertizeFactory( glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName']) except: logSupport.log.exception("Factory deadvertize failed!") try: glideFactoryInterface.deadvertizeFactoryClientMonitoring( glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName']) except: logSupport.log.exception("Factory Monitoring deadvertize failed!") logSupport.log.info("All EntryGroups should be terminated")
def main(): opts = get_opts() pool_name = opts.pool_name constraint = opts.constraint want_gk = opts.want_gk want_gc = opts.want_gc want_monitor = opts.want_monitor want_bench = opts.want_bench want_glexec = opts.want_glexec total_only = opts.total_only summarize = 'entry' if opts.summarize_site: summarize = 'size' if not want_monitor: if constraint is None: constraint = 'IS_MONITOR_VM =!= TRUE' else: constraint = '(%s) && (IS_MONITOR_VM =!= TRUE)' % constraint format_list = [('Machine', 's'), ('State', 's'), ('Activity', 's'), ('GLIDEIN_Site', 's'), ('GLIDEIN_Factory', 's'), ('GLIDEIN_Name', 's'), ('GLIDEIN_Entry_Name', 's'), ('EnteredCurrentActivity', 'i')] attrs = [ 'State', 'Activity', 'GLIDEIN_Site', 'GLIDEIN_Factory', 'GLIDEIN_Name', 'GLIDEIN_Entry_Name', 'EnteredCurrentActivity' ] if want_gk: format_list.append(('GLIDEIN_Gatekeeper', 's')) format_list.append(('GLIDEIN_GridType', 's')) attrs.append('GLIDEIN_Gatekeeper') attrs.append('GLIDEIN_GridType') if want_gc: format_list.append(('GLIDEIN_ClusterId', 'i')) format_list.append(('GLIDEIN_ProcId', 'i')) format_list.append(('GLIDEIN_Schedd', 's')) attrs.append('GLIDEIN_ClusterId') attrs.append('GLIDEIN_ProcId') attrs.append('GLIDEIN_Schedd') if want_glexec: format_list.append(('GLEXEC_STARTER', 'b')) format_list.append(('GLEXEC_JOB', 'b')) attrs.append('GLEXEC_STARTER') attrs.append('GLEXEC_JOB') if want_bench: format_list.append(('KFlops', 'i')) format_list.append(('Mips', 'i')) attrs.append('KFlops') attrs.append('Mips') cs = condorMonitor.CondorStatus(pool_name=pool_name) cs.load(constraint=constraint, format_list=format_list) global data data = cs.stored_data keys = data.keys() keys.sort(machine_cmp) counts_header = ('Total', 'Owner', 'Claimed/Busy', 'Claimed/Retiring', 'Claimed/Other', 'Unclaimed', 'Matched', 'Other') if want_bench: counts_header += ('GFlops', ' GIPS') print_mask = "%-39s %-9s" if want_gk: print_mask += " %-5s %-43s" print_mask += " %-19s %-19s" if want_gc: print_mask += " %-39s %-14s" if want_glexec: print_mask += " %-7s" if want_bench: print_mask += " %-5s %-5s" print_mask += " %-9s %-8s %-10s" header = ('Name', 'Site') if want_gk: header += ('Grid', 'Gatekeeper') header += ('Factory', 'Entry') if want_gc: header += ('GlideSchedd', 'GlideCluster') if want_glexec: header += ('gLExec', ) if want_bench: header += ('MFlop', 'Mips') header += ('State', 'Activity', 'ActvtyTime') if not total_only: print() print(print_mask % header) print() counts = {'Total': {}} for c in counts_header: counts['Total'][c] = 0 for vm_name in keys: el = data[vm_name] cel = { } # this will have all the needed attributes (??? if nothing else) for a in attrs: if a in el: cel[a] = el[a] else: cel[a] = '???' if cel['EnteredCurrentActivity'] != '???': cel['EnteredCurrentActivity'] = fmt_time( long(cel['EnteredCurrentActivity'])) state = cel['State'] activity = cel['Activity'] if 'KFlops' in el: gflops = (el['KFlops'] * 1.e-6) mflops_str = "%i" % (el['KFlops'] / 1000) else: mflops = 0.0 mflops_str = "???" if 'Mips' in el: gips = el['Mips'] * 1.e-3 mips_str = el['Mips'] else: mips = 0.0 mips_str = "???" if summarize == 'site': sum_str = cel['GLIDEIN_Site'] else: sum_str = "%s@%s@%s" % (cel['GLIDEIN_Entry_Name'], cel['GLIDEIN_Name'], cel['GLIDEIN_Factory']) if sum_str not in counts: counts[sum_str] = {} for c in counts_header: counts[sum_str][c] = 0 for t in ('Total', sum_str): ct = counts[t] ct['Total'] += 1 if state in ('Owner', 'Unclaimed', 'Matched'): ct[state] += 1 elif state == 'Claimed': if activity in ('Busy', 'Retiring'): ct['%s/%s' % (state, activity)] += 1 else: ct['Claimed/Other'] += 1 else: ct['Other'] += 1 if want_bench: ct['GFlops'] += gflops ct[' GIPS'] += gips if not total_only: print_arr = (vm_name, cel['GLIDEIN_Site']) if want_gk: print_arr += (cel['GLIDEIN_GridType'], cel['GLIDEIN_Gatekeeper']) print_arr += ("%s@%s" % (cel['GLIDEIN_Name'], cel['GLIDEIN_Factory']), cel['GLIDEIN_Entry_Name']) if want_gc: print_arr += ( cel['GLIDEIN_Schedd'], "%i.%i" % (cel['GLIDEIN_ClusterId'], cel['GLIDEIN_ProcId'])) if want_glexec: glexec_str = 'None' if 'GLEXEC_JOB' in el and el['GLEXEC_JOB']: glexec_str = 'Job' elif 'GLEXEC_STARTER' in el and el['GLEXEC_STARTER']: glexec_str = 'Starter' print_arr += (glexec_str, ) if want_bench: print_arr += (mflops_str, mips_str) print_arr += (state, activity, cel['EnteredCurrentActivity']) print(print_mask % print_arr) print() count_print_mask = "%39s" for c in counts_header: count_print_mask += " %%%is" % len(c) print(count_print_mask % (('', ) + counts_header)) ckeys = counts.keys() if summarize == 'site': ckeys.sort(ltotal_cmp) else: # default is entry ckeys.sort(entry_cmp) if len(ckeys) > 1: print() # put a space before the entry names count_print_val = None for t in ckeys: if t == 'Total': print() # put an empty line before Total count_print_val = [t] else: count_print_val = [''] for c in counts_header: count_print_val.append(int(counts[t][c])) print(count_print_mask % tuple(count_print_val)) print()
def go_request_glideins(self): ilog('Entered go_request_glideins.') from glideinwms.frontend import glideinFrontendInterface from glideinwms.lib import condorMonitor, condorExe, pubCrypto from glideinwms.frontend.glideinFrontendPlugins import proxy_plugins, createCredentialList # query job collector ilog('Checking the condor pool.') try: pool_status = condorMonitor.CondorStatus() pool_status.load( '(IS_MONITOR_VM=!=True)&&(%s)' % self.glidekeeper_constraint, [('State', 's')]) running_glideins = len(pool_status.fetchStored()) del pool_status self.running_glideins = running_glideins ilog('Found %d glideins in the pool.' % running_glideins) except: self.errors.append((time.time(), "condor_status failed")) return # query WMS collector ilog('Checking factory glideins.') glidein_dict = {} for factory_pool in self.factory_pools: factory_pool_node = factory_pool[0] factory_identity = factory_pool[1] try: if self.proxy_data != None: full_constraint = self.factory_constraint + ' && (PubKeyType=?="RSA") && (GlideinAllowx509_Proxy=!=False)' else: full_constraint = self.factory_constraint + ' && (GlideinRequirex509_Proxy=!=True)' ilog( 'Running findGlideins with these params: \n\tpool: %s\n\tident: %s\n\tsigtype: %s\n\tconstraints: %s' % ( str(factory_pool_node), str(None), str(self.signature_type), str(full_constraint) #str(self.proxy_data!=None), #str(True) )) factory_glidein_dict = glideinFrontendInterface.findGlideins( factory_pool_node, None, #factory_identity, #TODO: How do we authenticate with the factory? self.signature_type, full_constraint #self.proxy_data!=None, #get_only_matching=True ) except RuntimeError, e: factory_glidein_dict = { } # in case of error, treat as there is nothing there ilog('Error from findGlideins: %s' % str(e)) ilog('Found %d possible in factory_pool %s' % (len(factory_glidein_dict.keys()), dbgp(factory_pool))) for glidename in factory_glidein_dict.keys(): ilog('Now testing glidein with name %s' % glidename) glidein_el = factory_glidein_dict[glidename] ilog('Glidein stats: \n\n %s \n\n' % dbgp(glidein_el)) if not glidein_el['attrs'].has_key( 'PubKeyType'): # no pub key at all, skip ilog('%s has no PubKeyType -- skipping.' % glidename) continue elif glidein_el['attrs'][ 'PubKeyType'] == 'RSA': # only trust RSA for now try: # augment glidein_el['attrs']['PubKeyObj'] = pubCrypto.PubRSAKey( str( re.sub(r"\\+n", r"\n", glidein_el['attrs']['PubKeyValue']))) # and add glidein_dict[(factory_pool_node, glidename)] = glidein_el ilog('Adding %s to glidein_dict' % glidename) except RuntimeError, e: ilog('Hit error when adding %s to glidein_dict:\n%s' % (glidename, str(e))) continue # skip except:
factory_pool_node, self.client_name) except RuntimeError, e: self.errors.append( (time.time(), "Deadvertizing failed: %s" % e)) except: tb = traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]) self.errors.append( (time.time(), "Deadvertizing failed: %s" % string.join(tb, ''))) # Stop all the glideins I can see ilog('Getting glidein pool status data.') try: pool_status = condorMonitor.CondorStatus() pool_status.load(self.glidekeeper_constraint, [('GLIDEIN_COLLECTOR_NAME', 's'), ('GLIDEIN_MASTER_NAME', 's'), ('MyAddress', 's')]) pool_data = pool_status.fetchStored() except: self.errors.append((time.time(), "condor_status failed")) for k in pool_data.keys(): el = pool_data[k] ilog('Now killing pool with data: (%s -> %s)' % (dbgp(k), dbgp(el))) try: condorExe.exe_cmd("../sbin/condor_off",
def query_ress(ress_source, vo=''): """ Queries the specified RESS url source for information about the sites. Returns dictionary with RESS entries. An entry is created for each classad (site can be listed multiple times). Can raise error """ # TODO - there are multiple classads for an entry for each cluster/vo/etc. Currently only the common information in all the classads for # a site is used (gatekeeper, site and queue names) but if VO specific information is included in the future, this will require more # complicated logic for building the entries dictionary ress_constraint = '(GlueCEInfoContactString=!=UNDEFINED)' if vo!='': ress_constraint = '(GlueCEInfoContactString=!=UNDEFINED)&&(StringlistMember("VO:%s",GlueCEAccessControlBaseRule))'%vo ress_ip = socket.gethostbyname(ress_source) # Get RESS info condor_obj = condorMonitor.CondorStatus(pool_name=ress_source) format_list=[('GlueCEInfoContactString', 's'), ('GlueCEName', 's'), ('GlueSiteName', 's'), ('GlueCEInfoJobManager', 's'), ('GlueCEUniqueID', 's'), ('GlueCEPolicyMaxObtainableWallClockTime', 'i'), ('GlueCEStateStatus', 's')] condor_data = condor_obj.fetch(constraint=ress_constraint, format_list=format_list) ress_entries = {} for condor_id in condor_data.keys(): # Condor id is the value in the Name attribute of the classad. The same entry may have multiple Names and therefore classads but each # will have a unique Name/condor_id condor_el = condor_data[condor_id] # Default values for an entry gridtype = gatekeeper = rsl = wall_clocktime = ce_status = '' wall_clocktime = 0 gatekeeper_name = condor_el['GlueCEInfoContactString'].encode('utf-8') queue_name = condor_el['GlueCEName'].encode('utf-8') site_name = condor_el['GlueSiteName'].encode('utf-8') # Determine rsl by jobmanager # OSG only supports gt2 (gt5 in near future?), do not need to create other rsl strings to support other grid types like cream if condor_el['GlueCEInfoJobManager'].encode('utf-8') == "condor": rsl = "" else: rsl = '(queue=%s)(jobtype=single)' % queue_name glue_id = condor_el['GlueCEUniqueID'].encode('utf-8') wall_clocktime = int(condor_el['GlueCEPolicyMaxObtainableWallClockTime']) # Adjust to max of 48 hours or default of 36 hours as needed # This value is given in minutes if (wall_clocktime / 60) > 48: wall_clocktime = 48 * 60 if wall_clocktime == 0: wall_clocktime = 36 * 60 # TODO what to do with this? New file of disabled entries? ce_status = condor_el['GlueCEStateStatus'].encode('utf-8') # Because RESS is specific to OSG, can default all entries to these values glexec_bin = "OSG" work_dir ='OSG' # Could not find support for non-gt2 sites so defaulting gridtype to gt2. Even if there are some sites, the overwhelming # majority is gt2. May need to check GlueCEInfoGRAMVersion when sites start moving to gram5 (does gwms support gt5 yet?) gridtype = 'gt2' entry = {'site_name' : site_name + '_' + queue_name, 'gridtype' : gridtype, 'gatekeeper' : gatekeeper_name, 'rsl' : rsl, 'wall_clocktime' : wall_clocktime/60, 'ref_id' : condor_id, 'ce_status' : ce_status, 'glexec_bin' : glexec_bin, 'work_dir' : work_dir, 'source' : ress_source, 'source_type' : 'RESS', 'GlueCEUniqueID' : glue_id} ress_entries[condor_id] = entry return ress_entries