def get_downtime_fd(entry_name, cmdname): try: # New style has config all in the factory file #if entry_name=='factory': config = glideFactoryConfig.GlideinDescript() #else: # config=glideFactoryConfig.JobDescript(entry_name) except IOError: raise RuntimeError("Failed to load config for %s" % entry_name) fd = glideFactoryDowntimeLib.DowntimeFile(config.data['DowntimesFile']) return fd
def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript, frontendDescript, group_name, my_entries): """ Iterate over set of tasks until its time to quit or die. The main "worker" function for the Factory Entry Group. @todo: More description to come @type parent_pid: int @param parent_pid: the pid for the Factory daemon @type sleep_time: int @param sleep_time: The number of seconds to sleep between iterations @type advertize_rate: int @param advertize_rate: The rate at which advertising should occur @type glideinDescript: glideFactoryConfig.GlideinDescript @param glideinDescript: glidein.descript object in the Factory root dir @type frontendDescript: glideFactoryConfig.FrontendDescript @param frontendDescript: frontend.descript object in the Factory root dir @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name """ is_first = True # In first iteration count = 0 # Record the starttime so we know when to disable the use of old pub key starttime = time.time() # The grace period should be in the factory config. Use it to determine # the end of lifetime for the old key object. Hardcoded for now to 30 mins. oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime factory_downtimes = glideFactoryDowntimeLib.DowntimeFile( glideinDescript.data['DowntimesFile']) while True: # Check if parent is still active. If not cleanup and die. check_parent(parent_pid, glideinDescript, my_entries) cleanupSupport.cleaners.start_background_cleanup() # Check if its time to invalidate factory's old key if ((time.time() > oldkey_eoltime) and (glideinDescript.data['OldPubKeyObj'] is not None)): # Invalidate the use of factory's old key logSupport.log.info("Retiring use of old key.") logSupport.log.info( "Old key was valid from %s to %s ie grace of ~%s sec" % (starttime, oldkey_eoltime, oldkey_gracetime)) glideinDescript.data['OldPubKeyType'] = None glideinDescript.data['OldPubKeyObj'] = None # Check if the factory is in downtime. Group is in downtime only if the # factory is in downtime. Entry specific downtime is handled in entry factory_in_downtime = factory_downtimes.checkDowntime(entry="factory") # Record the iteration start time iteration_stime = time.time() iteration_stime_str = time.ctime() if factory_in_downtime: logSupport.log.info("Iteration at (in downtime) %s" % iteration_stime_str) else: logSupport.log.info("Iteration at %s" % iteration_stime_str) # PM: Shouldn't this be inside the else statement above? # Why do we want to execute this if we are in downtime? # Or do we want to execute only few steps here but code prevents us? try: done_something = iterate_one(count == 0, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) logSupport.log.info("Writing stats for all entries") try: pids = [] # generate a list of entries for each CPU cpuCount = int( glideinDescript.data['MonitorUpdateThreadCount']) logSupport.log.info("Number of parallel writes for stats: %i" % cpuCount) entrylists = [ my_entries.values()[cpu::cpuCount] for cpu in xrange(cpuCount) ] # Fork's keyed by cpu number. Actual key is irrelevant pipe_ids = {} post_writestats_info = {} for cpu in xrange(cpuCount): r, w = os.pipe() unregister_sighandler() pid = os.fork() if pid: # I am the parent register_sighandler() pids.append(pid) os.close(w) pipe_ids[cpu] = {'r': r, 'pid': pid} else: # I am the child os.close(r) logSupport.disable_rotate = True # Return the pickled entry object in form of dict # return_dict[entry.name][entry.getState()] return_dict = {} for entry in entrylists[cpu]: try: entry.writeStats() return_dict[entry.name] = entry.getState() except: entry.log.warning( "Error writing stats for entry '%s'" % (entry.name)) entry.log.exception( "Error writing stats for entry '%s': " % (entry.name)) try: os.write(w, cPickle.dumps(return_dict)) except: # Catch and log exceptions if any to avoid # runaway processes. entry.log.exception( "Error writing pickled state for entry '%s': " % (entry.name)) os.close(w) # Exit without triggering SystemExit exception os._exit(0) try: logSupport.log.info( "Processing response from children after write stats") post_writestats_info = fetch_fork_result_list(pipe_ids) except: logSupport.log.exception( "Error processing response from one or more children after write stats" ) logSupport.roll_all_logs() for i in post_writestats_info: for ent in post_writestats_info[i]: (my_entries[ent]).setState( post_writestats_info[i][ent]) except KeyboardInterrupt: raise # this is an exit signal, pass through except: # never fail for stats reasons! logSupport.log.exception("Error writing stats: ") except KeyboardInterrupt: raise # this is an exit signal, pass through except: if is_first: raise else: # If not the first pass, just warn logSupport.log.exception( "Exception occurred in the main loop of Factory Group %s: " % group_name) cleanupSupport.cleaners.wait_for_cleanup() iteration_etime = time.time() iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime) if (iteration_sleep_time < 0): iteration_sleep_time = 0 logSupport.log.info("Sleep %is" % iteration_sleep_time) time.sleep(iteration_sleep_time) count = (count + 1) % advertize_rate is_first = False # Entering following iterations
def setUp(self): self.file_loc = "/tmp/downtimes.txt" self.downtime = glideFactoryDowntimeLib.DowntimeFile(self.file_loc) pass
def spawn(sleep_time, advertize_rate, startup_dir, glideinDescript, frontendDescript, entries, restart_attempts, restart_interval): """ Spawn and keep track of the entry processes. Restart them if required. Advertise glidefactoryglobal classad every iteration @type sleep_time: long @param sleep_time: Delay between every iteration @type advertize_rate: long @param advertize_rate: Rate at which entries advertise their classads @type startup_dir: String @param startup_dir: Path to glideinsubmit directory @type glideinDescript: glideFactoryConfig.GlideinDescript @param glideinDescript: Factory config's glidein description object @type frontendDescript: glideFactoryConfig.FrontendDescript @param frontendDescript: Factory config's frontend description object @type entries: list @param entries: Sorted list of entry names @type restart_interval: long @param restart_interval: Allowed restart interval in second @type restart_attempts: long @param restart_attempts: Number of allowed restart attempts in the interval """ global STARTUP_DIR childs = {} # Number of glideFactoryEntry processes to spawn and directly relates to # number of concurrent condor_status processess # # NOTE: If number of entries gets too big, we may excede the shell args # limit. If that becomes an issue, move the logic to identify the # entries to serve to the group itself. # # Each process will handle multiple entries split as follows # - Sort the entries alphabetically. Already done # - Divide the list into equal chunks as possible # - Last chunk may get fewer entries entry_process_count = 1 starttime = time.time() oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime childs_uptime={} factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile']) logSupport.log.info("Available Entries: %s" % entries) group_size = long(math.ceil(float(len(entries))/entry_process_count)) entry_groups = entry_grouper(group_size, entries) def _set_rlimit(soft_l=None, hard_l=None): #set new hard and soft open file limits #if setting limits fails or no input parameters use inherited limits #from parent process #nb 1. it is possible to raise limits #up to [hard_l,hard_l] but once lowered they cannot be raised #nb 2. it may be better just to omit calling this function at #all from subprocess - in which case it inherits limits from #parent process lim = resource.getrlimit(resource.RLIMIT_NOFILE) if soft_l is not None or hard_l is not None: if not hard_l: hard_l = soft_l if not soft_l: soft_l=hard_l try: new_lim = [soft_l, hard_l] resource.setrlimit(resource.RLIMIT_NOFILE, new_lim) except: resource.setrlimit(resource.RLIMIT_NOFILE, lim) try: for group in range(len(entry_groups)): entry_names = string.join(entry_groups[group], ':') logSupport.log.info("Starting EntryGroup %s: %s" % \ (group, entry_groups[group])) # Converted to using the subprocess module command_list = [sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntryGroup.py"), str(os.getpid()), str(sleep_time), str(advertize_rate), startup_dir, entry_names, str(group)] childs[group] = subprocess.Popen(command_list, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, preexec_fn=_set_rlimit) # Get the startup time. Used to check if the entry is crashing # periodically and needs to be restarted. childs_uptime[group] = list() childs_uptime[group].insert(0, time.time()) logSupport.log.info("EntryGroup startup times: %s" % childs_uptime) for group in childs: # set it in non blocking mode # since we will run for a long time, we do not want to block for fd in (childs[group].stdout.fileno(), childs[group].stderr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) # If RemoveOldCredFreq < 0, do not do credential cleanup. curr_time = 0 # To ensure curr_time is always initialized if int(glideinDescript.data['RemoveOldCredFreq']) > 0: # Convert credential removal frequency from hours to seconds remove_old_cred_freq = int(glideinDescript.data['RemoveOldCredFreq']) * 60 * 60 curr_time = time.time() update_time = curr_time + remove_old_cred_freq # Convert credential removal age from days to seconds remove_old_cred_age = int(glideinDescript.data['RemoveOldCredAge']) * 60 * 60 * 24 # Create cleaners for old credential files logSupport.log.info("Adding cleaners for old credentials") cred_base_dir = glideinDescript.data['ClientProxiesBaseDir'] for username in frontendDescript.get_all_usernames(): cred_base_user = os.path.join(cred_base_dir, "user_%s" % username) cred_user_instance_dirname = os.path.join(cred_base_user, "glidein_%s" % glideinDescript.data['GlideinName']) cred_cleaner = cleanupSupport.DirCleanupCredentials( cred_user_instance_dirname, "(credential_*)", remove_old_cred_age) cleanupSupport.cred_cleaners.add_cleaner(cred_cleaner) iteration_basetime = time.time() while True: # retrieves WebMonitoringURL from glideclient classAd iteration_timecheck = time.time() iteration_timediff = iteration_timecheck - iteration_basetime if iteration_timediff >= 3600: # every hour iteration_basetime = time.time() # reset the start time fronmonpath = os.path.join(startup_dir, "monitor", "frontendmonitorlink.txt") fronmonconstraint = '(MyType=="glideclient")' fronmonformat_list = [('WebMonitoringURL', 's'), ('FrontendName', 's')] fronmonstatus = condorMonitor.CondorStatus(subsystem_name="any") fronmondata = fronmonstatus.fetch(constraint=fronmonconstraint, format_list=fronmonformat_list) fronmon_list_names = fronmondata.keys() if fronmon_list_names is not None: urlset = set() if os.path.exists(fronmonpath): os.remove(fronmonpath) for frontend_entry in fronmon_list_names: fronmonelement = fronmondata[frontend_entry] fronmonurl = fronmonelement['WebMonitoringURL'].encode('utf-8') fronmonfrt = fronmonelement['FrontendName'].encode('utf-8') if (fronmonfrt, fronmonurl) not in urlset: urlset.add((fronmonfrt, fronmonurl)) with open(fronmonpath, 'w') as fronmonf: fronmonf.write("%s, %s" % (fronmonfrt, fronmonurl)) # Record the iteration start time iteration_stime = time.time() # THIS IS FOR SECURITY # Make sure you delete the old key when its grace is up. # If a compromised key is left around and if attacker can somehow # trigger FactoryEntry process crash, we do not want the entry # to pick up the old key again when factory auto restarts it. if time.time() > oldkey_eoltime and glideinDescript.data['OldPubKeyObj'] is not None: glideinDescript.data['OldPubKeyObj'] = None glideinDescript.data['OldPubKeyType'] = None try: glideinDescript.remove_old_key() logSupport.log.info("Removed the old public key after its grace time of %s seconds" % oldkey_gracetime) except: # Do not crash if delete fails. Just log it. logSupport.log.warning("Failed to remove the old public key after its grace time") # Only removing credentials in the v3+ protocol # Affects Corral Frontend which only supports the v3+ protocol. # IF freq < zero, do not do cleanup. if int(glideinDescript.data['RemoveOldCredFreq']) > 0 and curr_time >= update_time: logSupport.log.info("Checking credentials for cleanup") # Query queue for glideins. Don't remove proxies in use. try: in_use_creds = glideFactoryLib.getCondorQCredentialList() cleanupSupport.cred_cleaners.cleanup(in_use_creds) except: logSupport.log.exception("Unable to cleanup old credentials") update_time = curr_time + remove_old_cred_freq curr_time = time.time() logSupport.log.info("Checking for credentials %s" % entries) # Read in the frontend globals classad # Do this first so that the credentials are immediately # available when the Entries startup classads = {} try: classads = glideFactoryCredentials.get_globals_classads() except Exception: logSupport.log.error("Error occurred retrieving globals classad -- is Condor running?") for classad_key in classads: classad = classads[classad_key] try: glideFactoryCredentials.process_global(classad, glideinDescript, frontendDescript) except: logSupport.log.exception("Error occurred processing the globals classads: ") logSupport.log.info("Checking EntryGroups %s" % childs.keys()) for group in childs: entry_names = string.join(entry_groups[group], ':') child = childs[group] # empty stdout and stderr try: tempOut = child.stdout.read() if len(tempOut) != 0: logSupport.log.warning("EntryGroup %s STDOUT: %s" % (group, tempOut)) except IOError: pass # ignore try: tempErr = child.stderr.read() if len(tempErr) != 0: logSupport.log.warning("EntryGroup %s STDERR: %s" % (group, tempErr)) except IOError: pass # ignore # look for exited child if child.poll(): # the child exited logSupport.log.warning("EntryGroup %s exited. Checking if it should be restarted." % (group)) tempOut = child.stdout.readlines() tempErr = child.stderr.readlines() if is_crashing_often(childs_uptime[group], restart_interval, restart_attempts): del childs[group] raise RuntimeError("EntryGroup '%s' has been crashing too often, quit the whole factory:\n%s\n%s" % (group, tempOut, tempErr)) else: # Restart the entry setting its restart time logSupport.log.warning("Restarting EntryGroup %s." % (group)) del childs[group] command_list = [sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntryGroup.py"), str(os.getpid()), str(sleep_time), str(advertize_rate), startup_dir, entry_names, str(group)] childs[group] = subprocess.Popen(command_list, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, preexec_fn=_set_rlimit) if len(childs_uptime[group]) == restart_attempts: childs_uptime[group].pop(0) childs_uptime[group].append(time.time()) for fd in (childs[group].stdout.fileno(), childs[group].stderr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) logSupport.log.warning("EntryGroup startup/restart times: %s" % (childs_uptime,)) # Aggregate Monitoring data periodically logSupport.log.info("Aggregate monitoring data") stats = aggregate_stats(factory_downtimes.checkDowntime()) save_stats(stats, os.path.join(startup_dir, glideFactoryConfig.factoryConfig.aggregated_stats_file)) # Aggregate job data periodically if glideinDescript.data.get('AdvertisePilotAccounting', False) in ['True', '1']: # data attributes are strings logSupport.log.info("Starting updating job classads") update_classads() logSupport.log.info("Finishing updating job classads") # Advertise the global classad with the factory keys and Factory statistics try: # KEL TODO need to add factory downtime? glideFactoryInterface.advertizeGlobal( glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName'], glideFactoryLib.factoryConfig.supported_signtypes, glideinDescript.data['PubKeyObj'] ) except Exception as e: logSupport.log.exception("Error advertising global classads: %s" % e) cleanupSupport.cleaners.cleanup() iteration_etime = time.time() iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime) if iteration_sleep_time < 0: iteration_sleep_time = 0 logSupport.log.info("Sleep %s secs" % iteration_sleep_time) time.sleep(iteration_sleep_time) # end while 1: finally: # cleanup at exit logSupport.log.info("Received signal...exit") try: try: clean_exit(childs) except: # if anything goes wrong, hardkill the rest for group in childs: logSupport.log.info("Hard killing EntryGroup %s" % group) try: os.kill(childs[group].pid, signal.SIGKILL) except OSError: pass # ignore dead clients finally: logSupport.log.info("Deadvertize myself") try: glideFactoryInterface.deadvertizeFactory( glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName']) except: logSupport.log.exception("Factory deadvertize failed!") try: glideFactoryInterface.deadvertizeFactoryClientMonitoring( glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName']) except: logSupport.log.exception("Factory Monitoring deadvertize failed!") logSupport.log.info("All EntryGroups should be terminated")