def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript, frontendDescript, group_name, my_entries): """ Iterate over set of tasks until its time to quit or die. The main "worker" function for the Factory Entry Group. @todo: More description to come @type parent_pid: int @param parent_pid: the pid for the Factory daemon @type sleep_time: int @param sleep_time: The number of seconds to sleep between iterations @type advertize_rate: int @param advertize_rate: The rate at which advertising should occur @type glideinDescript: glideFactoryConfig.GlideinDescript @param glideinDescript: glidein.descript object in the Factory root dir @type frontendDescript: glideFactoryConfig.FrontendDescript @param frontendDescript: frontend.descript object in the Factory root dir @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name """ is_first=1 count=0; # Record the starttime so we know when to disable the use of old pub key starttime = time.time() # The grace period should be in the factory config. Use it to determine # the end of lifetime for the old key object. Hardcoded for now to 30 mins. oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile']) while 1: # Check if parent is still active. If not cleanup and die. check_parent(parent_pid, glideinDescript, my_entries) cleanupSupport.cleaners.start_background_cleanup() # Check if its time to invalidate factory's old key if ( (time.time() > oldkey_eoltime) and (glideinDescript.data['OldPubKeyObj'] is not None) ): # Invalidate the use of factory's old key logSupport.log.info("Retiring use of old key.") logSupport.log.info("Old key was valid from %s to %s ie grace of ~%s sec" % (starttime,oldkey_eoltime,oldkey_gracetime)) glideinDescript.data['OldPubKeyType'] = None glideinDescript.data['OldPubKeyObj'] = None # Check if the factory is in downtime. Group is in downtime only if the # factory is in downtime. Entry specific downtime is handled in entry factory_in_downtime = factory_downtimes.checkDowntime(entry="factory") # Record the iteration start time iteration_stime = time.time() iteration_stime_str = time.ctime() if factory_in_downtime: logSupport.log.info("Iteration at (in downtime) %s" % iteration_stime_str) else: logSupport.log.info("Iteration at %s" % iteration_stime_str) # PM: Shouldn't this be inside the else statement above? # Why do we want to execute this if we are in downtime? # Or do we want to execute only few steps here but code prevents us? try: done_something = iterate_one(count==0, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) logSupport.log.info("Writing stats for all entries") try: pids = [] # generate a list of entries for each CPU cpuCount = int(glideinDescript.data['MonitorUpdateThreadCount']) logSupport.log.info("Number of parallel writes for stats: %i" % cpuCount) entrylists = [my_entries.values()[cpu::cpuCount] for cpu in xrange(cpuCount)] # Fork's keyed by cpu number. Actual key is irrelevant pipe_ids = {} post_writestats_info = {} for cpu in xrange(cpuCount): r,w = os.pipe() unregister_sighandler() pid = os.fork() if pid: # I am the parent register_sighandler() pids.append(pid) os.close(w) pipe_ids[cpu] = {'r': r, 'pid': pid} else: # I am the child os.close(r) logSupport.disable_rotate = True # Return the pickled entry object in form of dict # return_dict[entry.name][entry.getState()] return_dict = {} for entry in entrylists[cpu]: try: entry.writeStats() return_dict[entry.name] = entry.getState() except: entry.log.warning("Error writing stats for entry '%s'" % (entry.name)) entry.log.exception("Error writing stats for entry '%s': " % (entry.name)) try: os.write(w, cPickle.dumps(return_dict)) except: # Catch and log exceptions if any to avoid # runaway processes. entry.log.exception("Error writing pickled state for entry '%s': " % (entry.name)) os.close(w) # Exit without triggering SystemExit exception os._exit(0) try: logSupport.log.info("Processing response from children after write stats") post_writestats_info = fetch_fork_result_list(pipe_ids) except: logSupport.log.exception("Error processing response from one or more children after write stats") logSupport.roll_all_logs() for i in post_writestats_info: for ent in post_writestats_info[i]: (my_entries[ent]).setState(post_writestats_info[i][ent]) except KeyboardInterrupt: raise # this is an exit signal, pass through except: # never fail for stats reasons! logSupport.log.exception("Error writing stats: ") except KeyboardInterrupt: raise # this is an exit signal, pass through except: if is_first: raise else: # if not the first pass, just warn logSupport.log.exception("Exception occurred: ") cleanupSupport.cleaners.wait_for_cleanup() iteration_etime = time.time() iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime) if (iteration_sleep_time < 0): iteration_sleep_time = 0 logSupport.log.info("Sleep %is" % iteration_sleep_time) time.sleep(iteration_sleep_time) count = (count+1) % advertize_rate is_first = 0
def compile_pickle_data(entry, work_done): """ Extract the state of the entry after doing work @type entry: Entry @param entry: Entry object @type work_done: int @param work_done: Work done info """ return_dict = entry.getState() return_dict['work_done'] = work_done return return_dict ############################################################ # # S T A R T U P # ############################################################ if __name__ == '__main__': register_sighandler() # Force integrity checks on all condor operations gfl.set_condor_integrity_checks() main(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), sys.argv[4], sys.argv[5], sys.argv[6])
""" Extract the state of the entry after doing work @type entry: Entry @param entry: Entry object @type work_done: int @param work_done: Work done info """ return_dict = entry.getState() return_dict['work_done'] = work_done return return_dict ############################################################ # # S T A R T U P # ############################################################ if __name__ == '__main__': register_sighandler() # Force integrity checks on all condor operations gfl.set_condor_integrity_checks() main(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), sys.argv[4], sys.argv[5], sys.argv[6])
def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript, frontendDescript, group_name, my_entries): """ Iterate over set of tasks until its time to quit or die. The main "worker" function for the Factory Entry Group. @todo: More description to come @type parent_pid: int @param parent_pid: the pid for the Factory daemon @type sleep_time: int @param sleep_time: The number of seconds to sleep between iterations @type advertize_rate: int @param advertize_rate: The rate at which advertising should occur @type glideinDescript: glideFactoryConfig.GlideinDescript @param glideinDescript: glidein.descript object in the Factory root dir @type frontendDescript: glideFactoryConfig.FrontendDescript @param frontendDescript: frontend.descript object in the Factory root dir @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name """ is_first = True # In first iteration count = 0 # Record the starttime so we know when to disable the use of old pub key starttime = time.time() # The grace period should be in the factory config. Use it to determine # the end of lifetime for the old key object. Hardcoded for now to 30 mins. oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime factory_downtimes = glideFactoryDowntimeLib.DowntimeFile( glideinDescript.data['DowntimesFile']) while True: # Check if parent is still active. If not cleanup and die. check_parent(parent_pid, glideinDescript, my_entries) cleanupSupport.cleaners.start_background_cleanup() # Check if its time to invalidate factory's old key if ((time.time() > oldkey_eoltime) and (glideinDescript.data['OldPubKeyObj'] is not None)): # Invalidate the use of factory's old key logSupport.log.info("Retiring use of old key.") logSupport.log.info( "Old key was valid from %s to %s ie grace of ~%s sec" % (starttime, oldkey_eoltime, oldkey_gracetime)) glideinDescript.data['OldPubKeyType'] = None glideinDescript.data['OldPubKeyObj'] = None # Check if the factory is in downtime. Group is in downtime only if the # factory is in downtime. Entry specific downtime is handled in entry factory_in_downtime = factory_downtimes.checkDowntime(entry="factory") # Record the iteration start time iteration_stime = time.time() iteration_stime_str = time.ctime() if factory_in_downtime: logSupport.log.info("Iteration at (in downtime) %s" % iteration_stime_str) else: logSupport.log.info("Iteration at %s" % iteration_stime_str) # PM: Shouldn't this be inside the else statement above? # Why do we want to execute this if we are in downtime? # Or do we want to execute only few steps here but code prevents us? try: done_something = iterate_one(count == 0, factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) logSupport.log.info("Writing stats for all entries") try: pids = [] # generate a list of entries for each CPU cpuCount = int( glideinDescript.data['MonitorUpdateThreadCount']) logSupport.log.info("Number of parallel writes for stats: %i" % cpuCount) entrylists = [ my_entries.values()[cpu::cpuCount] for cpu in xrange(cpuCount) ] # Fork's keyed by cpu number. Actual key is irrelevant pipe_ids = {} post_writestats_info = {} for cpu in xrange(cpuCount): r, w = os.pipe() unregister_sighandler() pid = os.fork() if pid: # I am the parent register_sighandler() pids.append(pid) os.close(w) pipe_ids[cpu] = {'r': r, 'pid': pid} else: # I am the child os.close(r) logSupport.disable_rotate = True # Return the pickled entry object in form of dict # return_dict[entry.name][entry.getState()] return_dict = {} for entry in entrylists[cpu]: try: entry.writeStats() return_dict[entry.name] = entry.getState() except: entry.log.warning( "Error writing stats for entry '%s'" % (entry.name)) entry.log.exception( "Error writing stats for entry '%s': " % (entry.name)) try: os.write(w, cPickle.dumps(return_dict)) except: # Catch and log exceptions if any to avoid # runaway processes. entry.log.exception( "Error writing pickled state for entry '%s': " % (entry.name)) os.close(w) # Exit without triggering SystemExit exception os._exit(0) try: logSupport.log.info( "Processing response from children after write stats") post_writestats_info = fetch_fork_result_list(pipe_ids) except: logSupport.log.exception( "Error processing response from one or more children after write stats" ) logSupport.roll_all_logs() for i in post_writestats_info: for ent in post_writestats_info[i]: (my_entries[ent]).setState( post_writestats_info[i][ent]) except KeyboardInterrupt: raise # this is an exit signal, pass through except: # never fail for stats reasons! logSupport.log.exception("Error writing stats: ") except KeyboardInterrupt: raise # this is an exit signal, pass through except: if is_first: raise else: # If not the first pass, just warn logSupport.log.exception( "Exception occurred in the main loop of Factory Group %s: " % group_name) cleanupSupport.cleaners.wait_for_cleanup() iteration_etime = time.time() iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime) if (iteration_sleep_time < 0): iteration_sleep_time = 0 logSupport.log.info("Sleep %is" % iteration_sleep_time) time.sleep(iteration_sleep_time) count = (count + 1) % advertize_rate is_first = False # Entering following iterations
def find_and_perform_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry work = {} work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # TODO: If we return here check if we need to do cleanup of held glideins? # So far only de-advertising is confirmed to trigger not cleanup work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Got the work items grouped by entries # Now fork a process per entry and wait for certain duration to get back # the results. Kill processes if they take too long to get back with result # ids keyed by entry name pipe_ids = {} # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.") post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory") free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forks_remaining = parallel_workers # Only fork of child processes for entries that have corresponding # work todo, ie glideclient classads. for ent in work: # Check if we can fork more while (forks_remaining == 0): # Give some time for the processes to finsh the work #ogSupport.log.debug("Reached parallel_workers limit of %s" % parallel_workers) time.sleep(1) post_work_info_subset = {} # Wait and gather results for work done so far before forking more try: #post_work_info_subset = fetch_fork_result_list(pipe_ids) #ogSupport.log.debug("Checking finished workers") post_work_info_subset = process_finished_children(pipe_ids) post_work_info.update(post_work_info_subset) except RuntimeError: # Expect all errors logged already work_info_read_err = True forks_remaining += len(post_work_info_subset) for en in post_work_info_subset: del pipe_ids[en] entry = my_entries[ent] r,w = os.pipe() unregister_sighandler() pid = os.fork() forks_remaining -= 1 if pid != 0: register_sighandler() # This is the parent process os.close(w) pipe_ids[entry.name] = {'r': r, 'pid': pid} else: # This is the child process try: logSupport.disable_rotate = True os.close(r) try: work_done = glideFactoryEntry.check_and_perform_work( factory_in_downtime, entry, work[entry.name]) # entry object now has updated info in the child process # This info is required for monitoring and advertising # Compile the return info from th updated entry object # Can't dumps the entry object directly, so need to extract # the info required. return_dict = compile_pickle_data(entry, work_done) os.write(w,cPickle.dumps(return_dict)) except Exception, ex: tb = traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]) entry.log.exception("Error in check_and_perform_work for entry '%s' " % entry.name) os.close(w) # Hard kill myself. Don't want any cleanup, since I was created # just for doing check and perform work for each entry finally: