def forked_check_and_perform_work(factory_in_downtime, entry, work): work_done = glideFactoryEntry.check_and_perform_work( factory_in_downtime, entry, work[entry.name]) # entry object now has updated info in the child process # This info is required for monitoring and advertising # Compile the return info from th updated entry object # Can't dumps the entry object directly, so need to extract # the info required. return_dict = compile_pickle_data(entry, work_done) return return_dict
def forked_check_and_perform_work(factory_in_downtime, entry, work): work_done = glideFactoryEntry.check_and_perform_work( factory_in_downtime, entry, work[entry.name]) # entry object now has updated info in the child process # This info is required for monitoring and advertising # Compile the return info from the updated entry object # Can't dumps the entry object directly, so need to extract # the info required. return_dict = compile_pickle_data(entry, work_done) return return_dict
def forked_check_and_perform_work(factory_in_downtime, entry, work): """ Do the work assigned to an entry (glidein requests) @param factory_in_downtime: flag, True if the Factory is in downtime @param entry: entry object (glideFactoryEntry.Entry) @param work: work requests for the entry @return: dictionary with entry state + work_done """ work_done = glideFactoryEntry.check_and_perform_work(factory_in_downtime, entry, work) # entry object now has updated info in the child process # This info is required for monitoring and advertising # Compile the return info from the updated entry object # Can't dumps the entry object directly, so need to extract # the info required. return_dict = compile_pickle_data(entry, work_done) return return_dict
def forked_check_and_perform_work(factory_in_downtime, entry, work): """ Do the work assigned to an entry (glidein requests) @param factory_in_downtime: flag, True if the Factory is in downtime @param entry: entry object (glideFactoryEntry.Entry) @param work: work requests for the entry @return: dictionary with entry state + work_done """ work_done = glideFactoryEntry.check_and_perform_work(factory_in_downtime, entry, work) # entry object now has updated info in the child process # This info is required for monitoring and advertising # Compile the return info from the updated entry object # Can't dumps the entry object directly, so need to extract # the info required. return_dict = compile_pickle_data(entry, work_done) return return_dict
def find_and_perform_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries): """ For all entries in this group, find work requests from the WMS collector, validate credentials, and requests glideins. If an entry is in downtime, requested glideins is zero. @type factory_in_downtime: boolean @param factory_in_downtime: True if factory is in downtime @type glideinDescript: dict @param glideinDescript: Factory glidein config values @type frontendDescript: dict @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep @type group_name: string @param group_name: Name of the group @type my_entries: dict @param my_entries: Dictionary of entry objects keyed on entry name @return: Dictionary of work to do keyed on entry name @rtype: dict """ # Work done by group keyed by entry name. This will be returned back groupwork_done = {} # Step 1: # Find work to perform. Work is a dict work[entry_name][frontend] # We may or may not be able to perform all the work but that will be # checked later per entry work = {} work = find_work(factory_in_downtime, glideinDescript, frontendDescript, group_name, my_entries) # TODO: If we return here check if we need to do cleanup of held glideins? # So far only de-advertising is confirmed to trigger not cleanup work_count = get_work_count(work) if (work_count == 0): logSupport.log.info("No work found") return groupwork_done logSupport.log.info("Found %s total tasks to work on" % work_count) # Got the work items grouped by entries # Now fork a process per entry and wait for certain duration to get back # the results. Kill processes if they take too long to get back with result # ids keyed by entry name pipe_ids = {} # Max number of children to fork at a time # Each child currently takes ~50 MB # Leaving 3GB for system, max number of children to fork is # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB) parallel_workers = 0 try: parallel_workers = int(glideinDescript.data['EntryParallelWorkers']) except KeyError: logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.") post_work_info = {} work_info_read_err = False if parallel_workers <= 0: logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory") free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE') parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES)) if parallel_workers < 1: parallel_workers = 1 logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers) forks_remaining = parallel_workers # Only fork of child processes for entries that have corresponding # work todo, ie glideclient classads. for ent in work: # Check if we can fork more while (forks_remaining == 0): # Give some time for the processes to finsh the work #ogSupport.log.debug("Reached parallel_workers limit of %s" % parallel_workers) time.sleep(1) post_work_info_subset = {} # Wait and gather results for work done so far before forking more try: #post_work_info_subset = fetch_fork_result_list(pipe_ids) #ogSupport.log.debug("Checking finished workers") post_work_info_subset = process_finished_children(pipe_ids) post_work_info.update(post_work_info_subset) except RuntimeError: # Expect all errors logged already work_info_read_err = True forks_remaining += len(post_work_info_subset) for en in post_work_info_subset: del pipe_ids[en] entry = my_entries[ent] r,w = os.pipe() unregister_sighandler() pid = os.fork() forks_remaining -= 1 if pid != 0: register_sighandler() # This is the parent process os.close(w) pipe_ids[entry.name] = {'r': r, 'pid': pid} else: # This is the child process try: logSupport.disable_rotate = True os.close(r) try: work_done = glideFactoryEntry.check_and_perform_work( factory_in_downtime, entry, work[entry.name]) # entry object now has updated info in the child process # This info is required for monitoring and advertising # Compile the return info from th updated entry object # Can't dumps the entry object directly, so need to extract # the info required. return_dict = compile_pickle_data(entry, work_done) os.write(w,cPickle.dumps(return_dict)) except Exception, ex: tb = traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]) entry.log.exception("Error in check_and_perform_work for entry '%s' " % entry.name) os.close(w) # Hard kill myself. Don't want any cleanup, since I was created # just for doing check and perform work for each entry finally: