Пример #1
0
def forked_check_and_perform_work(factory_in_downtime, entry, work):
    work_done = glideFactoryEntry.check_and_perform_work(
                    factory_in_downtime, entry, work[entry.name])
    
    # entry object now has updated info in the child process
    # This info is required for monitoring and advertising
    # Compile the return info from th  updated entry object 
    # Can't dumps the entry object directly, so need to extract
    # the info required.
    return_dict = compile_pickle_data(entry, work_done)
    return return_dict
Пример #2
0
def forked_check_and_perform_work(factory_in_downtime, entry, work):
    work_done = glideFactoryEntry.check_and_perform_work(
        factory_in_downtime, entry, work[entry.name])

    # entry object now has updated info in the child process
    # This info is required for monitoring and advertising
    # Compile the return info from the updated entry object
    # Can't dumps the entry object directly, so need to extract
    # the info required.
    return_dict = compile_pickle_data(entry, work_done)
    return return_dict
Пример #3
0
def forked_check_and_perform_work(factory_in_downtime, entry, work):
    """
    Do the work assigned to an entry (glidein requests)
    @param factory_in_downtime: flag, True if the Factory is in downtime
    @param entry: entry object (glideFactoryEntry.Entry)
    @param work: work requests for the entry
    @return: dictionary with entry state + work_done
    """
    work_done = glideFactoryEntry.check_and_perform_work(factory_in_downtime, entry, work)
    
    # entry object now has updated info in the child process
    # This info is required for monitoring and advertising
    # Compile the return info from the updated entry object
    # Can't dumps the entry object directly, so need to extract
    # the info required.
    return_dict = compile_pickle_data(entry, work_done)
    return return_dict
def forked_check_and_perform_work(factory_in_downtime, entry, work):
    """
    Do the work assigned to an entry (glidein requests)
    @param factory_in_downtime: flag, True if the Factory is in downtime
    @param entry: entry object (glideFactoryEntry.Entry)
    @param work: work requests for the entry
    @return: dictionary with entry state + work_done
    """
    work_done = glideFactoryEntry.check_and_perform_work(factory_in_downtime, entry, work)
    
    # entry object now has updated info in the child process
    # This info is required for monitoring and advertising
    # Compile the return info from the updated entry object
    # Can't dumps the entry object directly, so need to extract
    # the info required.
    return_dict = compile_pickle_data(entry, work_done)
    return return_dict
def find_and_perform_work(factory_in_downtime, glideinDescript,
                          frontendDescript, group_name, my_entries):
    """
    For all entries in this group, find work requests from the WMS collector,
    validate credentials, and requests glideins. If an entry is in downtime,
    requested glideins is zero.

    @type factory_in_downtime: boolean
    @param factory_in_downtime: True if factory is in downtime

    @type glideinDescript: dict
    @param glideinDescript: Factory glidein config values

    @type frontendDescript: dict
    @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name

    @return: Dictionary of work to do keyed on entry name
    @rtype: dict
    """

    # Work done by group keyed by entry name. This will be returned back
    groupwork_done = {}

    # Step 1:
    # Find work to perform. Work is a dict work[entry_name][frontend]
    # We may or may not be able to perform all the work but that will be
    # checked later per entry

    work = {}
    work = find_work(factory_in_downtime, glideinDescript,
                     frontendDescript, group_name, my_entries)

    # TODO: If we return here check if we need to do cleanup of held glideins?
    #       So far only de-advertising is confirmed to trigger not cleanup
    work_count = get_work_count(work)
    if (work_count == 0):
        logSupport.log.info("No work found")
        return groupwork_done

    logSupport.log.info("Found %s total tasks to work on" % work_count)

    # Got the work items grouped by entries
    # Now fork a process per entry and wait for certain duration to get back
    # the results. Kill processes if they take too long to get back with result

    # ids keyed by entry name
    pipe_ids = {}

    # Max number of children to fork at a time
    # Each child currently takes ~50 MB
    # Leaving 3GB for system, max number of children to fork is
    # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB)
    parallel_workers = 0
    try:
        parallel_workers = int(glideinDescript.data['EntryParallelWorkers'])
    except KeyError:
        logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.")

    post_work_info = {}
    work_info_read_err = False

    if parallel_workers <= 0:
        logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory")
        free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE')
        parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES))
        if parallel_workers < 1: parallel_workers = 1

    logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers)
    forks_remaining = parallel_workers

    # Only fork of child processes for entries that have corresponding
    # work todo, ie glideclient classads.
    for ent in work:
        # Check if we can fork more
        while (forks_remaining == 0):
            # Give some time for the processes to finsh the work
            #ogSupport.log.debug("Reached parallel_workers limit of %s" % parallel_workers)
            time.sleep(1)

            post_work_info_subset = {}
            # Wait and gather results for work done so far before forking more
            try:
                #post_work_info_subset = fetch_fork_result_list(pipe_ids)
                #ogSupport.log.debug("Checking finished workers")
                post_work_info_subset = process_finished_children(pipe_ids)
                post_work_info.update(post_work_info_subset)
            except RuntimeError:
                # Expect all errors logged already
                work_info_read_err = True

            forks_remaining += len(post_work_info_subset)

            for en in post_work_info_subset:
                del pipe_ids[en]

        entry = my_entries[ent]
        r,w = os.pipe()
        unregister_sighandler()
        pid = os.fork()
        forks_remaining -= 1

        if pid != 0:
            register_sighandler()
            # This is the parent process
            os.close(w)
            pipe_ids[entry.name] = {'r': r, 'pid': pid}
        else:
            # This is the child process
            try:
                logSupport.disable_rotate = True
                os.close(r)

                try:
                    work_done = glideFactoryEntry.check_and_perform_work(
                                    factory_in_downtime, entry, work[entry.name])
                    # entry object now has updated info in the child process
                    # This info is required for monitoring and advertising
                    # Compile the return info from th  updated entry object 
                    # Can't dumps the entry object directly, so need to extract
                    # the info required.
                    return_dict = compile_pickle_data(entry, work_done)
                    os.write(w,cPickle.dumps(return_dict))
                except Exception, ex:
                    tb = traceback.format_exception(sys.exc_info()[0],
                                                    sys.exc_info()[1],
                                                    sys.exc_info()[2])
                    entry.log.exception("Error in check_and_perform_work for entry '%s' " % entry.name)

                os.close(w)
                # Hard kill myself. Don't want any cleanup, since I was created
                # just for doing check and perform work for each entry
            finally: