Exemplo n.º 1
0
def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript,
            frontendDescript, group_name, my_entries):
    """
    Iterate over set of tasks until its time to quit or die. The main "worker"
    function for the Factory Entry Group.
    @todo: More description to come

    @type parent_pid: int
    @param parent_pid: the pid for the Factory daemon

    @type sleep_time: int
    @param sleep_time: The number of seconds to sleep between iterations

    @type advertize_rate: int
    @param advertize_rate: The rate at which advertising should occur

    @type glideinDescript: glideFactoryConfig.GlideinDescript
    @param glideinDescript: glidein.descript object in the Factory root dir

    @type frontendDescript: glideFactoryConfig.FrontendDescript
    @param frontendDescript: frontend.descript object in the Factory root dir

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name
    """

    is_first = True  # In first iteration
    count = 0

    # Record the starttime so we know when to disable the use of old pub key
    starttime = time.time()

    # The grace period should be in the factory config. Use it to determine
    # the end of lifetime for the old key object. Hardcoded for now to 30 mins.
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(
        glideinDescript.data['DowntimesFile'])

    while True:

        # Check if parent is still active. If not cleanup and die.
        check_parent(parent_pid, glideinDescript, my_entries)

        cleanupSupport.cleaners.start_background_cleanup()

        # Check if its time to invalidate factory's old key
        if ((time.time() > oldkey_eoltime)
                and (glideinDescript.data['OldPubKeyObj'] is not None)):
            # Invalidate the use of factory's old key
            logSupport.log.info("Retiring use of old key.")
            logSupport.log.info(
                "Old key was valid from %s to %s ie grace of ~%s sec" %
                (starttime, oldkey_eoltime, oldkey_gracetime))
            glideinDescript.data['OldPubKeyType'] = None
            glideinDescript.data['OldPubKeyObj'] = None

        # Check if the factory is in downtime. Group is in downtime only if the
        # factory is in downtime. Entry specific downtime is handled in entry
        factory_in_downtime = factory_downtimes.checkDowntime(entry="factory")

        # Record the iteration start time
        iteration_stime = time.time()
        iteration_stime_str = time.ctime()

        if factory_in_downtime:
            logSupport.log.info("Iteration at (in downtime) %s" %
                                iteration_stime_str)
        else:
            logSupport.log.info("Iteration at %s" % iteration_stime_str)

        # PM: Shouldn't this be inside the else statement above?
        # Why do we want to execute this if we are in downtime?
        # Or do we want to execute only few steps here but code prevents us?
        try:
            done_something = iterate_one(count == 0, factory_in_downtime,
                                         glideinDescript, frontendDescript,
                                         group_name, my_entries)

            logSupport.log.info("Writing stats for all entries")

            try:
                pids = []
                # generate a list of entries for each CPU
                cpuCount = int(
                    glideinDescript.data['MonitorUpdateThreadCount'])
                logSupport.log.info("Number of parallel writes for stats: %i" %
                                    cpuCount)

                entrylists = [
                    my_entries.values()[cpu::cpuCount]
                    for cpu in xrange(cpuCount)
                ]

                # Fork's keyed by cpu number. Actual key is irrelevant
                pipe_ids = {}

                post_writestats_info = {}

                for cpu in xrange(cpuCount):
                    r, w = os.pipe()
                    unregister_sighandler()
                    pid = os.fork()
                    if pid:
                        # I am the parent
                        register_sighandler()
                        pids.append(pid)
                        os.close(w)
                        pipe_ids[cpu] = {'r': r, 'pid': pid}
                    else:
                        # I am the child
                        os.close(r)
                        logSupport.disable_rotate = True
                        # Return the pickled entry object in form of dict
                        # return_dict[entry.name][entry.getState()]
                        return_dict = {}
                        for entry in entrylists[cpu]:
                            try:
                                entry.writeStats()
                                return_dict[entry.name] = entry.getState()
                            except:
                                entry.log.warning(
                                    "Error writing stats for entry '%s'" %
                                    (entry.name))
                                entry.log.exception(
                                    "Error writing stats for entry '%s': " %
                                    (entry.name))

                        try:
                            os.write(w, cPickle.dumps(return_dict))
                        except:
                            # Catch and log exceptions if any to avoid
                            # runaway processes.
                            entry.log.exception(
                                "Error writing pickled state for entry '%s': "
                                % (entry.name))
                        os.close(w)
                        # Exit without triggering SystemExit exception
                        os._exit(0)

                try:
                    logSupport.log.info(
                        "Processing response from children after write stats")
                    post_writestats_info = fetch_fork_result_list(pipe_ids)
                except:
                    logSupport.log.exception(
                        "Error processing response from one or more children after write stats"
                    )

                logSupport.roll_all_logs()

                for i in post_writestats_info:
                    for ent in post_writestats_info[i]:
                        (my_entries[ent]).setState(
                            post_writestats_info[i][ent])
            except KeyboardInterrupt:
                raise  # this is an exit signal, pass through
            except:
                # never fail for stats reasons!
                logSupport.log.exception("Error writing stats: ")
        except KeyboardInterrupt:
            raise  # this is an exit signal, pass through
        except:
            if is_first:
                raise
            else:
                # If not the first pass, just warn
                logSupport.log.exception(
                    "Exception occurred in the main loop of Factory Group %s: "
                    % group_name)

        cleanupSupport.cleaners.wait_for_cleanup()

        iteration_etime = time.time()
        iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime)
        if (iteration_sleep_time < 0):
            iteration_sleep_time = 0
        logSupport.log.info("Sleep %is" % iteration_sleep_time)
        time.sleep(iteration_sleep_time)

        count = (count + 1) % advertize_rate
        is_first = False  # Entering following iterations
Exemplo n.º 2
0
def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript,
            frontendDescript, group_name, my_entries):
    """
    Iterate over set of tasks until its time to quit or die. The main "worker"
    function for the Factory Entry Group.
    @todo: More description to come

    @type parent_pid: int
    @param parent_pid: the pid for the Factory daemon

    @type sleep_time: int
    @param sleep_time: The number of seconds to sleep between iterations

    @type advertize_rate: int
    @param advertize_rate: The rate at which advertising should occur

    @type glideinDescript: glideFactoryConfig.GlideinDescript
    @param glideinDescript: glidein.descript object in the Factory root dir

    @type frontendDescript: glideFactoryConfig.FrontendDescript
    @param frontendDescript: frontend.descript object in the Factory root dir

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name
    """

    is_first=1
    count=0;

    # Record the starttime so we know when to disable the use of old pub key
    starttime = time.time()

    # The grace period should be in the factory config. Use it to determine
    # the end of lifetime for the old key object. Hardcoded for now to 30 mins.
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile'])

    while 1:

        # Check if parent is still active. If not cleanup and die.
        check_parent(parent_pid, glideinDescript, my_entries)

        cleanupSupport.cleaners.start_background_cleanup()

        # Check if its time to invalidate factory's old key
        if ( (time.time() > oldkey_eoltime) and
             (glideinDescript.data['OldPubKeyObj'] is not None) ):
            # Invalidate the use of factory's old key
            logSupport.log.info("Retiring use of old key.")
            logSupport.log.info("Old key was valid from %s to %s ie grace of ~%s sec" % (starttime,oldkey_eoltime,oldkey_gracetime))
            glideinDescript.data['OldPubKeyType'] = None
            glideinDescript.data['OldPubKeyObj'] = None

        # Check if the factory is in downtime. Group is in downtime only if the
        # factory is in downtime. Entry specific downtime is handled in entry
        factory_in_downtime = factory_downtimes.checkDowntime(entry="factory")

        # Record the iteration start time
        iteration_stime = time.time()
        iteration_stime_str = time.ctime()

        if factory_in_downtime:
            logSupport.log.info("Iteration at (in downtime) %s" % iteration_stime_str)
        else:
            logSupport.log.info("Iteration at %s" % iteration_stime_str)

        # PM: Shouldn't this be inside the else statement above?
        # Why do we want to execute this if we are in downtime?
        # Or do we want to execute only few steps here but code prevents us?
        try:
            done_something = iterate_one(count==0, factory_in_downtime,
                                         glideinDescript, frontendDescript,
                                         group_name, my_entries)

            logSupport.log.info("Writing stats for all entries")

            try:
                pids = []
                # generate a list of entries for each CPU
                cpuCount = int(glideinDescript.data['MonitorUpdateThreadCount'])
                logSupport.log.info("Number of parallel writes for stats: %i" % cpuCount)

                entrylists = [my_entries.values()[cpu::cpuCount] for cpu in xrange(cpuCount)]

                # Fork's keyed by cpu number. Actual key is irrelevant
                pipe_ids = {}

                post_writestats_info = {}

                for cpu in xrange(cpuCount):
                    r,w = os.pipe()
                    unregister_sighandler()
                    pid = os.fork()
                    if pid:
                        # I am the parent
                        register_sighandler()
                        pids.append(pid)
                        os.close(w)
                        pipe_ids[cpu] = {'r': r, 'pid': pid}
                    else:
                        # I am the child
                        os.close(r)
                        logSupport.disable_rotate = True
                        # Return the pickled entry object in form of dict
                        # return_dict[entry.name][entry.getState()]
                        return_dict = {}
                        for entry in entrylists[cpu]:
                            try:
                                entry.writeStats()
                                return_dict[entry.name] = entry.getState()
                            except:
                                entry.log.warning("Error writing stats for entry '%s'" % (entry.name))
                                entry.log.exception("Error writing stats for entry '%s': " % (entry.name))

                        try:
                            os.write(w, cPickle.dumps(return_dict))
                        except:
                            # Catch and log exceptions if any to avoid
                            # runaway processes.
                            entry.log.exception("Error writing pickled state for entry '%s': " % (entry.name))
                        os.close(w)
                        # Exit without triggering SystemExit exception
                        os._exit(0)

                try:
                    logSupport.log.info("Processing response from children after write stats")
                    post_writestats_info = fetch_fork_result_list(pipe_ids)
                except:
                    logSupport.log.exception("Error processing response from one or more children after write stats")

                logSupport.roll_all_logs()

                for i in post_writestats_info:
                    for ent in post_writestats_info[i]:
                        (my_entries[ent]).setState(post_writestats_info[i][ent])
            except KeyboardInterrupt:
                raise # this is an exit signal, pass through
            except:
                # never fail for stats reasons!
                logSupport.log.exception("Error writing stats: ")
        except KeyboardInterrupt:
            raise # this is an exit signal, pass through
        except:
            if is_first:
                raise
            else:
                # if not the first pass, just warn
                logSupport.log.exception("Exception occurred: ")

        cleanupSupport.cleaners.wait_for_cleanup()

        iteration_etime = time.time()
        iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime)
        if (iteration_sleep_time < 0):
            iteration_sleep_time = 0
        logSupport.log.info("Sleep %is" % iteration_sleep_time)
        time.sleep(iteration_sleep_time)

        count = (count+1) % advertize_rate
        is_first = 0
def find_and_perform_work(factory_in_downtime, glideinDescript,
                          frontendDescript, group_name, my_entries):
    """
    For all entries in this group, find work requests from the WMS collector,
    validate credentials, and requests glideins. If an entry is in downtime,
    requested glideins is zero.

    @type factory_in_downtime: boolean
    @param factory_in_downtime: True if factory is in downtime

    @type glideinDescript: dict
    @param glideinDescript: Factory glidein config values

    @type frontendDescript: dict
    @param frontendDescript: Security mappings for frontend identities, security classes, and usernames for privsep

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name

    @return: Dictionary of work to do keyed on entry name
    @rtype: dict
    """

    # Work done by group keyed by entry name. This will be returned back
    groupwork_done = {}

    # Step 1:
    # Find work to perform. Work is a dict work[entry_name][frontend]
    # We may or may not be able to perform all the work but that will be
    # checked later per entry

    work = {}
    work = find_work(factory_in_downtime, glideinDescript,
                     frontendDescript, group_name, my_entries)

    # TODO: If we return here check if we need to do cleanup of held glideins?
    #       So far only de-advertising is confirmed to trigger not cleanup
    work_count = get_work_count(work)
    if (work_count == 0):
        logSupport.log.info("No work found")
        return groupwork_done

    logSupport.log.info("Found %s total tasks to work on" % work_count)

    # Got the work items grouped by entries
    # Now fork a process per entry and wait for certain duration to get back
    # the results. Kill processes if they take too long to get back with result

    # ids keyed by entry name
    pipe_ids = {}

    # Max number of children to fork at a time
    # Each child currently takes ~50 MB
    # Leaving 3GB for system, max number of children to fork is
    # (Memory - 3000)/50 = 100 (RAM: 8GB) & 250 (RAM: 16GB)
    parallel_workers = 0
    try:
        parallel_workers = int(glideinDescript.data['EntryParallelWorkers'])
    except KeyError:
        logSupport.log.debug("EntryParallelWorkers not set -- factory probably needs a reconfig; setting to 0 for dynamic limits.")

    post_work_info = {}
    work_info_read_err = False

    if parallel_workers <= 0:
        logSupport.log.debug("Setting parallel_workers limit dynamically based on the available free memory")
        free_mem = os.sysconf('SC_AVPHYS_PAGES')*os.sysconf('SC_PAGE_SIZE')
        parallel_workers = int(free_mem / float(ENTRY_MEM_REQ_BYTES))
        if parallel_workers < 1: parallel_workers = 1

    logSupport.log.debug("Setting parallel_workers limit of %s" % parallel_workers)
    forks_remaining = parallel_workers

    # Only fork of child processes for entries that have corresponding
    # work todo, ie glideclient classads.
    for ent in work:
        # Check if we can fork more
        while (forks_remaining == 0):
            # Give some time for the processes to finsh the work
            #ogSupport.log.debug("Reached parallel_workers limit of %s" % parallel_workers)
            time.sleep(1)

            post_work_info_subset = {}
            # Wait and gather results for work done so far before forking more
            try:
                #post_work_info_subset = fetch_fork_result_list(pipe_ids)
                #ogSupport.log.debug("Checking finished workers")
                post_work_info_subset = process_finished_children(pipe_ids)
                post_work_info.update(post_work_info_subset)
            except RuntimeError:
                # Expect all errors logged already
                work_info_read_err = True

            forks_remaining += len(post_work_info_subset)

            for en in post_work_info_subset:
                del pipe_ids[en]

        entry = my_entries[ent]
        r,w = os.pipe()
        unregister_sighandler()
        pid = os.fork()
        forks_remaining -= 1

        if pid != 0:
            register_sighandler()
            # This is the parent process
            os.close(w)
            pipe_ids[entry.name] = {'r': r, 'pid': pid}
        else:
            # This is the child process
            try:
                logSupport.disable_rotate = True
                os.close(r)

                try:
                    work_done = glideFactoryEntry.check_and_perform_work(
                                    factory_in_downtime, entry, work[entry.name])
                    # entry object now has updated info in the child process
                    # This info is required for monitoring and advertising
                    # Compile the return info from th  updated entry object 
                    # Can't dumps the entry object directly, so need to extract
                    # the info required.
                    return_dict = compile_pickle_data(entry, work_done)
                    os.write(w,cPickle.dumps(return_dict))
                except Exception, ex:
                    tb = traceback.format_exception(sys.exc_info()[0],
                                                    sys.exc_info()[1],
                                                    sys.exc_info()[2])
                    entry.log.exception("Error in check_and_perform_work for entry '%s' " % entry.name)

                os.close(w)
                # Hard kill myself. Don't want any cleanup, since I was created
                # just for doing check and perform work for each entry
            finally: