예제 #1
0
def get_downtime_fd(entry_name, cmdname):
    try:
        # New style has config all in the factory file
        #if entry_name=='factory':
        config = glideFactoryConfig.GlideinDescript()
        #else:
        #    config=glideFactoryConfig.JobDescript(entry_name)
    except IOError:
        raise RuntimeError("Failed to load config for %s" % entry_name)

    fd = glideFactoryDowntimeLib.DowntimeFile(config.data['DowntimesFile'])
    return fd
예제 #2
0
def iterate(parent_pid, sleep_time, advertize_rate, glideinDescript,
            frontendDescript, group_name, my_entries):
    """
    Iterate over set of tasks until its time to quit or die. The main "worker"
    function for the Factory Entry Group.
    @todo: More description to come

    @type parent_pid: int
    @param parent_pid: the pid for the Factory daemon

    @type sleep_time: int
    @param sleep_time: The number of seconds to sleep between iterations

    @type advertize_rate: int
    @param advertize_rate: The rate at which advertising should occur

    @type glideinDescript: glideFactoryConfig.GlideinDescript
    @param glideinDescript: glidein.descript object in the Factory root dir

    @type frontendDescript: glideFactoryConfig.FrontendDescript
    @param frontendDescript: frontend.descript object in the Factory root dir

    @type group_name: string
    @param group_name: Name of the group

    @type my_entries: dict
    @param my_entries: Dictionary of entry objects keyed on entry name
    """

    is_first = True  # In first iteration
    count = 0

    # Record the starttime so we know when to disable the use of old pub key
    starttime = time.time()

    # The grace period should be in the factory config. Use it to determine
    # the end of lifetime for the old key object. Hardcoded for now to 30 mins.
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(
        glideinDescript.data['DowntimesFile'])

    while True:

        # Check if parent is still active. If not cleanup and die.
        check_parent(parent_pid, glideinDescript, my_entries)

        cleanupSupport.cleaners.start_background_cleanup()

        # Check if its time to invalidate factory's old key
        if ((time.time() > oldkey_eoltime)
                and (glideinDescript.data['OldPubKeyObj'] is not None)):
            # Invalidate the use of factory's old key
            logSupport.log.info("Retiring use of old key.")
            logSupport.log.info(
                "Old key was valid from %s to %s ie grace of ~%s sec" %
                (starttime, oldkey_eoltime, oldkey_gracetime))
            glideinDescript.data['OldPubKeyType'] = None
            glideinDescript.data['OldPubKeyObj'] = None

        # Check if the factory is in downtime. Group is in downtime only if the
        # factory is in downtime. Entry specific downtime is handled in entry
        factory_in_downtime = factory_downtimes.checkDowntime(entry="factory")

        # Record the iteration start time
        iteration_stime = time.time()
        iteration_stime_str = time.ctime()

        if factory_in_downtime:
            logSupport.log.info("Iteration at (in downtime) %s" %
                                iteration_stime_str)
        else:
            logSupport.log.info("Iteration at %s" % iteration_stime_str)

        # PM: Shouldn't this be inside the else statement above?
        # Why do we want to execute this if we are in downtime?
        # Or do we want to execute only few steps here but code prevents us?
        try:
            done_something = iterate_one(count == 0, factory_in_downtime,
                                         glideinDescript, frontendDescript,
                                         group_name, my_entries)

            logSupport.log.info("Writing stats for all entries")

            try:
                pids = []
                # generate a list of entries for each CPU
                cpuCount = int(
                    glideinDescript.data['MonitorUpdateThreadCount'])
                logSupport.log.info("Number of parallel writes for stats: %i" %
                                    cpuCount)

                entrylists = [
                    my_entries.values()[cpu::cpuCount]
                    for cpu in xrange(cpuCount)
                ]

                # Fork's keyed by cpu number. Actual key is irrelevant
                pipe_ids = {}

                post_writestats_info = {}

                for cpu in xrange(cpuCount):
                    r, w = os.pipe()
                    unregister_sighandler()
                    pid = os.fork()
                    if pid:
                        # I am the parent
                        register_sighandler()
                        pids.append(pid)
                        os.close(w)
                        pipe_ids[cpu] = {'r': r, 'pid': pid}
                    else:
                        # I am the child
                        os.close(r)
                        logSupport.disable_rotate = True
                        # Return the pickled entry object in form of dict
                        # return_dict[entry.name][entry.getState()]
                        return_dict = {}
                        for entry in entrylists[cpu]:
                            try:
                                entry.writeStats()
                                return_dict[entry.name] = entry.getState()
                            except:
                                entry.log.warning(
                                    "Error writing stats for entry '%s'" %
                                    (entry.name))
                                entry.log.exception(
                                    "Error writing stats for entry '%s': " %
                                    (entry.name))

                        try:
                            os.write(w, cPickle.dumps(return_dict))
                        except:
                            # Catch and log exceptions if any to avoid
                            # runaway processes.
                            entry.log.exception(
                                "Error writing pickled state for entry '%s': "
                                % (entry.name))
                        os.close(w)
                        # Exit without triggering SystemExit exception
                        os._exit(0)

                try:
                    logSupport.log.info(
                        "Processing response from children after write stats")
                    post_writestats_info = fetch_fork_result_list(pipe_ids)
                except:
                    logSupport.log.exception(
                        "Error processing response from one or more children after write stats"
                    )

                logSupport.roll_all_logs()

                for i in post_writestats_info:
                    for ent in post_writestats_info[i]:
                        (my_entries[ent]).setState(
                            post_writestats_info[i][ent])
            except KeyboardInterrupt:
                raise  # this is an exit signal, pass through
            except:
                # never fail for stats reasons!
                logSupport.log.exception("Error writing stats: ")
        except KeyboardInterrupt:
            raise  # this is an exit signal, pass through
        except:
            if is_first:
                raise
            else:
                # If not the first pass, just warn
                logSupport.log.exception(
                    "Exception occurred in the main loop of Factory Group %s: "
                    % group_name)

        cleanupSupport.cleaners.wait_for_cleanup()

        iteration_etime = time.time()
        iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime)
        if (iteration_sleep_time < 0):
            iteration_sleep_time = 0
        logSupport.log.info("Sleep %is" % iteration_sleep_time)
        time.sleep(iteration_sleep_time)

        count = (count + 1) % advertize_rate
        is_first = False  # Entering following iterations
 def setUp(self):
     self.file_loc = "/tmp/downtimes.txt"
     self.downtime = glideFactoryDowntimeLib.DowntimeFile(self.file_loc)
     pass
예제 #4
0
def spawn(sleep_time, advertize_rate, startup_dir, glideinDescript,
          frontendDescript, entries, restart_attempts, restart_interval):
    """
    Spawn and keep track of the entry processes. Restart them if required.
    Advertise glidefactoryglobal classad every iteration

    @type sleep_time: long
    @param sleep_time: Delay between every iteration
    @type advertize_rate: long
    @param advertize_rate: Rate at which entries advertise their classads
    @type startup_dir: String
    @param startup_dir: Path to glideinsubmit directory
    @type glideinDescript: glideFactoryConfig.GlideinDescript
    @param glideinDescript: Factory config's glidein description object
    @type frontendDescript: glideFactoryConfig.FrontendDescript
    @param frontendDescript: Factory config's frontend description object
    @type entries: list
    @param entries: Sorted list of entry names
    @type restart_interval: long
    @param restart_interval: Allowed restart interval in second
    @type restart_attempts: long
    @param restart_attempts: Number of allowed restart attempts in the interval
    """

    global STARTUP_DIR
    childs = {}

    # Number of glideFactoryEntry processes to spawn and directly relates to
    # number of concurrent condor_status processess
    #
    # NOTE: If number of entries gets too big, we may excede the shell args
    #       limit. If that becomes an issue, move the logic to identify the
    #       entries to serve to the group itself.
    #
    # Each process will handle multiple entries split as follows
    #   - Sort the entries alphabetically. Already done
    #   - Divide the list into equal chunks as possible
    #   - Last chunk may get fewer entries
    entry_process_count = 1


    starttime = time.time()
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime

    childs_uptime={}

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile'])

    logSupport.log.info("Available Entries: %s" % entries)

    group_size = long(math.ceil(float(len(entries))/entry_process_count))
    entry_groups = entry_grouper(group_size, entries)

    def _set_rlimit(soft_l=None, hard_l=None):
        #set new hard and soft open file limits
        #if setting limits fails or no input parameters use inherited limits
        #from parent process 
        #nb 1.  it is possible to raise limits 
        #up to [hard_l,hard_l] but once lowered they cannot be raised
        #nb 2. it may be better just to omit calling this function at
        #all from subprocess - in which case it inherits limits from
        #parent process

        lim =  resource.getrlimit(resource.RLIMIT_NOFILE)
        if soft_l is not None or hard_l is not None:
            if not hard_l:
                hard_l = soft_l
            if not soft_l:
                soft_l=hard_l
            try:    
                new_lim = [soft_l, hard_l]
                resource.setrlimit(resource.RLIMIT_NOFILE, new_lim)
            except:
                resource.setrlimit(resource.RLIMIT_NOFILE, lim)



    try:
        for group in range(len(entry_groups)):
            entry_names = string.join(entry_groups[group], ':')
            logSupport.log.info("Starting EntryGroup %s: %s" % \
                (group, entry_groups[group]))

            # Converted to using the subprocess module
            command_list = [sys.executable,
                            os.path.join(STARTUP_DIR,
                                         "glideFactoryEntryGroup.py"),
                            str(os.getpid()),
                            str(sleep_time),
                            str(advertize_rate),
                            startup_dir,
                            entry_names,
                            str(group)]
            childs[group] = subprocess.Popen(command_list, shell=False,
                                             stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE,
                                             close_fds=True,
                                             preexec_fn=_set_rlimit)

            # Get the startup time. Used to check if the entry is crashing
            # periodically and needs to be restarted.
            childs_uptime[group] = list()
            childs_uptime[group].insert(0, time.time())

        logSupport.log.info("EntryGroup startup times: %s" % childs_uptime)

        for group in childs:
            # set it in non blocking mode
            # since we will run for a long time, we do not want to block
            for fd in (childs[group].stdout.fileno(),
                       childs[group].stderr.fileno()):
                fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)

        # If RemoveOldCredFreq < 0, do not do credential cleanup.
        curr_time = 0  # To ensure curr_time is always initialized
        if int(glideinDescript.data['RemoveOldCredFreq']) > 0:
            # Convert credential removal frequency from hours to seconds
            remove_old_cred_freq = int(glideinDescript.data['RemoveOldCredFreq']) * 60 * 60
            curr_time = time.time()
            update_time = curr_time + remove_old_cred_freq

            # Convert credential removal age from days to seconds
            remove_old_cred_age = int(glideinDescript.data['RemoveOldCredAge']) * 60 * 60 * 24

            # Create cleaners for old credential files
            logSupport.log.info("Adding cleaners for old credentials")
            cred_base_dir = glideinDescript.data['ClientProxiesBaseDir']
            for username in frontendDescript.get_all_usernames():
                cred_base_user = os.path.join(cred_base_dir, "user_%s" % username)
                cred_user_instance_dirname = os.path.join(cred_base_user, "glidein_%s" % glideinDescript.data['GlideinName'])
                cred_cleaner = cleanupSupport.DirCleanupCredentials(
                    cred_user_instance_dirname,
                    "(credential_*)", remove_old_cred_age)
                cleanupSupport.cred_cleaners.add_cleaner(cred_cleaner)

        iteration_basetime = time.time()
        while True:
            # retrieves WebMonitoringURL from glideclient classAd
            iteration_timecheck = time.time()
            iteration_timediff = iteration_timecheck - iteration_basetime

            if iteration_timediff >= 3600:  # every hour
                iteration_basetime = time.time()  # reset the start time
                fronmonpath = os.path.join(startup_dir, "monitor", "frontendmonitorlink.txt")
                fronmonconstraint = '(MyType=="glideclient")'
                fronmonformat_list = [('WebMonitoringURL', 's'), ('FrontendName', 's')]
                fronmonstatus = condorMonitor.CondorStatus(subsystem_name="any")
                fronmondata = fronmonstatus.fetch(constraint=fronmonconstraint, format_list=fronmonformat_list)
                fronmon_list_names = fronmondata.keys()
                if fronmon_list_names is not None:
                    urlset = set()
                    if os.path.exists(fronmonpath):
                        os.remove(fronmonpath)
                    for frontend_entry in fronmon_list_names:
                        fronmonelement = fronmondata[frontend_entry]
                        fronmonurl = fronmonelement['WebMonitoringURL'].encode('utf-8')
                        fronmonfrt = fronmonelement['FrontendName'].encode('utf-8')
                        if (fronmonfrt, fronmonurl) not in urlset:
                            urlset.add((fronmonfrt, fronmonurl))
                            with open(fronmonpath, 'w') as fronmonf:
                                fronmonf.write("%s, %s" % (fronmonfrt, fronmonurl))

            # Record the iteration start time
            iteration_stime = time.time()

            # THIS IS FOR SECURITY
            # Make sure you delete the old key when its grace is up.
            # If a compromised key is left around and if attacker can somehow
            # trigger FactoryEntry process crash, we do not want the entry
            # to pick up the old key again when factory auto restarts it.
            if time.time() > oldkey_eoltime and glideinDescript.data['OldPubKeyObj'] is not None:
                glideinDescript.data['OldPubKeyObj'] = None
                glideinDescript.data['OldPubKeyType'] = None
                try:
                    glideinDescript.remove_old_key()
                    logSupport.log.info("Removed the old public key after its grace time of %s seconds" % oldkey_gracetime)
                except:
                    # Do not crash if delete fails. Just log it.
                    logSupport.log.warning("Failed to remove the old public key after its grace time")

            # Only removing credentials in the v3+ protocol
            # Affects Corral Frontend which only supports the v3+ protocol.
            # IF freq < zero, do not do cleanup.
            if int(glideinDescript.data['RemoveOldCredFreq']) > 0 and curr_time >= update_time:
                logSupport.log.info("Checking credentials for cleanup")

                # Query queue for glideins. Don't remove proxies in use.
                try:
                    in_use_creds = glideFactoryLib.getCondorQCredentialList()
                    cleanupSupport.cred_cleaners.cleanup(in_use_creds)
                except:
                    logSupport.log.exception("Unable to cleanup old credentials")

                update_time = curr_time + remove_old_cred_freq

            curr_time = time.time()

            logSupport.log.info("Checking for credentials %s" % entries)

            # Read in the frontend globals classad
            # Do this first so that the credentials are immediately
            # available when the Entries startup
            classads = {}
            try:
                classads = glideFactoryCredentials.get_globals_classads()
            except Exception:
                logSupport.log.error("Error occurred retrieving globals classad -- is Condor running?")

            for classad_key in classads:
                classad = classads[classad_key]
                try:
                    glideFactoryCredentials.process_global(classad,
                                                           glideinDescript,
                                                           frontendDescript)
                except:
                    logSupport.log.exception("Error occurred processing the globals classads: ")


            logSupport.log.info("Checking EntryGroups %s" % childs.keys())
            for group in childs:
                entry_names = string.join(entry_groups[group], ':')
                child = childs[group]

                # empty stdout and stderr
                try:
                    tempOut = child.stdout.read()
                    if len(tempOut) != 0:
                        logSupport.log.warning("EntryGroup %s STDOUT: %s" % (group, tempOut))
                except IOError:
                    pass # ignore
                try:
                    tempErr = child.stderr.read()
                    if len(tempErr) != 0:
                        logSupport.log.warning("EntryGroup %s STDERR: %s" % (group, tempErr))
                except IOError:
                    pass  # ignore

                # look for exited child
                if child.poll():
                    # the child exited
                    logSupport.log.warning("EntryGroup %s exited. Checking if it should be restarted." % (group))
                    tempOut = child.stdout.readlines()
                    tempErr = child.stderr.readlines()

                    if is_crashing_often(childs_uptime[group],
                                         restart_interval, restart_attempts):
                        del childs[group]
                        raise RuntimeError("EntryGroup '%s' has been crashing too often, quit the whole factory:\n%s\n%s" % (group, tempOut, tempErr))
                    else:
                        # Restart the entry setting its restart time
                        logSupport.log.warning("Restarting EntryGroup %s." % (group))
                        del childs[group]

                        command_list = [sys.executable,
                                        os.path.join(STARTUP_DIR,
                                                     "glideFactoryEntryGroup.py"),
                                        str(os.getpid()),
                                        str(sleep_time),
                                        str(advertize_rate),
                                        startup_dir,
                                        entry_names,
                                        str(group)]
                        childs[group] = subprocess.Popen(command_list,
                                                         shell=False,
                                                         stdout=subprocess.PIPE,
                                                         stderr=subprocess.PIPE,
                                                         close_fds=True,
                                                         preexec_fn=_set_rlimit)                                                         

                        if len(childs_uptime[group]) == restart_attempts:
                            childs_uptime[group].pop(0)
                        childs_uptime[group].append(time.time())
                        for fd in (childs[group].stdout.fileno(),
                                   childs[group].stderr.fileno()):
                            fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                            fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
                        logSupport.log.warning("EntryGroup startup/restart times: %s" % (childs_uptime,))

            # Aggregate Monitoring data periodically
            logSupport.log.info("Aggregate monitoring data")
            stats = aggregate_stats(factory_downtimes.checkDowntime())
            save_stats(stats, os.path.join(startup_dir, glideFactoryConfig.factoryConfig.aggregated_stats_file))

            # Aggregate job data periodically
            if glideinDescript.data.get('AdvertisePilotAccounting', False) in ['True', '1']:   # data attributes are strings
                logSupport.log.info("Starting updating job classads")
                update_classads()
                logSupport.log.info("Finishing updating job classads")

            # Advertise the global classad with the factory keys and Factory statistics
            try:
                # KEL TODO need to add factory downtime?
                glideFactoryInterface.advertizeGlobal(
                    glideinDescript.data['FactoryName'],
                    glideinDescript.data['GlideinName'],
                    glideFactoryLib.factoryConfig.supported_signtypes,
                    glideinDescript.data['PubKeyObj']
                    )
            except Exception as e:
                logSupport.log.exception("Error advertising global classads: %s" % e)

            cleanupSupport.cleaners.cleanup()

            iteration_etime = time.time()
            iteration_sleep_time = sleep_time - (iteration_etime - iteration_stime)
            if iteration_sleep_time < 0:
                iteration_sleep_time = 0
            logSupport.log.info("Sleep %s secs" % iteration_sleep_time)
            time.sleep(iteration_sleep_time)

        # end while 1:

    finally:
        # cleanup at exit
        logSupport.log.info("Received signal...exit")
        try:
            try:
                clean_exit(childs)
            except:
                # if anything goes wrong, hardkill the rest
                for group in childs:
                    logSupport.log.info("Hard killing EntryGroup %s" % group)
                    try:
                        os.kill(childs[group].pid, signal.SIGKILL)
                    except OSError:
                        pass # ignore dead clients
        finally:
            logSupport.log.info("Deadvertize myself")
            try:
                glideFactoryInterface.deadvertizeFactory(
                    glideinDescript.data['FactoryName'],
                    glideinDescript.data['GlideinName'])
            except:
                logSupport.log.exception("Factory deadvertize failed!")
            try:
                glideFactoryInterface.deadvertizeFactoryClientMonitoring(
                    glideinDescript.data['FactoryName'],
                    glideinDescript.data['GlideinName'])
            except:
                logSupport.log.exception("Factory Monitoring deadvertize failed!")
        logSupport.log.info("All EntryGroups should be terminated")