def shouldHibernate(frontendDescript, work_dir, ha, mode, groups): """ Check if the frontend is running in HA mode. If run in master mode never hibernate. If run in slave mode, hiberate if master is active. @rtype: bool @return: True if we should hibernate else False """ servicePerformance.startPerfMetricEvent('frontend', 'ha_check') if mode == 'slave': master_frontend_name = str( ha.get('ha_frontends')[0].get('frontend_name')) for group in groups: element = glideinFrontendElement(os.getpid(), work_dir, group, "run") # Set environment required to query factory collector set_frontend_htcondor_env(work_dir, frontendDescript, element) for factory_pool in element.factory_pools: try: factory_pool_node = factory_pool[0] master_classads = glideinFrontendInterface.findMasterFrontendClassads( factory_pool_node, master_frontend_name) if master_classads: # Found some classads in one of the collectors # Cleanup the env and return True clean_htcondor_env() servicePerformance.endPerfMetricEvent( 'frontend', 'ha_check') return True except RuntimeError: # Failed to talk if not factory_pool_node: factory_pool_node = '' msg = "Failed to talk to the factory_pool %s to get the status of Master frontend %s" % ( factory_pool_node, master_frontend_name) logSupport.log.warn(msg) msg = "Exception talking to the factory_pool %s to get the status of Master frontend %s: " % ( factory_pool_node, master_frontend_name) logSupport.log.exception(msg) # Cleanup the env clean_htcondor_env() # NOTE: # If we got this far with no errors then we could not find # active master frontend. We should not hibernate as slave # However, if there were errors checking with factory pool # then the master frontend could be down so its safe to wake # up and start advertising. servicePerformance.endPerfMetricEvent('frontend', 'ha_check') return False
def shouldHibernate(frontendDescript, work_dir, ha, mode, groups): """ Check if the frontend is running in HA mode. If run in master mode never hibernate. If run in slave mode, hiberate if master is active. @rtype: bool @return: True if we should hibernate else False """ servicePerformance.startPerfMetricEvent('frontend', 'ha_check') if mode == 'slave': master_frontend_name = str(ha.get('ha_frontends')[0].get('frontend_name')) for group in groups: element = glideinFrontendElement(os.getpid(), work_dir, group, "run") # Set environment required to query factory collector set_frontend_htcondor_env(work_dir, frontendDescript, element) for factory_pool in element.factory_pools: try: factory_pool_node = factory_pool[0] master_classads = glideinFrontendInterface.findMasterFrontendClassads(factory_pool_node, master_frontend_name) if master_classads: # Found some classads in one of the collectors # Cleanup the env and return True clean_htcondor_env() servicePerformance.endPerfMetricEvent( 'frontend', 'ha_check') return True except RuntimeError: # Failed to talk if not factory_pool_node: factory_pool_node = '' msg = "Failed to talk to the factory_pool %s to get the status of Master frontend %s" % (factory_pool_node, master_frontend_name) logSupport.log.warn(msg) msg = "Exception talking to the factory_pool %s to get the status of Master frontend %s: " % (factory_pool_node, master_frontend_name) logSupport.log.exception(msg ) # Cleanup the env clean_htcondor_env() # NOTE: # If we got this far with no errors then we could not find # active master frontend. We should not hibernate as slave # However, if there were errors checking with factory pool # then the master frontend could be down so its safe to wake # up and start advertising. servicePerformance.endPerfMetricEvent('frontend', 'ha_check') return False
def test_get_perf_metric(self): startPerfMetricEvent(name, event_name, event_begin) endPerfMetricEvent(name, event_name, event_end) self.assertEqual(event_end_repr, getPerfMetric(name).__repr__())
def test_get_perf_metric_event_lifetime(self): startPerfMetricEvent(name, event_name, event_begin) endPerfMetricEvent(name, event_name, event_end) self.assertEqual(1000, getPerfMetricEventLifetime(name, event_name))
def spawn(sleep_time, advertize_rate, work_dir, frontendDescript, groups, max_parallel_workers, restart_interval, restart_attempts): num_groups = len(groups) # TODO: Get the ha_check_interval from the config ha = glideinFrontendLib.getHASettings(frontendDescript.data) ha_check_interval = glideinFrontendLib.getHACheckInterval(frontendDescript.data) mode = glideinFrontendLib.getHAMode(frontendDescript.data) master_frontend_name = '' if mode == 'slave': master_frontend_name = ha.get('ha_frontends')[0].get('frontend_name') active = (mode == 'master') hibernate = shouldHibernate(frontendDescript, work_dir, ha, mode, groups) logSupport.log.info('Frontend started with mode = %s' % mode) try: # Service will exit on signal only. # This infinite loop is for the slave to go back into hibernation # once the master becomes alive. # Master never loops infinitely here, but instead it does in # the inner loop while(mode=='master') ... while True: while hibernate: # If I am slave enter hibernation cycle while Master is alive logSupport.log.info('Master Frontend %s is online. Hibernating.' % master_frontend_name) time.sleep(ha_check_interval) hibernate = shouldHibernate(frontendDescript, work_dir, ha, mode, groups) # We broke out of hibernation cycle # Either Master has disappeared or I am the Master if mode == 'slave': logSupport.log.info("Master frontend %s is offline. Activating slave frontend." % master_frontend_name) active = True failure_dict = {} for group in groups: failure_dict[group] = FailureCounter(group, restart_interval) while ((mode == 'master') or ((mode == 'slave') and active)): servicePerformance.startPerfMetricEvent('frontend', 'iteration') start_time = time.time() timings = spawn_iteration(work_dir, frontendDescript, groups, max_parallel_workers, failure_dict, restart_attempts, "run") servicePerformance.endPerfMetricEvent('frontend', 'iteration') end_time = time.time() elapsed_time = servicePerformance.getPerfMetricEventLifetime('frontend', 'iteration') if elapsed_time < sleep_time: real_sleep_time = sleep_time - elapsed_time logSupport.log.info("Sleep %.1f sec" % real_sleep_time) time.sleep(real_sleep_time) else: logSupport.log.info("No sleeping this loop, took %.1f sec > %.1f sec" % (elapsed_time, sleep_time)) # order the groups by walltime # longest walltime first timings.sort(lambda x, y:-cmp(x[1], y[1])) # recreate the groups list, with new ordering groups = [el[0] for el in timings] assert num_groups == len(groups), "Something went wrong, number of groups changed" if mode == 'slave': # If we are slave, check if master is back and if so # deadvertise my classads and hibernate hibernate = shouldHibernate(frontendDescript, work_dir, ha, mode, groups) if hibernate: active = False logSupport.log.info("Master frontend %s is back online" % master_frontend_name) logSupport.log.info("Deadvertize my ads and enter hibernation cycle") spawn_cleanup(work_dir, frontendDescript, groups, frontendDescript.data['FrontendName'], mode) else: logSupport.log.info("Master frontend %s is still offline" % master_frontend_name) finally: # We have been asked to terminate logSupport.log.info("Deadvertize my ads") spawn_cleanup(work_dir, frontendDescript, groups, frontendDescript.data['FrontendName'], mode)
def spawn_iteration(work_dir, frontendDescript, groups, max_active, failure_dict, max_failures, action): childs = {} for group_name in groups: childs[group_name] = {'state': 'queued'} active_groups = 0 groups_tofinish = len(groups) max_num_failures = 0 logSupport.log.info("Starting iteration") try: while groups_tofinish > 0: done_something = False # check if any group finished by now for group_name in groups: if childs[group_name]['state'] == 'spawned': group_rc = poll_group_process(group_name, childs[group_name]['data']) if not (group_rc is None): # None means "still alive" if group_rc == 0: childs[group_name]['state'] = 'finished' else: childs[group_name]['state'] = 'failed' failure_dict[group_name].add_failure() num_failures = failure_dict[group_name].count_failures() max_num_failures = max(max_num_failures, num_failures) logSupport.log.warning("Group %s terminated with exit code %i (%i recent failure)" % (group_name, group_rc, num_failures)) childs[group_name]['end_time'] = time.time() servicePerformance.endPerfMetricEvent( 'frontend', 'group_%s_iteration'%group_name) active_groups -= 1 groups_tofinish -= 1 done_something = True # see if I can spawn more for group_name in groups: if active_groups < max_active: # can spawn more if childs[group_name]['state'] == 'queued': childs[group_name]['data'] = spawn_group(work_dir, group_name, action) childs[group_name]['state'] = 'spawned' childs[group_name]['start_time'] = time.time() servicePerformance.startPerfMetricEvent( 'frontend', 'group_%s_iteration'%group_name) active_groups += 1 done_something = True else: break if done_something: logSupport.log.info("Active groups = %i, Groups to finish = %i" % (active_groups, groups_tofinish)) if groups_tofinish > 0: time.sleep(0.01) logSupport.log.info("All groups finished") logSupport.log.info("Aggregate monitoring data") # KEL - can we just call the monitor aggregator method directly? see above servicePerformance.startPerfMetricEvent('frontend', 'aggregate_stats') stats = aggregate_stats() servicePerformance.endPerfMetricEvent('frontend', 'aggregate_stats') #logSupport.log.debug(stats) # Create the glidefrontendmonitor classad fm_advertiser = glideinFrontendInterface.FrontendMonitorClassadAdvertiser(multi_support=glideinFrontendInterface.frontendConfig.advertise_use_multi) fm_classad = glideinFrontendInterface.FrontendMonitorClassad( frontendDescript.data['FrontendName']) fm_classad.setFrontendDetails( frontendDescript.data['FrontendName'], ','.join(groups), glideinFrontendLib.getHAMode(frontendDescript.data)) try: idle_jobs = { 'Total': stats['total']['Jobs']['Idle'], '600': stats['total']['Jobs']['OldIdle'], '3600': stats['total']['Jobs']['Idle_3600'], } except KeyError as err: idle_jobs = {'Total': 0, '600': 0, '3600': 0} logSupport.log.error("Error in RRD Database. Setting idle_jobs[%s] Failed. Reconfig the frontend with -fix_rrd to fix this error" % (err.message,)) fm_classad.setIdleJobCount(idle_jobs) fm_classad.setPerfMetrics(servicePerformance.getPerfMetric('frontend')) # Gather performance stats from history file of each group for group_name in groups: gname = 'group_%s' % group_name try: history_obj = glideinFrontendConfig.HistoryFile( work_dir, group_name, True, dict) pfm = servicePerformance.getPerfMetric(gname) pfm.metric = history_obj['perf_metrics'].metric fm_classad.setPerfMetrics( servicePerformance.getPerfMetric(gname)) except: pass # Do not fail for non-critical actions fm_advertiser.addClassad(fm_classad.adParams['Name'], fm_classad) # Advertise glidefrontendmonitor classad to user pool logSupport.log.info("Advertising %i %s classad(s) to the user pool" % (len(fm_advertiser.classads), fm_advertiser.adType)) try: set_frontend_htcondor_env(work_dir, frontendDescript) fm_advertiser.advertiseAllClassads() logSupport.log.info("Done advertising %s classad(s) to the user pool" % fm_advertiser.adType) except condorExe.ExeError: logSupport.log.error("Exception occurred trying to advertise %s classad(s) to the user pool" % fm_advertiser.adType) except: # Rethrow any other exception including stop signal raise finally: # Cleanup the env clean_htcondor_env() logSupport.log.info("Cleaning logs") cleanupSupport.cleaners.cleanup() if max_num_failures > max_failures: logSupport.log.info("Too many group failures, aborting") logSupport.log.debug("Failed %i times (limit %i), aborting"%(max_num_failures, max_failures)) raise RuntimeError("Too many group failures, aborting") finally: # cleanup at exit # if anything goes wrong, hardkill the rest for group_name in childs: if childs[group_name]['state']=='spawned': logSupport.log.info("Hard killing group %s" % group_name) servicePerformance.endPerfMetricEvent( 'frontend', 'group_%s_iteration'%group_name) try: os.kill(childs[group_name]['data'].pid, signal.SIGKILL) except OSError: pass # ignore failed kills of non-existent processes # at this point, all groups should have been run timings = [] for group_name in groups: timings.append((group_name, childs[group_name]['end_time']-childs[group_name]['start_time'])) return timings