def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, profile=''): BaseSatellite.__init__(self, 'scheduler', config_file, is_daemon, do_replace, debug, debug_file) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) self.must_run = True # Now the interface self.uri = None self.uri2 = None # And possible links for satellites # from now only pollers self.pollers = {} self.reactionners = {} self.brokers = {}
def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, port=None, local_log=None, daemon_name=None): self.daemon_name = 'scheduler' if daemon_name: self.daemon_name = daemon_name BaseSatellite.__init__(self, self.daemon_name, config_file, is_daemon, do_replace, debug, debug_file, port, local_log) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) self.must_run = True # Now the interface self.uri = None self.uri2 = None # stats part # --- copied from scheduler.py self.nb_pulled_checks = 0 self.nb_pulled_actions = 0 # self.nb_checks_send = 0 self.nb_pushed_checks = 0 self.nb_pushed_actions = 0 self.nb_broks_send = 0 self.nb_pulled_broks = 0 # --- # And possible links for satellites # from now only pollers self.pollers = {} self.reactionners = {} self.brokers = {}
def __init__(self, **kwargs): """Scheduler daemon initialisation :param kwargs: command line arguments """ super(Alignak, self).__init__(kwargs.get('daemon_name', 'Default-scheduler'), **kwargs) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) # stats part # --- copied from scheduler.py self.nb_pulled_checks = 0 self.nb_pulled_actions = 0 # self.nb_checks_send = 0 self.nb_pushed_checks = 0 self.nb_pushed_actions = 0 self.nb_pulled_broks = 0 # --- # And possible links for satellites self.brokers = {} self.pollers = {} self.reactionners = {} self.receivers = {} # This because it is the Satellite that has thes properties and I am a Satellite # todo: change this? # Broks are stored in each broker link, not locally # self.broks = [] self.broks_lock = threading.RLock() # Modules are only loaded one time self.have_modules = False self.first_scheduling = False
class Alignak(BaseSatellite): # pylint: disable=too-many-instance-attributes """Scheduler class. Referenced as "app" in most Interface """ properties = BaseSatellite.properties.copy() properties.update({ 'type': StringProp(default='scheduler'), 'port': IntegerProp(default=7768) }) def __init__(self, **kwargs): """Scheduler daemon initialisation :param kwargs: command line arguments """ super(Alignak, self).__init__(kwargs.get('daemon_name', 'Default-scheduler'), **kwargs) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) # stats part # --- copied from scheduler.py self.nb_pulled_checks = 0 self.nb_pulled_actions = 0 # self.nb_checks_send = 0 self.nb_pushed_checks = 0 self.nb_pushed_actions = 0 self.nb_pulled_broks = 0 # --- # And possible links for satellites self.brokers = {} self.pollers = {} self.reactionners = {} self.receivers = {} # This because it is the Satellite that has thes properties and I am a Satellite # todo: change this? # Broks are stored in each broker link, not locally # self.broks = [] self.broks_lock = threading.RLock() # Modules are only loaded one time self.have_modules = False self.first_scheduling = False def get_broks(self, broker_name): """Send broks to a specific broker :param broker_name: broker name to send broks :type broker_name: str :greturn: dict of brok for this broker :rtype: dict[alignak.brok.Brok] """ logger.debug("Broker %s requests my broks list", broker_name) res = [] if not broker_name: return res for broker_link in list(self.brokers.values()): if broker_name == broker_link.name: for brok in sorted(broker_link.broks, key=lambda x: x.creation_time): # Only provide broks that did not yet sent to our external modules if getattr(brok, 'sent_to_externals', False): res.append(brok) brok.got = True broker_link.broks = [ b for b in broker_link.broks if not getattr(b, 'got', False) ] logger.debug("Providing %d broks to %s", len(res), broker_name) break else: logger.warning("Got a brok request from an unknown broker: %s", broker_name) return res def compensate_system_time_change(self, difference): # pragma: no cover, # pylint: disable=too-many-branches # not with unit tests """Compensate a system time change of difference for all hosts/services/checks/notifs :param difference: difference in seconds :type difference: int :return: None """ super(Alignak, self).compensate_system_time_change(difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) if not hasattr(self.sched, "conf"): # Race condition where time change before getting conf return # Then we compensate all host/services for host in self.sched.hosts: host.compensate_system_time_change(difference) for serv in self.sched.services: serv.compensate_system_time_change(difference) # Now all checks and actions for chk in list(self.sched.checks.values()): # Already launch checks should not be touch if chk.status == u'scheduled' and chk.t_to_go is not None: t_to_go = chk.t_to_go ref = self.sched.find_item_by_id(chk.ref) new_t = max(0, t_to_go + difference) timeperiod = self.sched.timeperiods[ref.check_period] if timeperiod is not None: # But it's no so simple, we must match the timeperiod new_t = timeperiod.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: chk.state = u'waitconsume' chk.exit_status = 2 chk.output = '(Error: there is no available check time after time change!)' chk.check_time = time.time() chk.execution_time = 0 else: chk.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for act in list(self.sched.actions.values()): # Already launch checks should not be touch if act.status == u'scheduled': t_to_go = act.t_to_go # Event handler do not have ref ref_id = getattr(act, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if act.is_a == u'notification': ref = self.sched.find_item_by_id(ref_id) if ref.notification_period: # But it's no so simple, we must match the timeperiod notification_period = self.sched.timeperiods[ ref.notification_period] new_t = notification_period.get_next_valid_time_from_t( new_t) # And got a creation_time variable too act.creation_time += difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: act.state = 'waitconsume' act.exit_status = 2 act.output = '(Error: there is no available check time after time change!)' act.check_time = time.time() act.execution_time = 0 else: act.t_to_go = new_t def do_before_loop(self): """Stop the scheduling process""" if self.sched: self.sched.stop_scheduling() def do_loop_turn(self): """Scheduler loop turn Simply run the Alignak scheduler loop This is called when a configuration got received by the scheduler daemon. As of it, check if the first scheduling has been done... and manage this. :return: None """ if not self.first_scheduling: # Ok, now all is initialized, we can make the initial broks logger.info("First scheduling launched") _t0 = time.time() # Program start brok self.sched.initial_program_status() # First scheduling self.sched.schedule() statsmgr.timer('first_scheduling', time.time() - _t0) logger.info("First scheduling done") # Connect to our passive satellites if needed for satellite in [ s for s in list(self.pollers.values()) if s.passive ]: if not self.daemon_connection_init(satellite): logger.error("Passive satellite connection failed: %s", satellite) for satellite in [ s for s in list(self.reactionners.values()) if s.passive ]: if not self.daemon_connection_init(satellite): logger.error("Passive satellite connection failed: %s", satellite) # Ticks are for recurrent function call like consume, del zombies etc self.sched.ticks = 0 self.first_scheduling = True # Each loop turn, execute the daemon specific treatment... # only if the daemon has a configuration to manage if self.sched.pushed_conf: # If scheduling is not yet enabled, enable scheduling if not self.sched.must_schedule: self.sched.start_scheduling() self.sched.before_run() self.sched.run() else: logger.warning("#%d - No monitoring configuration to scheduler...", self.loop_count) def get_managed_configurations(self): """Get the configurations managed by this scheduler The configuration managed by a scheduler is the self configuration got by the scheduler during the dispatching. :return: a dict of scheduler links with instance_id as key and hash, push_flavor and configuration identifier as values :rtype: dict """ # for scheduler_link in list(self.schedulers.values()): # res[scheduler_link.instance_id] = { # 'hash': scheduler_link.hash, # 'push_flavor': scheduler_link.push_flavor, # 'managed_conf_id': scheduler_link.managed_conf_id # } res = {} if self.sched.pushed_conf and self.cur_conf and 'instance_id' in self.cur_conf: res[self.cur_conf['instance_id']] = { 'hash': self.cur_conf['hash'], 'push_flavor': self.cur_conf['push_flavor'], 'managed_conf_id': self.cur_conf['managed_conf_id'] } logger.debug("Get managed configuration: %s", res) return res def setup_new_conf(self): # pylint: disable=too-many-statements, too-many-branches, too-many-locals """Setup new conf received for scheduler :return: None """ # Execute the base class treatment... super(Alignak, self).setup_new_conf() # ...then our own specific treatment! with self.conf_lock: # self_conf is our own configuration from the alignak environment # self_conf = self.cur_conf['self_conf'] logger.debug("Got config: %s", self.cur_conf) if 'conf_part' not in self.cur_conf: self.cur_conf['conf_part'] = None conf_part = self.cur_conf['conf_part'] # Ok now we can save the retention data if self.sched.pushed_conf is not None: self.sched.update_retention() # Get the monitored objects configuration t00 = time.time() received_conf_part = None try: received_conf_part = unserialize(conf_part) assert received_conf_part is not None except AssertionError as exp: # This to indicate that no configuration is managed by this scheduler... logger.warning( "No managed configuration received from arbiter") except AlignakClassLookupException as exp: # pragma: no cover # This to indicate that the new configuration is not managed... self.new_conf = { "_status": "Cannot un-serialize configuration received from arbiter", "_error": str(exp) } logger.error(self.new_conf) logger.error("Back trace of the error:\n%s", traceback.format_exc()) return except Exception as exp: # pylint: disable=broad-except # This to indicate that the new configuration is not managed... self.new_conf = { "_status": "Cannot un-serialize configuration received from arbiter", "_error": str(exp) } logger.error(self.new_conf) self.exit_on_exception(exp, str(self.new_conf)) # if not received_conf_part: # return logger.info( "Monitored configuration %s received at %d. Un-serialized in %d secs", received_conf_part, t00, time.time() - t00) logger.info("Scheduler received configuration : %s", received_conf_part) # Now we create our pollers, reactionners and brokers for link_type in ['pollers', 'reactionners', 'brokers']: if link_type not in self.cur_conf['satellites']: logger.error("Missing %s in the configuration!", link_type) continue my_satellites = getattr(self, link_type, {}) received_satellites = self.cur_conf['satellites'][link_type] for link_uuid in received_satellites: rs_conf = received_satellites[link_uuid] logger.debug("- received %s - %s: %s", rs_conf['instance_id'], rs_conf['type'], rs_conf['name']) # Must look if we already had a configuration and save our broks already_got = rs_conf['instance_id'] in my_satellites broks = [] actions = {} wait_homerun = {} external_commands = {} running_id = 0 if already_got: logger.warning("I already got: %s", rs_conf['instance_id']) # Save some information running_id = my_satellites[link_uuid].running_id (broks, actions, wait_homerun, external_commands) = \ my_satellites[link_uuid].get_and_clear_context() # Delete the former link del my_satellites[link_uuid] # My new satellite link... new_link = SatelliteLink.get_a_satellite_link( link_type[:-1], rs_conf) my_satellites[new_link.uuid] = new_link logger.info("I got a new %s satellite: %s", link_type[:-1], new_link) new_link.running_id = running_id new_link.external_commands = external_commands new_link.broks = broks new_link.wait_homerun = wait_homerun new_link.actions = actions # Replacing the satellite address and port by those defined in satellite_map if new_link.name in self.cur_conf['override_conf'].get( 'satellite_map', {}): override_conf = self.cur_conf['override_conf'] overriding = override_conf.get('satellite_map')[ new_link.name] logger.warning( "Do not override the configuration for: %s, with: %s. " "Please check whether this is necessary!", new_link.name, overriding) # First mix conf and override_conf to have our definitive conf for prop in getattr(self.cur_conf, 'override_conf', []): logger.debug("Overriden: %s / %s ", prop, getattr(received_conf_part, prop, None)) logger.debug("Overriding: %s / %s ", prop, self.cur_conf['override_conf']) setattr(received_conf_part, prop, self.cur_conf['override_conf'].get(prop, None)) # Scheduler modules if not self.have_modules: try: logger.debug("Modules configuration: %s", self.cur_conf['modules']) self.modules = unserialize(self.cur_conf['modules'], no_load=True) except AlignakClassLookupException as exp: # pragma: no cover, simple protection logger.error( 'Cannot un-serialize modules configuration ' 'received from arbiter: %s', exp) if self.modules: logger.debug("I received some modules configuration: %s", self.modules) self.have_modules = True self.do_load_modules(self.modules) # and start external modules too self.modules_manager.start_external_instances() else: logger.info("I do not have modules") if received_conf_part: logger.info("Loading configuration...") # Propagate the global parameters to the configuration items received_conf_part.explode_global_conf() # We give the configuration to our scheduler self.sched.reset() self.sched.load_conf(self.cur_conf['instance_id'], self.cur_conf['instance_name'], received_conf_part) # Once loaded, the scheduler has an inner pushed_conf object logger.info("Loaded: %s", self.sched.pushed_conf) # Update the scheduler ticks according to the daemon configuration self.sched.update_recurrent_works_tick(self) # We must update our pushed configuration macros with correct values # from the configuration parameters # self.sched.pushed_conf.fill_resource_macros_names_macros() # Creating the Macroresolver Class & unique instance m_solver = MacroResolver() m_solver.init(received_conf_part) # Now create the external commands manager # We are an applyer: our role is not to dispatch commands, but to apply them ecm = ExternalCommandManager( received_conf_part, 'applyer', self.sched, received_conf_part.accept_passive_unknown_check_results, received_conf_part.log_external_commands) # Scheduler needs to know about this external command manager to use it if necessary self.sched.external_commands_manager = ecm # Ok now we can load the retention data self.sched.retention_load() # Log hosts/services initial states self.sched.log_initial_states() # Create brok new conf brok = Brok({'type': 'new_conf', 'data': {}}) self.sched.add_brok(brok) # Initialize connection with all our satellites logger.info("Initializing connection with my satellites:") my_satellites = self.get_links_of_type(s_type='') for satellite in list(my_satellites.values()): logger.info("- : %s/%s", satellite.type, satellite.name) if not self.daemon_connection_init(satellite): logger.error("Satellite connection failed: %s", satellite) if received_conf_part: # Enable the scheduling process logger.info("Loaded: %s", self.sched.pushed_conf) self.sched.start_scheduling() # Now I have a configuration! self.have_conf = True def clean_previous_run(self): """Clean variables from previous configuration :return: None """ # Execute the base class treatment... super(Alignak, self).clean_previous_run() # Clean all lists self.pollers.clear() self.reactionners.clear() self.brokers.clear() def get_daemon_stats(self, details=False): """Increase the stats provided by the Daemon base class :return: stats dictionary :rtype: dict """ # Call the base Daemon one res = super(Alignak, self).get_daemon_stats(details=details) res.update({ 'name': self.name, 'type': self.type, 'monitored_objects': {} }) counters = res['counters'] # Satellites counters counters['brokers'] = len(self.brokers) counters['pollers'] = len(self.pollers) counters['reactionners'] = len(self.reactionners) counters['receivers'] = len(self.receivers) if not self.sched: return res # # Hosts/services problems counters # m_solver = MacroResolver() # counters['hosts_problems'] = m_solver._get_total_host_problems() # counters['hosts_unhandled_problems'] = m_solver._get_total_host_problems_unhandled() # counters['services_problems'] = m_solver._get_total_service_problems() # counters['services_unhandled_problems'] = m_solver._get_total_service_problems_unhandled() # Get statistics from the scheduler scheduler_stats = self.sched.get_scheduler_stats(details=details) res['counters'].update(scheduler_stats['counters']) scheduler_stats.pop('counters') res.update(scheduler_stats) return res def get_monitoring_problems(self): """Get the current scheduler livesynthesis :return: live synthesis and problems dictionary :rtype: dict """ res = {} if not self.sched: return res # Get statistics from the scheduler scheduler_stats = self.sched.get_scheduler_stats(details=True) if 'livesynthesis' in scheduler_stats: res['livesynthesis'] = scheduler_stats['livesynthesis'] if 'problems' in scheduler_stats: res['problems'] = scheduler_stats['problems'] return res def main(self): """Main function for Scheduler, launch after the init:: * Init daemon * Load module manager * Launch main loop * Catch any Exception that occurs :return: None """ try: # Start the daemon mode if not self.do_daemon_init_and_start(): self.exit_on_error(message="Daemon initialization error", exit_code=3) # We wait for initial conf self.wait_for_initial_conf() if self.new_conf: # Setup the received configuration self.setup_new_conf() # Now the main loop self.do_main_loop() logger.info("Exited from the main loop.") # On main loop exit, call the scheduler after run process self.sched.after_run() self.request_stop() except Exception: # pragma: no cover, this should never happen indeed ;) self.exit_on_exception(traceback.format_exc()) raise
class Alignak(BaseSatellite): # pylint: disable=too-many-instance-attributes """Scheduler class. Referenced as "app" in most Interface """ properties = BaseSatellite.properties.copy() properties.update({ 'type': StringProp(default='scheduler'), 'port': IntegerProp(default=7768) }) def __init__(self, **kwargs): """Scheduler daemon initialisation :param kwargs: command line arguments """ super(Alignak, self).__init__(kwargs.get('daemon_name', 'Default-scheduler'), **kwargs) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) # stats part # --- copied from scheduler.py self.nb_pulled_checks = 0 self.nb_pulled_actions = 0 # self.nb_checks_send = 0 self.nb_pushed_checks = 0 self.nb_pushed_actions = 0 self.nb_pulled_broks = 0 # --- # And possible links for satellites self.brokers = {} self.pollers = {} self.reactionners = {} self.receivers = {} # This because it is the Satellite that has thes properties and I am a Satellite # todo: change this? # Broks are stored in each broker link, not locally # self.broks = [] self.broks_lock = threading.RLock() # Modules are only loaded one time self.have_modules = False self.first_scheduling = False def get_broks(self, broker_name): """Send broks to a specific broker :param broker_name: broker name to send broks :type broker_name: str :greturn: dict of brok for this broker :rtype: dict[alignak.brok.Brok] """ logger.debug("Broker %s requests my broks list", broker_name) res = [] if not broker_name: return res for broker_link in list(self.brokers.values()): if broker_name == broker_link.name: for brok in sorted(broker_link.broks, key=lambda x: x.creation_time): # Only provide broks that did not yet sent to our external modules if getattr(brok, 'sent_to_externals', False): res.append(brok) brok.got = True broker_link.broks = [b for b in broker_link.broks if not getattr(b, 'got', False)] logger.debug("Providing %d broks to %s", len(res), broker_name) break else: logger.warning("Got a brok request from an unknown broker: %s", broker_name) return res def compensate_system_time_change(self, difference): # pragma: no cover, # pylint: disable=too-many-branches # not with unit tests """Compensate a system time change of difference for all hosts/services/checks/notifs :param difference: difference in seconds :type difference: int :return: None """ super(Alignak, self).compensate_system_time_change(difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) if not hasattr(self.sched, "conf"): # Race condition where time change before getting conf return # Then we compensate all host/services for host in self.sched.hosts: host.compensate_system_time_change(difference) for serv in self.sched.services: serv.compensate_system_time_change(difference) # Now all checks and actions for chk in list(self.sched.checks.values()): # Already launch checks should not be touch if chk.status == u'scheduled' and chk.t_to_go is not None: t_to_go = chk.t_to_go ref = self.sched.find_item_by_id(chk.ref) new_t = max(0, t_to_go + difference) timeperiod = self.sched.timeperiods[ref.check_period] if timeperiod is not None: # But it's no so simple, we must match the timeperiod new_t = timeperiod.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: chk.state = u'waitconsume' chk.exit_status = 2 chk.output = '(Error: there is no available check time after time change!)' chk.check_time = time.time() chk.execution_time = 0 else: chk.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for act in list(self.sched.actions.values()): # Already launch checks should not be touch if act.status == u'scheduled': t_to_go = act.t_to_go # Event handler do not have ref ref_id = getattr(act, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if act.is_a == u'notification': ref = self.sched.find_item_by_id(ref_id) if ref.notification_period: # But it's no so simple, we must match the timeperiod notification_period = self.sched.timeperiods[ref.notification_period] new_t = notification_period.get_next_valid_time_from_t(new_t) # And got a creation_time variable too act.creation_time += difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: act.state = 'waitconsume' act.exit_status = 2 act.output = '(Error: there is no available check time after time change!)' act.check_time = time.time() act.execution_time = 0 else: act.t_to_go = new_t def do_before_loop(self): """Stop the scheduling process""" if self.sched: self.sched.stop_scheduling() def do_loop_turn(self): """Scheduler loop turn Simply run the Alignak scheduler loop This is called when a configuration got received by the scheduler daemon. As of it, check if the first scheduling has been done... and manage this. :return: None """ if not self.first_scheduling: # Ok, now all is initialized, we can make the initial broks logger.info("First scheduling launched") _t0 = time.time() # Program start brok self.sched.initial_program_status() # First scheduling self.sched.schedule() statsmgr.timer('first_scheduling', time.time() - _t0) logger.info("First scheduling done") # Connect to our passive satellites if needed for satellite in [s for s in list(self.pollers.values()) if s.passive]: if not self.daemon_connection_init(satellite): logger.error("Passive satellite connection failed: %s", satellite) for satellite in [s for s in list(self.reactionners.values()) if s.passive]: if not self.daemon_connection_init(satellite): logger.error("Passive satellite connection failed: %s", satellite) # Ticks are for recurrent function call like consume, del zombies etc self.sched.ticks = 0 self.first_scheduling = True # Each loop turn, execute the daemon specific treatment... # only if the daemon has a configuration to manage if self.sched.pushed_conf: # If scheduling is not yet enabled, enable scheduling if not self.sched.must_schedule: self.sched.start_scheduling() self.sched.before_run() self.sched.run() else: logger.warning("#%d - No monitoring configuration to scheduler...", self.loop_count) def get_managed_configurations(self): """Get the configurations managed by this scheduler The configuration managed by a scheduler is the self configuration got by the scheduler during the dispatching. :return: a dict of scheduler links with instance_id as key and hash, push_flavor and configuration identifier as values :rtype: dict """ # for scheduler_link in list(self.schedulers.values()): # res[scheduler_link.instance_id] = { # 'hash': scheduler_link.hash, # 'push_flavor': scheduler_link.push_flavor, # 'managed_conf_id': scheduler_link.managed_conf_id # } res = {} if self.sched.pushed_conf and self.cur_conf and 'instance_id' in self.cur_conf: res[self.cur_conf['instance_id']] = { 'hash': self.cur_conf['hash'], 'push_flavor': self.cur_conf['push_flavor'], 'managed_conf_id': self.cur_conf['managed_conf_id'] } logger.debug("Get managed configuration: %s", res) return res def setup_new_conf(self): # pylint: disable=too-many-statements, too-many-branches, too-many-locals """Setup new conf received for scheduler :return: None """ # Execute the base class treatment... super(Alignak, self).setup_new_conf() # ...then our own specific treatment! with self.conf_lock: # self_conf is our own configuration from the alignak environment # self_conf = self.cur_conf['self_conf'] logger.debug("Got config: %s", self.cur_conf) if 'conf_part' not in self.cur_conf: self.cur_conf['conf_part'] = None conf_part = self.cur_conf['conf_part'] # Ok now we can save the retention data if self.sched.pushed_conf is not None: self.sched.update_retention() # Get the monitored objects configuration t00 = time.time() received_conf_part = None try: received_conf_part = unserialize(conf_part) assert received_conf_part is not None except AssertionError as exp: # This to indicate that no configuration is managed by this scheduler... logger.warning("No managed configuration received from arbiter") except AlignakClassLookupException as exp: # pragma: no cover # This to indicate that the new configuration is not managed... self.new_conf = { "_status": "Cannot un-serialize configuration received from arbiter", "_error": str(exp) } logger.error(self.new_conf) logger.error("Back trace of the error:\n%s", traceback.format_exc()) return except Exception as exp: # pylint: disable=broad-except # This to indicate that the new configuration is not managed... self.new_conf = { "_status": "Cannot un-serialize configuration received from arbiter", "_error": str(exp) } logger.error(self.new_conf) self.exit_on_exception(exp, str(self.new_conf)) # if not received_conf_part: # return logger.info("Monitored configuration %s received at %d. Un-serialized in %d secs", received_conf_part, t00, time.time() - t00) logger.info("Scheduler received configuration : %s", received_conf_part) # Now we create our pollers, reactionners and brokers for link_type in ['pollers', 'reactionners', 'brokers']: if link_type not in self.cur_conf['satellites']: logger.error("Missing %s in the configuration!", link_type) continue my_satellites = getattr(self, link_type, {}) received_satellites = self.cur_conf['satellites'][link_type] for link_uuid in received_satellites: rs_conf = received_satellites[link_uuid] logger.debug("- received %s - %s: %s", rs_conf['instance_id'], rs_conf['type'], rs_conf['name']) # Must look if we already had a configuration and save our broks already_got = rs_conf['instance_id'] in my_satellites broks = [] actions = {} wait_homerun = {} external_commands = {} running_id = 0 if already_got: logger.warning("I already got: %s", rs_conf['instance_id']) # Save some information running_id = my_satellites[link_uuid].running_id (broks, actions, wait_homerun, external_commands) = \ my_satellites[link_uuid].get_and_clear_context() # Delete the former link del my_satellites[link_uuid] # My new satellite link... new_link = SatelliteLink.get_a_satellite_link(link_type[:-1], rs_conf) my_satellites[new_link.uuid] = new_link logger.info("I got a new %s satellite: %s", link_type[:-1], new_link) new_link.running_id = running_id new_link.external_commands = external_commands new_link.broks = broks new_link.wait_homerun = wait_homerun new_link.actions = actions # Replacing the satellite address and port by those defined in satellite_map if new_link.name in self.cur_conf['override_conf'].get('satellite_map', {}): override_conf = self.cur_conf['override_conf'] overriding = override_conf.get('satellite_map')[new_link.name] logger.warning("Do not override the configuration for: %s, with: %s. " "Please check whether this is necessary!", new_link.name, overriding) # First mix conf and override_conf to have our definitive conf for prop in getattr(self.cur_conf, 'override_conf', []): logger.debug("Overriden: %s / %s ", prop, getattr(received_conf_part, prop, None)) logger.debug("Overriding: %s / %s ", prop, self.cur_conf['override_conf']) setattr(received_conf_part, prop, self.cur_conf['override_conf'].get(prop, None)) # Scheduler modules if not self.have_modules: try: logger.debug("Modules configuration: %s", self.cur_conf['modules']) self.modules = unserialize(self.cur_conf['modules'], no_load=True) except AlignakClassLookupException as exp: # pragma: no cover, simple protection logger.error('Cannot un-serialize modules configuration ' 'received from arbiter: %s', exp) if self.modules: logger.debug("I received some modules configuration: %s", self.modules) self.have_modules = True self.do_load_modules(self.modules) # and start external modules too self.modules_manager.start_external_instances() else: logger.info("I do not have modules") if received_conf_part: logger.info("Loading configuration...") # Propagate the global parameters to the configuration items received_conf_part.explode_global_conf() # We give the configuration to our scheduler self.sched.reset() self.sched.load_conf(self.cur_conf['instance_id'], self.cur_conf['instance_name'], received_conf_part) # Once loaded, the scheduler has an inner pushed_conf object logger.info("Loaded: %s", self.sched.pushed_conf) # Update the scheduler ticks according to the daemon configuration self.sched.update_recurrent_works_tick(self) # We must update our pushed configuration macros with correct values # from the configuration parameters # self.sched.pushed_conf.fill_resource_macros_names_macros() # Creating the Macroresolver Class & unique instance m_solver = MacroResolver() m_solver.init(received_conf_part) # Now create the external commands manager # We are an applyer: our role is not to dispatch commands, but to apply them ecm = ExternalCommandManager( received_conf_part, 'applyer', self.sched, received_conf_part.accept_passive_unknown_check_results, received_conf_part.log_external_commands) # Scheduler needs to know about this external command manager to use it if necessary self.sched.external_commands_manager = ecm # Ok now we can load the retention data self.sched.retention_load() # Log hosts/services initial states self.sched.log_initial_states() # Create brok new conf brok = Brok({'type': 'new_conf', 'data': {}}) self.sched.add_brok(brok) # Initialize connection with all our satellites logger.info("Initializing connection with my satellites:") my_satellites = self.get_links_of_type(s_type='') for satellite in list(my_satellites.values()): logger.info("- : %s/%s", satellite.type, satellite.name) if not self.daemon_connection_init(satellite): logger.error("Satellite connection failed: %s", satellite) if received_conf_part: # Enable the scheduling process logger.info("Loaded: %s", self.sched.pushed_conf) self.sched.start_scheduling() # Now I have a configuration! self.have_conf = True def clean_previous_run(self): """Clean variables from previous configuration :return: None """ # Execute the base class treatment... super(Alignak, self).clean_previous_run() # Clean all lists self.pollers.clear() self.reactionners.clear() self.brokers.clear() def get_daemon_stats(self, details=False): """Increase the stats provided by the Daemon base class :return: stats dictionary :rtype: dict """ # Call the base Daemon one res = super(Alignak, self).get_daemon_stats(details=details) res.update({'name': self.name, 'type': self.type, 'monitored_objects': {}}) counters = res['counters'] # Satellites counters counters['brokers'] = len(self.brokers) counters['pollers'] = len(self.pollers) counters['reactionners'] = len(self.reactionners) counters['receivers'] = len(self.receivers) if not self.sched: return res # # Hosts/services problems counters # m_solver = MacroResolver() # counters['hosts_problems'] = m_solver._get_total_host_problems() # counters['hosts_unhandled_problems'] = m_solver._get_total_host_problems_unhandled() # counters['services_problems'] = m_solver._get_total_service_problems() # counters['services_unhandled_problems'] = m_solver._get_total_service_problems_unhandled() # Get statistics from the scheduler scheduler_stats = self.sched.get_scheduler_stats(details=details) res['counters'].update(scheduler_stats['counters']) scheduler_stats.pop('counters') res.update(scheduler_stats) return res def get_monitoring_problems(self): """Get the current scheduler livesynthesis :return: live synthesis and problems dictionary :rtype: dict """ res = {} if not self.sched: return res # Get statistics from the scheduler scheduler_stats = self.sched.get_scheduler_stats(details=True) if 'livesynthesis' in scheduler_stats: res['livesynthesis'] = scheduler_stats['livesynthesis'] if 'problems' in scheduler_stats: res['problems'] = scheduler_stats['problems'] return res def main(self): """Main function for Scheduler, launch after the init:: * Init daemon * Load module manager * Launch main loop * Catch any Exception that occurs :return: None """ try: # Start the daemon mode if not self.do_daemon_init_and_start(): self.exit_on_error(message="Daemon initialization error", exit_code=3) # We wait for initial conf self.wait_for_initial_conf() if self.new_conf: # Setup the received configuration self.setup_new_conf() # Now the main loop self.do_main_loop() logger.info("Exited from the main loop.") # On main loop exit, call the scheduler after run process self.sched.after_run() self.request_stop() except Exception: # pragma: no cover, this should never happen indeed ;) self.exit_on_exception(traceback.format_exc()) raise
class Alignak(BaseSatellite): """Scheduler class. Referenced as "app" in most Interface """ properties = BaseSatellite.properties.copy() properties.update({ 'daemon_type': StringProp(default='scheduler'), 'pidfile': PathProp(default='schedulerd.pid'), 'port': IntegerProp(default=7768), 'local_log': PathProp(default='schedulerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, port=None, local_log=None, daemon_name=None): self.daemon_name = 'scheduler' if daemon_name: self.daemon_name = daemon_name BaseSatellite.__init__(self, self.daemon_name, config_file, is_daemon, do_replace, debug, debug_file, port, local_log) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) self.must_run = True # Now the interface self.uri = None self.uri2 = None # stats part # --- copied from scheduler.py self.nb_pulled_checks = 0 self.nb_pulled_actions = 0 # self.nb_checks_send = 0 self.nb_pushed_checks = 0 self.nb_pushed_actions = 0 self.nb_broks_send = 0 self.nb_pulled_broks = 0 # --- # And possible links for satellites # from now only pollers self.pollers = {} self.reactionners = {} self.brokers = {} def compensate_system_time_change(self, difference, timeperiods): # pragma: no cover, # not with unit tests """Compensate a system time change of difference for all hosts/services/checks/notifs :param difference: difference in seconds :type difference: int :return: None """ logger.warning( "A system time change of %d has been detected. Compensating...", difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) if not hasattr(self.sched, "conf"): # Race condition where time change before getting conf return # Then we compensate all host/services for host in self.sched.hosts: host.compensate_system_time_change(difference) for serv in self.sched.services: serv.compensate_system_time_change(difference) # Now all checks and actions for chk in self.sched.checks.values(): # Already launch checks should not be touch if chk.status == 'scheduled' and chk.t_to_go is not None: t_to_go = chk.t_to_go ref = self.sched.find_item_by_id(chk.ref) new_t = max(0, t_to_go + difference) timeperiod = timeperiods[ref.check_period] if timeperiod is not None: # But it's no so simple, we must match the timeperiod new_t = timeperiod.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: chk.state = 'waitconsume' chk.exit_status = 2 chk.output = '(Error: there is no available check time after time change!)' chk.check_time = time.time() chk.execution_time = 0 else: chk.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for act in self.sched.actions.values(): # Already launch checks should not be touch if act.status == 'scheduled': t_to_go = act.t_to_go # Event handler do not have ref ref_id = getattr(act, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if act.is_a == 'notification': ref = self.sched.find_item_by_id(ref_id) if ref.notification_period: # But it's no so simple, we must match the timeperiod notification_period = self.sched.timeperiods[ ref.notification_period] new_t = notification_period.get_next_valid_time_from_t( new_t) # And got a creation_time variable too act.creation_time += difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: act.state = 'waitconsume' act.exit_status = 2 act.output = '(Error: there is no available check time after time change!)' act.check_time = time.time() act.execution_time = 0 else: act.t_to_go = new_t def manage_signal(self, sig, frame): """Manage signals caught by the daemon signal.SIGUSR1 : dump_memory signal.SIGUSR2 : dump_object (nothing) signal.SIGTERM, signal.SIGINT : terminate process :param sig: signal caught by daemon :type sig: str :param frame: current stack frame :type frame: :return: None TODO: Refactor with Daemon one """ logger.info("scheduler process %d received a signal: %s", os.getpid(), str(sig)) # If we got USR1, just dump memory if sig == signal.SIGUSR1: self.sched.need_dump_memory = True elif sig == signal.SIGUSR2: # usr2, dump objects self.sched.need_objects_dump = True else: # if not, die :) logger.info("scheduler process %d is dying...", os.getpid()) self.sched.die() self.must_run = False Daemon.manage_signal(self, sig, frame) def do_loop_turn(self): """Scheduler loop turn Basically wait initial conf and run :return: None """ # Ok, now the conf self.wait_for_initial_conf() if not self.new_conf: return logger.info("New configuration received") self.setup_new_conf() logger.info( "[%s] New configuration loaded, scheduling for Alignak: %s", self.name, self.sched.alignak_name) self.sched.run() def setup_new_conf(self): # pylint: disable=too-many-statements """Setup new conf received for scheduler :return: None """ with self.conf_lock: self.clean_previous_run() new_conf = self.new_conf logger.info("[%s] Sending us a configuration", self.name) conf_raw = new_conf['conf'] override_conf = new_conf['override_conf'] modules = new_conf['modules'] satellites = new_conf['satellites'] instance_name = new_conf['instance_name'] # Ok now we can save the retention data if hasattr(self.sched, 'conf'): self.sched.update_retention_file(forced=True) # horay, we got a name, we can set it in our stats objects statsmgr.register(instance_name, 'scheduler', statsd_host=new_conf['statsd_host'], statsd_port=new_conf['statsd_port'], statsd_prefix=new_conf['statsd_prefix'], statsd_enabled=new_conf['statsd_enabled']) t00 = time.time() try: conf = unserialize(conf_raw) except AlignakClassLookupException as exp: # pragma: no cover, simple protection logger.error( 'Cannot un-serialize configuration received from arbiter: %s', exp) logger.debug("Conf received at %d. Un-serialized in %d secs", t00, time.time() - t00) self.new_conf = None if 'scheduler_name' in new_conf: name = new_conf['scheduler_name'] else: name = instance_name self.name = name # Set my own process title self.set_proctitle(self.name) logger.info("[%s] Received a new configuration, containing: ", self.name) for key in new_conf: logger.info("[%s] - %s", self.name, key) logger.info("[%s] configuration identifiers: %s (%s)", self.name, new_conf['conf_uuid'], new_conf['push_flavor']) # Tag the conf with our data self.conf = conf self.conf.push_flavor = new_conf['push_flavor'] self.conf.alignak_name = new_conf['alignak_name'] self.conf.instance_name = instance_name self.conf.skip_initial_broks = new_conf['skip_initial_broks'] self.conf.accept_passive_unknown_check_results = \ new_conf['accept_passive_unknown_check_results'] self.cur_conf = conf self.override_conf = override_conf self.modules = unserialize(modules, True) self.satellites = satellites # Now We create our pollers, reactionners and brokers for sat_type in ['pollers', 'reactionners', 'brokers']: if sat_type not in satellites: continue for sat_id in satellites[sat_type]: # Must look if we already have it sats = getattr(self, sat_type) sat = satellites[sat_type][sat_id] sats[sat_id] = sat if sat['name'] in override_conf['satellitemap']: sat = dict(sat) # make a copy sat.update(override_conf['satellitemap'][sat['name']]) proto = 'http' if sat['use_ssl']: proto = 'https' uri = '%s://%s:%s/' % (proto, sat['address'], sat['port']) sats[sat_id]['uri'] = uri sats[sat_id]['con'] = None sats[sat_id]['running_id'] = 0 sats[sat_id]['last_connection'] = 0 sats[sat_id]['connection_attempt'] = 0 sats[sat_id]['max_failed_connections'] = 3 setattr(self, sat_type, sats) logger.debug("We have our %s: %s ", sat_type, satellites[sat_type]) logger.info("We have our %s:", sat_type) for daemon in satellites[sat_type].values(): logger.info(" - %s ", daemon['name']) # First mix conf and override_conf to have our definitive conf for prop in self.override_conf: val = self.override_conf[prop] setattr(self.conf, prop, val) if self.conf.use_timezone != '': logger.info("Setting our timezone to %s", str(self.conf.use_timezone)) os.environ['TZ'] = self.conf.use_timezone time.tzset() self.do_load_modules(self.modules) logger.info("Loading configuration.") self.conf.explode_global_conf() # pylint: disable=E1101 # we give sched it's conf self.sched.reset() self.sched.load_conf(self.conf) self.sched.load_satellites(self.pollers, self.reactionners, self.brokers) # We must update our Config dict macro with good value # from the config parameters self.sched.conf.fill_resource_macros_names_macros() # Creating the Macroresolver Class & unique instance m_solver = MacroResolver() m_solver.init(self.conf) # self.conf.dump() # self.conf.quick_debug() # Now create the external commands manager # We are an applyer: our role is not to dispatch commands, but to apply them ecm = ExternalCommandManager(self.conf, 'applyer', self.sched) # Scheduler needs to know about this external command manager to use it if necessary self.sched.set_external_commands_manager(ecm) # Update External Commands Manager self.sched.external_commands_manager.accept_passive_unknown_check_results = \ self.sched.conf.accept_passive_unknown_check_results # We clear our schedulers managed (it's us :) ) # and set ourselves in it self.schedulers = {self.conf.uuid: self.sched} # pylint: disable=E1101 # Ok now we can load the retention data self.sched.retention_load() # Create brok new conf brok = Brok({'type': 'new_conf', 'data': {}}) self.sched.add_brok(brok) def what_i_managed(self): # pylint: disable=no-member """Get my managed dict (instance id and push_flavor) :return: dict containing instance_id key and push flavor value :rtype: dict """ if hasattr(self, 'conf'): return {self.conf.uuid: self.conf.push_flavor} # pylint: disable=E1101 return {} def clean_previous_run(self): """Clean variables from previous configuration :return: None """ # Clean all lists self.pollers.clear() self.reactionners.clear() self.brokers.clear() def main(self): """Main function for Scheduler, launch after the init:: * Init daemon * Load module manager * Launch main loop * Catch any Exception that occurs :return: None """ try: self.setup_alignak_logger() # Look if we are enabled or not. If ok, start the daemon mode self.look_for_early_exit() # todo: # This function returns False if some problem is detected during initialization # (eg. communication port not free) # Perharps we should stop the initialization process and exit? if not self.do_daemon_init_and_start(): return self.load_modules_manager(self.name) self.uri = self.http_daemon.uri logger.info("[Scheduler] General interface is at: %s", self.uri) self.do_mainloop() except Exception: self.print_unrecoverable(traceback.format_exc()) raise
class Alignak(BaseSatellite): """Scheduler class. Referenced as "app" in most Interface """ properties = BaseSatellite.properties.copy() properties.update({ 'pidfile': PathProp(default='schedulerd.pid'), 'port': IntegerProp(default=7768), 'local_log': PathProp(default='schedulerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, profile=''): BaseSatellite.__init__(self, 'scheduler', config_file, is_daemon, do_replace, debug, debug_file) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) self.must_run = True # Now the interface self.uri = None self.uri2 = None # And possible links for satellites # from now only pollers self.pollers = {} self.reactionners = {} self.brokers = {} def compensate_system_time_change(self, difference): """Compensate a system time change of difference for all hosts/services/checks/notifs :param difference: difference in seconds :type difference: int :return: None """ logger.warning( "A system time change of %d has been detected. Compensating...", difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) if not hasattr(self.sched, "conf"): # Race condition where time change before getting conf return # Then we compensate all host/services for host in self.sched.hosts: host.compensate_system_time_change(difference) for serv in self.sched.services: serv.compensate_system_time_change(difference) # Now all checks and actions for chk in self.sched.checks.values(): # Already launch checks should not be touch if chk.status == 'scheduled' and chk.t_to_go is not None: t_to_go = chk.t_to_go ref = chk.ref new_t = max(0, t_to_go + difference) if ref.check_period is not None: # But it's no so simple, we must match the timeperiod new_t = ref.check_period.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: chk.state = 'waitconsume' chk.exit_status = 2 chk.output = '(Error: there is no available check time after time change!)' chk.check_time = time.time() chk.execution_time = 0 else: chk.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for act in self.sched.actions.values(): # Already launch checks should not be touch if act.status == 'scheduled': t_to_go = act.t_to_go # Event handler do not have ref ref = getattr(act, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if act.is_a == 'notification': if ref.notification_period: # But it's no so simple, we must match the timeperiod new_t = ref.notification_period.get_next_valid_time_from_t( new_t) # And got a creation_time variable too act.creation_time += difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: act.state = 'waitconsume' act.exit_status = 2 act.output = '(Error: there is no available check time after time change!)' act.check_time = time.time() act.execution_time = 0 else: act.t_to_go = new_t def manage_signal(self, sig, frame): """Manage signals caught by the daemon signal.SIGUSR1 : dump_memory signal.SIGUSR2 : dump_object (nothing) signal.SIGTERM, signal.SIGINT : terminate process :param sig: signal caught by daemon :type sig: str :param frame: current stack frame :type frame: :return: None TODO: Refactor with Daemon one """ logger.warning("%s > Received a SIGNAL %s", process.current_process(), sig) # If we got USR1, just dump memory if sig == signal.SIGUSR1: self.sched.need_dump_memory = True elif sig == signal.SIGUSR2: # usr2, dump objects self.sched.need_objects_dump = True else: # if not, die :) self.sched.die() self.must_run = False Daemon.manage_signal(self, sig, frame) def do_loop_turn(self): """Scheduler loop turn Basically wait initial conf and run :return: None """ # Ok, now the conf self.wait_for_initial_conf() if not self.new_conf: return logger.info("New configuration received") self.setup_new_conf() logger.info("New configuration loaded") self.sched.run() def setup_new_conf(self): """Setup new conf received for scheduler :return: None """ with self.conf_lock: new_c = self.new_conf conf_raw = new_c['conf'] override_conf = new_c['override_conf'] modules = new_c['modules'] satellites = new_c['satellites'] instance_name = new_c['instance_name'] push_flavor = new_c['push_flavor'] skip_initial_broks = new_c['skip_initial_broks'] accept_passive_unknown_chk_res = new_c[ 'accept_passive_unknown_check_results'] api_key = new_c['api_key'] secret = new_c['secret'] http_proxy = new_c['http_proxy'] statsd_host = new_c['statsd_host'] statsd_port = new_c['statsd_port'] statsd_prefix = new_c['statsd_prefix'] statsd_enabled = new_c['statsd_enabled'] # horay, we got a name, we can set it in our stats objects statsmgr.register(self.sched, instance_name, 'scheduler', api_key=api_key, secret=secret, http_proxy=http_proxy, statsd_host=statsd_host, statsd_port=statsd_port, statsd_prefix=statsd_prefix, statsd_enabled=statsd_enabled) t00 = time.time() conf = cPickle.loads(conf_raw) logger.debug("Conf received at %d. Unserialized in %d secs", t00, time.time() - t00) self.new_conf = None # Tag the conf with our data self.conf = conf self.conf.push_flavor = push_flavor self.conf.instance_name = instance_name self.conf.skip_initial_broks = skip_initial_broks self.conf.accept_passive_unknown_check_results = accept_passive_unknown_chk_res self.cur_conf = conf self.override_conf = override_conf self.modules = modules self.satellites = satellites # self.pollers = self.app.pollers if self.conf.human_timestamp_log: # pylint: disable=E1101 logger.set_human_format() # Now We create our pollers for pol_id in satellites['pollers']: # Must look if we already have it already_got = pol_id in self.pollers poll = satellites['pollers'][pol_id] self.pollers[pol_id] = poll if poll['name'] in override_conf['satellitemap']: poll = dict(poll) # make a copy poll.update(override_conf['satellitemap'][poll['name']]) proto = 'http' if poll['use_ssl']: proto = 'https' uri = '%s://%s:%s/' % (proto, poll['address'], poll['port']) self.pollers[pol_id]['uri'] = uri self.pollers[pol_id]['last_connection'] = 0 # Now We create our reactionners for reac_id in satellites['reactionners']: # Must look if we already have it already_got = reac_id in self.reactionners reac = satellites['reactionners'][reac_id] self.reactionners[reac_id] = reac if reac['name'] in override_conf['satellitemap']: reac = dict(reac) # make a copy reac.update(override_conf['satellitemap'][reac['name']]) proto = 'http' if poll['use_ssl']: proto = 'https' uri = '%s://%s:%s/' % (proto, reac['address'], reac['port']) self.reactionners[reac_id]['uri'] = uri self.reactionners[reac_id]['last_connection'] = 0 # First mix conf and override_conf to have our definitive conf for prop in self.override_conf: val = self.override_conf[prop] setattr(self.conf, prop, val) if self.conf.use_timezone != '': logger.debug("Setting our timezone to %s", str(self.conf.use_timezone)) os.environ['TZ'] = self.conf.use_timezone time.tzset() if len(self.modules) != 0: logger.debug("I've got %s modules", str(self.modules)) # TODO: if scheduler had previous modules instanciated it must clean them! self.do_load_modules(self.modules) logger.info("Loading configuration.") self.conf.explode_global_conf() # we give sched it's conf self.sched.reset() self.sched.load_conf(self.conf) self.sched.load_satellites(self.pollers, self.reactionners) # We must update our Config dict macro with good value # from the config parameters self.sched.conf.fill_resource_macros_names_macros() # print "DBG: got macros", self.sched.conf.macros # Creating the Macroresolver Class & unique instance m_solver = MacroResolver() m_solver.init(self.conf) # self.conf.dump() # self.conf.quick_debug() # Now create the external commander # it's a applyer: it role is not to dispatch commands, # but to apply them ecm = ExternalCommandManager(self.conf, 'applyer') # Scheduler need to know about external command to # activate it if necessary self.sched.load_external_command(ecm) # External command need the sched because he can raise checks ecm.load_scheduler(self.sched) # We clear our schedulers managed (it's us :) ) # and set ourself in it self.schedulers = {self.conf.instance_id: self.sched} def what_i_managed(self): """Get my managed dict (instance id and push_flavor) :return: dict containing instance_id key and push flavor value :rtype: dict """ if hasattr(self, 'conf'): return {self.conf.instance_id: self.conf.push_flavor} else: return {} def main(self): """Main function for Scheduler, launch after the init:: * Init daemon * Load module manager * Launch main loop * Catch any Exception that occurs :return: None """ try: self.load_config_file() # Setting log level logger.setLevel(self.log_level) # Force the debug level if the daemon is said to start with such level if self.debug: logger.setLevel('DEBUG') self.look_for_early_exit() self.do_daemon_init_and_start() self.load_modules_manager() self.uri = self.http_daemon.uri logger.info("[scheduler] General interface is at: %s", self.uri) self.do_mainloop() except Exception, exp: self.print_unrecoverable(traceback.format_exc()) raise
class Alignak(BaseSatellite): """Scheduler class. Referenced as "app" in most Interface """ properties = BaseSatellite.properties.copy() properties.update({ 'pidfile': PathProp(default='schedulerd.pid'), 'port': IntegerProp(default=7768), 'local_log': PathProp(default='schedulerd.log'), }) def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, profile=''): BaseSatellite.__init__(self, 'scheduler', config_file, is_daemon, do_replace, debug, debug_file) self.http_interface = SchedulerInterface(self) self.sched = Scheduler(self) self.must_run = True # Now the interface self.uri = None self.uri2 = None # And possible links for satellites # from now only pollers self.pollers = {} self.reactionners = {} self.brokers = {} def compensate_system_time_change(self, difference): """Compensate a system time change of difference for all hosts/services/checks/notifs :param difference: difference in seconds :type difference: int :return: None """ logger.warning("A system time change of %d has been detected. Compensating...", difference) # We only need to change some value self.program_start = max(0, self.program_start + difference) if not hasattr(self.sched, "conf"): # Race condition where time change before getting conf return # Then we compensate all host/services for host in self.sched.hosts: host.compensate_system_time_change(difference) for serv in self.sched.services: serv.compensate_system_time_change(difference) # Now all checks and actions for chk in self.sched.checks.values(): # Already launch checks should not be touch if chk.status == 'scheduled' and chk.t_to_go is not None: t_to_go = chk.t_to_go ref = chk.ref new_t = max(0, t_to_go + difference) if ref.check_period is not None: # But it's no so simple, we must match the timeperiod new_t = ref.check_period.get_next_valid_time_from_t(new_t) # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: chk.state = 'waitconsume' chk.exit_status = 2 chk.output = '(Error: there is no available check time after time change!)' chk.check_time = time.time() chk.execution_time = 0 else: chk.t_to_go = new_t ref.next_chk = new_t # Now all checks and actions for act in self.sched.actions.values(): # Already launch checks should not be touch if act.status == 'scheduled': t_to_go = act.t_to_go # Event handler do not have ref ref = getattr(act, 'ref', None) new_t = max(0, t_to_go + difference) # Notification should be check with notification_period if act.is_a == 'notification': if ref.notification_period: # But it's no so simple, we must match the timeperiod new_t = ref.notification_period.get_next_valid_time_from_t(new_t) # And got a creation_time variable too act.creation_time += difference # But maybe no there is no more new value! Not good :( # Say as error, with error output if new_t is None: act.state = 'waitconsume' act.exit_status = 2 act.output = '(Error: there is no available check time after time change!)' act.check_time = time.time() act.execution_time = 0 else: act.t_to_go = new_t def manage_signal(self, sig, frame): """Manage signals caught by the daemon signal.SIGUSR1 : dump_memory signal.SIGUSR2 : dump_object (nothing) signal.SIGTERM, signal.SIGINT : terminate process :param sig: signal caught by daemon :type sig: str :param frame: current stack frame :type frame: :return: None TODO: Refactor with Daemon one """ logger.warning("%s > Received a SIGNAL %s", process.current_process(), sig) # If we got USR1, just dump memory if sig == signal.SIGUSR1: self.sched.need_dump_memory = True elif sig == signal.SIGUSR2: # usr2, dump objects self.sched.need_objects_dump = True else: # if not, die :) self.sched.die() self.must_run = False Daemon.manage_signal(self, sig, frame) def do_loop_turn(self): """Scheduler loop turn Basically wait initial conf and run :return: None """ # Ok, now the conf self.wait_for_initial_conf() if not self.new_conf: return logger.info("New configuration received") self.setup_new_conf() logger.info("New configuration loaded") self.sched.run() def setup_new_conf(self): """Setup new conf received for scheduler :return: None """ with self.conf_lock: new_c = self.new_conf conf_raw = new_c['conf'] override_conf = new_c['override_conf'] modules = new_c['modules'] satellites = new_c['satellites'] instance_name = new_c['instance_name'] push_flavor = new_c['push_flavor'] skip_initial_broks = new_c['skip_initial_broks'] accept_passive_unknown_chk_res = new_c['accept_passive_unknown_check_results'] api_key = new_c['api_key'] secret = new_c['secret'] http_proxy = new_c['http_proxy'] statsd_host = new_c['statsd_host'] statsd_port = new_c['statsd_port'] statsd_prefix = new_c['statsd_prefix'] statsd_enabled = new_c['statsd_enabled'] # horay, we got a name, we can set it in our stats objects statsmgr.register(self.sched, instance_name, 'scheduler', api_key=api_key, secret=secret, http_proxy=http_proxy, statsd_host=statsd_host, statsd_port=statsd_port, statsd_prefix=statsd_prefix, statsd_enabled=statsd_enabled) t00 = time.time() conf = cPickle.loads(conf_raw) logger.debug("Conf received at %d. Unserialized in %d secs", t00, time.time() - t00) self.new_conf = None # Tag the conf with our data self.conf = conf self.conf.push_flavor = push_flavor self.conf.instance_name = instance_name self.conf.skip_initial_broks = skip_initial_broks self.conf.accept_passive_unknown_check_results = accept_passive_unknown_chk_res self.cur_conf = conf self.override_conf = override_conf self.modules = modules self.satellites = satellites # self.pollers = self.app.pollers if self.conf.human_timestamp_log: logger.set_human_format() # Now We create our pollers for pol_id in satellites['pollers']: # Must look if we already have it already_got = pol_id in self.pollers poll = satellites['pollers'][pol_id] self.pollers[pol_id] = poll if poll['name'] in override_conf['satellitemap']: poll = dict(poll) # make a copy poll.update(override_conf['satellitemap'][poll['name']]) proto = 'http' if poll['use_ssl']: proto = 'https' uri = '%s://%s:%s/' % (proto, poll['address'], poll['port']) self.pollers[pol_id]['uri'] = uri self.pollers[pol_id]['last_connection'] = 0 # Now We create our reactionners for reac_id in satellites['reactionners']: # Must look if we already have it already_got = reac_id in self.reactionners reac = satellites['reactionners'][reac_id] self.reactionners[reac_id] = reac if reac['name'] in override_conf['satellitemap']: reac = dict(reac) # make a copy reac.update(override_conf['satellitemap'][reac['name']]) proto = 'http' if poll['use_ssl']: proto = 'https' uri = '%s://%s:%s/' % (proto, reac['address'], reac['port']) self.reactionners[reac_id]['uri'] = uri self.reactionners[reac_id]['last_connection'] = 0 # First mix conf and override_conf to have our definitive conf for prop in self.override_conf: val = self.override_conf[prop] setattr(self.conf, prop, val) if self.conf.use_timezone != '': logger.debug("Setting our timezone to %s", str(self.conf.use_timezone)) os.environ['TZ'] = self.conf.use_timezone time.tzset() if len(self.modules) != 0: logger.debug("I've got %s modules", str(self.modules)) # TODO: if scheduler had previous modules instanciated it must clean them! self.modules_manager.set_modules(self.modules) self.do_load_modules() logger.info("Loading configuration.") self.conf.explode_global_conf() # we give sched it's conf self.sched.reset() self.sched.load_conf(self.conf) self.sched.load_satellites(self.pollers, self.reactionners) # We must update our Config dict macro with good value # from the config parameters self.sched.conf.fill_resource_macros_names_macros() # print "DBG: got macros", self.sched.conf.macros # Creating the Macroresolver Class & unique instance m_solver = MacroResolver() m_solver.init(self.conf) # self.conf.dump() # self.conf.quick_debug() # Now create the external commander # it's a applyer: it role is not to dispatch commands, # but to apply them ecm = ExternalCommandManager(self.conf, 'applyer') # Scheduler need to know about external command to # activate it if necessary self.sched.load_external_command(ecm) # External command need the sched because he can raise checks ecm.load_scheduler(self.sched) # We clear our schedulers managed (it's us :) ) # and set ourself in it self.schedulers = {self.conf.instance_id: self.sched} def what_i_managed(self): """Get my managed dict (instance id and push_flavor) :return: dict containing instance_id key and push flavor value :rtype: dict """ if hasattr(self, 'conf'): return {self.conf.instance_id: self.conf.push_flavor} else: return {} def main(self): """Main function for Scheduler, launch after the init:: * Init daemon * Load module manager * Launch main loop * Catch any Exception that occurs :return: None """ try: self.load_config_file() # Setting log level logger.setLevel(self.log_level) # Force the debug level if the daemon is said to start with such level if self.debug: logger.setLevel('DEBUG') self.look_for_early_exit() self.do_daemon_init_and_start() self.load_modules_manager() self.uri = self.http_daemon.uri logger.info("[scheduler] General interface is at: %s", self.uri) self.do_mainloop() except Exception, exp: self.print_unrecoverable(traceback.format_exc()) raise