def create_pack(self, buf, name): """ Create pack with data from configuration file :param buf: buffer :type buf: str :param name: name of file :type name: str :return: None """ if not json: logger.warning("[Pack] cannot load the pack file '%s': missing json lib", name) return # Ok, go compile the code try: json_dump = json.loads(buf) if 'name' not in json_dump: logger.error("[Pack] no name in the pack '%s'", name) return pack = Pack({}) pack.pack_name = json_dump['name'] pack.description = json_dump.get('description', '') pack.macros = json_dump.get('macros', {}) pack.templates = json_dump.get('templates', [pack.pack_name]) pack.path = json_dump.get('path', 'various/') pack.doc_link = json_dump.get('doc_link', '') pack.services = json_dump.get('services', {}) pack.commands = json_dump.get('commands', []) if not pack.path.endswith('/'): pack.path += '/' # Ok, add it self[pack._id] = pack except ValueError, exp: logger.error("[Pack] error in loading pack file '%s': '%s'", name, exp)
def is_correct(self): """ Check if the macromodulation is valid and have all properties defined :return: True if valide, otherwise False :rtype: bool """ state = True cls = self.__class__ # Raised all previously saw errors like unknown commands or timeperiods if self.configuration_errors != []: state = False for err in self.configuration_errors: logger.error("[item::%s] %s", self.get_name(), err) for prop, entry in cls.properties.items(): if prop not in cls._special_properties: if not hasattr(self, prop) and entry.required: logger.error( "[macromodulation::%s] %s property not set", self.get_name(), prop ) state = False # Bad boy... # Ok just put None as modulation_period, means 24x7 if not hasattr(self, 'modulation_period'): self.modulation_period = None return state
def manage_finished_checks(self): """ Check the status of checks if done, return message finished :) REF: doc/alignak-action-queues.png (5) :return: None """ to_del = [] wait_time = 1 now = time.time() for action in self.checks: if action.status == 'launched' and action.last_poll < now - action.wait_time: action.check_finished(self.max_plugins_output_length) wait_time = min(wait_time, action.wait_time) # If action done, we can launch a new one if action.status in ('done', 'timeout'): to_del.append(action) # We answer to the master # msg = Message(_id=self._id, _type='Result', data=action) try: self.returns_queue.put(action) except IOError, exp: logger.error("[%d] Exiting: %s", self._id, exp) sys.exit(2)
def work(self, slave_q, returns_queue, control_q): """ Wrapper function for work in order to catch the exception to see the real work, look at do_work :param slave_q: Global Queue Master->Slave :type slave_q: Queue.Queue :param returns_queue: queue managed by manager :type returns_queue: Queue.Queue :param control_q: Control Queue for the worker :type control_q: Queue.Queue :return: None """ try: self.do_work(slave_q, returns_queue, control_q) # Catch any exception, try to print it and exit anyway except Exception, exp: output = cStringIO.StringIO() traceback.print_exc(file=output) logger.error( "Worker '%d' exit with an unmanaged exception : %slave_q", self._id, output.getvalue()) output.close() # Ok I die now raise
def try_instance_init(self, inst, late_start=False): """Try to "init" the given module instance. :param inst: instance to init :type inst: object :param late_start: If late_start, don't look for last_init_try :type late_start: bool :return: True on successful init. False if instance init method raised any Exception. :rtype: bool """ try: logger.info("Trying to init module: %s", inst.get_name()) inst.init_try += 1 # Maybe it's a retry if not late_start and inst.init_try > 1: # Do not try until 5 sec, or it's too loopy if inst.last_init_try > time.time() - 5: return False inst.last_init_try = time.time() # If it's an external, create/update Queues() if inst.is_external: inst.create_queues(self.manager) inst.init() except Exception, err: logger.error("The instance %s raised an exception %s, I remove it!", inst.get_name(), str(err)) output = cStringIO.StringIO() traceback.print_exc(file=output) logger.error("Back trace of this remove: %s", output.getvalue()) output.close() return False
def get_hosts_by_explosion(self, hostgroups): """ Get hosts of this group :param hostgroups: Hostgroup object :type hostgroups: alignak.objects.hostgroup.Hostgroups :return: list of hosts of this group :rtype: list """ # First we tag the hg so it will not be explode # if a son of it already call it self.already_explode = True # Now the recursive part # rec_tag is set to False every HG we explode # so if True here, it must be a loop in HG # calls... not GOOD! if self.rec_tag: logger.error("[hostgroup::%s] got a loop in hostgroup definition", self.get_name()) return self.get_hosts() # Ok, not a loop, we tag it and continue self.rec_tag = True hg_mbrs = self.get_hostgroup_members() for hg_mbr in hg_mbrs: hostgroup = hostgroups.find_by_name(hg_mbr.strip()) if hostgroup is not None: value = hostgroup.get_hosts_by_explosion(hostgroups) if value is not None: self.add_string_member(value) return self.get_hosts()
def linkify_s_by_plug(self): """ Link modules :return: None """ for module in self: new_modules = [] mods = strip_and_uniq(module.modules) for plug_name in mods: plug_name = plug_name.strip() # don't read void names if plug_name == '': continue # We are the modules, we search them :) plug = self.find_by_name(plug_name) if plug is not None: new_modules.append(plug) else: err = "[module] unknown %s module from %s" % (plug_name, module.get_name()) logger.error(err) module.configuration_errors.append(err) module.modules = new_modules
def try_instance_init(self, inst, late_start=False): """Try to "init" the given module instance. :param inst: instance to init :type inst: object :param late_start: If late_start, don't look for last_init_try :type late_start: bool :return: True on successful init. False if instance init method raised any Exception. :rtype: bool """ try: logger.info("Trying to init module: %s", inst.get_name()) inst.init_try += 1 # Maybe it's a retry if not late_start and inst.init_try > 1: # Do not try until 5 sec, or it's too loopy if inst.last_init_try > time.time() - 5: return False inst.last_init_try = time.time() # If it's an external, create/update Queues() if inst.is_external: inst.create_queues(self.manager) inst.init() except Exception, err: logger.error( "The instance %s raised an exception %s, I remove it!", inst.get_name(), str(err)) output = cStringIO.StringIO() traceback.print_exc(file=output) logger.error("Back trace of this remove: %s", output.getvalue()) output.close() return False
def linkify_s_by_plug(self): """ Link modules :return: None """ for module in self: new_modules = [] mods = strip_and_uniq(module.modules) for plug_name in mods: plug_name = plug_name.strip() # don't read void names if plug_name == '': continue # We are the modules, we search them :) plug = self.find_by_name(plug_name) if plug is not None: new_modules.append(plug) else: err = "[module] unknown %s module from %s" % ( plug_name, module.get_name()) logger.error(err) module.configuration_errors.append(err) module.modules = new_modules
def _thread_run(self): con = None while not self._stop_requested: if con is None: try: con = self._connect_to_mongo() db = con[self._db_name] db.collection_names() except PyMongoError as err: logger.error("Could not connect to mongo: %s", err) time.sleep(1) continue objects = self.test_and_get_objects_updates() if not objects: time.sleep(1) continue # as we don't use any lock around _objects_updated, # this little sleep should ensure that no more threads # will be able to use the previous self._objects_updated # stored locally here in 'objects'. time.sleep(0.1) try: self.do_updates(db, objects) except Exception as err: logger.exception("Fatal error updating objects in mongo: %s", err) con = None
def load_file(self, path): """ Load files in path parameter to load all configuration files with extension .pack of the pack :param path: Path where file of pack are :type path: str :return: None """ # Now walk for it for root, dirs, files in os.walk(path): for p_file in files: if re.search(r"\.pack$", p_file): path = os.path.join(root, p_file) try: file_d = open(path, 'rU') buf = file_d.read() file_d.close() except IOError, exp: logger.error( "Cannot open pack file '%s' for reading: %s", path, exp) # ok, skip this one continue self.create_pack(buf, p_file[:-5])
def is_correct(self): """ Check if the macromodulation is valid and have all properties defined :return: True if valide, otherwise False :rtype: bool """ state = True cls = self.__class__ # Raised all previously saw errors like unknown commands or timeperiods if self.configuration_errors != []: state = False for err in self.configuration_errors: logger.error("[item::%s] %s", self.get_name(), err) for prop, entry in cls.properties.items(): if prop not in cls._special_properties: if not hasattr(self, prop) and entry.required: logger.error("[macromodulation::%s] %s property not set", self.get_name(), prop) state = False # Bad boy... # Ok just put None as modulation_period, means 24x7 if not hasattr(self, 'modulation_period'): self.modulation_period = None return state
def method_patch(self, endpoint, data_json, headers, stop_inception=False): """ Method to update an item :param endpoint: endpoint (API URL) :type endpoint: str :param data_json: properties of item to update :type data_json:str :param headers: headers (example: Content-Type). 'If-Match' required :type headers: dict :param stop_inception: if false try to get the right etag :type stop_inception: bool :return: dictionary with response of update fields :rtype: dict """ response = requests.patch(endpoint, data_json, headers=headers) if response.status_code == 200: return response.json() elif response.status_code == 412: # 412 means Precondition failed logger.error(response.content) if 'Client and server etags don' in response.content: # update etag + retry if stop_inception: return '{}' resp = self.method_get(endpoint) headers['If-Match'] = resp['_etag'] return self.method_patch(endpoint, data_json, headers, True) else: logger.error("%s: %s for %s" % (response.status_code, response.content, endpoint)) return response.json()
def get_instances(self): """Create, init and then returns the list of module instances that the caller needs. If an instance can't be created or init'ed then only log is done. That instance is skipped. The previous modules instance(s), if any, are all cleaned. Arbiter call this method with start_external=False :return: module instances list :rtype: list """ self.clear_instances() for (mod_conf, module) in self.modules_assoc: mod_conf.properties = module.properties.copy() try: inst = module.get_instance(mod_conf) if not isinstance(inst, BaseModule): raise TypeError('Returned instance is not of type BaseModule (%s) !' % type(inst)) except Exception as err: logger.error("The module %s raised an exception %s, I remove it! traceback=%s", mod_conf.get_name(), err, traceback.format_exc()) else: # Give the module the data to which module it is load from inst.set_loaded_into(self.modules_type) self.instances.append(inst) for inst in self.instances: # External are not init now, but only when they are started if not inst.is_external and not self.try_instance_init(inst): # If the init failed, we put in in the restart queue logger.warning("The module '%s' failed to init, I will try to restart it later", inst.get_name()) self.to_restart.append(inst) return self.instances
def linkify_sd_by_s(self, hosts, services): """Replace dependent_service_description and service_description in service dependency by the real object :param hosts: host list, used to look for a specific one :type hosts: alignak.objects.host.Hosts :param services: service list to look for a specific one :type services: alignak.objects.service.Services :return: None """ to_del = [] errors = self.configuration_errors warns = self.configuration_warnings for servicedep in self: try: s_name = servicedep.dependent_service_description hst_name = servicedep.dependent_host_name # The new member list, in id serv = services.find_srv_by_name_and_hostname(hst_name, s_name) if serv is None: host = hosts.find_by_name(hst_name) if not (host and host.is_excluded_for_sdesc(s_name)): errors.append("Service %s not found for host %s" % (s_name, hst_name)) elif host: warns.append( "Service %s is excluded from host %s ; " "removing this servicedependency as it's unusuable." % (s_name, hst_name) ) to_del.append(servicedep) continue servicedep.dependent_service_description = serv s_name = servicedep.service_description hst_name = servicedep.host_name # The new member list, in id serv = services.find_srv_by_name_and_hostname(hst_name, s_name) if serv is None: host = hosts.find_by_name(hst_name) if not (host and host.is_excluded_for_sdesc(s_name)): errors.append("Service %s not found for host %s" % (s_name, hst_name)) elif host: warns.append( "Service %s is excluded from host %s ; " "removing this servicedependency as it's unusuable." % (s_name, hst_name) ) to_del.append(servicedep) continue servicedep.service_description = serv except AttributeError as err: logger.error("[servicedependency] fail to linkify by service %s: %s", servicedep, err) to_del.append(servicedep) for servicedep in to_del: self.remove_item(servicedep)
def linkify_sd_by_s(self, hosts, services): """Replace dependent_service_description and service_description in service dependency by the real object :param hosts: host list, used to look for a specific one :type hosts: alignak.objects.host.Hosts :param services: service list to look for a specific one :type services: alignak.objects.service.Services :return: None """ to_del = [] errors = self.configuration_errors warns = self.configuration_warnings for servicedep in self: try: s_name = servicedep.dependent_service_description hst_name = servicedep.dependent_host_name # The new member list, in id serv = services.find_srv_by_name_and_hostname(hst_name, s_name) if serv is None: host = hosts.find_by_name(hst_name) if not (host and host.is_excluded_for_sdesc(s_name)): errors.append("Service %s not found for host %s" % (s_name, hst_name)) elif host: warns.append("Service %s is excluded from host %s ; " "removing this servicedependency as it's unusuable." % (s_name, hst_name)) to_del.append(servicedep) continue servicedep.dependent_service_description = serv s_name = servicedep.service_description hst_name = servicedep.host_name # The new member list, in id serv = services.find_srv_by_name_and_hostname(hst_name, s_name) if serv is None: host = hosts.find_by_name(hst_name) if not (host and host.is_excluded_for_sdesc(s_name)): errors.append("Service %s not found for host %s" % (s_name, hst_name)) elif host: warns.append("Service %s is excluded from host %s ; " "removing this servicedependency as it's unusuable." % (s_name, hst_name)) to_del.append(servicedep) continue servicedep.service_description = serv except AttributeError as err: logger.error("[servicedependency] fail to linkify by service %s: %s", servicedep, err) to_del.append(servicedep) for servicedep in to_del: self.remove_item(servicedep)
def push_external_commands_to_schedulers(self): """Send a HTTP request to the schedulers (POST /run_external_commands) with external command list if the receiver is in direct routing. If not in direct_routing just clear the unprocessed_external_command list and return :return: None """ # If we are not in a direct routing mode, just bailout after # faking resolving the commands if not self.direct_routing: self.external_commands.extend(self.unprocessed_external_commands) self.unprocessed_external_commands = [] return commands_to_process = self.unprocessed_external_commands self.unprocessed_external_commands = [] # Now get all external commands and put them into the # good schedulers for ext_cmd in commands_to_process: self.external_command.resolve_command(ext_cmd) # Now for all alive schedulers, send the commands for sched_id in self.schedulers: sched = self.schedulers[sched_id] extcmds = sched['external_commands'] cmds = [extcmd.cmd_line for extcmd in extcmds] con = sched.get('con', None) sent = False if not con: logger.warning("The scheduler is not connected %s", sched) self.pynag_con_init(sched_id) con = sched.get('con', None) # If there are commands and the scheduler is alive if len(cmds) > 0 and con: logger.debug("Sending %d commands to scheduler %s", len(cmds), sched) try: # con.run_external_commands(cmds) con.post('run_external_commands', {'cmds': cmds}) sent = True # Not connected or sched is gone except (HTTPEXCEPTIONS, KeyError), exp: logger.debug('manage_returns exception:: %s,%s ', type(exp), str(exp)) self.pynag_con_init(sched_id) return except AttributeError, exp: # the scheduler must not be initialized logger.debug('manage_returns exception:: %s,%s ', type(exp), str(exp)) except Exception, exp: logger.error( "A satellite raised an unknown exception: %s (%s)", exp, type(exp)) raise
def reaper(self): """Get data from daemon and send it to the statsd daemon :return: None """ try: from Crypto.Cipher import AES except ImportError: logger.error("Cannot find python lib crypto: stats export is not available") AES = None # pylint: disable=C0103 while True: now = int(time.time()) stats = self.stats self.stats = {} if len(stats) != 0: string = ", ".join(["%s:%s" % (key, v) for (key, v) in stats.iteritems()]) # If we are not in an initializer daemon we skip, we cannot have a real name, it sucks # to find the data after this if not self.name or not self.api_key or not self.secret: time.sleep(60) continue metrics = [] for (key, elem) in stats.iteritems(): namekey = "%s.%s.%s" % (self.type, self.name, key) _min, _max, number, _sum = elem _avg = float(_sum) / number # nb can't be 0 here and _min_max can't be None too string = "%s.avg %f %d" % (namekey, _avg, now) metrics.append(string) string = "%s.min %f %d" % (namekey, _min, now) metrics.append(string) string = "%s.max %f %d" % (namekey, _max, now) metrics.append(string) string = "%s.count %f %d" % (namekey, number, now) metrics.append(string) # logger.debug('REAPER metrics to send %s (%d)' % (metrics, len(str(metrics))) ) # get the inner data for the daemon struct = self.app.get_stats_struct() struct["metrics"].extend(metrics) # logger.debug('REAPER whole struct %s' % struct) j = json.dumps(struct) if AES is not None and self.secret != "": logger.debug("Stats PUT to kernel.alignak.io/api/v1/put/ with %s %s", self.api_key, self.secret) # assume a %16 length messagexs encrypted_text = self._encrypt(j) try: self.con.put("/api/v1/put/?api_key=%s" % (self.api_key), encrypted_text) except HTTPException, exp: logger.error("Stats REAPER cannot put to the metric server %s", exp) time.sleep(60)
def load_statsd(self): """Create socket connection to statsd host :return: None """ try: self.statsd_addr = (socket.gethostbyname(self.statsd_host), self.statsd_port) self.statsd_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) except (socket.error, socket.gaierror), exp: logger.error("Cannot create statsd socket: %s", exp) return
def start_module(self): """Wrapper for _main function. Catch and raise any exception occurring in the main function :return: None """ try: self._main() except Exception as exp: logger.error('[%s] %s', self.name, traceback.format_exc()) raise exp
def is_correct(self): """Check if the Daterange is correct : weekdays are valid :return: True if weekdays are valid, False otherwise :rtype: bool """ valid = self.day in Daterange.weekdays if not valid: logger.error("Error: %s is not a valid day", self.day) # Check also if Daterange is correct. valid &= super(StandardDaterange, self).is_correct() return valid
def load_statsd(self): """Create socket connection to statsd host :return: None """ try: self.statsd_addr = (socket.gethostbyname(self.statsd_host), self.statsd_port) self.statsd_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) except (socket.error, socket.gaierror), exp: logger.error('Cannot create statsd socket: %s', exp) return
def push_external_commands_to_schedulers(self): """Send a HTTP request to the schedulers (POST /run_external_commands) with external command list if the receiver is in direct routing. If not in direct_routing just clear the unprocessed_external_command list and return :return: None """ # If we are not in a direct routing mode, just bailout after # faking resolving the commands if not self.direct_routing: self.external_commands.extend(self.unprocessed_external_commands) self.unprocessed_external_commands = [] return commands_to_process = self.unprocessed_external_commands self.unprocessed_external_commands = [] # Now get all external commands and put them into the # good schedulers for ext_cmd in commands_to_process: self.external_command.resolve_command(ext_cmd) # Now for all alive schedulers, send the commands for sched_id in self.schedulers: sched = self.schedulers[sched_id] extcmds = sched['external_commands'] cmds = [extcmd.cmd_line for extcmd in extcmds] con = sched.get('con', None) sent = False if not con: logger.warning("The scheduler is not connected %s", sched) self.pynag_con_init(sched_id) con = sched.get('con', None) # If there are commands and the scheduler is alive if len(cmds) > 0 and con: logger.debug("Sending %d commands to scheduler %s", len(cmds), sched) try: # con.run_external_commands(cmds) con.post('run_external_commands', {'cmds': cmds}) sent = True # Not connected or sched is gone except (HTTPEXCEPTIONS, KeyError), exp: logger.debug('manage_returns exception:: %s,%s ', type(exp), str(exp)) self.pynag_con_init(sched_id) return except AttributeError, exp: # the scheduler must not be initialized logger.debug('manage_returns exception:: %s,%s ', type(exp), str(exp)) except Exception, exp: logger.error("A satellite raised an unknown exception: %s (%s)", exp, type(exp)) raise
def linkify_hd_by_tp(self, timeperiods): """Replace dependency_period by a real object in host dependency :param timeperiods: list of timeperiod, used to look for a specific one :type timeperiods: alignak.objects.timeperiod.Timeperiods :return: None """ for hostdep in self: try: tp_name = hostdep.dependency_period timeperiod = timeperiods.find_by_name(tp_name) hostdep.dependency_period = timeperiod except AttributeError, exp: logger.error("[hostdependency] fail to linkify by timeperiod: %s", exp)
def linkify_sd_by_tp(self, timeperiods): """Replace dependency_period by a real object in service dependency :param timeperiods: list of timeperiod, used to look for a specific one :type timeperiods: alignak.objects.timeperiod.Timeperiods :return: None """ for servicedep in self: try: tp_name = servicedep.dependency_period timeperiod = timeperiods.find_by_name(tp_name) servicedep.dependency_period = timeperiod except AttributeError, exp: logger.error("[servicedependency] fail to linkify by timeperiods: %s", exp)
def do_manage_returns(self): """Manage the checks and then send a HTTP request to schedulers (POST /put_results) REF: doc/alignak-action-queues.png (6) :return: None """ # For all schedulers, we check for wait_homerun # and we send back results for sched_id, sched in self.schedulers.iteritems(): if not sched['active']: continue results = sched['wait_homerun'] # NB: it's **mostly** safe for us to not use some lock around # this 'results' / sched['wait_homerun']. # Because it can only be modified (for adding new values) by the # same thread running this function (that is the main satellite # thread), and this occurs exactly in self.manage_action_return(). # Another possibility is for the sched['wait_homerun'] to be # cleared within/by : # ISchedulers.get_returns() -> Satelitte.get_return_for_passive() # This can so happen in an (http) client thread. if not results: return # So, at worst, some results would be received twice on the # scheduler level, which shouldn't be a problem given they are # indexed by their "action_id". send_ok = False try: con = sched.get('con') if con is None: # None = not initialized con = self.pynag_con_init(sched_id) if con: con.post('put_results', {'results': results.values()}) send_ok = True except HTTPEXCEPTIONS as err: logger.error('Could not send results to scheduler %s : %s', sched['name'], err) except Exception as err: logger.exception( "Unhandled exception trying to send results " "to scheduler %s: %s", sched['name'], err) raise finally: if send_ok: results.clear() else: # if - and only if - send was not ok, # then "de-init" the sched connection: sched['con'] = None
def add(self, elt): """Add elt to this broker Original comment : Schedulers have some queues. We can simplify the call by adding elements into the proper queue just by looking at their type Brok -> self.broks TODO: better tag ID? External commands -> self.external_commands :param elt: object to add :type elt: object :return: None """ cls_type = elt.__class__.my_type if cls_type == 'brok': # For brok, we TAG brok with our instance_id elt.instance_id = 0 self.broks_internal_raised.append(elt) return elif cls_type == 'externalcommand': logger.debug("Enqueuing an external command '%s'", str(ExternalCommand.__dict__)) self.external_commands.append(elt) # Maybe we got a Message from the modules, it's way to ask something # like from now a full data from a scheduler for example. elif cls_type == 'message': # We got a message, great! logger.debug(str(elt.__dict__)) if elt.get_type() == 'NeedData': data = elt.get_data() # Full instance id means: I got no data for this scheduler # so give me all dumbass! if 'full_instance_id' in data: c_id = data['full_instance_id'] source = elt.source logger.info('The module %s is asking me to get all initial data ' 'from the scheduler %d', source, c_id) # so we just reset the connection and the running_id, # it will just get all new things try: self.schedulers[c_id]['con'] = None self.schedulers[c_id]['running_id'] = 0 except KeyError: # maybe this instance was not known, forget it logger.warning("the module %s ask me a full_instance_id " "for an unknown ID (%d)!", source, c_id) # Maybe a module tells me that it's dead, I must log it's last words... if elt.get_type() == 'ICrash': data = elt.get_data() logger.error('the module %s just crash! Please look at the traceback:', data['name']) logger.error(data['trace'])
def check_exclude_rec(self): """ Check if this timeperiod is tagged :return: if tagged return false, if not true :rtype: bool """ if self.rec_tag: logger.error("[timeentry::%s] is in a loop in exclude parameter", self.get_name()) return False self.rec_tag = True for timeperiod in self.exclude: timeperiod.check_exclude_rec() return True
def do_manage_returns(self): """Manage the checks and then send a HTTP request to schedulers (POST /put_results) REF: doc/alignak-action-queues.png (6) :return: None """ # For all schedulers, we check for wait_homerun # and we send back results for sched_id, sched in self.schedulers.iteritems(): if not sched['active']: continue results = sched['wait_homerun'] # NB: it's **mostly** safe for us to not use some lock around # this 'results' / sched['wait_homerun']. # Because it can only be modified (for adding new values) by the # same thread running this function (that is the main satellite # thread), and this occurs exactly in self.manage_action_return(). # Another possibility is for the sched['wait_homerun'] to be # cleared within/by : # ISchedulers.get_returns() -> Satelitte.get_return_for_passive() # This can so happen in an (http) client thread. if not results: return # So, at worst, some results would be received twice on the # scheduler level, which shouldn't be a problem given they are # indexed by their "action_id". send_ok = False try: con = sched.get('con') if con is None: # None = not initialized con = self.pynag_con_init(sched_id) if con: con.post('put_results', {'results': results.values()}) send_ok = True except HTTPEXCEPTIONS as err: logger.error('Could not send results to scheduler %s : %s', sched['name'], err) except Exception as err: logger.exception("Unhandled exception trying to send results " "to scheduler %s: %s", sched['name'], err) raise finally: if send_ok: results.clear() else: # if - and only if - send was not ok, # then "de-init" the sched connection: sched['con'] = None
def is_correct(self): """Check if the Daterange is correct : weekdays are valid :return: True if weekdays are valid, False otherwise :rtype: bool """ valid = True valid &= self.swday in xrange(7) if not valid: logger.error("Error: %s is not a valid day", self.swday) valid &= self.ewday in xrange(7) if not valid: logger.error("Error: %s is not a valid day", self.ewday) return valid
def init(self): """ Initialize Graphite connection :return: None or socket """ logger.info("[Graphite] initializing connection to %s:%d ...", str(self.host), self.port) try: self.con = socket() self.con.connect((self.host, self.port)) except IOError as e: logger.error("[Graphite] Graphite Carbon instance connexion failed" " IOError: %s", str(e)) # do not raise an exception - logging is enough ... self.con = None return self.con
def launch_new_checks(self): """ Launch checks that are in status REF: doc/alignak-action-queues.png (4) :return: None """ # queue for chk in self.checks: if chk.status == 'queue': self._idletime = 0 res = chk.execute() # Maybe we got a true big problem in the # action launching if res == 'toomanyopenfiles': # We should die as soon as we return all checks logger.error("[%d] I am dying Too many open files %s ... ", self._id, chk) self.i_am_dying = True
def is_correct(self): """ Check if dateranges of timeperiod are valid :return: false if at least one datarange is invalid :rtype: bool """ valid = True for daterange in self.dateranges: good = daterange.is_correct() if not good: logger.error("[timeperiod::%s] invalid daterange ", self.get_name()) valid &= good # Warn about non correct entries for entry in self.invalid_entries: logger.warning("[timeperiod::%s] invalid entry '%s'", self.get_name(), entry) return valid
def is_correct(self): """Check if the CheckModulation definition is correct:: * Check for required attribute * Raise previous configuration errors :return: True if the definition is correct, False otherwise :rtype: bool """ state = True cls = self.__class__ # Raised all previously saw errors like unknown commands or timeperiods if self.configuration_errors != []: state = False for err in self.configuration_errors: logger.error("[item::%s] %s", self.get_name(), err) for prop, entry in cls.properties.items(): if prop not in cls._special_properties: if not hasattr(self, prop) and entry.required: logger.error("[checkmodulation::%s] %s property not set", self.get_name(), prop) state = False # Bad boy... # Ok now we manage special cases... # Service part if not hasattr(self, 'check_command'): logger.error("[checkmodulation::%s] do not have any check_command defined", self.get_name()) state = False else: if self.check_command is None: logger.error("[checkmodulation::%s] a check_command is missing", self.get_name()) state = False if not self.check_command.is_valid(): logger.error("[checkmodulation::%s] a check_command is invalid", self.get_name()) state = False # Ok just put None as check_period, means 24x7 if not hasattr(self, 'check_period'): self.check_period = None return state
def stop_process(self): """Request the module process to stop and release it :return: None """ if self.process: logger.info("I'm stopping module %r (pid=%s)", self.get_name(), self.process.pid) self.process.terminate() self.process.join(timeout=1) if self.process.is_alive(): logger.warning("%r is still alive normal kill, I help it to die", self.get_name()) self.kill() self.process.join(1) if self.process.is_alive(): logger.error("%r still alive after brutal kill, I leave it.", self.get_name()) self.process = None
def linkify(self, timeperiods): """ Will make timeperiod in exclude with id of the timeperiods :param timeperiods: Timeperiods object :type timeperiods: :return: None """ new_exclude = [] if hasattr(self, 'exclude') and self.exclude != []: logger.debug("[timeentry::%s] have excluded %s", self.get_name(), self.exclude) excluded_tps = self.exclude # print "I will exclude from:", excluded_tps for tp_name in excluded_tps: timepriod = timeperiods.find_by_name(tp_name.strip()) if timepriod is not None: new_exclude.append(timepriod) else: logger.error("[timeentry::%s] unknown %s timeperiod", self.get_name(), tp_name) self.exclude = new_exclude
def check_alive_instances(self): """Check alive isntances. If not, log error and try to restart it :return: None """ # Only for external for inst in self.instances: if inst not in self.to_restart: if inst.is_external and not inst.process.is_alive(): logger.error( "The external module %s goes down unexpectedly!", inst.get_name()) logger.info("Setting the module %s to restart", inst.get_name()) # We clean its queues, they are no more useful inst.clear_queues(self.manager) self.to_restart.append(inst) # Ok, no need to look at queue size now continue # Now look for man queue size. If above value, the module should got a huge problem # and so bailout. It's not a perfect solution, more a watchdog # If max_queue_size is 0, don't check this if self.max_queue_size == 0: continue # Ok, go launch the dog! queue_size = 0 try: queue_size = inst.to_q.qsize() except Exception, exp: pass if queue_size > self.max_queue_size: logger.error( "The external module %s got a too high brok queue size (%s > %s)!", inst.get_name(), queue_size, self.max_queue_size) logger.info("Setting the module %s to restart", inst.get_name()) # We clean its queues, they are no more useful inst.clear_queues(self.manager) self.to_restart.append(inst)
def get_services_by_explosion(self, servicegroups): """ Get all services of this servicegroup and add it in members container :param servicegroups: servicegroups object :type servicegroups: object :return: return empty string or list of members :rtype: str or list """ # First we tag the hg so it will not be explode # if a son of it already call it self.already_explode = True # Now the recursive part # rec_tag is set to False every HG we explode # so if True here, it must be a loop in HG # calls... not GOOD! if self.rec_tag: logger.error( "[servicegroup::%s] got a loop in servicegroup definition", self.get_name()) if hasattr(self, 'members'): return self.members else: return '' # Ok, not a loop, we tag it and continue self.rec_tag = True sg_mbrs = self.get_servicegroup_members() for sg_mbr in sg_mbrs: servicegroup = servicegroups.find_by_name(sg_mbr.strip()) if servicegroup is not None: value = servicegroup.get_services_by_explosion(servicegroups) if value is not None: self.add_string_member(value) if hasattr(self, 'members'): return self.members else: return ''
def stop_process(self): """Request the module process to stop and release it :return: None """ if self.process: logger.info("I'm stopping module %r (pid=%s)", self.get_name(), self.process.pid) self.process.terminate() self.process.join(timeout=1) if self.process.is_alive(): logger.warning( "%r is still alive normal kill, I help it to die", self.get_name()) self.kill() self.process.join(1) if self.process.is_alive(): logger.error( "%r still alive after brutal kill, I leave it.", self.get_name()) self.process = None
def get_contacts_by_explosion(self, contactgroups): """ Get hosts of this group :param contactgroups: Contactgroups object, use to look for a specific one :type contactgroups: alignak.objects.contactgroup.Contactgroups :return: list of contact of this group :rtype: list[alignak.objects.contact.Contact] """ # First we tag the hg so it will not be explode # if a son of it already call it self.already_explode = True # Now the recursive part # rec_tag is set to False every CG we explode # so if True here, it must be a loop in HG # calls... not GOOD! if self.rec_tag: logger.error( "[contactgroup::%s] got a loop in contactgroup definition", self.get_name()) if hasattr(self, 'members'): return self.members else: return '' # Ok, not a loop, we tag it and continue self.rec_tag = True cg_mbrs = self.get_contactgroup_members() for cg_mbr in cg_mbrs: contactgroup = contactgroups.find_by_name(cg_mbr.strip()) if contactgroup is not None: value = contactgroup.get_contacts_by_explosion(contactgroups) if value is not None: self.add_string_member(value) if hasattr(self, 'members'): return self.members else: return ''
def _get_all_objects(self, object_type): for attempt in range(3): try: objects_manager = getattr( self.surveil_client.config, object_type, ) list_kwargs = {} if object_type in ['hosts', 'services']: list_kwargs['templates'] = True return objects_manager.list(**list_kwargs) except Exception as exp: logger.error( "[surveil-config] Could not get %s objects from Surveil - try %s/3", (object_type, attempt) ) logger.error('[surveil-config]' + str(exp)) time.sleep(10) #TODO: The arbiter should stop completely. raise Exception("Could not load config from Surveil")
def is_correct(self): """ Check if a group is valid. Valid mean all members exists, so list of unknown_members is empty :return: True if group is correct, otherwise False :rtype: bool """ res = True if self.unknown_members: for member in self.unknown_members: logger.error("[itemgroup::%s] as %s, got unknown member %s", self.get_name(), self.__class__.my_type, member) res = False if self.configuration_errors != []: for err in self.configuration_errors: logger.error("[itemgroup] %s", err) res = False return res
def manage_initial_service_status_brok(self, b): """ Initialize the cache for services :param b: :type b: :return: None """ host_name = b.data['host_name'] service_description = b.data['service_description'] service_id = host_name+"/"+service_description logger.info("[Graphite] got initial service status: %s", service_id) if host_name not in self.hosts_cache: logger.error("[Graphite] initial service status, host is unknown: %s.", service_id) return self.services_cache[service_id] = {} if '_GRAPHITE_POST' in b.data['customs']: self.services_cache[service_id]['_GRAPHITE_POST'] = b.data['customs']['_GRAPHITE_POST'] logger.debug("[Graphite] initial service status received: %s", service_id)
def get_services_by_explosion(self, servicegroups): """ Get all services of this servicegroup and add it in members container :param servicegroups: servicegroups object :type servicegroups: object :return: return empty string or list of members :rtype: str or list """ # First we tag the hg so it will not be explode # if a son of it already call it self.already_explode = True # Now the recursive part # rec_tag is set to False every HG we explode # so if True here, it must be a loop in HG # calls... not GOOD! if self.rec_tag: logger.error("[servicegroup::%s] got a loop in servicegroup definition", self.get_name()) if hasattr(self, 'members'): return self.members else: return '' # Ok, not a loop, we tag it and continue self.rec_tag = True sg_mbrs = self.get_servicegroup_members() for sg_mbr in sg_mbrs: servicegroup = servicegroups.find_by_name(sg_mbr.strip()) if servicegroup is not None: value = servicegroup.get_services_by_explosion(servicegroups) if value is not None: self.add_string_member(value) if hasattr(self, 'members'): return self.members else: return ''
def get_new_broks(self, i_type='scheduler'): """Get new broks from daemon defined in type parameter :param i_type: type of object :type i_type: str :return: None """ # Get the good links tab for looping.. links = self.get_links_from_type(i_type) if links is None: logger.debug('Type unknown for connection! %s', i_type) return # We check for new check in each schedulers and put # the result in new_checks for sched_id in links: try: con = links[sched_id]['con'] if con is not None: # None = not initialized t00 = time.time() # Before ask a call that can be long, do a simple ping to be sure it is alive con.get('ping') tmp_broks = con.get('get_broks', {'bname': self.name}, wait='long') try: tmp_broks = cPickle.loads(zlib.decompress(base64.b64decode(tmp_broks))) except (TypeError, zlib.error, cPickle.PickleError), exp: logger.error('Cannot load broks data from %s : %s', links[sched_id]['name'], exp) links[sched_id]['con'] = None continue logger.debug("%s Broks get in %s", len(tmp_broks), time.time() - t00) for brok in tmp_broks.values(): brok.instance_id = links[sched_id]['instance_id'] # Ok, we can add theses broks to our queues self.add_broks_to_queue(tmp_broks.values()) else: # no con? make the connection self.pynag_con_init(sched_id, i_type=i_type)
def get_instances(self): """Create, init and then returns the list of module instances that the caller needs. If an instance can't be created or init'ed then only log is done. That instance is skipped. The previous modules instance(s), if any, are all cleaned. Arbiter call this method with start_external=False :return: module instances list :rtype: list """ self.clear_instances() for (mod_conf, module) in self.modules_assoc: mod_conf.properties = module.properties.copy() try: inst = module.get_instance(mod_conf) if not isinstance(inst, BaseModule): raise TypeError( 'Returned instance is not of type BaseModule (%s) !' % type(inst)) except Exception as err: logger.error( "The module %s raised an exception %s, I remove it! traceback=%s", mod_conf.get_name(), err, traceback.format_exc()) else: # Give the module the data to which module it is load from inst.set_loaded_into(self.modules_type) self.instances.append(inst) for inst in self.instances: # External are not init now, but only when they are started if not inst.is_external and not self.try_instance_init(inst): # If the init failed, we put in in the restart queue logger.warning( "The module '%s' failed to init, I will try to restart it later", inst.get_name()) self.to_restart.append(inst) return self.instances
def eval(myself, ctx): """Execute the trigger :param myself: self object but self will be use after exec (locals) :type myself: object :param ctx: host or service object :type ctx: alignak.objects.schedulingitem.SchedulingItem :return: None """ self = ctx # Ok we can declare for this trigger call our functions for (name, fun) in TRIGGER_FUNCTIONS.iteritems(): locals()[name] = fun code = myself.code_bin # Comment? => compile(myself.code_bin, "<irc>", "exec") try: exec code in dict(locals()) # pylint: disable=W0122 except Exception as err: set_value(self, "UNKNOWN: Trigger error: %s" % err, "", 3) logger.error('%s Trigger %s failed: %s ; ' '%s', self.host_name, myself.trigger_name, err, traceback.format_exc())