Пример #1
0
 def _check_error_dict(self, force=False):
     c_name = process_tools.get_cluster_name()
     mails_sent = 0
     s_time = time.time()
     ep_dels = []
     for ep, es in self.__eg_dict.items():
         t_diff = s_time - es["last_update"]
         if force or (t_diff < 0 or t_diff > 60):
             subject = "Python error for pid {:d} on {}@{} ({}, {})".format(
                 ep,
                 process_tools.get_fqdn()[0],
                 c_name,
                 process_tools.get_machine_name(),
                 clusterid.get_cluster_id() or "N/A",
             )
             err_lines = "".join(es["error_str"]).split("\n")
             msg_body = "\n".join(
                 ["Processinfo {}".format(self._get_process_info(es))] + [
                     "{:3d} {}".format(line_num + 1, line)
                     for line_num, line in enumerate(err_lines)
                 ])
             if self.CC.CS["log.send.errormails"]:
                 self._send_mail(subject, msg_body)
                 mails_sent += 1
             ep_dels.append(ep)
     for epd in ep_dels:
         del self.__eg_dict[epd]
     e_time = time.time()
     if mails_sent:
         self.log("Sent {} in {}".format(
             logging_tools.get_plural("mail", mails_sent),
             logging_tools.get_diff_time_str(e_time - s_time)))
Пример #2
0
 def _check_error_dict(self, force=False):
     mails_sent = 0
     s_time = time.time()
     ep_dels = []
     for ep, es in list(self.__eg_dict.items()):
         t_diff = s_time - es.last_update
         if force or (t_diff < 0 or t_diff > 60):
             subject = "An error occured, PID={:d} on {}@{} ({})".format(
                 es.pid,
                 process_tools.get_fqdn()[0],
                 clusterid.get_safe_cluster_id("N/A"),
                 clusterid.get_safe_cluster_name("N/A"),
             )
             err_lines = es.error_str.split("\n")
             msg_body = "\n".join(
                 ["Processinfo {}".format(es.get_process_info())] + [
                     "{:3d} {}".format(line_num, line)
                     for line_num, line in enumerate(err_lines, 1)
                 ])
             if self.CC.CS["log.send.errormails"]:
                 self._send_mail(subject, msg_body)
                 mails_sent += 1
             ep_dels.append(ep)
     for epd in ep_dels:
         del self.__eg_dict[epd]
     e_time = time.time()
     if mails_sent:
         self.log("Sent {} in {}".format(
             logging_tools.get_plural("mail", mails_sent),
             logging_tools.get_diff_time_str(e_time - s_time)))
Пример #3
0
 def __init__(self):
     self.__debug = global_config["DEBUG"]
     threading_tools.process_pool.__init__(self, "main")
     self.CC.init(icswServiceEnum.meta_server,
                  global_config,
                  native_logging=True)
     self.CC.check_config()
     self.CC.CS.copy_to_global_config(global_config, [
         ("meta.track.icsw.memory", "TRACK_CSW_MEMORY"),
         ("meta.check.time", "MIN_CHECK_TIME"),
         ("meta.check.memory.time", "MIN_MEMCHECK_TIME"),
     ])
     global_config.add_config_entries([
         ("STATE_DIR",
          configfile.str_c_var(os.path.join(self.CC.CS["meta.maindir"],
                                            ".srvstate"),
                               source="dynamic")),
     ])
     # check for correct rights
     self._check_dirs()
     self._init_network_sockets()
     self._init_inotify()
     self.register_exception("int_error", self._sigint)
     self.register_exception("term_error", self._sigint)
     # init stuff for mailing
     self.__new_mail = mail_tools.mail(
         None,
         "{}@{}".format(
             self.CC.CS["meta.mail.from.name"],
             process_tools.get_fqdn()[0],
         ),
         self.CC.CS["mail.target.address"],
     )
     self.__new_mail.set_server(self.CC.CS["mail.server"],
                                self.CC.CS["mail.server"])
     # msi dict
     self.__last_update_time = time.time(
     ) - 2 * global_config["MIN_CHECK_TIME"]
     self.__last_memcheck_time = time.time(
     ) - 2 * global_config["MIN_MEMCHECK_TIME"]
     self._init_meminfo()
     self.CC.log_config()
     self._init_statemachine()
     self.__next_stop_is_restart = False
     # wait for transactions if necessary
     self.__exit_process = False
     self.__transition_timer = False
     self.register_timer(self._check, 30, instant=True)
Пример #4
0
 def _check_processes(self, service_list=None, force=False):
     self.__loopcount += 1
     act_time = time.time()
     # act_pid_dict = process_tools.get_proc_list()
     _check_mem = act_time > self.__last_memcheck_time + global_config[
         "MIN_MEMCHECK_TIME"] and global_config["TRACK_CSW_MEMORY"]
     if _check_mem:
         self.__last_memcheck_time = act_time
     if service_list is not None:
         self.def_ns.service = service_list
     else:
         self.def_ns.service = []
     _res_list = self.container.check_system(self.def_ns,
                                             self.server_instance)
     # always reset service to the empty list
     self.def_ns.service = []
     trans_list = self.service_state.update(
         _res_list,
         exclude=["meta-server"],
         throttle=[("uwsgi-init", 5)],
         # force first call
         force=(self.__loopcount == 1 or force),
     )
     if trans_list:
         self._new_transitions(trans_list)
         if self.__loopcount > 1 and not force:
             _cluster_id = clusterid.get_cluster_id() or "N/A"
             mail_subject, mail_text = self.service_state.get_mail_text(
                 trans_list)
             self.__new_mail.init_text()
             self.__new_mail.set_subject("{} from {} ({})".format(
                 mail_subject,
                 process_tools.get_fqdn()[0],
                 _cluster_id,
             ))
             self.__new_mail.append_text(mail_text)
             _sm_stat, log_lines = self.__new_mail.send_mail()
             for line in log_lines:
                 self.log(line)
     if _check_mem and _res_list:
         self._show_meminfo(_res_list)
     end_time = time.time()
     if end_time - act_time > 1:
         self.log("update {:d} took {}".format(
             self.__loopcount,
             logging_tools.get_diff_time_str(end_time - act_time),
         ))
Пример #5
0
 def _send_mail(self, subject, msg_body):
     new_mail = mail_tools.icswMail(
         subject,
         "{}@{}".format(
             self.CC.CS["log.mail.from.name"],
             process_tools.get_fqdn()[0],
         ),
         self.CC.CS["mail.target.address"],
         msg_body,
     )
     new_mail.set_server(self.CC.CS["mail.server"],
                         self.CC.CS["mail.server"])
     try:
         send_stat, log_lines = new_mail.send_mail()
         for log_line in log_lines:
             self.log(" - ({:d}) {}".format(send_stat, log_line),
                      logging_tools.LOG_LEVEL_OK)
     except:
         self.log(
             "error sending mail: {}".format(
                 process_tools.get_except_info()),
             logging_tools.LOG_LEVEL_CRITICAL)
Пример #6
0
    def __init__(self):
        long_host_name, mach_name = process_tools.get_fqdn()
        threading_tools.process_pool.__init__(self, "main", zmq=True)
        self.CC.init(icswServiceEnum.monitor_server, global_config)
        self.CC.check_config()
        db_tools.close_connection()
        self.CC.read_config_from_db([
            ("NETSPEED_WARN_MULT", configfile.float_c_var(0.85)),
            ("NETSPEED_CRITICAL_MULT", configfile.float_c_var(0.95)),
            ("NETSPEED_DEFAULT_VALUE", configfile.int_c_var(10000000)),
            ("CHECK_HOST_ALIVE_PINGS", configfile.int_c_var(5)),
            ("CHECK_HOST_ALIVE_TIMEOUT", configfile.float_c_var(5.0)),
            ("ENABLE_COLLECTD", configfile.bool_c_var(False)),
            ("ENABLE_NAGVIS", configfile.bool_c_var(False)),
            ("ENABLE_FLAP_DETECTION", configfile.bool_c_var(False)),
            ("NAGVIS_DIR", configfile.str_c_var("/opt/nagvis4icinga")),
            ("NAGVIS_URL", configfile.str_c_var("/nagvis")),
            ("NONE_CONTACT_GROUP", configfile.str_c_var("none_group")),
            ("FROM_ADDR", configfile.str_c_var(long_host_name)),
            ("LOG_EXTERNAL_COMMANDS", configfile.bool_c_var(False)),
            ("LOG_PASSIVE_CHECKS", configfile.bool_c_var(False)),
            ("BUILD_CONFIG_ON_STARTUP", configfile.bool_c_var(True)),
            ("RELOAD_ON_STARTUP", configfile.bool_c_var(True)),
            ("RETAIN_HOST_STATUS", configfile.bool_c_var(True)),
            ("RETAIN_SERVICE_STATUS", configfile.bool_c_var(True)),
            ("PASSIVE_HOST_CHECKS_ARE_SOFT", configfile.bool_c_var(True)),
            ("RETAIN_PROGRAM_STATE", configfile.bool_c_var(False)),
            ("USE_HOST_DEPENDENCIES", configfile.bool_c_var(False)),
            ("USE_SERVICE_DEPENDENCIES", configfile.bool_c_var(False)),
            ("TRANSLATE_PASSIVE_HOST_CHECKS", configfile.bool_c_var(True)),
            ("USE_ONLY_ALIAS_FOR_ALIAS", configfile.bool_c_var(False)),
            ("HOST_DEPENDENCIES_FROM_TOPOLOGY", configfile.bool_c_var(False)),
            ("CCOLLCLIENT_TIMEOUT", configfile.int_c_var(10)),
            ("CSNMPCLIENT_TIMEOUT", configfile.int_c_var(20)),
            ("MAX_SERVICE_CHECK_SPREAD", configfile.int_c_var(5)),
            ("MAX_HOST_CHECK_SPREAD", configfile.int_c_var(5)),
            ("MAX_CONCURRENT_CHECKS", configfile.int_c_var(500)),
            ("CHECK_SERVICE_FRESHNESS",
             configfile.bool_c_var(
                 True, help_string="enable service freshness checking")),
            ("CHECK_HOST_FRESHNESS",
             configfile.bool_c_var(
                 True, help_string="enable host freshness checking")),
            ("SAFE_CC_NAME", configfile.bool_c_var(False)),
            ("SERVICE_FRESHNESS_CHECK_INTERVAL", configfile.int_c_var(60)),
            ("HOST_FRESHNESS_CHECK_INTERVAL", configfile.int_c_var(60)),
            ("SAFE_NAMES",
             configfile.bool_c_var(
                 False,
                 help_string=
                 "convert all command descriptions to safe names (without spaces), [%(default)s]"
             )),
            ("ENABLE_ICINGA_LOG_PARSING",
             configfile.bool_c_var(
                 True,
                 help_string=
                 "collect icinga logs in the database (required for status history and kpis)"
             )),
        ])
        # copy flags
        self.__verbose = global_config["VERBOSE"]
        # log config
        self.CC.log_config()
        # re-insert config
        self.CC.re_insert_config()
        # init build control
        self.BC = BuildControl(self)
        self.register_exception("int_error", self._int_error)
        self.register_exception("term_error", self._int_error)
        self.register_exception("hup_error", self._hup_error)
        self._check_notification()
        self._check_special_commands()
        # sync master uuid
        self.__sync_master_uuid = None
        # from mixins
        self.VCM_check_md_version()
        self._init_network_sockets()

        if "MD_TYPE" in global_config:
            self.register_func("register_remote", self._register_remote)
            self.register_func("send_command", self._send_command)
            self.register_func("ocsp_results", self._ocsp_results)
            self.register_func("set_sync_master_uuid",
                               self._set_sync_master_uuid)
            self.register_func("distribution_info", self._distribution_info)
            self.register_func("build_step", self.BC.build_step)

            self.add_process(SyncerProcess("syncer"), start=True)
            self.add_process(DynConfigProcess("dynconfig"), start=True)
            self.add_process(IcingaLogReader("IcingaLogReader"), start=True)
            self.add_process(KpiProcess("KpiProcess"), start=True)
            # wait for the processes to start
            time.sleep(0.5)
            self.register_timer(self._check_for_redistribute,
                                60 if global_config["DEBUG"] else 300)
            # only test code
            # self.send_to_remote_server(
            #    "cluster-server",
            #    unicode(server_command.srv_command(command="statusd")),
            # )
        else:
            self._int_error("no MD found")
Пример #7
0
 def _send_vector(self, *args, **kwargs):
     send_id = args[0]
     _struct = self.cs[send_id]
     _p_until = _struct.get("pause_until", 0)
     cur_time = int(time.time())
     # print "_", _p_until, cur_time
     if _p_until:
         if _p_until > cur_time:
             return
         else:
             self.log("clearing pause_until")
             del _struct["pause_until"]
     cur_id = _struct["sent"]
     full = cur_id % _struct.get("full_info_every", 10) == 0
     cur_id += 1
     _struct["sent"] = cur_id
     try:
         fqdn, _short_name = process_tools.get_fqdn()
     except:
         fqdn = process_tools.get_machine_name()
     send_format = _struct.get("format", "xml")
     if send_format == "xml":
         send_vector = self.build_xml(E, simple=not full)
         send_vector.attrib["name"] = _struct.get("send_name", fqdn) or fqdn
         send_vector.attrib["interval"] = "{:d}".format(
             _struct.get("send_every"))
         send_vector.attrib["uuid"] = self.module.main_proc.zeromq_id
     else:
         send_vector = self.build_json(simple=not full)
         send_vector[1]["name"] = _struct.get("send_name", fqdn) or fqdn
         send_vector[1]["interval"] = _struct.get("send_every")
         send_vector[1]["uuid"] = self.module.main_proc.zeromq_id
     # send to server
     t_host, t_port = (
         _struct.get("target", "127.0.0.1"),
         _struct.get("port", 8002),
     )
     try:
         if send_format == "xml":
             self.__socket_dict[send_id].send_unicode(
                 unicode(etree.tostring(send_vector)))  # @UndefinedVariable
         else:
             # print json.dumps(send_vector)
             self.__socket_dict[send_id].send_unicode(
                 json.dumps(send_vector))
     except:
         exc_info = process_tools.get_except_info()
         # ignore errors
         self.log(
             "error sending to ({}, {:d}): {}".format(
                 t_host, t_port, exc_info), logging_tools.LOG_LEVEL_ERROR)
         if exc_info.count("int_error"):
             raise
         else:
             # problem sending, wait 2 minutes
             _diff_t = 120
             _w_time = cur_time + _diff_t
             self.log(
                 "setting pause_until to {:d} (+{:d} seconds)".format(
                     _w_time, _diff_t), logging_tools.LOG_LEVEL_WARN)
             _struct["pause_until"] = _w_time
     self.cs[send_id] = _struct
Пример #8
0
 def __init__(self, options):
     threading_tools.process_pool.__init__(self, "main", zmq=True)
     long_host_name, mach_name = process_tools.get_fqdn()
     self.__run_command = True if global_config["COMMAND"].strip() else False
     # rewrite LOG_NAME if necessary
     if self.__run_command:
         self.CC.init(
             icswServiceEnum.cluster_server,
             global_config,
             init_msi_block=False,
             log_name_postfix="direct-{}".format(global_config["COMMAND"]),
         )
     else:
         self.CC.init(
             icswServiceEnum.cluster_server,
             global_config,
         )
     self.CC.check_config()
     # close DB conncetion (daemonize)
     if self.__run_command:
         global_config.mc_prefix = global_config["COMMAND"]
     else:
         # create hardware fingerprint
         self.CC.create_hfp()
     # enable memcache backend
     global_config.enable_mc()
     db_tools.close_connection()
     self.CC.read_config_from_db(
         [
             ("IMAGE_SOURCE_DIR", configfile.str_c_var("/opt/cluster/system/images")),
             ("MAILSERVER", configfile.str_c_var("localhost")),
             ("FROM_NAME", configfile.str_c_var("quotawarning")),
             ("FROM_ADDR", configfile.str_c_var(long_host_name)),
             ("VERSION", configfile.str_c_var(VERSION_STRING, database=False)),
             ("QUOTA_ADMINS", configfile.str_c_var("*****@*****.**")),
             ("MONITOR_QUOTA_USAGE", configfile.bool_c_var(False, info="enabled quota usage tracking")),
             ("TRACK_ALL_QUOTAS", configfile.bool_c_var(False, info="also track quotas without limit")),
             ("QUOTA_CHECK_TIME_SECS", configfile.int_c_var(3600)),
             ("USER_MAIL_SEND_TIME", configfile.int_c_var(3600, info="time in seconds between two mails")),
             ("SERVER_FULL_NAME", configfile.str_c_var(long_host_name, database=False)),
             ("SERVER_SHORT_NAME", configfile.str_c_var(mach_name, database=False)),
             ("DATABASE_DUMP_DIR", configfile.str_c_var("/opt/cluster/share/db_backup")),
             ("DATABASE_KEEP_DAYS", configfile.int_c_var(30)),
             ("USER_SCAN_TIMER", configfile.int_c_var(7200, info="time in seconds between two user_scan runs")),
             ("NEED_ALL_NETWORK_BINDS", configfile.bool_c_var(True, info="raise an error if not all bind() calls are successfull")),
         ]
     )
     if not self.__run_command:
         self.CC.re_insert_config()
     self.register_exception("int_error", self._int_error)
     self.register_exception("term_error", self._int_error)
     self.register_func("bg_finished", self._bg_finished)
     self._log_config()
     self._check_uuid()
     self._load_modules()
     self.__options = options
     self._set_next_backup_time(True)
     if self.__run_command:
         self.register_timer(self._run_command, 3600, instant=True)
     else:
         self._init_network_sockets()
         if not self["exit_requested"]:
             self.init_notify_framework(global_config)
             self.add_process(CapabilityProcess("capability_process"), start=True)
             self.add_process(LicenseChecker("license_checker"), start=True)
             db_tools.close_connection()
             self.register_timer(
                 self._update,
                 2 if global_config["DEBUG"] else 30,
                 instant=True
             )
Пример #9
0
 def __init__(self):
     _long_host_name, mach_name = process_tools.get_fqdn()
     threading_tools.icswProcessPool.__init__(self, "main")
     self.register_exception("int_error", self._int_error)
     self.register_exception("term_error", self._int_error)
     self.CC.init(icswServiceEnum.mother_server, global_config)
     self.CC.check_config()
     # close db connection (for daemonizing)
     db_tools.close_connection()
     self.debug = global_config["DEBUG"]
     self.srv_helper = service_tools.ServiceHelper(self.log)
     self.__hs_port = InstanceXML(quiet=True).get_port_dict(
         icswServiceEnum.hoststatus, command=True)
     self.__hm_port = InstanceXML(quiet=True).get_port_dict(
         icswServiceEnum.host_monitoring, command=True)
     # log config
     self.CC.read_config_from_db([
         ("TFTP_LINK", configfile.StringConfigVar("/tftpboot")),
         ("TFTP_DIR",
          configfile.StringConfigVar(
              os.path.join(CLUSTER_DIR, "system", "tftpboot"))),
         ("CLUSTER_DIR", configfile.StringConfigVar(CLUSTER_DIR)),
         # in 10th of seconds
         ("NODE_BOOT_DELAY", configfile.IntegerConfigVar(50)),
         ("FANCY_PXE_INFO", configfile.BoolConfigVar(False)),
         ("SERVER_SHORT_NAME", configfile.StringConfigVar(mach_name)),
         ("WRITE_DHCP_CONFIG", configfile.BoolConfigVar(True)),
         ("DHCP_AUTHORITATIVE", configfile.BoolConfigVar(False)),
         ("DHCP_ONLY_BOOT_NETWORKS", configfile.BoolConfigVar(True)),
         ("MODIFY_NFS_CONFIG", configfile.BoolConfigVar(True)),
         ("NEED_ALL_NETWORK_BINDS", configfile.BoolConfigVar(True)),
     ])
     global_config.add_config_entries([
         ("CONFIG_DIR",
          configfile.StringConfigVar(
              os.path.join(global_config["TFTP_DIR"], "config"))),
         ("ETHERBOOT_DIR",
          configfile.StringConfigVar(
              os.path.join(global_config["TFTP_DIR"], "etherboot"))),
         ("KERNEL_DIR",
          configfile.StringConfigVar(
              os.path.join(global_config["TFTP_DIR"], "kernels"))),
         ("SHARE_DIR",
          configfile.StringConfigVar(
              os.path.join(global_config["CLUSTER_DIR"], "share",
                           "mother"))),
         ("NODE_SOURCE_IDX",
          configfile.IntegerConfigVar(LogSource.new("node").pk)),
     ])
     self.CC.log_config()
     self.CC.re_insert_config()
     # prepare directories
     self._prepare_directories()
     # check netboot functionality
     self._check_netboot_functionality()
     # check nfs exports
     self._check_nfs_exports()
     # modify syslog config
     self._enable_syslog_config()
     # dhcp config
     self.write_dhcp_config()
     # check status entries
     self._check_status_entries()
     self.register_func("contact_hoststatus", self._contact_hoststatus)
     self.register_func("contact_hostmonitor", self._contact_hostmonitor)
     my_uuid = uuid_tools.get_uuid()
     self.log("cluster_device_uuid is '{}'".format(my_uuid.urn))
     if self._init_network_sockets():
         self.add_process(initat.mother.kernel.KernelSyncProcess("kernel"),
                          start=True)
         self.add_process(
             initat.mother.command.ExternalCommandProcess("command"),
             start=True)
         self.add_process(
             initat.mother.control.NodeControlProcess("control"),
             start=True)
         self.add_process(initat.mother.control.ICMPProcess("icmp"),
                          start=True)
         db_tools.close_connection()
         conf_dict = {
             key: global_config[key]
             for key in ["LOG_NAME", "LOG_DESTINATION", "VERBOSE"]
         }
         self.add_process(SNMPProcess("snmp_process", conf_dict=conf_dict),
                          start=True)
         # send initial commands
         self.send_to_process(
             "kernel", "srv_command",
             str(
                 server_command.srv_command(command="check_kernel_dir",
                                            insert_all_found="1")))
         # restart hoststatus
         self.send_to_process("command",
                              "delay_command",
                              "/etc/init.d/hoststatus restart",
                              delay_time=5)
         self.send_to_process("control", "refresh", refresh=False)
     else:
         self._int_error("bind problem")