def update_nrpe_config(): # python-dbus is used by check_upstart_job apt_install('python-dbus') hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe.copy_nrpe_checks() _services = [] for service in services(): if service.startswith('snap.'): service = service.split('.')[1] _services.append(service) nrpe.add_init_service_checks(nrpe_setup, _services, current_unit) nrpe.add_haproxy_checks(nrpe_setup, current_unit) nrpe_setup.write()
def remove_nrpe_config(nagios=None): remove_state('nrpe-external-master.initial-config') # List of systemd services for which the checks will be removed services = ('snap.kube-apiserver.daemon', 'snap.kube-controller-manager.daemon', 'snap.kube-scheduler.daemon') # The current nrpe-external-master interface doesn't handle a lot of logic, # use the charm-helpers code for now. hostname = nrpe.get_nagios_hostname() nrpe_setup = nrpe.NRPE(hostname=hostname) for service in services: nrpe_setup.remove_check(shortname=service)
def render_nrpe(self): """Configure Nagios NRPE checks.""" ch_core.hookenv.log("Rendering NRPE checks.", level=ch_core.hookenv.INFO) hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() # Determine if this is a subordinate unit or not if ch_core.hookenv.principal_unit() == ch_core.hookenv.local_unit(): primary = True else: primary = False charm_nrpe = nrpe.NRPE(hostname=hostname, primary=primary) nrpe.add_init_service_checks( charm_nrpe, self.nrpe_check_services, current_unit) charm_nrpe.write()
def update_nrpe_config(): # python-dbus is used by check_upstart_job # fasteners is used by apt_install collect_ceph_osd_services.py pkgs = ['python3-dbus'] if CompareHostReleases(lsb_release()['DISTRIB_CODENAME']) >= 'bionic': pkgs.append('python3-fasteners') apt_install(pkgs) # copy the check and collect files over to the plugins directory charm_dir = os.environ.get('CHARM_DIR', '') nagios_plugins = '/usr/local/lib/nagios/plugins' # Grab nagios user/group ID's from original source _dir = os.stat(nagios_plugins) uid = _dir.st_uid gid = _dir.st_gid for name in ('collect_ceph_osd_services.py', 'check_ceph_osd_services.py'): target = os.path.join(nagios_plugins, name) shutil.copy(os.path.join(charm_dir, 'files', 'nagios', name), target) os.chown(target, uid, gid) hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() # BUG#1810749 - the nagios user can't access /var/lib/ceph/.. and that's a # GOOD THING, as it keeps ceph secure from Nagios. However, to check # whether ceph is okay, the check_systemd.py or 'status ceph-osd' still # needs to be called with the contents of ../osd/ceph-*/whoami files. To # get around this conundrum, instead a cron.d job that runs as root will # perform the checks every minute, and write to a tempory file the results, # and the nrpe check will grep this file and error out (return 2) if the # first 3 characters of a line are not 'OK:'. cmd = ('MAILTO=""\n' '* * * * * root ' '/usr/local/lib/nagios/plugins/collect_ceph_osd_services.py' ' 2>&1 | logger -t check-osd\n') with open(CRON_CEPH_CHECK_FILE, "wt") as f: f.write(cmd) nrpe_cmd = '/usr/local/lib/nagios/plugins/check_ceph_osd_services.py' nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe_setup.add_check( shortname='ceph-osd', description='process check {%s}' % current_unit, check_cmd=nrpe_cmd ) nrpe_setup.write()
def update_nrpe_config(unused=None): # List of systemd services that will be checked services = ("snap.etcd.etcd", ) # The current nrpe-external interface doesn't handle a lot of logic, # use the charm-helpers code for now. hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False) # add our first check, to alert on service failure nrpe.add_init_service_checks(nrpe_setup, services, current_unit) # add the cron job to populate the cache for our second check # (we cache the output of 'etcdctl alarm list' to minimise overhead) with open("templates/check_etcd-alarms.cron") as fp: write_file( path="/etc/cron.d/check_etcd-alarms", content=fp.read().encode(), owner="root", perms=0o644, ) # create an empty output file for the above write_file( path="/var/lib/nagios/etcd-alarm-list.txt", content="", owner="root", perms=0o644, ) # install the NRPE script for the above with open("templates/check_etcd-alarms.py") as fp: write_file( path="/usr/lib/nagios/plugins/check_etcd-alarms.py", content=fp.read().encode(), owner="root", perms=0o755, ) # define our second check, to alert on etcd alarm status nrpe_setup.add_check( "etcd-alarms", "Verify etcd has no raised alarms", "/usr/lib/nagios/plugins/check_etcd-alarms.py", ) nrpe_setup.write() set_state("etcd.nrpe.configured")
def update_nrpe_config(): # python-dbus is used by check_upstart_job apt_install('python-dbus') hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe.copy_nrpe_checks() nrpe.add_init_service_checks(nrpe_setup, services(), current_unit) nrpe.add_haproxy_checks(nrpe_setup, current_unit) api_port = determine_api_port(config('bind-port'), singlenode_mode=True) nrpe_setup.add_check(shortname="swift-proxy-healthcheck", description="Check Swift Proxy Healthcheck", check_cmd="/usr/lib/nagios/plugins/check_http \ -I localhost -u /healthcheck -p {} \ -e \"OK\"".format(api_port)) nrpe_setup.write()
def remove_nrpe_config(): """ :return: None """ remove_state('nrpe-external-master.docker.initial-config') # List of systemd services for which the checks will be removed. services = ['docker'] # The current nrpe-external-master interface doesn't handle a lot of logic, # use the charm-helpers code for now. hostname = nrpe.get_nagios_hostname() nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False) for service in services: nrpe_setup.remove_check(shortname=service)
def update_nrpe_config(): # python-dbus is used by check_upstart_job apt_install('python-dbus') hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe.copy_nrpe_checks() nrpe.add_init_service_checks(nrpe_setup, services(), current_unit) nrpe.add_haproxy_checks(nrpe_setup, current_unit) conf = nrpe_setup.config check_http_params = conf.get('nagios_check_http_params') if check_http_params: nrpe_setup.add_check(shortname='vhost', description='Check Virtual Host {%s}' % current_unit, check_cmd='check_http %s' % check_http_params) nrpe_setup.write()
def test_write_restarts_service(self): self.patched['config'].return_value = { 'nagios_context': 'test', 'nagios_servicegroups': '' } self.patched['exists'].return_value = True checker = nrpe.NRPE() self.assertEqual(None, checker.write()) expected = ['service', 'nagios-nrpe-server', 'restart'] self.assertEqual(expected, self.patched['call'].call_args[0][0]) self.check_call_counts(config=1, getpwnam=1, getgrnam=1, exists=1, call=1)
def test_write_restarts_service(self): self.patched['config'].return_value = { 'nagios_context': 'test', 'nagios_servicegroups': '' } self.patched['exists'].return_value = True checker = nrpe.NRPE() self.assertEqual(None, checker.write()) self.patched['service'].assert_called_with('restart', 'nagios-nrpe-server') self.check_call_counts(config=1, getpwnam=1, getgrnam=1, exists=1, service=1)
def configure_megaraid(): status_set('maintenance', 'configuring megaraid check') install_nagios_plugin_from_file(source_file_path='/opt/{}/{}'.format( PLUGIN_NAME, PLUGIN_NAME), plugin_name=PLUGIN_NAME) hostname = nrpe.get_nagios_hostname() nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False) nrpe_setup.add_check( shortname=PLUGIN_NAME, description=PLUGIN_NAME, check_cmd='{plugin_name} -p {storcli_path} {check_params}'.format( plugin_name=PLUGIN_NAME, storcli_path=config('storcli_path'), check_params=config('check_parameters'))) nrpe_setup.write() status_set('active', 'ready') set_flag('megaraid.configured')
def update_nrpe_config(): scripts_src = os.path.join(os.environ["CHARM_DIR"], "files", "nrpe") scripts_dst = "/usr/local/lib/nagios/plugins" if not os.path.exists(scripts_dst): os.makedirs(scripts_dst) for fname in glob.glob(os.path.join(scripts_src, "*")): if os.path.isfile(fname): shutil.copy2(fname, os.path.join(scripts_dst, os.path.basename(fname))) sudoers_src = os.path.join(os.environ["CHARM_DIR"], "files", "sudoers") sudoers_dst = "/etc/sudoers.d" for fname in glob.glob(os.path.join(sudoers_src, "*")): if os.path.isfile(fname): shutil.copy2(fname, os.path.join(sudoers_dst, os.path.basename(fname))) hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) apt_install('python-dbus') # corosync/crm checks nrpe_setup.add_check(shortname='corosync_rings', description='Check Corosync rings {%s}' % current_unit, check_cmd='check_corosync_rings') nrpe_setup.add_check(shortname='crm_status', description='Check crm status {%s}' % current_unit, check_cmd='check_crm') # process checks nrpe_setup.add_check(shortname='corosync_proc', description='Check Corosync process {%s}' % current_unit, check_cmd='check_procs -c 1:1 -C corosync') nrpe_setup.add_check(shortname='pacemakerd_proc', description='Check Pacemakerd process {%s}' % current_unit, check_cmd='check_procs -c 1:1 -C pacemakerd') nrpe_setup.write()
def update_nrpe_config(): # python-dbus is used by check_upstart_job apt_install('python3-dbus') hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() # create systemd or upstart check cmd = '/bin/cat /var/lib/ceph/osd/ceph-*/whoami |' if init_is_systemd(): cmd += 'xargs -I_@ /usr/local/lib/nagios/plugins/check_systemd.py' cmd += ' ceph-osd@_@' else: cmd += 'xargs -I@ status ceph-osd id=@' cmd += ' && exit 0 || exit 2' nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe_setup.add_check(shortname='ceph-osd', description='process check {%s}' % current_unit, check_cmd=cmd) nrpe_setup.write()
def update_nrpe_config(): # python-dbus is used by check_upstart_job apt_install('python-dbus') hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe.add_init_service_checks(nrpe_setup, services(), current_unit) cronpath = '/etc/cron.d/nagios-netns-check' cron_template = ('*/5 * * * * root ' '/usr/local/lib/nagios/plugins/check_netns.sh ' '> /var/lib/nagios/netns-check.txt\n') f = open(cronpath, 'w') f.write(cron_template) f.close() nrpe_setup.add_check( shortname="netns", description='Network Namespace check {%s}' % current_unit, check_cmd='check_status_file.py -f /var/lib/nagios/netns-check.txt') nrpe_setup.write()
def nrpe_external_master_relation(): ''' Configure the nrpe-external-master relation ''' local_plugins = helpers.local_plugins_dir() if os.path.exists(local_plugins): src = os.path.join(hookenv.charm_dir(), "files", "check_cassandra_heap.sh") with open(src, 'rb') as f: host.write_file(os.path.join(local_plugins, 'check_cassandra_heap.sh'), f.read(), perms=0o555) nrpe_compat = nrpe.NRPE() conf = hookenv.config() cassandra_heap_warn = conf.get('nagios_heapchk_warn_pct') cassandra_heap_crit = conf.get('nagios_heapchk_crit_pct') if cassandra_heap_warn and cassandra_heap_crit: nrpe_compat.add_check( shortname="cassandra_heap", description="Check Cassandra Heap", check_cmd="check_cassandra_heap.sh localhost {} {}" "".format(cassandra_heap_warn, cassandra_heap_crit)) cassandra_disk_warn = conf.get('nagios_disk_warn_pct') cassandra_disk_crit = conf.get('nagios_disk_crit_pct') dirs = helpers.get_all_database_directories() dirs = set(dirs['data_file_directories'] + [dirs['commitlog_directory'], dirs['saved_caches_directory']]) for disk in dirs: check_name = re.sub('[^A-Za-z0-9_]', '_', disk) if cassandra_disk_warn and cassandra_disk_crit: shortname = "cassandra_disk{}".format(check_name) hookenv.log("Adding disk utilization check {}".format(shortname), DEBUG) nrpe_compat.add_check( shortname=shortname, description="Check Cassandra Disk {}".format(disk), check_cmd="check_disk -u GB -w {}% -c {}% -K 5% -p {}" "".format(cassandra_disk_warn, cassandra_disk_crit, disk)) nrpe_compat.write()
def configure_nrpe_checks(): install_nagios_plugin_from_file( source_file_path='/opt/netbox-docker/checks/check_docker', plugin_name='check_docker') containers = ['netbox', 'netbox-worker', 'nginx', 'redis'] nrpe_setup = nrpe.NRPE(hostname=nrpe.get_nagios_hostname(), primary=True) nrpe_setup.add_check(shortname='check_http_netbox', description='Check netbox web server', check_cmd='{check_path} -H localhost -p 80'.format( check_path='/usr/lib/nagios/plugins/check_http')) for container in containers: nrpe_setup.add_check( shortname='check_docker_{container}'.format(container=container), description='Check netbox {container} container'.format( container=container), check_cmd='{check_path} --containers {container} {params}'.format( check_path='/usr/lib/nagios/plugins/check_docker', container=container, params=config('check_docker_params'))) nrpe_setup.write() set_flag('netbox.nrpe.configured')
def update_nrpe_config(): # python-dbus is used by check_upstart_job # python-psutil is used by check_ntpmon fetch.apt_install(['python-dbus', 'python-psutil']) nagios_ntpmon_checks = hookenv.config('nagios_ntpmon_checks').split(" ") if os.path.isdir(NAGIOS_PLUGINS): host.rsync( os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios', 'check_ntpmon.py'), os.path.join(NAGIOS_PLUGINS, 'check_ntpmon.py')) hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe.add_init_service_checks(nrpe_setup, ['ntp'], current_unit) allchecks = set(['offset', 'peers', 'reachability', 'sync']) # remove any previously-created ntpmon checks nrpe_setup.remove_check(shortname="ntpmon") for c in allchecks: nrpe_setup.remove_check(shortname="ntpmon_%s" % c) # If all checks are specified, combine them into a single check to reduce # Nagios noise. if set(nagios_ntpmon_checks) == allchecks: nrpe_setup.add_check( shortname="ntpmon", description='Check NTPmon {}'.format(current_unit), check_cmd='check_ntpmon.py') else: for nc in nagios_ntpmon_checks: if len(nc) > 0: nrpe_setup.add_check( shortname="ntpmon_%s" % nc, description='Check NTPmon %s {%s}' % (nc, current_unit), check_cmd='check_ntpmon.py --check %s' % nc) nrpe_setup.write()
def update_nrpe_config(): plugins_dir = '/usr/local/lib/nagios/plugins' nrpe_compat = nrpe.NRPE() component_ip = common_utils.get_ip() common_utils.rsync_nrpe_checks(plugins_dir) common_utils.add_nagios_to_sudoers() check_api_cmd = 'check_http -H {} -p 8081'.format(component_ip) nrpe_compat.add_check( shortname='check_analytics_api', description='Check Contrail Analytics API', check_cmd=check_api_cmd ) ctl_status_shortname = 'check_contrail_status_' + MODULE nrpe_compat.add_check( shortname=ctl_status_shortname, description='Check contrail-status', check_cmd=common_utils.contrail_status_cmd(MODULE, plugins_dir) ) nrpe_compat.write()
def update_nrpe_config(): services = ["snap.{}.daemon".format(s) for s in worker_services] data = render("nagios_plugin.py", None, {"node_name": get_node_name()}) plugin_path = install_nagios_plugin_from_text(data, "check_k8s_worker.py") hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe_setup.add_check("node", "Node registered with API Server", str(plugin_path)) nrpe.add_init_service_checks(nrpe_setup, services, current_unit) nrpe_setup.write() creds = db.get("credentials") servers = get_kube_api_servers() if creds and servers: server = servers[get_unit_number() % len(servers)] create_kubeconfig( nrpe_kubeconfig_path, server, ca_crt_path, token=creds["client_token"], user="******", ) # Make sure Nagios dirs are the correct permissions. cmd = ["chown", "-R", "nagios:nagios"] for p in ["/var/lib/nagios/", os.path.dirname(nrpe_kubeconfig_path)]: if os.path.exists(p): check_call(cmd + [p]) remove_state("nrpe-external-master.reconfigure") set_state("nrpe-external-master.initial-config") # request CPU governor check from nrpe relation to be performance rel_settings = { "requested_cpu_governor": "performance", } for rid in hookenv.relation_ids("nrpe-external-master"): hookenv.relation_set(relation_id=rid, relation_settings=rel_settings)
def update_nrpe_checks(): if os.path.isdir(NAGIOS_PLUGINS): rsync(os.path.join(charm_dir(), 'files', 'check_rabbitmq.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py')) rsync(os.path.join(charm_dir(), 'files', 'check_rabbitmq_queues.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py')) if config('management_plugin'): rsync( os.path.join(charm_dir(), 'files', 'check_rabbitmq_cluster.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_cluster.py')) if config('stats_cron_schedule'): script = os.path.join(SCRIPTS_DIR, 'collect_rabbitmq_stats.sh') cronjob = CRONJOB_CMD.format(schedule=config('stats_cron_schedule'), timeout=config('cron-timeout'), command=script) rsync(os.path.join(charm_dir(), 'files', 'collect_rabbitmq_stats.sh'), script) write_file(STATS_CRONFILE, cronjob) elif os.path.isfile(STATS_CRONFILE): os.remove(STATS_CRONFILE) # Find out if nrpe set nagios_hostname hostname = nrpe.get_nagios_hostname() myunit = nrpe.get_nagios_unit_name() # create unique user and vhost for each unit current_unit = local_unit().replace('/', '-') user = '******'.format(current_unit) vhosts = [{'vhost': user, 'shortname': rabbit.RABBIT_USER}] password = rabbit.get_rabbit_password(user, local=True) nrpe_compat = nrpe.NRPE(hostname=hostname) rabbit.create_user(user, password, ['monitoring']) if config('check-vhosts'): for other_vhost in config('check-vhosts').split(' '): if other_vhost: item = { 'vhost': other_vhost, 'shortname': 'rabbit_{}'.format(other_vhost) } vhosts.append(item) for vhost in vhosts: rabbit.create_vhost(vhost['vhost']) rabbit.grant_permissions(user, vhost['vhost']) if config('ssl') in ['off', 'on']: cmd = ('{}/check_rabbitmq.py --user {} --password {} ' '--vhost {}'.format(NAGIOS_PLUGINS, user, password, vhost['vhost'])) log('Adding rabbitmq non-SSL check for {}'.format(vhost['vhost']), level=DEBUG) description = 'Check RabbitMQ {} {}'.format(myunit, vhost['vhost']) nrpe_compat.add_check(shortname=vhost['shortname'], description=description, check_cmd=cmd) if config('ssl') in ['only', 'on']: cmd = ('{}/check_rabbitmq.py --user {} --password {} ' '--vhost {} --ssl --ssl-ca {} --port {}'.format( NAGIOS_PLUGINS, user, password, vhost['vhost'], SSL_CA_FILE, int(config('ssl_port')))) log('Adding rabbitmq SSL check for {}'.format(vhost['vhost']), level=DEBUG) description = 'Check RabbitMQ (SSL) {} {}'.format( myunit, vhost['vhost']) nrpe_compat.add_check(shortname=vhost['shortname'] + "_ssl", description=description, check_cmd=cmd) if config('queue_thresholds'): cmd = "" # If value of queue_thresholds is incorrect we want the hook to fail for item in yaml.safe_load(config('queue_thresholds')): cmd += ' -c "{}" "{}" {} {}'.format(*item) nrpe_compat.add_check( shortname=rabbit.RABBIT_USER + '_queue', description='Check RabbitMQ Queues', check_cmd='{}/check_rabbitmq_queues.py{} {}'.format( NAGIOS_PLUGINS, cmd, STATS_DATAFILE)) if config('management_plugin'): # add NRPE check _check_cmd = ( '{}/check_rabbitmq_cluster.py --port {} --user {} --password {}'. format(NAGIOS_PLUGINS, rabbit.get_managment_port(), user, password)) nrpe_compat.add_check(shortname=rabbit.RABBIT_USER + '_cluster', description='Check RabbitMQ Cluster', check_cmd=_check_cmd) nrpe_compat.write()
def update_nrpe_config(): # Validate options (DEPRECATED) valid_alerts = ['ignore', 'warning', 'critical'] if config('failed_actions_alert_type').lower() not in valid_alerts: status_set( 'blocked', 'The value of option failed_actions_alert_type must be ' 'among {}'.format(valid_alerts)) return if config('failed_actions_threshold') < 0: status_set( 'blocked', 'The value of option failed_actions_threshold must be a ' 'positive integer') return scripts_src = os.path.join(os.environ["CHARM_DIR"], "files", "nrpe") scripts_dst = "/usr/local/lib/nagios/plugins" if not os.path.exists(scripts_dst): os.makedirs(scripts_dst) for fname in glob.glob(os.path.join(scripts_src, "*")): if os.path.isfile(fname): shutil.copy2(fname, os.path.join(scripts_dst, os.path.basename(fname))) sudoers_src = os.path.join(os.environ["CHARM_DIR"], "files", "sudoers") sudoers_dst = "/etc/sudoers.d" for fname in glob.glob(os.path.join(sudoers_src, "*")): if os.path.isfile(fname): shutil.copy2(fname, os.path.join(sudoers_dst, os.path.basename(fname))) hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) apt_install('python-dbus') check_crm_cmd = 'check_crm -s' check_crm_cmd += ' --failedactions={}'.format( config('failed_actions_alert_type').lower()) if config('failed_actions_threshold'): check_crm_cmd += ' --failcount={}'.format( config('failed_actions_threshold')) for err_type in ['warn', 'crit']: check_crm_cmd += ' --failcount-{}={}'.format( err_type, config('res_failcount_{}'.format(err_type)) or 0) if nrpe.NRPE.does_nrpe_conf_dir_exist(): # corosync/crm checks # LP #1902919 - corosync version 2.99 changed the ring status output # for udp/udpu to hardcode the status to always report 'OK'. This # results in the check providing no value over what is provided by the # crm_status check. A version check on the package would be more ideal, # however populating the apt-cache object is expensive to run on each # config-changed hook, so use the faster check of comparing the # release name. ring_check = { 'shortname': 'corosync_rings', 'description': 'Check Corosync rings {}'.format(current_unit), 'check_cmd': 'check_corosync_rings', } if CompareHostReleases(get_distrib_codename()) < 'eoan': nrpe_setup.add_check(**ring_check) else: nrpe_setup.remove_check(**ring_check) nrpe_setup.add_check( shortname='crm_status', description='Check crm status {}'.format(current_unit), check_cmd=check_crm_cmd) # process checks nrpe_setup.add_check( shortname='corosync_proc', description='Check Corosync process {}'.format(current_unit), check_cmd='check_procs -c 1:1 -C corosync') nrpe_setup.add_check( shortname='pacemakerd_proc', description='Check Pacemakerd process {}'.format(current_unit), check_cmd='check_procs -c 1:1 -C pacemakerd') nrpe_setup.write()
def update_nrpe_checks(): if os.path.isdir(NAGIOS_PLUGINS): rsync(os.path.join(charm_dir(), 'scripts', 'check_rabbitmq.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq.py')) rsync(os.path.join(charm_dir(), 'scripts', 'check_rabbitmq_queues.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_queues.py')) if config('management_plugin'): rsync(os.path.join(charm_dir(), 'scripts', 'check_rabbitmq_cluster.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_cluster.py')) if config('stats_cron_schedule'): script = os.path.join(SCRIPTS_DIR, 'collect_rabbitmq_stats.sh') cronjob = CRONJOB_CMD.format(schedule=config('stats_cron_schedule'), timeout=config('cron-timeout'), command=script) rsync(os.path.join(charm_dir(), 'scripts', 'collect_rabbitmq_stats.sh'), script) write_file(STATS_CRONFILE, cronjob) elif os.path.isfile(STATS_CRONFILE): os.remove(STATS_CRONFILE) if config('management_plugin'): rsync(os.path.join(charm_dir(), 'scripts', 'check_rabbitmq_cluster.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_cluster.py')) # Find out if nrpe set nagios_hostname hostname = nrpe.get_nagios_hostname() myunit = nrpe.get_nagios_unit_name() # create unique user and vhost for each unit current_unit = local_unit().replace('/', '-') user = '******' % current_unit vhost = 'nagios-%s' % current_unit password = rabbit.get_rabbit_password(user, local=True) rabbit.create_vhost(vhost) rabbit.create_user(user, password, ['monitoring']) rabbit.grant_permissions(user, vhost) nrpe_compat = nrpe.NRPE(hostname=hostname) if config('ssl') in ['off', 'on']: cmd = ('{plugins_dir}/check_rabbitmq.py --user {user} ' '--password {password} --vhost {vhost}') cmd = cmd.format(plugins_dir=NAGIOS_PLUGINS, user=user, password=password, vhost=vhost) nrpe_compat.add_check( shortname=rabbit.RABBIT_USER, description='Check RabbitMQ {%s}' % myunit, check_cmd=cmd ) if config('ssl') in ['only', 'on']: log('Adding rabbitmq SSL check', level=DEBUG) cmd = ('{plugins_dir}/check_rabbitmq.py --user {user} ' '--password {password} --vhost {vhost} ' '--ssl --ssl-ca {ssl_ca} --port {port}') cmd = cmd.format(plugins_dir=NAGIOS_PLUGINS, user=user, password=password, port=int(config('ssl_port')), vhost=vhost, ssl_ca=SSL_CA_FILE) nrpe_compat.add_check( shortname=rabbit.RABBIT_USER + "_ssl", description='Check RabbitMQ (SSL) {%s}' % myunit, check_cmd=cmd ) if config('queue_thresholds'): cmd = "" # If value of queue_thresholds is incorrect we want the hook to fail for item in yaml.safe_load(config('queue_thresholds')): cmd += ' -c "{}" "{}" {} {}'.format(*item) nrpe_compat.add_check( shortname=rabbit.RABBIT_USER + '_queue', description='Check RabbitMQ Queues', check_cmd='{}/check_rabbitmq_queues.py{} {}'.format( NAGIOS_PLUGINS, cmd, STATS_DATAFILE) ) if config('management_plugin'): # add NRPE check nrpe_compat.add_check( shortname=rabbit.RABBIT_USER + '_cluster', description='Check RabbitMQ Cluster', check_cmd='{}/check_rabbitmq_cluster.py --port {} --user {} --password {}'.format( NAGIOS_PLUGINS, rabbit.get_managment_port(), user, password ) ) nrpe_compat.write()
def update_nrpe_config(): # python-dbus is used by check_upstart_job apt_install('python-dbus') log('Refreshing nrpe checks') if not os.path.exists(NAGIOS_PLUGINS): mkpath(NAGIOS_PLUGINS) rsync( os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master', 'check_swift_storage.py'), os.path.join(NAGIOS_PLUGINS, 'check_swift_storage.py')) rsync( os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master', 'check_timed_logs.pl'), os.path.join(NAGIOS_PLUGINS, 'check_timed_logs.pl')) rsync( os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master', 'check_swift_replicator_logs.sh'), os.path.join(NAGIOS_PLUGINS, 'check_swift_replicator_logs.sh')) rsync( os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master', 'check_swift_service'), os.path.join(NAGIOS_PLUGINS, 'check_swift_service')) rsync( os.path.join(os.getenv('CHARM_DIR'), 'files', 'sudo', 'swift-storage'), os.path.join(SUDOERS_D, 'swift-storage')) # Find out if nrpe set nagios_hostname hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) # check the rings and replication nrpe_setup.add_check( shortname='swift_storage', description='Check swift storage ring hashes and replication' ' {%s}' % current_unit, check_cmd='check_swift_storage.py {}'.format( config('nagios-check-params'))) object_port = config('object-server-port') container_port = config('container-server-port') account_port = config('account-server-port') nrpe_setup.add_check( shortname="swift-object-server-api", description="Check Swift Object Server API availability", check_cmd="/usr/lib/nagios/plugins/check_http \ -I localhost -u /recon/version -p {} \ -e \"OK\"".format(object_port)) nrpe_setup.add_check( shortname="swift-container-server-api", description="Check Swift Container Server API availability", check_cmd="/usr/lib/nagios/plugins/check_http \ -I localhost -u /recon/version -p {} \ -e \"OK\"".format(container_port)) nrpe_setup.add_check( shortname="swift-account-server-api", description="Check Swift Account Server API availability", check_cmd="/usr/lib/nagios/plugins/check_http \ -I localhost -u /recon/version -p {} \ -e \"OK\"".format(account_port)) if config('nagios-replication-check-params'): nrpe_setup.add_check( shortname='swift_replicator_health', description='Check swift object replicator log reporting', check_cmd='check_swift_replicator_logs.sh {}'.format( config('nagios-replication-check-params'))) else: nrpe_setup.remove_check(shortname='swift_replicator_health') nrpe.add_init_service_checks(nrpe_setup, SWIFT_SVCS, current_unit) nrpe_setup.write()
def update_nrpe_config(): hostname = nrpe.get_nagios_hostname() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe_setup.write()
def test_update_nrpe(self): self.patched['config'].return_value = { 'nagios_context': 'a', 'nagios_servicegroups': '' } self.patched['exists'].return_value = True def _rels(rname): relations = { 'local-monitors': 'local-monitors:1', 'nrpe-external-master': 'nrpe-external-master:2', } return [relations[rname]] self.patched['relation_ids'].side_effect = _rels checker = nrpe.NRPE() checker.add_check(shortname="myservice", description="Check MyService", check_cmd="check_http http://localhost") self.assertEqual(None, checker.write()) self.assertEqual(2, self.patched['open'].call_count) filename = 'check_myservice.cfg' expected = [ ('/etc/nagios/nrpe.d/%s' % filename, 'w'), ('/var/lib/nagios/export/service__a-testunit_%s' % filename, 'w'), ] actual = [x[0] for x in self.patched['open'].call_args_list] self.assertEqual(expected, actual) outfile = self.patched['open'].return_value.__enter__.return_value service_file_contents = """ #--------------------------------------------------- # This file is Juju managed #--------------------------------------------------- define service { use active-service host_name a-testunit service_description a-testunit[myservice] Check MyService check_command check_nrpe!check_myservice servicegroups a } """ expected = [ '# check myservice\n', 'command[check_myservice]=/usr/lib/nagios/plugins/check_http http://localhost\n', service_file_contents, ] actual = [x[0][0] for x in outfile.write.call_args_list] self.assertEqual(expected, actual) nrpe_monitors = {'myservice': {'command': 'check_myservice'}} monitors = yaml.dump({"monitors": {"remote": {"nrpe": nrpe_monitors}}}) relation_set_calls = [ call(monitors=monitors, relation_id="local-monitors:1"), call(monitors=monitors, relation_id="nrpe-external-master:2"), ] self.patched['relation_set'].assert_has_calls(relation_set_calls, any_order=True) self.check_call_counts(config=1, getpwnam=1, getgrnam=1, exists=3, open=2, listdir=1, relation_ids=2, relation_set=2)
def test_max_check_attmpts(self): self.patched['config'].return_value = {'nagios_context': 'a', 'nagios_servicegroups': ''} self.patched['exists'].return_value = True self.patched['relation_get'].return_value = { 'egress-subnets': '10.66.111.24/32', 'ingress-address': '10.66.111.24', 'private-address': '10.66.111.24' } def _rels(rname): relations = { 'local-monitors': 'local-monitors:1', 'nrpe-external-master': 'nrpe-external-master:2', } return [relations[rname]] self.patched['relation_ids'].side_effect = _rels checker = nrpe.NRPE() checker.add_check(shortname="myservice", description="Check MyService", check_cmd="check_http http://localhost", max_check_attempts=8, ) self.assertEqual(None, checker.write()) self.assertEqual(2, self.patched['open'].call_count) filename = 'check_myservice.cfg' expected = [ ('/etc/nagios/nrpe.d/%s' % filename, 'w'), ('/var/lib/nagios/export/service__a-testunit_%s' % filename, 'w'), ] actual = [x[0] for x in self.patched['open'].call_args_list] self.assertEqual(expected, actual) outfile = self.patched['open'].return_value.__enter__.return_value service_file_contents = """ #--------------------------------------------------- # This file is Juju managed #--------------------------------------------------- define service { use active-service host_name a-testunit service_description a-testunit[myservice] Check MyService check_command check_nrpe!check_myservice servicegroups a max_check_attempts 8 } """ expected = [ '# check myservice\n', '# The following header was added automatically by juju\n', '# Modifying it will affect nagios monitoring and alerting\n', '# servicegroups: a\n', 'command[check_myservice]=/usr/lib/nagios/plugins/check_http http://localhost\n', service_file_contents, ] actual = [x[0][0] for x in outfile.write.call_args_list] self.assertEqual(expected, actual) nrpe_monitors = {'myservice': {'command': 'check_myservice', 'max_check_attempts': 8, }} monitors = yaml.dump( {"monitors": {"remote": {"nrpe": nrpe_monitors}}}) relation_set_calls = [ call(monitors=monitors, relation_id="local-monitors:1"), call(monitors=monitors, relation_id="nrpe-external-master:2"), ] self.patched['relation_set'].assert_has_calls(relation_set_calls, any_order=True) self.check_call_counts(config=1, getpwnam=1, getgrnam=1, exists=4, open=2, listdir=1, relation_get=2, relation_ids=3, relation_set=3)
def test_default_servicegroup(self): """Test that nagios_servicegroups gets set to the default if omitted""" self.patched['config'].return_value = {'nagios_context': 'testctx'} checker = nrpe.NRPE() self.assertEqual(checker.nagios_servicegroups, 'testctx')
def test_add_init_service_checks(self, mock_isdir): def _exists(init_file): files = ['/etc/init/apache2.conf', '/usr/lib/nagios/plugins/check_upstart_job', '/etc/init.d/haproxy', '/usr/lib/nagios/plugins/check_status_file.py', '/etc/cron.d/nagios-service-check-haproxy', '/var/lib/nagios/service-check-haproxy.txt', '/usr/lib/nagios/plugins/check_systemd.py' ] return init_file in files self.patched['exists'].side_effect = _exists # Test without systemd and /var/lib/nagios does not exist self.patched['init_is_systemd'].return_value = False mock_isdir.return_value = False bill = nrpe.NRPE() services = ['apache2', 'haproxy'] nrpe.add_init_service_checks(bill, services, 'testunit') mock_isdir.assert_called_with('/var/lib/nagios') self.patched['call'].assert_not_called() expect_cmds = { 'apache2': '/usr/lib/nagios/plugins/check_upstart_job apache2', 'haproxy': '/usr/lib/nagios/plugins/check_status_file.py -f ' '/var/lib/nagios/service-check-haproxy.txt', } self.assertEqual(bill.checks[0].shortname, 'apache2') self.assertEqual(bill.checks[0].check_cmd, expect_cmds['apache2']) self.assertEqual(bill.checks[1].shortname, 'haproxy') self.assertEqual(bill.checks[1].check_cmd, expect_cmds['haproxy']) # without systemd and /var/lib/nagios does exist mock_isdir.return_value = True f = MagicMock() self.patched['open'].return_value = f bill = nrpe.NRPE() services = ['apache2', 'haproxy'] nrpe.add_init_service_checks(bill, services, 'testunit') mock_isdir.assert_called_with('/var/lib/nagios') self.patched['call'].assert_called_with( ['/usr/local/lib/nagios/plugins/check_exit_status.pl', '-e', '-s', '/etc/init.d/haproxy', 'status'], stdout=f, stderr=subprocess.STDOUT) # Test regular services and snap services with systemd services = ['apache2', 'haproxy', 'snap.test.test', 'ceph-radosgw@hostname'] self.patched['init_is_systemd'].return_value = True nrpe.add_init_service_checks(bill, services, 'testunit') expect_cmds = { 'apache2': '/usr/lib/nagios/plugins/check_systemd.py apache2', 'haproxy': '/usr/lib/nagios/plugins/check_systemd.py haproxy', 'snap.test.test': '/usr/lib/nagios/plugins/check_systemd.py snap.test.test', } self.assertEqual(bill.checks[2].shortname, 'apache2') self.assertEqual(bill.checks[2].check_cmd, expect_cmds['apache2']) self.assertEqual(bill.checks[3].shortname, 'haproxy') self.assertEqual(bill.checks[3].check_cmd, expect_cmds['haproxy']) self.assertEqual(bill.checks[4].shortname, 'snap.test.test') self.assertEqual(bill.checks[4].check_cmd, expect_cmds['snap.test.test'])
def update_nrpe_config(): # Validate options valid_alerts = ['ignore', 'warning', 'critical'] if config('failed_actions_alert_type').lower() not in valid_alerts: status_set( 'blocked', 'The value of option failed_actions_alert_type must be ' 'among {}'.format(valid_alerts)) return if config('failed_actions_threshold') <= 0: status_set( 'blocked', 'The value of option failed_actions_threshold must be a ' 'positive integer') return scripts_src = os.path.join(os.environ["CHARM_DIR"], "files", "nrpe") scripts_dst = "/usr/local/lib/nagios/plugins" if not os.path.exists(scripts_dst): os.makedirs(scripts_dst) for fname in glob.glob(os.path.join(scripts_src, "*")): if os.path.isfile(fname): shutil.copy2(fname, os.path.join(scripts_dst, os.path.basename(fname))) sudoers_src = os.path.join(os.environ["CHARM_DIR"], "files", "sudoers") sudoers_dst = "/etc/sudoers.d" for fname in glob.glob(os.path.join(sudoers_src, "*")): if os.path.isfile(fname): shutil.copy2(fname, os.path.join(sudoers_dst, os.path.basename(fname))) hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) apt_install('python-dbus') if config('failed_actions_alert_type').lower() == 'ignore': check_crm_cmd = 'check_crm --failedactions=ignore' else: check_crm_cmd = ('check_crm --failcounts={} --failedactions={}'.format( config('failed_actions_threshold'), config('failed_actions_alert_type').lower())) # corosync/crm checks nrpe_setup.add_check( shortname='corosync_rings', description='Check Corosync rings {}'.format(current_unit), check_cmd='check_corosync_rings') nrpe_setup.add_check( shortname='crm_status', description='Check crm status {}'.format(current_unit), check_cmd=check_crm_cmd) # process checks nrpe_setup.add_check( shortname='corosync_proc', description='Check Corosync process {}'.format(current_unit), check_cmd='check_procs -c 1:1 -C corosync') nrpe_setup.add_check( shortname='pacemakerd_proc', description='Check Pacemakerd process {}'.format(current_unit), check_cmd='check_procs -c 1:1 -C pacemakerd') nrpe_setup.write()