示例#1
0
def check_contrail_alarms(contrail_vip, token):
    """Check the alarms in Contrail Analytics.

    @param str vip: VIP of Contrail
    @param str token: Token for the authentication
    @returns: None
    """
    url = 'http://{}:8081/analytics/alarms'.format(contrail_vip)
    headers = {'X-Auth-Token': token}
    try:
        r = requests.get(url=url, headers=headers)
    except requests.exceptions.ConnectionError as error:
        raise nagios_plugin3.CriticalError(
            'CRITICAL: contrail analytics API error: {}'.format(error))

    if r.status_code != 200:
        raise nagios_plugin3.CriticalError(
            'CRITICAL: contrail analytics API return code is {}'.format(
                r.code))

    result = r.json()
    msg = parse_contrail_alarms(result)

    if msg.startswith('CRITICAL: '):
        raise nagios_plugin3.CriticalError(msg)
    elif msg.startswith('WARNING: '):
        raise nagios_plugin3.WarnError(msg)
    print('OK: no unacknowledged or sev>0 contrail analytics alarms')
示例#2
0
def check_loadbalancers(connection):
    """check loadbalancers status."""

    lb_mgr = connection.load_balancer
    lb_all = lb_mgr.load_balancers()

    # only check enabled lbs
    lb_enabled = [lb for lb in lb_all if lb.is_admin_state_up]

    # check provisioning_status is ACTIVE for each lb
    lbs = [lb for lb in lb_enabled if lb.provisioning_status != 'ACTIVE']
    if lbs:
        items = [
            'loadbalancer {} provisioning_status is {}'.format(
                lb.id, lb.provisioning_status) for lb in lbs
        ]
        output = 'CRITICAL: {}'.format(', '.join(items))
        raise nagios_plugin3.CriticalError(output)

    # check operating_status is ONLINE for each lb
    lbs = [lb for lb in lb_enabled if lb.operating_status != 'ONLINE']
    if lbs:
        items = [
            'loadbalancer {} operating_status is {}'.format(
                lb.id, lb.operating_status) for lb in lbs
        ]
        output = 'CRITICAL: {}'.format(', '.join(items))
        raise nagios_plugin3.CriticalError(output)

    net_mgr = connection.network
    # check vip port exists for each lb
    lbs = []
    for lb in lb_enabled:
        try:
            net_mgr.get_port(lb.vip_port_id)
        except openstack.exceptions.NotFoundException:
            lbs.append(lb)
    if lbs:
        items = [
            'vip port {} for loadbalancer {} not found'.format(
                lb.vip_port_id, lb.id) for lb in lbs
        ]
        output = 'CRITICAL: {}'.format(', '.join(items))
        raise nagios_plugin3.CriticalError(output)

    # warn about disabled lbs if no critical error found
    lb_disabled = [lb for lb in lb_all if not lb.is_admin_state_up]
    if lb_disabled:
        items = [
            'loadbalancer {} admin_state_up is False'.format(lb.id)
            for lb in lb_disabled
        ]
        output = 'WARNING: {}'.format(', '.join(items))
        raise nagios_plugin3.WarnError(output)

    print('OK: Loadbalancers are happy')
def check_node(node):
    # Note: Keep the Ready check first since all checks will fail when not Ready
    checks = [
        {
            'name': 'Ready',
            'expected': 'True',
            'type': 'error',
            'error': 'Node Not Ready'
        },
        {
            'name': 'MemoryPressure',
            'expected': 'False',
            'type': 'warn',
            'error': 'Memory Pressure'
        },
        {
            'name': 'DiskPressure',
            'expected': 'False',
            'type': 'warn',
            'error': 'Disk Pressure'
        },
        {
            'name': 'PIDPressure',
            'expected': 'False',
            'type': 'warn',
            'error': 'PID Pressure'
        },
    ]
    msg = []
    error = False
    for check in checks:
        # find the status that matches
        for s in node['status']['conditions']:
            if s['type'] == check['name']:
                # does it match expectations? If not, toss it on the list
                # of errors so we don't show the first issue, but all.
                if s['status'].lower() != check['expected'].lower():
                    msg.append(check['error'])
                    if check['type'] == 'error':
                        error = True
                else:
                    break
        else:
            err_msg = 'Unable to find status for {}'.format(check['error'])
            raise nagios_plugin3.CriticalError(err_msg)

    if msg:
        if error:
            raise nagios_plugin3.CriticalError(msg)
        else:
            raise nagios_plugin3.WarnError(msg)
def verify_node_registered_and_ready():
    try:
        cmd = "/snap/bin/kubectl --kubeconfig /var/lib/nagios/.kube/config" \
              " get no -o=yaml"
        y = yaml.load(check_output(cmd.split()))
    except Exception:
        raise nagios_plugin3.CriticalError("Unable to run kubectl "
                                           "and parse output")
    for node in y['items']:
        if node['metadata']['name'] == '{{node_name}}':
            check_node(node)
            return
    else:
        raise nagios_plugin3.CriticalError("Unable to find "
                                           "node registered on API server")
示例#5
0
def check_amphorae(connection):
    """check amphroae status."""

    lb_mgr = connection.load_balancer

    resp = lb_mgr.get('/v2/octavia/amphorae')
    if resp.status_code != 200:
        return

    data = json.loads(resp.content)
    items = data.get('amphorae', [])

    # raise CRITICAL for ERROR status
    bad_status_list = ('ERROR', )
    bad_items = [item for item in items if item['status'] in bad_status_list]
    if bad_items:
        items = [
            'amphroa {} status is {}'.format(item['id'], item['status'])
            for item in bad_items
        ]
        output = 'CRITICAL: {}'.format(', '.join(items))
        raise nagios_plugin3.CriticalError(output)

    # raise WARNING for these status
    bad_status_list = ('PENDING_CREATE', 'PENDING_DELETE', 'BOOTING')
    bad_items = [item for item in items if item['status'] in bad_status_list]
    if bad_items:
        items = [
            'amphroa {} status is {}'.format(item['id'], item['status'])
            for item in bad_items
        ]
        output = 'WARNING: {}'.format(', '.join(items))
        raise nagios_plugin3.WarnError(output)

    print('OK: Amphorae are happy')
示例#6
0
def check_pools(connection):
    """check pools status."""
    lb_mgr = connection.load_balancer
    pools_all = lb_mgr.pools()
    pools_enabled = [pool for pool in pools_all if pool.is_admin_state_up]

    # check provisioning_status is ACTIVE for each pool
    pools = [
        pool for pool in pools_enabled if pool.provisioning_status != 'ACTIVE'
    ]
    if pools:
        items = [
            'pool {} provisioning_status is {}'.format(
                pool.id, pool.provisioning_status) for pool in pools
        ]
        output = 'CRITICAL: {}'.format(', '.join(items))
        raise nagios_plugin3.CriticalError(output)

    # raise CRITICAL if ERROR
    pools = [
        pool for pool in pools_enabled if pool.operating_status == 'ERROR'
    ]
    if pools:
        items = [
            'pool {} operating_status is {}'.format(pool.id,
                                                    pool.operating_status)
            for pool in pools
        ]
        output = 'CRITICAL: {}'.format(', '.join(items))
        raise nagios_plugin3.CriticalError(output)

    # raise WARNING if NO_MONITOR
    pools = [
        pool for pool in pools_enabled if pool.operating_status == 'NO_MONITOR'
    ]
    if pools:
        items = [
            'pool {} operating_status is {}'.format(pool.id,
                                                    pool.operating_status)
            for pool in pools
        ]
        output = 'WARNING: {}'.format(', '.join(items))
        raise nagios_plugin3.WarnError(output)

    print('OK: Pools are happy')
def verify_node_registered_and_ready():
    node = None
    try:
        cmd = [
            "/snap/bin/kubectl", "--kubeconfig",
            "/var/lib/nagios/.kube/config", "get", "no", "{{node_name}}",
            "-o=yaml"
        ]
        node = yaml.safe_load(check_output(cmd, stderr=PIPE))
    except CalledProcessError as e:
        err = e.stderr.decode('UTF-8')
        if "not found" in err:
            raise nagios_plugin3.CriticalError("Unable to find "
                                               "node registered on API server")
    if not node:
        raise nagios_plugin3.CriticalError("Unable to run kubectl "
                                           "and parse output")
    return check_node(node)
def check_snaps_installed():
    """Confirm the snaps are installed, raise an error if not"""
    for snap_name in snap_resources:
        cmd = ['snap', 'list', snap_name]
        try:
            check_output(cmd).decode('UTF-8')
        except Exception:
            msg = '{} snap is not installed'.format(snap_name)
            raise nagios_plugin3.CriticalError(msg)
示例#9
0
def check_alarms():
    """Raise an error if the cached status contains any non-blank lines"""
    alarms = []
    alarm_list = load_alarm_list()
    for line in alarm_list.splitlines():
        line = line.strip()
        if line:
            alarms.append(line)
    if alarms:
        raise nagios_plugin3.CriticalError(' '.join(alarms))
示例#10
0
def check_nova_services(args, nova):
    aggregates = nova.get('/os-aggregates').json()['aggregates']
    services = nova.get('/os-services').json()['services']
    services_compute = [x for x in services if x['binary'] == 'nova-compute']
    msg = ['nova-compute']
    status = []
    hosts_checked = []
    for agg in aggregates:
        # skip the defined host aggregates to be skipped from the config
        # making it case-insensitive
        skipped_aggregates = [
            name.lower() for name in args.skip_aggregates.split(',')
        ]
        aggregate_name = agg['name'].lower()
        if aggregate_name in skipped_aggregates:
            continue
        # get a list of hosts, pass to the function
        hosts = agg['hosts']
        hosts_checked.append(hosts)
        status.append(
            check_hosts_up(args, agg['name'], hosts, services_compute))
    # find hosts that haven't been checked already
    hosts_checked = [item for sublist in hosts_checked for item in sublist]
    hosts_not_checked = [
        x['host'] for x in services_compute if x['host'] not in hosts_checked
    ]
    if len(hosts_not_checked) > 0:
        status.append(
            check_hosts_up(args, '(not-part-of-any-agg)', hosts_not_checked,
                           services_compute))
    status_crit = len([agg['critical'] for agg in status if agg['critical']])
    status_warn = len([agg['warning'] for agg in status if agg['warning']])
    msg.extend([x['msg_text'] for x in status if x['msg_text'] != ''])
    if status_crit:
        output = 'CRITICAL: {}'.format(', '.join(msg))
        raise nagios_plugin3.CriticalError(output)
    if status_warn:
        output = 'WARNING: {}'.format(', '.join(msg))
        raise nagios_plugin3.WarnError(output)
    print('OK: Nova-compute services happy')
示例#11
0
def verify_remote_connection_to_apiserver():
    try:
        test_connection(socket.gethostbyname(socket.gethostname()), 6443)
    except Exception:
        raise nagios_plugin3.CriticalError("Unable to reach "
                                           "API server on remote port")