def main(): """ Main function to run the check """ args = parse_args() metric_sender = MetricSender(verbose=args.verbose, debug=args.debug) filesys_full_metric = ['filesys.full'] filesys_inode_derived_metrics = { 'filesys.inodes.pused': 'filesys.usedfiles / (filesys.usedfiles + filesys.freefiles) * 100' } discovery_key_fs = 'disc.filesys' item_prototype_macro_fs = '#OSO_FILESYS' item_prototype_key_full = 'disc.filesys.full' item_prototype_key_inode = 'disc.filesys.inodes.pused' # Get the disk space filesys_full_metrics = pminfo.get_metrics(filesys_full_metric) filtered_filesys_metrics = filter_out_docker_filesystems( filesys_full_metrics, 'filesys.full.') if args.filter_pod_pv: filtered_filesys_metrics = filter_out_customer_pv_filesystems( filtered_filesys_metrics) if args.force_send_zeros: filtered_filesys_metrics = zero_mount_percentages( filtered_filesys_metrics) metric_sender.add_dynamic_metric(discovery_key_fs, item_prototype_macro_fs, filtered_filesys_metrics.keys()) for filesys_name, filesys_full in filtered_filesys_metrics.iteritems(): metric_sender.add_metric( {'%s[%s]' % (item_prototype_key_full, filesys_name): filesys_full}) # Get filesytem inode metrics filesys_inode_metrics = pminfo.get_metrics( derived_metrics=filesys_inode_derived_metrics) filtered_filesys_inode_metrics = filter_out_docker_filesystems( filesys_inode_metrics, 'filesys.inodes.pused.') if args.filter_pod_pv: filtered_filesys_inode_metrics = filter_out_customer_pv_filesystems( filtered_filesys_inode_metrics) if args.force_send_zeros: filtered_filesys_inode_metrics = zero_mount_percentages( filtered_filesys_inode_metrics) for filesys_name, filesys_inodes in filtered_filesys_inode_metrics.iteritems( ): metric_sender.add_metric({ '%s[%s]' % (item_prototype_key_inode, filesys_name): filesys_inodes }) metric_sender.send_metrics()
def main(): """ Main function to run the check """ args = parse_args() metric_sender = MetricSender(verbose=args.verbose, debug=args.debug) discovery_key_disk = 'disc.disk' interval = 3 pcp_disk_dev_metrics = ['disk.dev.total', 'disk.dev.avactive'] item_prototype_macro_disk = '#OSO_DISK' item_prototype_key_tps = 'disc.disk.tps' item_prototype_key_putil = 'disc.disk.putil' disk_metrics = pminfo.get_sampled_data(pcp_disk_dev_metrics, interval, 2) pcp_metrics_divided = {} for metric in pcp_disk_dev_metrics: pcp_metrics_divided[metric] = { k: v for k, v in disk_metrics.items() if metric in k } # do TPS checks; use disk.dev.total filtered_disk_totals = clean_up_metric_dict( pcp_metrics_divided[pcp_disk_dev_metrics[0]], pcp_disk_dev_metrics[0] + '.') # Add dynamic items metric_sender.add_dynamic_metric(discovery_key_disk, item_prototype_macro_disk, filtered_disk_totals.keys()) # calculate the TPS and add them to the ZaggSender for disk, totals in filtered_disk_totals.iteritems(): disk_tps = (totals[1] - totals[0]) / interval metric_sender.add_metric( {'%s[%s]' % (item_prototype_key_tps, disk): disk_tps}) # do % Util checks; use disk.dev.avactive filtered_disk_totals = clean_up_metric_dict( pcp_metrics_divided[pcp_disk_dev_metrics[1]], pcp_disk_dev_metrics[1] + '.') # calculate the % Util and add them to the ZaggSender for disk, totals in filtered_disk_totals.iteritems(): total_active = (float)(totals[1] - totals[0]) / 1000.0 putil = 100 * total_active / interval metric_sender.add_metric( {'%s[%s]' % (item_prototype_key_putil, disk): putil}) metric_sender.send_metrics()
def send_metric_data(bucket_list, bucket_stats, args): '''send data to zabbix ''' discovery_key = "disc.aws" discovery_macro = "#S3_BUCKET" prototype_s3_size = "disc.aws.size" prototype_s3_count = "disc.aws.objects" mts = MetricSender(verbose=args.debug) mts.add_dynamic_metric(discovery_key, discovery_macro, bucket_list) for bucket in bucket_stats.keys(): zab_key = "{}[{}]".format(prototype_s3_size, bucket) mts.add_metric({zab_key: int(round(bucket_stats[bucket]["size"]))}) zab_key = "{}[{}]".format(prototype_s3_count, bucket) mts.add_metric({zab_key: bucket_stats[bucket]["objects"]}) mts.send_metrics()
def report_to_zabbix(self, disc_key, disc_macro, item_proto_key, value): """ Sends the commands exit code to zabbix. """ mts = MetricSender() # Add the dynamic item self.verbose_print("Adding the dynamic item to Zabbix - %s, %s, [%s]" % \ (disc_key, disc_macro, self.args.name)) mts.add_dynamic_metric(disc_key, disc_macro, [self.args.name]) # Send the value for the dynamic item self.verbose_print("Sending metric to Zabbix - %s[%s]: %s" % \ (item_proto_key, self.args.name, value)) mts.add_metric({'%s[%s]' % (item_proto_key, self.args.name): value}) # Actually send them mts.send_metrics()
def main(): """ Main function to run the check """ args = parse_args() metric_sender = MetricSender(verbose=args.verbose, debug=args.debug) discovery_key_network = 'disc.network' pcp_network_dev_metrics = ['network.interface.in.bytes', 'network.interface.out.bytes'] item_proto_macro_network = '#OSO_NET_INTERFACE' item_proto_key_in_bytes = 'disc.network.in.bytes' item_proto_key_out_bytes = 'disc.network.out.bytes' network_metrics = pminfo.get_metrics(pcp_network_dev_metrics) pcp_metrics_divided = {} for metric in pcp_network_dev_metrics: pcp_metrics_divided[metric] = {k: v for k, v in network_metrics.items() if metric in k} # do Network In; use network.interface.in.bytes filtered_network_totals = clean_up_metric_dict(pcp_metrics_divided[pcp_network_dev_metrics[0]], pcp_network_dev_metrics[0] + '.') # Add dynamic items metric_sender.add_dynamic_metric(discovery_key_network, item_proto_macro_network, filtered_network_totals.keys()) # Report Network IN bytes; them to the MetricSender for interface, total in filtered_network_totals.iteritems(): metric_sender.add_metric({'%s[%s]' % (item_proto_key_in_bytes, interface): total}) # Report Network OUT Bytes; use network.interface.out.bytes filtered_network_totals = clean_up_metric_dict(pcp_metrics_divided[pcp_network_dev_metrics[1]], pcp_network_dev_metrics[1] + '.') # calculate the % Util and add them to the MetricSender for interface, total in filtered_network_totals.iteritems(): metric_sender.add_metric({'%s[%s]' % (item_proto_key_out_bytes, interface): total}) metric_sender.send_metrics()
def report_to_zabbix(self, total_snapshottable_vols, total_snapshots_created, total_snapshot_creation_errors): """ Sends the commands exit code to zabbix. """ mts = MetricSender(verbose=True) # Populate EBS_SNAPSHOTTER_DISC_SCHEDULE_MACRO with the schedule mts.add_dynamic_metric(EBS_SNAPSHOTTER_DISC_KEY, EBS_SNAPSHOTTER_DISC_SCHEDULE_MACRO, \ [self.args.with_schedule]) # Send total_snapshottable_vols prototype item key and value mts.add_metric({'%s[%s]' % (EBS_SNAPSHOTTER_SNAPSHOTTABLE_VOLUMES_KEY, self.args.with_schedule): \ total_snapshottable_vols}) # Send total_snapshots_created prototype item key and value mts.add_metric({'%s[%s]' % (EBS_SNAPSHOTTER_SNAPSHOTS_CREATED_KEY, self.args.with_schedule): \ total_snapshots_created}) # Send total_snapshot_creation_errors prototype item key and value mts.add_metric({'%s[%s]' % (EBS_SNAPSHOTTER_SNAPSHOT_CREATION_ERRORS_KEY, self.args.with_schedule): \ total_snapshot_creation_errors}) # Actually send them mts.send_metrics()
class EBSStuckVolumesCheck(object): """ This class houses a check that looks for EBS volumes that are stuck in a transition state (attaching, detaching, busy, etc). """ def __init__(self): """ initialize EBSStuckVolumesCheck class """ self.args = None self.vol_state_data = None self.parse_args() # Make sure we're using the profile they've requested. if self.args.aws_creds_profile: os.environ['AWS_PROFILE'] = self.args.aws_creds_profile self.eu = EbsUtil(self.args.region, verbose=self.args.verbose) self.mts = MetricSender(verbose=self.args.verbose) def parse_args(self): ''' Parse arguments passed to the script ''' parser = argparse.ArgumentParser( description='OpenShift Cluster Metrics Checker') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose output') parser.add_argument('--region', required=True, help='AWS EC2 Region to check') parser.add_argument('--stuck-after', default=120, type=int, help='Amount of time in seconds after which the volume is ' + \ 'determined to be "stuck".') parser.add_argument('--aws-creds-profile', required=False, help='The AWS credentials profile to use.') self.args = parser.parse_args() @staticmethod def read_raw_volume_state_data(): """ Reads in the raw string the volume state data from disk """ if not os.path.isfile(STATE_DATA_FILE): return "" # Act like the file is blank with open(STATE_DATA_FILE, 'r') as stream: return stream.read() def load_volume_state_data(self): """ Loads the volume state data from disk """ if os.path.isfile(STATE_DATA_FILE): with open(STATE_DATA_FILE, 'r') as stream: self.vol_state_data = yaml.load(stream) else: self.vol_state_data = {} def save_volume_state_data(self): """ Saves the volume state data to disk """ with open(STATE_DATA_FILE, 'w') as outfile: yaml.dump(self.vol_state_data, outfile, default_flow_style=False, allow_unicode=True) def add_new_transitioning_volumes(self, trans_vols): """ Adds volumes that we haven't seen before that are in a transitioning state. """ for vol in trans_vols: vol_uri = self.eu.generate_volume_uri(vol) if vol_uri not in self.vol_state_data.keys(): # This is the first time we've seen this volume, add it. vol_uri = self.eu.generate_volume_uri(vol) self.vol_state_data[vol_uri] = {} self.vol_state_data[vol_uri][STUCK_AFTER_KEY] = datetime.now() + \ timedelta(seconds=self.args.stuck_after) self.vol_state_data[vol_uri][VOLUME_ID_KEY] = str(vol.id) self.vol_state_data[vol_uri][STATE_KEY] = TRANSITION_STATE self.vol_state_data[vol_uri][ATTACH_STATUS_KEY] = str( vol.attach_data.status) def set_stuck_volumes(self): """ Sets volumes to state 'stuck' if they've passed their transition state deadline. """ for item in self.vol_state_data.itervalues(): # We don't want to set unstuck volumes back to stuck. if item[STATE_KEY] != UNSTUCK_STATE: if datetime.now() > item[STUCK_AFTER_KEY]: item[STATE_KEY] = STUCK_STATE def set_unstuck_volumes(self, trans_vols): """ Change volumes that were in state 'stuck' that are no longer in transition, to state 'unstuck'. """ trans_vol_ids = [str(vol.id) for vol in trans_vols] for vol_uri, cache_data in self.vol_state_data.iteritems(): if cache_data[STATE_KEY] == STUCK_STATE and \ cache_data[VOLUME_ID_KEY] not in trans_vol_ids: # This volue was stuck, but isn't any longer self.vol_state_data[vol_uri][STATE_KEY] = UNSTUCK_STATE def report_stuck_volumes(self): """ sends data to monitoring that these volumes are stuck. """ for vol_uri, cache_data in self.vol_state_data.iteritems(): if cache_data[STATE_KEY] == STUCK_STATE: self.mts.add_dynamic_metric(EBS_VOLUME_URI_DISC_KEY, EBS_VOLUME_URI_DISC_MACRO, [vol_uri]) item_name = '%s[%s]' % (EBS_VOLUME_ATTACH_STATE_KEY, vol_uri) self.mts.add_metric({item_name: MONITORING_STUCK_VALUE}) # Actually send them self.mts.send_metrics() def report_unstuck_volumes(self): """ sends data to monitoring that these volumes have become unstuck. """ for vol_uri, cache_data in self.vol_state_data.iteritems(): if cache_data[STATE_KEY] == UNSTUCK_STATE: self.mts.add_dynamic_metric(EBS_VOLUME_URI_DISC_KEY, EBS_VOLUME_URI_DISC_MACRO, [vol_uri]) item_name = '%s[%s]' % (EBS_VOLUME_ATTACH_STATE_KEY, vol_uri) self.mts.add_metric({item_name: MONITORING_UNSTUCK_VALUE}) # Actually send them self.mts.send_metrics() def remove_unstuck_volumes_from_state_data(self): """ Removes state 'unstuck' volumes from the state data (no longer need to track them) """ for vol_uri in self.vol_state_data.keys(): cache_data = self.vol_state_data[vol_uri] if cache_data[STATE_KEY] == UNSTUCK_STATE: # This volume was stuck, but isn't any longer del self.vol_state_data[vol_uri] def remove_no_longer_transitioning_volumes(self, trans_vols): """ Remove volumes that were transitioning, but are no longer in the trans_vols list """ trans_vol_ids = [str(vol.id) for vol in trans_vols] for vol_uri in self.vol_state_data.keys(): cache_data = self.vol_state_data[vol_uri] if cache_data[STATE_KEY] == TRANSITION_STATE and \ cache_data[VOLUME_ID_KEY] not in trans_vol_ids: # This volume was transitioning, but isn't any longer del self.vol_state_data[vol_uri] def run(self): """ Run the main logic of this check """ # Load the state machine data self.load_volume_state_data() # Get the volumes that are currently in a transitioning state trans_vols = self.eu.get_trans_attach_status_vols() # Based on that list, weed out the volumes that used to be transitioning, # that are no longer in the transitioning volumes list. This means that # it was a normal volume transition, probably from attaching to attached # or detaching to detached (aka None). self.remove_no_longer_transitioning_volumes(trans_vols) # Check on the volumes that were in the stuck state that are no longer # in the transitioning volumes list. This means that they went from stuck # to unstuck. We need to track these so that we can report that they've become # unstuck to monitoring. self.set_unstuck_volumes(trans_vols) # Add any volumes that are transitioning that we haven't seen before to our data self.add_new_transitioning_volumes(trans_vols) # Change volumes that are still transitioning and have hit their deadline to # finish that transition to a state of "stuck" self.set_stuck_volumes() # Report to monitoring the stuck volumes self.report_stuck_volumes() # Report to monitoring the volumes that were stuck, but are now unstuck (no # longer transitioning) self.report_unstuck_volumes() # Since the unstuck volumes have been reported, they can safeuly be removed from # our tracking now. self.remove_unstuck_volumes_from_state_data() # Make sure we save state for the next run. self.save_volume_state_data() self.eu.verbose_print("\nTracking Volumes") self.eu.verbose_print("----------------\n") # Cat out the state file raw_state_file = self.read_raw_volume_state_data() self.eu.verbose_print(raw_state_file)
class OpenshiftMasterZaggClient(object): """ Checks for the Openshift Master """ def __init__(self): self.args = None self.metric_sender = None self.ora = None self.zabbix_api_key = None self.zabbix_healthz_key = None def run(self): """ Main function to run the check """ self.parse_args() self.metric_sender = MetricSender(verbose=self.args.verbose, debug=self.args.debug) if self.args.local: self.ora = OpenshiftRestApi() self.args.api_ping = True self.args.healthz = True self.zabbix_api_key = 'openshift.master.local.api.ping' self.zabbix_healthz_key = 'openshift.master.local.api.healthz' else: master_cfg_from_yaml = [] with open('/etc/origin/master/master-config.yaml', 'r') as yml: master_cfg_from_yaml = yaml.load(yml) self.ora = OpenshiftRestApi(host=master_cfg_from_yaml['oauthConfig']['masterURL'], verify_ssl=True) self.zabbix_api_key = 'openshift.master.api.ping' self.zabbix_healthz_key = 'openshift.master.api.healthz' try: if self.args.healthz or self.args.all_checks: self.healthz_check() except Exception as ex: print "Problem performing healthz check: %s " % ex.message self.metric_sender.add_metric({self.zabbix_healthz_key: 'false'}) try: if self.args.api_ping or self.args.all_checks: self.api_ping() if self.args.project_count or self.args.all_checks: self.project_count() if self.args.pod_count or self.args.all_checks: self.pod_count() if self.args.user_count or self.args.all_checks: self.user_count() if self.args.pv_info or self.args.all_checks: self.pv_info() if self.args.node_checks or self.args.all_checks: self.nodes_not_schedulable() self.nodes_not_ready() self.nodes_not_labeled() except Exception as ex: print "Problem Openshift API checks: %s " % ex.message self.metric_sender.add_metric({self.zabbix_api_key: 0}) # Openshift API is down try: if self.args.metrics or self.args.all_checks: self.metric_check() except Exception as ex: print "Problem getting Openshift metrics at /metrics: %s " % ex.message self.metric_sender.add_metric({'openshift.master.metric.ping' : 0}) # Openshift Metrics are down self.metric_sender.send_metrics() def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Network metric sender') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') parser.add_argument('-l', '--local', action='store_true', default=False, help='Run local checks against the local API (https://127.0.0.1)') master_check_group = parser.add_argument_group('Different Checks to Perform') master_check_group.add_argument('--all-checks', action='store_true', default=None, help='Do all of the checks') master_check_group.add_argument('--api-ping', action='store_true', default=None, help='Verify the Openshift API is alive') master_check_group.add_argument('--healthz', action='store_true', default=None, help='Query the Openshift Master API /healthz') master_check_group.add_argument('--metrics', action='store_true', default=None, help='Query the Openshift Master Metrics at /metrics') master_check_group.add_argument('--project-count', action='store_true', default=None, help='Query the Openshift Master for Number of Pods') master_check_group.add_argument('--pod-count', action='store_true', default=None, help='Query the Openshift Master for Number of Running Pods') master_check_group.add_argument('--user-count', action='store_true', default=None, help='Query the Openshift Master for Number of Users') master_check_group.add_argument('--pv-info', action='store_true', default=None, help='Query the Openshift Master for Persistent Volumes Info') master_check_group.add_argument('--node-checks', action='store_true', default=None, help='Query the Openshift Master for node checks') self.args = parser.parse_args() def api_ping(self): """ Verify the Openshift API health is responding correctly """ print "\nPerforming Openshift API ping check..." response = self.ora.get('/api/v1/nodes') print "\nOpenshift API ping is alive" print "Number of nodes in the Openshift cluster: %s" % len(response['items']) self.metric_sender.add_metric({self.zabbix_api_key: 1, 'openshift.master.node.count': len(response['items'])}) def healthz_check(self): """ check the /healthz API call """ print "\nPerforming /healthz check..." response = self.ora.get('/healthz', rtype='text') print "healthz check returns: %s " %response self.metric_sender.add_metric({self.zabbix_healthz_key: str('ok' in response).lower()}) def metric_check(self): """ collect certain metrics from the /metrics API call """ print "\nPerforming /metrics check..." response = self.ora.get('/metrics', rtype='text') for metric_type in text_string_to_metric_families(response): # Collect the apiserver_request_latencies_summary{resource="pods",verb="LIST",quantiles in /metrics # Collect the apiserver_request_latencies_summary{resource="pods",verb="WATCHLIST",quantiles in /metrics if metric_type.name == 'apiserver_request_latencies_summary': key_str = 'openshift.master.apiserver.latency.summary' for sample in metric_type.samples: if (sample[1]['resource'] == 'pods' and sample[1].has_key('quantile') and 'LIST' in sample[1]['verb']): curr_key_str = key_str + ".pods.quantile.%s.%s" % (sample[1]['verb'], sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.metric_sender.add_metric({curr_key_str.lower(): int(value/1000)}) # Collect the scheduler_e2e_scheduling_latency_microseconds{quantiles in /metrics if metric_type.name == 'scheduler_e2e_scheduling_latency_microseconds': for sample in metric_type.samples: if sample[1].has_key('quantile'): key_str = 'openshift.master.scheduler.e2e.scheduling.latency' curr_key_str = key_str + ".quantile.%s" % (sample[1]['quantile'].split('.')[1]) if math.isnan(sample[2]): value = 0 else: value = sample[2] self.metric_sender.add_metric({curr_key_str.lower(): int(value/1000)}) self.metric_sender.add_metric({'openshift.master.metric.ping' : 1}) # def project_count(self): """ check the number of projects in Openshift """ print "\nPerforming project count check..." excluded_names = ['openshift', 'openshift-infra', 'default', 'ops-monitor'] response = self.ora.get('/oapi/v1/projects') project_names = [project['metadata']['name'] for project in response['items']] valid_names = set(project_names) - set(excluded_names) print "Project count: %s" % len(valid_names) self.metric_sender.add_metric({'openshift.project.count' : len(valid_names)}) def pod_count(self): """ check the number of pods in Openshift """ print "\nPerforming pod count check..." response = self.ora.get('/api/v1/pods') # Get running pod count running_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: running_pod_count += 1 # Get running pod count on compute only nodes (non-infra) running_user_pod_count = 0 for i in response['items']: if 'containerStatuses' in i['status']: if 'running' in i['status']['containerStatuses'][0]['state']: if 'nodeSelector' in i['spec']: # logging pods don't have selector on 'type' if 'type' in i['spec']['nodeSelector'] \ and i['spec']['nodeSelector']['type'] == 'compute': running_user_pod_count += 1 print "Total pod count: %s" % len(response['items']) print "Running pod count: %s" % running_pod_count print "User Running pod count: %s" % running_user_pod_count self.metric_sender.add_metric({'openshift.master.pod.running.count' : running_pod_count, 'openshift.master.pod.user.running.count' : running_user_pod_count, 'openshift.master.pod.total.count' : len(response['items'])}) def user_count(self): """ check the number of users in Openshift """ print "\nPerforming user count check..." response = self.ora.get('/oapi/v1/users') print "Total user count: %s" % len(response['items']) self.metric_sender.add_metric({'openshift.master.user.count' : len(response['items'])}) @staticmethod def convert_to_GiB(value): """ take units as 'Gi', 'Ti', etc and return as int GiB """ if 'G' in value: return int(value.strip('GIgi')) elif 'Ti' in value: return 1000 * int(value.replace('Ti', '')) def pv_info(self): """ Gather info about the persistent volumes in Openshift """ print "\nPerforming user persistent volume count...\n" response = self.ora.get('/api/v1/persistentvolumes') pv_capacity_total = 0 pv_capacity_available = 0 pv_types = {'Available': 0, 'Bound': 0, 'Released': 0, 'Failed': 0} # Dynamic items variables discovery_key_pv = 'disc.pv' item_prototype_macro_pv = '#OSO_PV' item_prototype_key_count = 'disc.pv.count' item_prototype_key_available = 'disc.pv.available' dynamic_pv_count = defaultdict(int) dynamic_pv_available = defaultdict(int) for item in response['items']: # gather dynamic pv counts dynamic_pv_count[item['spec']['capacity']['storage']] += 1 #get count of each pv type available pv_types[item['status']['phase']] += 1 #get info for the capacity and capacity available capacity = item['spec']['capacity']['storage'] if item['status']['phase'] == 'Available': # get total available capacity pv_capacity_available = pv_capacity_available + self.convert_to_GiB(capacity) # gather dynamic pv available counts dynamic_pv_available[item['spec']['capacity']['storage']] += 1 pv_capacity_total = pv_capacity_total + self.convert_to_GiB(capacity) print "Total Persistent Volume Total count: %s" % len(response['items']) print 'Total Persistent Volume Capacity: %s' % pv_capacity_total print 'Total Persisten Volume Available Capacity: %s' % pv_capacity_available self.metric_sender.add_metric( {'openshift.master.pv.total.count' : len(response['items']), 'openshift.master.pv.space.total': pv_capacity_total, 'openshift.master.pv.space.available': pv_capacity_available}) for key, value in pv_types.iteritems(): print "Total Persistent Volume %s count: %s" % (key, value) self.metric_sender.add_metric( {'openshift.master.pv.%s.count' %key.lower() : value}) # Add dynamic items self.metric_sender.add_dynamic_metric(discovery_key_pv, item_prototype_macro_pv, dynamic_pv_count.keys()) for size, count in dynamic_pv_count.iteritems(): print print "Total Persistent Volume %s count: %s" % (size, count) print "Total Persistent Volume available %s count: %s" % (size, dynamic_pv_available[size]) self.metric_sender.add_metric({"%s[%s]" %(item_prototype_key_count, size) : count, "%s[%s]" %(item_prototype_key_available, size) : dynamic_pv_available[size]}) def nodes_not_schedulable(self): """check the number of nodes in the cluster that are not schedulable""" print "\nPerforming nodes not schedulable check..." response = self.ora.get('/api/v1/nodes') nodes_not_schedulable = [] for n in response['items']: if n['metadata']['labels']['type'] == 'master': if self.args.verbose: print "Node: %s is a master\n" % n['metadata']['name'] else: if "unschedulable" in n['spec']: nodes_not_schedulable.append(n['metadata']['name']) print "Count of nodes not schedulable: %s" % len(nodes_not_schedulable) print "Nodes not schedulable: %s\n" % nodes_not_schedulable self.metric_sender.add_metric( {'openshift.master.nodesnotschedulable.count' : len(nodes_not_schedulable)}) def nodes_not_ready(self): """ check the number of nodes in the cluster that are not ready""" print "\nPerforming nodes not ready check..." response = self.ora.get('/api/v1/nodes') nodes_not_ready = [] for n in response['items']: has_ready_status = False for cond in n['status']['conditions']: if self.args.verbose: print "Get ready status of %s" % n['metadata']['name'] if cond['type'] == "Ready": has_ready_status = True if cond['status'].lower() != "true": if self.args.verbose: print "Non-true ready status of %s : %s" % (n['metadata']['name'], cond['status']) nodes_not_ready.append(n['metadata']['name']) if has_ready_status == False: if self.args.verbose: print "Did not find ready status for %s" % n['metadata']['name'] nodes_not_ready.append(n['metadata']['name']) print "Count of nodes not ready: %s" % len(nodes_not_ready) self.metric_sender.add_metric( {'openshift.master.nodesnotready.count' : len(nodes_not_ready)}) def nodes_not_labeled(self): """ check the nodes in the cluster that are not labeled Note: This check only searches for nodes with no label keys set""" print "\nPerforming nodes not labeled check..." response = self.ora.get('/api/v1/nodes') nodes_not_labeled = [] nodes_labeled = [] for n in response['items']: if 'labels' in n['metadata']: nodes_labeled.append(n['metadata']['name']) else: nodes_not_labeled.append(n['metadata']['name']) print "Nodes not labeled: %s\nNodes labeled: %s \n" % (nodes_not_labeled, nodes_labeled) self.metric_sender.add_metric( {'openshift.master.nodesnotlabeled.count' : len(nodes_not_labeled)})
class CertificateReporting(object): ''' class with ability to parse through x509 certificates to extract and report to zabbix the expiration date assocated with the cert ''' def __init__(self): ''' constructor ''' self.args = None self.current_date = datetime.datetime.today() self.parse_args() self.msend = MetricSender(debug=self.args.debug) def dprint(self, msg): ''' debug printer ''' if self.args.debug: print msg def parse_args(self): ''' parse command line args ''' argparser = argparse.ArgumentParser(description='certificate checker') argparser.add_argument('--debug', default=False, action='store_true') argparser.add_argument( '--cert-list', default="/etc/origin", type=str, help='comma-separated list of dirs/certificates') self.args = argparser.parse_args() def days_to_expiration(self, cert_file): ''' return days to expiration for a certificate ''' crypto = OpenSSL.crypto cert = open(cert_file).read() certificate = crypto.load_certificate(crypto.FILETYPE_PEM, cert) expiration_date_asn1 = certificate.get_notAfter() # expiration returned in ASN.1 GENERALIZEDTIME format # YYYYMMDDhhmmss with a trailing 'Z' expiration_date = parser.parse(expiration_date_asn1).replace( tzinfo=None) delta = expiration_date - self.current_date return delta.days def process_certificates(self): ''' check through list of certificates/directories ''' for cert in self.args.cert_list.split(','): if not os.path.exists(cert): self.dprint("{} does not exist. skipping.".format(cert)) continue mode = os.stat(cert).st_mode if S_ISDIR(mode): self.all_certs_in_dir(cert) elif S_ISREG(mode): days = self.days_to_expiration(cert) self.dprint("{} in {} days".format(cert, days)) self.add_metrics(cert, days) else: self.dprint("not a file. not a directory. skipping.") # now push out all queued up item(s) to metric servers self.msend.send_metrics() def add_metrics(self, certificate, days_to_expiration): ''' queue up item for submission to zabbix ''' self.msend.add_dynamic_metric(CERT_DISC_KEY, CERT_DISC_MACRO, [certificate]) zbx_key = "{}[{}]".format(CERT_DISC_KEY, certificate) self.msend.add_metric({zbx_key: days_to_expiration}) def all_certs_in_dir(self, directory): ''' recursively go through all *.crt files in 'directory' ''' for root, _, filenames in os.walk(directory): for filename in filenames: if filename.endswith('.crt'): full_path = os.path.join(root, filename) days = self.days_to_expiration(full_path) self.dprint("{} in {} days".format(full_path, days)) self.add_metrics(full_path, days)
class OpsMetricClient(object): """ class to send data via MeticSender """ def __init__(self): self.metric_sender = None self.args = None self.config = None self.heartbeat = None def run(self): """ main function to run the script """ self.parse_args() self.parse_config(self.args.config_file) self.config_metric_sender() if self.args.send_heartbeat: self.add_heartbeat() if self.args.key and self.args.value: self.add_metric() if self.args.discovery_key and self.args.macro_string and self.args.macro_names: self.add_dynamic_metric() self.metric_sender.send_metrics() def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='metric sender') parser.add_argument('--send-heartbeat', help="send heartbeat metric to zagg", action="store_true") group = parser.add_mutually_exclusive_group() group.add_argument('-s', '--host', help='specify host name as registered in Zabbix') group.add_argument('--synthetic', default=False, action='store_true', help='send as cluster-wide synthetic host') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') parser.add_argument('-c', '--config-file', help='ops-metric-client config file', default='/etc/openshift_tools/metric_sender.yaml') key_value_group = parser.add_argument_group('Sending a Key-Value Pair') key_value_group.add_argument('-k', '--key', help='metric key') key_value_group.add_argument('-o', '--value', help='metric value') key_value_group.add_argument( '-t', '--tags', help='list of space delimited key tags: units=byte ...', nargs='*') low_level_discovery_group = parser.add_argument_group( 'Sending a Low Level Discovery Item') low_level_discovery_group.add_argument('--discovery-key', help='discovery key') low_level_discovery_group.add_argument('--macro-string', help='macro string') low_level_discovery_group.add_argument( '--macro-names', help='comma separated list of macro names') self.args = parser.parse_args() def parse_config(self, config_file): """ parse config file """ self.config = yaml.load(file(config_file)) def config_metric_sender(self): """ configure the metric_sender """ if self.args.host: host = self.args.host elif self.args.synthetic: host = self.config['synthetic_clusterwide']['host']['name'] else: host = self.config['host']['name'] metric_verbose = self.args.verbose metric_debug = self.args.debug if isinstance(metric_verbose, str): metric_verbose = (metric_verbose == 'True') if isinstance(metric_debug, str): metric_debug = (metric_debug == 'True') self.metric_sender = MetricSender(host=host, verbose=metric_verbose, debug=metric_debug, config_file=self.args.config_file) def add_heartbeat(self): """ crate a heartbeat metric """ if self.args.synthetic: heartbeat = MetricSenderHeartbeat( templates=self.config['synthetic_clusterwide']['heartbeat'] ['templates'], hostgroups=self.config['heartbeat']['hostgroups']) else: heartbeat = MetricSenderHeartbeat( templates=self.config['heartbeat']['templates'], hostgroups=self.config['heartbeat']['hostgroups']) self.metric_sender.add_heartbeat(heartbeat) def add_metric(self): """ send key/value pair """ # Get tags from command line args tags = dict([i.split("=")[0], i.split("=")[1]] for i in self.args.tags) if self.args.tags else {} self.metric_sender.add_metric({self.args.key: self.args.value}, key_tags=tags) def add_dynamic_metric(self): """ send zabbix low level discovery item to zagg """ self.metric_sender.add_dynamic_metric(self.args.discovery_key, self.args.macro_string, self.args.macro_names.split(','))
class DockerContainerUsageCli(object): ''' This is the class that actually pulls eveyrthing together into a cli script. ''' def __init__(self, config_file=None): if not config_file: self.config_file = '/etc/openshift_tools/container_metrics.yml' else: self.config_file = config_file self.config = None self.parse_config() self.cli = AutoVersionClient(base_url='unix://var/run/docker.sock', timeout=120) self.docker_util = DockerUtil(self.cli) self.metric_sender = MetricSender(verbose=True) def parse_config(self): """ parse config file """ if not self.config: if not os.path.exists(self.config_file): raise IOError(self.config_file + " does not exist.") self.config = yaml.load(file(self.config_file)) def format_ctr_name(self, ctr_name): ''' Takes a container name and if there's a name_format_regex specified, it applies it ''' for item in self.config['usage_checks']: name_match_regex = item['name_match_regex'] if item.has_key('name_format_regex') and re.match( name_match_regex, ctr_name): try: name_format_regex = item['name_format_regex'] new_name = re.sub(name_match_regex, name_format_regex, ctr_name) return new_name except sre_constants.error as ex: # Just use the full name (we don't want to die because of name formatting) print "\nError: %s: [%s]. Using full name [%s].\n" % ( ex.message, name_format_regex, ctr_name) return ctr_name return ctr_name def main(self): ''' The main entrypoint of the cli ''' ctr_regexes = [ uchk['name_match_regex'] for uchk in self.config['usage_checks'] ] use_cgroups = self.config.get('use_cgroups', False) ctrs = self.docker_util.get_ctrs_matching_names(ctr_regexes) for ctr_name, ctr in ctrs.iteritems(): (cpu_stats, mem_stats) = self.docker_util.get_ctr_stats( ctr, use_cgroups=use_cgroups) formatted_ctr_name = self.format_ctr_name(ctr_name) # Add the container hostnames as macros for the dynamic item. self.metric_sender.add_dynamic_metric(ZBX_DOCKER_DISC_KEY, ZBX_DOCKER_DISC_MACRO, [formatted_ctr_name]) data = { '%s[%s]' % (ZBX_CTR_CPU_USED_PCT_KEY, formatted_ctr_name): cpu_stats.used_pct, '%s[%s]' % (ZBX_CTR_MEM_USED_KEY, formatted_ctr_name): mem_stats.used, '%s[%s]' % (ZBX_CTR_MEM_LIMIT_KEY, formatted_ctr_name): mem_stats.limit, '%s[%s]' % (ZBX_CTR_MEM_LIMIT_USED_PCT_KEY, formatted_ctr_name): mem_stats.limit_used_pct, '%s[%s]' % (ZBX_CTR_MEM_FAILCNT_KEY, formatted_ctr_name): mem_stats.failcnt, } print "%s:" % formatted_ctr_name for k, v in data.iteritems(): print " %s: %s" % (k, v) print self.metric_sender.add_metric(data) # Actually send the metrics self.metric_sender.send_metrics()