class OpenshiftRouterChecks(object): """Checks for the Openshift Router""" def __init__(self): self.args = None self.metrics = None # metric sender self.kubeconfig = None self.parse_args() self.get_kubeconfig() self.ocutil = None def get_kubeconfig(self): """Find kubeconfig to use for OCUtil""" # Default master kubeconfig kubeconfig = '/tmp/admin.kubeconfig' non_master_kube_dir = '/etc/origin/node' if os.path.exists(kubeconfig): # If /tmp/admin.kubeconfig exists, use it! pass elif os.path.isdir(non_master_kube_dir): for my_file in os.listdir(non_master_kube_dir): if my_file.endswith(".kubeconfig"): kubeconfig = os.path.join(non_master_kube_dir, my_file) if self.args.debug: print "Using kubeconfig: {}".format(kubeconfig) self.kubeconfig = kubeconfig def check_all_router_health(self): """ Perform defined router health check on all routers """ discovery_key = "disc.openshift.cluster.router" discovery_macro = "#OS_ROUTER" router_health_item = "disc.openshift.cluster.router.health" router_pods = self.find_router_pods() health_report = {} for router_name, pod_details in router_pods.iteritems(): health = self.router_pod_healthy(pod_details) if self.args.verbose: print "{} healthy: {}\n".format(router_name, health) health_report[router_name] = health # make dynamic items, and queue up the associated data router_names = health_report.keys() self.metrics.add_dynamic_metric(discovery_key, discovery_macro, router_names, synthetic=True) for router_name, health_status in health_report.iteritems(): zbx_key = "{}[{}]".format(router_health_item, router_name) self.metrics.add_metric({zbx_key: int(health_status)}, synthetic=True) def running_pod_count_check(self): """ return hash of deployment configs containing whether the number of running pods matches the definition in the deployment config """ router_pods = self.find_router_pods() # get actual running pod count (per DC) dc_pod_count = {} for _, details in router_pods.iteritems(): dc_name = details['metadata']['labels']['deploymentconfig'] dc_pod_count[dc_name] = dc_pod_count.get(dc_name, 0) + 1 if self.args.debug: print "Running pod count: {}".format(dc_pod_count) # get expected pod count as defined in each router DC expected_pod_count = {} for dc_name in dc_pod_count.keys(): expected_pod_count[dc_name] = self.ocutil.get_dc( dc_name)['spec']['replicas'] if self.args.debug: print "Expected pod count: {}".format(expected_pod_count) results = {} for dc_name in dc_pod_count.keys(): results[dc_name] = bool( dc_pod_count[dc_name] == expected_pod_count[dc_name]) if self.args.verbose or self.args.debug: print "DC replica count matching actual counts: {}".format(results) return results def check_router_replica_count(self): """ Check whether the running router replica count is the same as what is defined in the deployment config """ discovery_key = "disc.openshift.cluster.router" discovery_macro = "#ROUTER_DC" dc_status_item = "disc.openshift.cluster.router.expected_pod_count" replica_results = self.running_pod_count_check() # make dynamic items, and queue up the associated data dc_names = replica_results.keys() self.metrics.add_dynamic_metric(discovery_key, discovery_macro, dc_names, synthetic=True) for dc_name, replica_status in replica_results.iteritems(): zbx_key = "{}[{}]".format(dc_status_item, dc_name) self.metrics.add_metric({zbx_key: int(replica_status)}, synthetic=True) def run(self): """Main function to run the check""" self.ocutil = OCUtil(config_file=self.kubeconfig, verbose=self.args.verbose) self.metrics = MetricSender(verbose=self.args.verbose, debug=self.args.debug) self.check_all_router_health() self.check_router_replica_count() if self.args.dry_run: self.metrics.print_unique_metrics_key_value() else: self.metrics.send_metrics() def parse_args(self): """ parse the args from the cli """ parser = argparse.ArgumentParser(description='Openshift Router sender') parser.add_argument('-v', '--verbose', action='store_true', default=None, help='Verbose?') parser.add_argument('--debug', action='store_true', default=None, help='Debug?') parser.add_argument('--dry-run', action='store_true', default=False, help='Collect stats, but no report to zabbix') self.args = parser.parse_args() @staticmethod def get_router_health_url(router): """ build router healthcheck URL """ podip = router['status']['podIP'] port = router['spec']['containers'][0]['livenessProbe']['httpGet'][ 'port'] path = router['spec']['containers'][0]['livenessProbe']['httpGet'][ 'path'] url = 'http://{}:{}{}'.format(podip, port, path) return url @staticmethod def router_pod_healthy(router): """ ping the health port for router pod health """ url = OpenshiftRouterChecks.get_router_health_url(router) try: result = urllib2.urlopen(url).getcode() if result == 200: return True else: return False except (urllib2.HTTPError, urllib2.URLError): return False def find_router_pods(self): """ return dict of PODs running haproxy (the router pods) """ router_pods = {} for pod in self.ocutil.get_pods()['items']: try: img = pod['status']['containerStatuses'][0]['image'] if 'ose-haproxy-router' in img: router_pods[pod['metadata']['name']] = pod except KeyError: pass return router_pods
class InfraNodePodStatus(object): ''' This is a check for making sure the internal pods like router and registry running and located on different infra nodes ''' def __init__(self): '''initial for the InfraNodePodStatus''' self.kubeconfig = '/tmp/admin.kubeconfig' self.oc = OCUtil(namespace='default', config_file=self.kubeconfig) def check_pods(self): ''' get all the pod information ''' pods = self.oc.get_pods() pod_report = {} for pod in pods['items']: pod_name = pod['metadata']['name'] pod_report[pod_name] = {} pod_report[pod_name]['hostIP'] = pod['status']['hostIP'] pod_report[pod_name]['status'] = pod['status']['phase'] return pod_report @staticmethod def compare_ip(keyword, pod_info_dict): ''' to compare the pod host ip and check the pod status ''' pod_hostip_status = [ pod_info_dict[i] for i in pod_info_dict.keys() if keyword in i ] # pod_status = [pod_info_dict[i] for i in pod_info_dict.keys() if keyword in i] pod_run_num = 0 for i in pod_hostip_status: if i['status'] == "Running": pod_run_num += 1 if len(pod_hostip_status) == 2: if pod_hostip_status[0]['hostIP'] != pod_hostip_status[1]['hostIP']: # print "ok, you do not need do anything for {} pod".format(keyword) result_code = 1 else: # print "there are something wrong, please check the pod" result_code = 0 else: print "plese check the pod" result_code = 0 # result_code 1 means the two pods are on different nodes # pod_run_num means the running pod number return result_code, pod_run_num def run(self): ''' run the command and send the code to zabbix ''' ms = MetricSender() pod_report = self.check_pods() # the check_value is the value to send to zabbix router_check_value = self.compare_ip('router', pod_report) registry_check_value = self.compare_ip('registry', pod_report) print router_check_value, registry_check_value ms.add_metric({'openshift.router.pod.location': router_check_value[0]}) ms.add_metric({'openshift.router.pod.status': router_check_value[1]}) ms.add_metric( {'openshift.registry.pod.location': registry_check_value[0]}) ms.add_metric( {'openshift.registry.pod.status': registry_check_value[1]}) ms.send_metrics()
class InfraNodePodStatus(object): ''' This is a check for making sure the internal pods like router and registry running and located on different infra nodes ''' def __init__( self, args=None, ): '''initial for the InfraNodePodStatus''' self.args = args self.kubeconfig = '/tmp/admin.kubeconfig' self.oc = OCUtil(namespace=self.args.namespace, config_file=self.kubeconfig) self.all_pods = self.get_all_pods() def get_all_pods(self): ''' get all the pod information ''' pods = self.oc.get_pods() pod_report = {} for pod in pods['items']: pod_name = pod['metadata']['name'] pod_report[pod_name] = {} pod_report[pod_name]['hostIP'] = pod['status']['hostIP'] pod_report[pod_name]['status'] = pod['status']['phase'] return pod_report def get_expected_replicas(self, deploymentconfig): ''' get expected replica count from deploymentconfig ''' defined_replicas = self.oc.get_dc(deploymentconfig)['spec']['replicas'] return defined_replicas def get_pods_by_name(self, podname): """get_pods_by_name""" return [ self.all_pods[i] for i in self.all_pods.keys() if i.startswith(podname + '-') ] def check_pods( self, podname, keybase="", pod_optional=False, ): ''' to compare the pod host ip and check the pod status ''' logging.getLogger().info("Finding pods for: %s", podname) result_code = 1 pods = self.get_pods_by_name(podname) logging.getLogger().info("Pods Found: %s", len(pods)) expected_replicas = 0 try: expected_replicas = self.get_expected_replicas(podname) except Exception: logging.getLogger().warn("dc not found for pod %s", podname) if pod_optional: logging.getLogger().warn( "Some clusters don't have pod %s, please confirm before trying to fix this", podname) return # nothing we should do, so quit early, don't do more checks logging.getLogger().info("Expected Replicas: %s", expected_replicas) if len(pods) != expected_replicas: result_code = 0 logging.getLogger().critical("Count Pods and Replicas don't match") count_pods_running = len([i for i in pods if i['status'] == "Running"]) logging.getLogger().info("Pods Running: %s", count_pods_running) if len(pods) != count_pods_running: result_code = 0 logging.getLogger().critical("Some pods are not in running state") host_ips = set([x['hostIP'] for x in pods]) logging.getLogger().info("Hosts found: %d", len(host_ips)) if len(host_ips) < 2 or len(pods) < 2: result_code = 0 logging.getLogger().critical( "%s has %d pods on %d hosts, not distributed", podname, len(pods), len(host_ips)) if result_code == 0: logging.getLogger().critical("Please check pods are in running " "state, and on unique hosts") logging.getLogger().critical("oc get pods -n %s -o wide", self.args.namespace) # result_code 1 means the pods are on different nodes # count_pods_running means the running pod number self.send_metrics(keybase=keybase, location=result_code, status=count_pods_running) def send_metrics(self, keybase="", location="", status=""): """send_metrics""" ms = MetricSender(verbose=self.args.verbose) ms.add_metric({keybase + '.location': location}) ms.add_metric({keybase + '.status': status}) ms.send_metrics()