Пример #1
0
    def check_election_status(self, config):
        """
        Retrieves the leader-election annotation from a given object, and
        submits metrics and a service check.

        An integration warning is sent if the object is not retrievable,
        or no record is found. Monitors on the service-check should have
        no-data alerts enabled to account for this.

        The config objet requires the following fields:
            namespace (prefix for the metrics and check)
            record_kind (endpoints or configmap)
            record_name
            record_namespace
            tags (optional)

        It reads the following agent configuration:
            kubernetes_kubeconfig_path: defaut is to use in-cluster config
        """
        try:
            record = self._get_record(config.get("record_kind", ""),
                                      config.get("record_name", ""),
                                      config.get("record_namespace", ""))
            self._report_status(config, record)
        except Exception as e:
            self.warning(
                "Cannot retrieve leader election record {}: {}".format(
                    config.get("record_name", ""), e))
Пример #2
0
    def _report_status(self, config, record):
        # Compute prefix for gauges and service check
        prefix = config.get("namespace") + ".leader_election"

        # Compute tags for gauges and service check
        tags = []
        for n in ["record_kind", "record_name", "record_namespace"]:
            if n in config:
                tags.append("{}:{}".format(n, config[n]))
        tags += config.get("tags", [])

        # Sanity check on the record
        valid, reason = record.validate()
        if not valid:
            self.service_check(prefix + ".status",
                               AgentCheck.CRITICAL,
                               tags=tags,
                               message=reason)
            return  # Stop here

        # Report metrics
        self.monotonic_count(prefix + ".transitions", record.transitions, tags)
        self.gauge(prefix + ".lease_duration", record.lease_duration, tags)

        leader_status = AgentCheck.OK
        if record.seconds_until_renew + record.lease_duration < 0:
            leader_status = AgentCheck.CRITICAL
        self.service_check(prefix + ".status",
                           leader_status,
                           tags=tags,
                           message=record.summary)
Пример #3
0
def main(cfg):
    #parse config
    if os.path.isfile(cfg):
        config = ConfigParser.ConfigParser()
        config.read(cfg)
        test_name = config.get('kraken', 'test_type')
        namespace = config.get('kraken', 'name')
        label = config.get('kraken', 'label')
        master_label = config.get('kraken', 'master_label')
        if (label is None):
            print(
                Fore.YELLOW +
                'label is not provided, assuming you are okay with deleting any of the available nodes except the master\n'
            )
            label = "undefined"
        if test_name == "kill_node":
            node_test(label, master_label)
        elif test_name == "crash_node":
            node_crash(label, master_label)
        elif test_name == "kill_master":
            master_test(label, master_label)
        elif test_name == "kill_etcd":
            etcd_test(label, master_label)
        else:
            print(
                Fore.RED +
                '%s is not a valid scenario, please choose from kill_node, crash_node, kill_etcd, kill_master'
            ) % (test_name)
            sys.exit(1)
    else:
        help()
        sys.exit(1)
Пример #4
0
def main(cfg):
    #parse config
    if os.path.isfile(cfg):
        config = ConfigParser.ConfigParser()
        config.read(cfg)
        test_name = config.get('kraken', 'test_type')
        namespace = config.get('kraken','name')
        label = config.get('kraken', 'label')
        master_label = config.get('kraken', 'master_label')
        # wabouham ADDED: master_label is normally "node_type=master" 
        print (Fore.YELLOW + 'label is: %s\n') %(label)
        print (Fore.YELLOW + 'master_label is: %s\n') %(master_label)

        if (label is None):
            print (Fore.YELLOW + 'label is not provided, assuming you are okay with deleting any of the available nodes except the master\n')
            label = "undefined"
        if test_name == "kill_node":
            node_test(label, master_label)
        elif test_name == "crash_node":
            node_crash(label, master_label)
        elif test_name == "kill_master":
            master_test(label, master_label)
        elif test_name == "kill_etcd":
            etcd_test(label, master_label)
        else:
            print (Fore.RED + '%s is not a valid scenario, please choose from kill_node, crash_node, kill_etcd, kill_master') %(test_name)
            sys.exit(1)
    else:
        help()
        sys.exit(1)
Пример #5
0
 def __init__(self, config, trawler):
     # Takes in config object and trawler instance it's behind
     # In k8s or outside
     self.use_kubeconfig = trawler.use_kubeconfig
     # Namespace to find managemnet pods
     self.namespace = config.get('namespace', 'default')
     # Maximum frequency to pull data from APIC
     self.max_frequency = int(config.get('frequency', 600))
     if self.use_kubeconfig:
         logger.error(
             "Analytics metrics currently only available in cluster")
     else:
         self.find_hostname_and_certs()
Пример #6
0
 def __init__(self, config, trawler):
     # Takes in config object and trawler instance it's behind
     # Use kubeconfig or in-cluster config for k8s comms
     self.use_kubeconfig = trawler.use_kubeconfig
     # Namespace to find managemnet pods
     self.namespace = config.get('namespace', 'default')
     # Maximum frequency to pull data from APIC
     self.max_frequency = int(config.get('frequency', 600))
     # Cloud manager username to use for REST calls
     self.username = config.get('username', 'admin')
     # Load password from secret `cloudmanager_password`
     self.password = trawler.read_secret('cloudmanager_password')
     if self.password is None:
         # Use out of box default password
         self.password = '******'
     self.hostname = self.find_hostname()
Пример #7
0
def get_current_context():
    '''
    Read ~/.kube/config to get the current context from the merged kubeconfig --> 'kubectl config current-context'

    return: current_context 
    '''

    config = _config_loader()
    if config:
        return config.get('current-context')
Пример #8
0
    def _report_status(self, config, record):
        # Compute prefix for gauges and service check
        prefix = config.get("namespace") + ".leader_election"

        # Compute tags for gauges and service check
        tags = []
        for k, v in {
                "record_kind": record.kind,
                "record_name": config.get("record_name"),
                "record_namespace": config.get("record_namespace"),
        }.items():
            if v is not None:
                tags.append("{}:{}".format(k, v))
        tags += config.get("tags", [])

        # Sanity check on the record
        valid, reason = record.validate()
        if not valid:
            self.service_check(prefix + ".status",
                               AgentCheck.CRITICAL,
                               tags=tags,
                               message=reason)
            return  # Stop here

        # Report metrics
        self.monotonic_count(prefix + ".transitions", record.transitions, tags)
        self.gauge(prefix + ".lease_duration", record.lease_duration, tags)

        leader_status = AgentCheck.OK
        message = record.summary
        if record.seconds_until_renew + record.lease_duration < 0:
            leader_status = AgentCheck.CRITICAL
        if leader_status is AgentCheck.OK:
            message = None
        self.service_check(prefix + ".status",
                           leader_status,
                           tags=tags,
                           message=message)
Пример #9
0
def _isExist(cluster_name):
    '''
    
    Read ~/.kube/config to check if the provided cluster name is already configured under ~/.kube/config

    :param cluster_name: Name of cluster to check against.
    :return: True or False 

    '''
    config = _config_loader()

    if config:

        for cluster in config.get('clusters', []):
            if (cluster['name'] == cluster_name):
                return True

    return False
Пример #10
0
def get_kubeasyList(output=False):

    kubeasyList = {}
    config = _config_loader()

    if config:

        for cluster in config.get('clusters', []):

            if cluster['name'] == get_current_context():
                kubeasyList['** ' +
                            cluster['name']] = cluster['cluster']['server']
            else:
                kubeasyList['   ' +
                            cluster['name']] = cluster['cluster']['server']

    if output and kubeasyList:

        header = ['K8s Cluster', 'Master']
        print(
            '\n List of clusters which are currently ready to use for kubeasy:'
        )
        print(
            colorama.Fore.GREEN +
            '\n - \'kubeasy -d\' to access Kubernetes dashboard for the current context.'
        )
        print(
            colorama.Fore.GREEN +
            ' - \'kubeasy -c <cluster_name>\' to switch to another listed context.\n'
        )
        _print_table(kubeasyList, header)
        print(colorama.Fore.GREEN + '\nNote: ** indicates current context.\n')

    elif output and not kubeasyList:
        print(
            colorama.Fore.YELLOW +
            'Currently there are no clusters configured for kubeasy, Please check \"kubeasy -h\" for how to add new AKS\\GKE clusters.'
        )

    else:

        return kubeasyList
Пример #11
0
def main(cfg):
    # Parse and read the config
    if os.path.isfile(cfg):
        with open(cfg, 'r') as f:
            config = yaml.full_load(f)
        config = config["shutdown"][0]
        cloud_type = config['cloud_type']
        kubeconfig_path = config.get("kubeconfig_path", "~/.kube/config")

        shutdown_master_num = config.get("shutdown_master_num", "all")
        shutdown_worker_num = config.get("shutdown_worker_num", "all")
        shutdown_infra_num = config.get("shutdown_infra_num", "all")

        ssh_file = config.get("ssh_file", "")

        initialize_clients(kubeconfig_path)
        downtime = calc_time(config.get("downtime", "300 s"))

        masters = list_nodes("node-role.kubernetes.io/master")
        backup_etcd(masters[1])

        if shutdown_master_num != "all":
            new_master_list = []
            for i in range(int(shutdown_master_num)):
                new_master_list.append(masters[i])
            masters = new_master_list

        workers = list_nodes(
            "node-role.kubernetes.io/worker=,node-role.kubernetes.io/infra!=")
        if shutdown_worker_num != "all":
            new_worker_list = []
            for i in range(int(shutdown_worker_num)):
                new_worker_list.append(workers[i])
            workers = new_worker_list

        infras = list_nodes("node-role.kubernetes.io/infra")

        if shutdown_infra_num != "all":
            new_infra_list = []
            for i in range(int(shutdown_infra_num)):
                new_infra_list.append(infras[i])
            infras = new_infra_list

        node_list = workers + infras + masters
        logging.info('node list ' + str(node_list))

        if cloud_type == "aws":
            aws = aws_node_scenarios()
            for node in node_list:
                logging.info('stop node ' + str(node))
                aws_node_scenarios.node_stop_scenario(aws, node)

        elif cloud_type == "azure" or cloud_type == "az":
            logging.info("azure")
            az_account = run_cmd("az account list -o yaml")
            az = azure_node_scenarios(az_account)
            for node in node_list:
                logging.info('stop node ' + str(node))
                azure_node_scenarios.node_stop_scenario(az, node)
        elif cloud_type == "gcp":
            logging.info('gcp')
            project = run_cmd('gcloud config get-value project').split(
                '/n')[0].strip()
            gcp = gcp_node_scenarios(project)

            for node in node_list:
                logging.info('stop node ' + str(node))
                gcp_node_scenarios.node_stop_scenario(gcp, node)
        else:
            logging.info("Shutting down using ssh")
            shutdown_via_ssh(node_list, ssh_file)

        # wait period
        time.sleep(downtime)

        # restart cluster
        # start nodes based on cloud provider
        if cloud_type == "aws":
            for node in node_list:
                logging.info('start node ' + str(node))
                aws_node_scenarios.node_start_scenario(aws, node)

        elif cloud_type == "azure" or cloud_type == "az":

            for node in node_list:
                logging.info('start node ' + str(node))
                azure_node_scenarios.node_start_scenario(az, node)

        elif cloud_type == "gcp":
            for node in node_list:
                logging.info('start node ' + str(node))
                gcp_node_scenarios.node_start_scenario(gcp, node)
        else:
            logging.info("Cloud type " + str(cloud_type) +
                         " is not supported ")
            sys.exit(1)

        wait_for_all_nodes_ready(masters)

        wait_for_all_nodes_ready(workers)

        wait_for_all_nodes_ready(infras)

        cluster_operators = run_cmd("oc get co")

        run_cmd("oc get nodes")
Пример #12
0
def login_with_kubeconfig(**_: Any) -> Optional[credentials.ConnectionInfo]:
    """
    A minimalistic login handler that can get raw data from a kubeconfig file.

    Authentication capabilities can be limited to keep the code short & simple.
    No parsing or sophisticated multi-step token retrieval is performed.

    This login function is intended to make Kopf runnable in trivial cases
    when neither pykube-ng nor the official client library are installed.
    """

    # As per https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/
    kubeconfig = os.environ.get('KUBECONFIG')
    if not kubeconfig and os.path.exists(os.path.expanduser('~/.kube/config')):
        kubeconfig = '~/.kube/config'
    if not kubeconfig:
        return None

    paths = [path.strip() for path in kubeconfig.split(os.pathsep)]
    paths = [os.path.expanduser(path) for path in paths if path]

    # As prescribed: if the file is absent or non-deserialisable, then fail. The first value wins.
    current_context: Optional[str] = None
    contexts: Dict[Any, Any] = {}
    clusters: Dict[Any, Any] = {}
    users: Dict[Any, Any] = {}
    for path in paths:

        with open(path, 'rt', encoding='utf-8') as f:
            config = yaml.safe_load(f.read()) or {}

        if current_context is None:
            current_context = config.get('current-context')
        for item in config.get('contexts', []):
            if item['name'] not in contexts:
                contexts[item['name']] = item.get('context') or {}
        for item in config.get('clusters', []):
            if item['name'] not in clusters:
                clusters[item['name']] = item.get('cluster') or {}
        for item in config.get('users', []):
            if item['name'] not in users:
                users[item['name']] = item.get('user') or {}

    # Once fully parsed, use the current context only.
    if current_context is None:
        raise credentials.LoginError('Current context is not set in kubeconfigs.')
    context = contexts[current_context]
    cluster = clusters[context['cluster']]
    user = users[context['user']]

    # Unlike pykube's login, we do not make a fake API request to refresh the token.
    provider_token = user.get('auth-provider', {}).get('config', {}).get('access-token')

    # Map the retrieved fields into the credentials object.
    return credentials.ConnectionInfo(
        server=cluster.get('server'),
        ca_path=cluster.get('certificate-authority'),
        ca_data=cluster.get('certificate-authority-data'),
        insecure=cluster.get('insecure-skip-tls-verify'),
        certificate_path=user.get('client-certificate'),
        certificate_data=user.get('client-certificate-data'),
        private_key_path=user.get('client-key'),
        private_key_data=user.get('client-key-data'),
        username=user.get('username'),
        password=user.get('password'),
        token=user.get('token') or provider_token,
        default_namespace=context.get('namespace'),
        priority=PRIORITY_OF_KUBECONFIG,
    )
Пример #13
0
    def create_deps(self, configList, is_wait_ip=True):

        result = {
            'datas': {
                'node': '',
                'deps': {}
            },
            'error': '',
            'status': True
        }

        # 过滤出来的属性
        result_deps_pool = [
            'name', 'deploy_name', 'service_name', 'host_ip', 'rf_port',
            'mysql_port', 'ssh_port', 'pod_ip', 'web_ssh_port', 'port_map',
            'error', 'res'
        ]

        reqData = []

        for config in configList:

            # 端口数据转字符串
            for i in range(len(config.get('ports'))):
                config.get('ports')[i] = str(config.get('ports')[i])

            is_resource_occupied = config.get(
                'is_resource_occupied') if config.get(
                    'is_resource_occupied') else 0
            life_days = config.get('life_days') if config.get(
                'life_days') else 0

            is_count = config.get('is_count') if config.get('is_count') else 0
            max_count = config.get('max_count') if config.get(
                'max_count') else 0
            app_info = config.get('app_info') if config.get(
                'app_info') else None
            if app_info:
                if not app_info['project']:
                    app_info['project'] = "未知"
                if not app_info['department']:
                    app_info['department'] = "未知"
                if not app_info['panel']:
                    app_info['panel'] = "未知"
                if not app_info['packet']:
                    app_info['packet'] = "未知"
                if not app_info['component']:
                    app_info['component'] = "未知"
                if not app_info['usage']:
                    app_info['usage'] = "未知"

            reqData.append({
                "id": uuid.uuid4().__str__(),
                "uid": self.uid,
                "image": config.get('image'),
                "label": '',
                "name": 'sdkuser',
                "command": config.get('command'),
                "cpu": config.get('cpu'),
                "memory": config.get('memory'),
                "ephemeral_storage": config.get('ephemeral_storage'),
                "ports": ','.join(config.get('ports')),
                "is_build": 0,
                "is_persistent": 0,
                "is_resource_occupied": is_resource_occupied,
                "p_name": '',
                "p_path": '',
                "p_storage": 0,
                "sub_net_name": [],
                "node_name": '',
                "pic": '',
                "coordinate": [],
                "is_set": -1,
                "node_labels": config.get('node_labels'),
                "life_days": life_days,
                "is_count": is_count,
                "max_count": max_count,
                "app_info": app_info
            })

        headers = {
            "Accept": "*/*",
            "Accept-Encoding": "gzip, deflate",
            "User-Agent": "python-requests/2.9.1",
        }
        url = REQUEST_URL + "/api/multiDeploy"
        response = requests.post(url=url,
                                 data=json.dumps(reqData),
                                 headers=headers,
                                 verify=False)
        response = response.content.decode('UTF-8')
        response = json.loads(response)

        # pprint(response)

        datas = response.get('data')

        status = response.get('status')

        # print(status)
        if status:

            # 容器数量是否超过上限
            if status.get('overNumLimit') == True:
                result['error'] = '容器创建数量超过上限'
                result['status'] = False
                return result

            # 资源是否不足
            if status.get('outOfResource') == True:
                result['error'] = '无可分配资源,请适当释放您的资源或通知资源负责人'
                result['status'] = False
                return result

            del status["outOfResource"]

            for k, v in status.items():
                if v == False:
                    result['error'] = '容器创建过程出错'
                    result['status'] = False
                    break
        # print('===========================')

        # 容器创建都成功
        if result['status'] == True:
            # 调整result结构

            id_to_name_map = {}  # id与name的映射关系
            podNames = []  # 容器name列表,获取pod_ip时候用

            result.get("datas")["node"] = datas.get('node')

            # 构造容器属性body,并补充map列表
            for id, name in datas.get('name').items():
                result.get("datas").get("deps")[name] = {}
                id_to_name_map[id] = name
                podNames.append(name)

            # 填充容器属性
            for k, v in datas.items():
                if k in result_deps_pool:
                    for dep_id, dep_val in v.items():
                        dep_name = id_to_name_map[dep_id]
                        if dep_name:
                            result.get("datas").get(
                                "deps")[dep_name][k] = dep_val

            # pprint(result)

            # is_wait_ip is True -> get pod_ip
            if is_wait_ip:
                # 添加podIp
                url = REQUEST_URL + "/api/getPodIps"
                for i in range(POD_IP_TIMES):
                    time.sleep(POD_IP_DELAY)
                    res = requests.post(url=url,
                                        data=json.dumps(podNames),
                                        headers=headers,
                                        verify=False)
                    res = res.content.decode('UTF-8')
                    res = json.loads(res)
                    # print('get_pod_res====================',res)
                    if res.get('status') == True:
                        pod_ips = res.get('datas')
                        for name, props in result.get("datas").get(
                                "deps").items():
                            result.get("datas").get(
                                "deps")[name]['pod_ip'] = pod_ips[name]
                        break

                # 超出等待时间,需要删除未获取到pod_ip的容器,并抛异常
                delete_dep_list = []
                for d_name, d_item in result.get("datas").get("deps").items():
                    # 有一个没得到pod_ip,就都删除
                    if not d_item.get('pod_ip'):
                        if len(podNames) > 0:
                            for p_name in podNames:
                                delete_dep_list.append({"name": p_name})
                            print(
                                '----------------部分容器未获得pod_ip,需要删除所有容器:-----------------------:'
                            )
                            print(podNames)
                            delete_res = self.delete_deps(delete_dep_list)
                            print('----------------删除结果-------------------:')
                            print(delete_res)
                            # TODO.清空result数据
                            result = {
                                'datas': {
                                    'node': '',
                                    'deps': {}
                                },
                                'error': '',
                                'status': True
                            }
                            result['error'] = '创建失败,容器启动异常导致部分pod_ip未获取到,请确认配置'
                            result['status'] = False
                            # raise Exception("创建失败,容器启动异常导致部分pod_ip未获取到,请确认配置")
            # is_wait_ip is Fakse -> pod_ip = '0.0.0.0'
            else:
                for name, props in result.get("datas").get("deps").items():
                    result.get("datas").get("deps")[name]['pod_ip'] = '0.0.0.0'

            # 获取到podIp后,判断容器状态
            url_status = REQUEST_URL + "/api/isPodRunning?uid=" + self.uid
            res_status = requests.post(url=url_status,
                                       data=json.dumps(podNames),
                                       headers=headers,
                                       verify=False)
            res_status = res_status.content.decode('UTF-8')
            res_status = json.loads(res_status)
            if res_status.get('status') == False:
                result['error'] = res_status.get('error')
                result['status'] = False

        if result.get('datas').get('deps') == {} and result.get(
                'status') == True:
            result['error'] = '未获取到容器创建数据'
            result['status'] = False

        # pprint(result)
        return result
Пример #14
0
 initimage = None
 proxyimage = None
 imagepullpolicy = IMAGEPULLPOLICY
 debug = DEBUG
 verbosity = VERBOSITY
 namespaces = NAMESPACES
 version = ISTIO_VERSION
 configmap = None
 if configmaps.items:
     found = [c for c in configmaps.items if c.metadata.name == CONFIGMAP]
     if found:
         configmap = found[0]
 if configmap is not None:
     print("Applying settings from configmap")
     config = yaml.load(configmap.data['config'])
     policy = config.get('policy', 'enabled')
     initializername = config.get('initializerName', INITIALIZER)
     namespaces = config.get('namespaces', NAMESPACES)
     params = config.get('params')
     if params is not None:
         initimage = params.get('initImage', initimage)
         proxyimage = params.get('proxyImage', proxyimage)
         imagepullpolicy = params.get('imagePullPolicy', imagepullpolicy)
         debug = params.get('debugMode', debug)
         verbosity = params.get('verbosity', verbosity)
         version = params.get('version', version)
 if proxyimage is None:
     proxyimage = 'docker.io/istio/proxy_debug:%s' % version
 if initimage is None:
     initimage = 'docker.io/istio/proxy_init:%s' % version
 if not debug:
Пример #15
0
def main(cfg):
    # Parse and read the config
    if os.path.isfile(cfg):
        config = configparser.ConfigParser()
        config.read(cfg)
        watch_nodes = config.get('cerberus', 'watch_nodes')
        cerberus_publish_status = config.get('cerberus',
                                             'cerberus_publish_status')
        watch_etcd = config.get('cerberus', 'watch_etcd')
        etcd_namespace = config.get('cerberus', 'etcd_namespace')
        watch_openshift_apiserver = config.get('cerberus',
                                               'watch_openshift_apiserver')
        openshift_apiserver_namespace = \
            config.get('cerberus', 'openshift_apiserver_namespace')
        watch_kube_apiserver = config.get('cerberus', 'watch_kube_apiserver')
        kube_apiserver_namespace = config.get('cerberus',
                                              'kube_apiserver_namespace')
        watch_monitoring_stack = config.get('cerberus',
                                            'watch_monitoring_stack')
        monitoring_stack_namespace = config.get('cerberus',
                                                'monitoring_stack_namespace')
        watch_kube_controller = config.get('cerberus', 'watch_kube_controller')
        kube_controller_namespace = config.get('cerberus',
                                               'kube_controller_namespace')
        iterations = config.get('tunings', 'iterations')
        sleep_time = config.get('tunings', 'sleep_time')
        daemon_mode = config.get('tunings', 'daemon_mode')

        # Start cerberus
        logging.info("Starting cerberus")

        # Run http server using a separate thread
        # if cerberus is asked to publish the status.
        # It is served by the http server.
        if cerberus_publish_status == "True":
            logging.info("Publishing cerberus status at http://localhost:8086")
            _thread.start_new_thread(start_server, ())

        # Initialize the start iteration to 0
        iteration = 0

        # Set the number of iterations to loop to infinity
        # if daemon mode is enabled
        # or else set it to the provided iterations count in the config
        if daemon_mode == "True":
            logging.info("Daemon mode enabled, cerberus will monitor forever")
            logging.info("Ignoring the iterations set")
            iterations = float('inf')
        else:
            iterations = int(iterations)

        # Loop to run the components status checks starts here
        while (int(iteration) < iterations):
            iteration += 1

            # Monitor nodes status
            if watch_nodes == "True":
                watch_nodes_status = monitor_nodes()
                logging.info("Iteration %s: Node status: \
                              %s" % (iteration, watch_nodes_status))
            else:
                logging.info("Cerberus is not monitoring nodes,\
                              so setting the status to True and \
                              assuming that the nodes are ready")
                watch_nodes_status = True

            # Monitor etcd status
            if watch_etcd == "True":
                watch_etcd_status = monitor_namespace(etcd_namespace)
                logging.info("Iteration %s: \
                              Etcd member pods status: \
                              %s" % (iteration, watch_etcd_status))
            else:
                logging.info("Cerberus is not monitoring ETCD,\
                              so setting the status to True and \
                              assuming that the ETCD member pods are ready")
                watch_etcd_status = True

            # Monitor openshift-apiserver status
            if watch_openshift_apiserver == "True":
                watch_openshift_apiserver_status = \
                    monitor_namespace(openshift_apiserver_namespace)
                logging.info("Iteration %s: \
                              OpenShift apiserver status: \
                              %s" %
                             (iteration, watch_openshift_apiserver_status))
            else:
                logging.info("Cerberus is not monitoring openshift-apiserver,\
                              so setting the status to True \
                              and assuming that the \
                              openshift-apiserver is up and running")
                watch_openshift_apiserver_status = True

            # Monitor kube apiserver status
            if watch_kube_apiserver == "True":
                watch_kube_apiserver_status = \
                    monitor_namespace(kube_apiserver_namespace)
                logging.info("Iteration %s: \
                              Kube ApiServer status: \
                              %s" % (iteration, watch_kube_apiserver_status))
            else:
                logging.info("Cerberus is not monitoring Kube ApiServer, so \
                              setting the status to True and assuming that \
                              the Kube ApiServer is up and running")
                watch_kube_apiserver_status = True

            # Monitor prometheus/monitoring stack
            if watch_monitoring_stack == "True":
                watch_monitoring_stack_status = \
                    monitor_namespace(monitoring_stack_namespace)
                logging.info("Iteration %s: \
                              Monitoring stack status: \
                              %s" % (iteration, watch_monitoring_stack_status))
            else:
                logging.info("Cerberus is not monitoring prometheus/monitoring\
                             , so setting the status to True \
                             and assuming that the monitoring stack is \
                             up and running")
                watch_monitoring_stack_status = True

            # Monitor kube controller
            if watch_kube_controller == "True":
                watch_kube_controller_status = \
                    monitor_namespace(kube_controller_namespace)
                logging.info("Iteration %s: \
                              Kube controller status: \
                              %s" % (iteration, watch_kube_controller_status))
            else:
                logging.info("Cerberus is not monitoring kube controller, so \
                              setting the status to True and assuming that \
                              the kube controller is up and running")
                watch_kube_controller_status = True

            # Sleep for the specified duration
            logging.info("Sleeping for the \
                          specified duration: %s" % (sleep_time))
            time.sleep(float(sleep_time))

            # Set the cerberus status by checking the status of the
            # watched components/resources for the http server to publish it
            if (watch_nodes_status and watch_etcd_status
                    and watch_openshift_apiserver_status
                    and watch_kube_apiserver and watch_monitoring_stack_status
                    and watch_kube_controller):
                cerberus_status = True
            else:
                cerberus_status = False
            if cerberus_publish_status == "True":
                publish_cerberus_status(cerberus_status)
        else:
            logging.info("Completed watching for the specified number of \
                          iterations: %s" % (iterations))
    else:
        logging.error("Could not find a config at %s, please check" % (cfg))
        sys.exit(1)
Пример #16
0
def deploy(deployment,
           chart,
           environment,
           namespace=None,
           helm_config_overrides_implicit=None,
           helm_config_overrides_string=None,
           version=None,
           timeout=None,
           force=False,
           atomic=False,
           cleanup_on_fail=False):
    """
    Deploy a JupyterHub.

    Expects the following files to exist in current directory

    {chart}/ (Helm deployment chart)
    deployments/
    - {deployment}
        - image/ (optional)
        - secrets/
            - {environment}.yaml
        - config/
          - common.yaml
          - {environment}.yaml

    A docker image from deployments/{deployment}/image is expected to be
    already built and available with imagebuilder.
    `jupyterhub.singleuser.image.tag` will be automatically set to this image
    tag.
    """
    if helm_config_overrides_implicit is None:
        helm_config_overrides_implicit = []
    if helm_config_overrides_string is None:
        helm_config_overrides_string = []

    config = get_config(deployment)

    name = f'{deployment}-{environment}'

    if namespace is None:
        namespace = name
    helm_config_files = [
        f for f in [
            os.path.join('deployments', deployment, 'config', 'common.yaml'),
            os.path.join('deployments', deployment, 'config',
                         f'{environment}.yaml'),
        ] if os.path.exists(f)
    ]

    helm_secret_files = [
        f for f in [
            # Support for secrets in same repo
            os.path.join('deployments', deployment, 'secrets',
                         f'{environment}.yaml'),
            # Support for secrets in a submodule repo
            os.path.join('secrets', 'deployments', deployment, 'secrets',
                         f'{environment}.yaml'),
        ] if os.path.exists(f)
    ]

    if config.get('images'):
        for image in config['images']['images']:
            # We can support other charts that wrap z2jh by allowing various
            # config paths where we set image tags and names.
            # We default to one sublevel, but we can do multiple levels.
            # With the PANGEO chart, we this could be set to `pangeo.jupyterhub.singleuser.image`
            helm_config_overrides_string.append(
                f'{image.helm_substitution_path}.tag={image.tag}')
            helm_config_overrides_string.append(
                f'{image.helm_substitution_path}.name={image.name}')

    with ExitStack() as stack:
        decrypted_secret_files = [
            stack.enter_context(decrypt_file(f)) for f in helm_secret_files
        ]

        # Just in time for k8s access, activate the cluster credentials
        stack.enter_context(cluster_auth(deployment))
        helm_upgrade(
            name,
            namespace,
            chart,
            helm_config_files + decrypted_secret_files,
            helm_config_overrides_implicit,
            helm_config_overrides_string,
            version,
            timeout,
            force,
            atomic,
            cleanup_on_fail,
        )
Пример #17
0
def main(cfg):
    # Parse and read the config
    if os.path.isfile(cfg):
        config = configparser.ConfigParser()
        config.read(cfg)
        watch_nodes = config.get('cerberus', 'watch_nodes')
        cerberus_publish_status = config.get('cerberus',
                                             'cerberus_publish_status')
        watch_etcd = config.get('cerberus', 'watch_etcd')
        etcd_namespace = config.get('cerberus', 'etcd_namespace')
        watch_openshift_apiserver = config.get('cerberus',
                                               'watch_openshift_apiserver')
        openshift_apiserver_namespace = \
            config.get('cerberus', 'openshift_apiserver_namespace')
        watch_kube_apiserver = config.get('cerberus', 'watch_kube_apiserver')
        kube_apiserver_namespace = config.get('cerberus',
                                              'kube_apiserver_namespace')
        watch_monitoring_stack = config.get('cerberus',
                                            'watch_monitoring_stack')
        monitoring_stack_namespace = config.get('cerberus',
                                                'monitoring_stack_namespace')
        watch_kube_controller = config.get('cerberus', 'watch_kube_controller')
        kube_controller_namespace = config.get('cerberus',
                                               'kube_controller_namespace')
        watch_machine_api = config.get('cerberus',
                                       'watch_machine_api_components')
        machine_api_namespace = config.get('cerberus', 'machine_api_namespace')
        watch_kube_scheduler = config.get('cerberus', 'watch_kube_scheduler')
        kube_scheduler_namespace = config.get('cerberus',
                                              'kube_scheduler_namespace')
        kubeconfig_path = config.get('cerberus', 'kubeconfig_path')
        iterations = config.get('tunings', 'iterations')
        sleep_time = config.get('tunings', 'sleep_time')
        daemon_mode = config.get('tunings', 'daemon_mode')

        # Initialize clients
        if not os.path.isfile(kubeconfig_path):
            kubeconfig_path = None
        initialize_clients(kubeconfig_path)

        # Start cerberus
        logging.info("Starting cerberus")

        # Run http server using a separate thread
        # if cerberus is asked to publish the status.
        # It is served by the http server.
        if cerberus_publish_status == "True":
            logging.info("Publishing cerberus status at http://localhost:8086")
            _thread.start_new_thread(start_server, ())

        # Initialize the start iteration to 0
        iteration = 0

        # Set the number of iterations to loop to infinity
        # if daemon mode is enabled
        # or else set it to the provided iterations count in the config
        if daemon_mode == "True":
            logging.info("Daemon mode enabled, cerberus will monitor forever")
            logging.info("Ignoring the iterations set")
            iterations = float('inf')
        else:
            iterations = int(iterations)

        # Loop to run the components status checks starts here
        while (int(iteration) < iterations):
            iteration += 1
            print("\n")

            # Monitor nodes status
            if watch_nodes == "True":
                watch_nodes_status, failed_nodes = monitor_nodes()
                logging.info("Iteration %s: Node status: %s" %
                             (iteration, watch_nodes_status))
            else:
                logging.info("Cerberus is not monitoring nodes, "
                             "so setting the status to True and "
                             "assuming that the nodes are ready")
                watch_nodes_status = True

            # Monitor etcd status
            if watch_etcd == "True":
                watch_etcd_status, failed_etcd_pods = \
                    monitor_namespace(etcd_namespace)
                logging.info("Iteration %s: Etcd member pods status: %s" %
                             (iteration, watch_etcd_status))
            else:
                logging.info("Cerberus is not monitoring ETCD, "
                             "so setting the status to True and "
                             "assuming that the ETCD member pods are ready")
                watch_etcd_status = True

            # Monitor openshift-apiserver status
            if watch_openshift_apiserver == "True":
                watch_openshift_apiserver_status, failed_ocp_apiserver_pods = \
                    monitor_namespace(openshift_apiserver_namespace)
                logging.info("Iteration %s: OpenShift apiserver status: %s" %
                             (iteration, watch_openshift_apiserver_status))
            else:
                logging.info("Cerberus is not monitoring openshift-apiserver, "
                             "so setting the status to True "
                             "and assuming that the "
                             "openshift-apiserver is up and running")
                watch_openshift_apiserver_status = True

            # Monitor kube apiserver status
            if watch_kube_apiserver == "True":
                watch_kube_apiserver_status, failed_kube_apiserver_pods = \
                    monitor_namespace(kube_apiserver_namespace)
                logging.info("Iteration %s: Kube ApiServer status: %s" %
                             (iteration, watch_kube_apiserver_status))
            else:
                logging.info("Cerberus is not monitoring Kube ApiServer, so "
                             "setting the status to True and assuming that "
                             "the Kube ApiServer is up and running")
                watch_kube_apiserver_status = True

            # Monitor prometheus/monitoring stack
            if watch_monitoring_stack == "True":
                watch_monitoring_stack_status, failed_monitoring_stack = \
                    monitor_namespace(monitoring_stack_namespace)
                logging.info("Iteration %s: Monitoring stack status: %s" %
                             (iteration, watch_monitoring_stack_status))
            else:
                logging.info("Cerberus is not monitoring prometheus stack, "
                             "so setting the status to True "
                             "and assuming that the monitoring stack is "
                             "up and running")
                watch_monitoring_stack_status = True

            # Monitor kube controller
            if watch_kube_controller == "True":
                watch_kube_controller_status, failed_kube_controller_pods = \
                    monitor_namespace(kube_controller_namespace)
                logging.info("Iteration %s: Kube controller status: %s" %
                             (iteration, watch_kube_controller_status))
            else:
                logging.info("Cerberus is not monitoring kube controller, so "
                             "setting the status to True and assuming that "
                             "the kube controller is up and running")
                watch_kube_controller_status = True

            # Monitor machine api components
            # Components includes operator, controller and auto scaler
            if watch_machine_api == "True":
                watch_machine_api_status, failed_machine_api_components = \
                    monitor_namespace(machine_api_namespace)
                logging.info(
                    "Iteration %s: Machine API components status: %s" %
                    (iteration, watch_machine_api_status))
            else:
                logging.info("Cerberus is not monitoring machine api "
                             "components, so setting the status to True and "
                             "assuming that it is up and running")
                watch_machine_api_status = True

            # Monitor kube scheduler
            if watch_kube_scheduler == "True":
                watch_kube_scheduler_status, failed_kube_scheduler_pods = \
                    monitor_namespace(kube_scheduler_namespace)
                logging.info("Iteration %s: Kube scheduler status: %s" %
                             (iteration, watch_kube_scheduler_status))
            else:
                logging.info("Cerberus is not monitoring kube scheduler, so "
                             "setting the status to True and assuming that "
                             "the kube scheduler is up and running")
                watch_kube_scheduler_status = True

            # Sleep for the specified duration
            logging.info("Sleeping for the "
                         "specified duration: %s" % (sleep_time))
            time.sleep(float(sleep_time))

            # Set the cerberus status by checking the status of the
            # watched components/resources for the http server to publish it
            if watch_nodes_status and watch_etcd_status \
                and watch_openshift_apiserver_status \
                and watch_kube_apiserver_status \
                and watch_monitoring_stack_status \
                and watch_kube_controller_status \
                and watch_machine_api_status \
                and watch_kube_scheduler_status:
                cerberus_status = True
            else:
                cerberus_status = False
                logging.info(
                    "Failed nodes: %s\n"
                    "Failed etcd pods: %s\n"
                    "Failed openshift apiserver pods: %s\n"
                    "Failed kube apiserver pods: %s\n"
                    "Failed monitoring stack components: %s\n"
                    "Failed kube controller pods: %s\n"
                    "Failed machine api components: %s "
                    "Failed kube scheduler pods: %s " %
                    (failed_nodes, failed_etcd_pods, failed_ocp_apiserver_pods,
                     failed_kube_apiserver_pods, failed_monitoring_stack,
                     failed_kube_controller_pods,
                     failed_machine_api_components,
                     failed_kube_scheduler_pods))

            if cerberus_publish_status == "True":
                publish_cerberus_status(cerberus_status)
        else:
            logging.info("Completed watching for the specified number of "
                         "iterations: %s" % (iterations))
    else:
        logging.error("Could not find a config at %s, please check" % (cfg))
        sys.exit(1)