def check_pod_status(): """Background Task to update status/phase of known pods """ from api.models.kubepod import KubePod try: with PidFile('pod_status') as p: print(p.pidname) ns = os.environ.get('MLBENCH_NAMESPACE') config.load_incluster_config() v1 = client.CoreV1Api() pods = pods = KubePod.objects.all() for pod in pods: ret = v1.read_namespaced_pod(pod.name, ns) phase = ret.status.phase if phase != pod.phase: pod.phase = phase pod.save() except PidFileError: return
def main(): global_options = {} logging.basicConfig(level=logging.DEBUG, format='%(asctime)-15s %(message)s') logging.getLogger('kubernetes').setLevel(logging.WARNING) try: k8s_config.load_kube_config() _, context = k8s_config.list_kube_config_contexts() region = context['context']['cluster'] domain = 'cc.{}.cloud.sap'.format(region) global_options['own_namespace'] = 'kube-system' #context['context']['namespace'] except IOError: from os import environ environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default' k8s_config.load_incluster_config() with open('/var/run/secrets/kubernetes.io/serviceaccount/namespace', 'r') as f: global_options['own_namespace'] = f.read() with open('/etc/resolv.conf', 'r') as f: for l in f: if re.match('^search\s+', l): _, domain = l.rsplit(' ', 1) domain = domain.strip() configurator = Configurator(domain, global_options) configurator.poll_config() discovery = DnsDiscovery(domain, configurator.global_options) discovery.register(re.compile(six.b('\Avc-[a-z]+-?\d+\Z')), configurator) while True: discovery.discover() configurator.poll() sleep(10)
def __init__(self, config): self._labels = config['labels'] self._labels[config.get('scope_label', 'cluster-name')] = config['scope'] self._label_selector = ','.join('{0}={1}'.format(k, v) for k, v in self._labels.items()) self._namespace = config.get('namespace') or 'default' self._role_label = config.get('role_label', 'role') config['namespace'] = '' super(Kubernetes, self).__init__(config) self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1, retry_exceptions=(KubernetesRetriableException, HTTPException, HTTPError, socket.error, socket.timeout)) self._ttl = None try: k8s_config.load_incluster_config() except k8s_config.ConfigException: k8s_config.load_kube_config(context=config.get('context', 'local')) self.__subsets = None use_endpoints = config.get('use_endpoints') and (config.get('patronictl') or 'pod_ip' in config) if use_endpoints: addresses = [k8s_client.V1EndpointAddress(ip=config['pod_ip'])] ports = [] for p in config.get('ports', [{}]): port = {'port': int(p.get('port', '5432'))} port.update({n: p[n] for n in ('name', 'protocol') if p.get(n)}) ports.append(k8s_client.V1EndpointPort(**port)) self.__subsets = [k8s_client.V1EndpointSubset(addresses=addresses, ports=ports)] self._api = CoreV1ApiProxy(use_endpoints) self.set_retry_timeout(config['retry_timeout']) self.set_ttl(config.get('ttl') or 30) self._leader_observed_record = {} self._leader_observed_time = None self._leader_resource_version = None self._leader_observed_subsets = [] self.__do_not_watch = False
def pytest_collection_modifyitems(config, items): c = Configuration() c.assert_hostname = False Configuration.set_default(c) k8sconfig.load_incluster_config() core_api = k8sclient.CoreV1Api() check_longhorn(core_api) if config.getoption(SKIP_RECURRING_JOB_OPT): skip_upgrade = pytest.mark.skip(reason="remove " + SKIP_RECURRING_JOB_OPT + " option to run") for item in items: if "recurring_job" in item.keywords: item.add_marker(skip_upgrade) using_csi = check_csi(core_api) if using_csi: skip_upgrade = pytest.mark.skip(reason="environment is not using " + "flexvolume") for item in items: if "flexvolume" in item.keywords: item.add_marker(skip_upgrade) else: skip_upgrade = pytest.mark.skip(reason="environment is not " + "using csi") for item in items: if "csi" in item.keywords: item.add_marker(skip_upgrade) all_nodes_support_mount_propagation = True for node in get_longhorn_api_client().list_node(): node = wait_for_node_mountpropagation_condition( get_longhorn_api_client(), node["name"]) if "conditions" not in node.keys(): all_nodes_support_mount_propagation = False else: conditions = node["conditions"] for key, condition in conditions.iteritems(): if key == NODE_CONDITION_MOUNTPROPAGATION and \ condition["status"] != CONDITION_STATUS_TRUE: all_nodes_support_mount_propagation = False break if not all_nodes_support_mount_propagation: break if not all_nodes_support_mount_propagation: skip_upgrade = pytest.mark.skip(reason="environment does not " + "support base image") skip_node = pytest.mark.skip(reason="environment does not " + "support mount disk") for item in items: if "baseimage" in item.keywords: item.add_marker(skip_upgrade) elif "mountdisk" in item.keywords: item.add_marker(skip_node)
def _load_kube_config(in_cluster, cluster_context, config_file): if not has_kubernetes: raise _import_err if in_cluster: config.load_incluster_config() else: config.load_kube_config(config_file=config_file, context=cluster_context) return client.CoreV1Api()
def _load_kube_config(in_cluster): from kubernetes import config, client if in_cluster: config.load_incluster_config() return client.CoreV1Api() else: config.load_kube_config() return client.CoreV1Api()
def main(): config.load_incluster_config() v1 = client.CoreV1Api() print("Listing pods with their IPs:") ret = v1.list_pod_for_all_namespaces(watch=False) for i in ret.items: print("%s\t%s\t%s" % (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
def main(): currNameSpace=sys.argv[1] config.load_incluster_config() v1 = client.CoreV1Api() nodeList = v1.list_pod_for_all_namespaces(watch=False) for node in nodeList.items: if node.metadata.namespace == currNameSpace: print("%s %s" % (node.metadata.name, node.status.pod_ip))
def _load_kube_config(in_cluster, cluster_context): from kubernetes import config, client if in_cluster: config.load_incluster_config() return client.CoreV1Api() else: if cluster_context is None: config.load_kube_config() return client.CoreV1Api() else: return client.CoreV1Api( api_client=config.new_client_from_config(context=cluster_context))
def serve(self): # For deployed clusters, we should always be running inside # a Rook cluster. For development convenience, also support # running outside (reading ~/.kube config) if self._in_cluster(): config.load_incluster_config() cluster_name = os.environ['ROOK_CLUSTER_NAME'] else: self.log.warning("DEVELOPMENT ONLY: Reading kube config from ~") config.load_kube_config() cluster_name = "rook" # So that I can do port forwarding from my workstation - jcsp from kubernetes.client import configuration configuration.verify_ssl = False self._k8s = client.CoreV1Api() try: # XXX mystery hack -- I need to do an API call from # this context, or subsequent API usage from handle_command # fails with SSLError('bad handshake'). Suspect some kind of # thread context setup in SSL lib? self._k8s.list_namespaced_pod(cluster_name) except ApiException: # Ignore here to make self.available() fail with a proper error message pass self._rook_cluster = RookCluster( self._k8s, cluster_name) # In case Rook isn't already clued in to this ceph # cluster's existence, initialize it. # self._rook_cluster.init_rook() self._initialized.set() while not self._shutdown.is_set(): # XXX hack (or is it?) to kick all completions periodically, # in case we had a caller that wait()'ed on them long enough # to get persistence but not long enough to get completion global all_completions self.wait(all_completions) all_completions = filter(lambda x: not x.is_complete, all_completions) self._shutdown.wait(5)
def _load_kube_config(in_cluster, cluster_context, config_file): from kubernetes import config, client if in_cluster: config.load_incluster_config() else: config.load_kube_config(config_file=config_file, context=cluster_context) if PY2: # For connect_get_namespaced_pod_exec from kubernetes.client import Configuration configuration = Configuration() configuration.assert_hostname = False Configuration.set_default(configuration) return client.CoreV1Api()
def main(): logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO) if len(sys.argv) != 4 or sys.argv[1] not in ('on_start', 'on_stop', 'on_role_change'): sys.exit('Usage: %s <action> <role> <cluster_name>', sys.argv[0]) action, role, cluster = sys.argv[1:4] k8s_config.load_incluster_config() k8s_api = CoreV1Api() namespace = os.environ['POD_NAMESPACE'] if role == 'master' and action in ('on_start', 'on_role_change'): patch_master_endpoint(k8s_api, namespace, cluster)
def __init__(self, **kwargs): self.svcaccount = kwargs.get('svcaccount','default') self.namespace = kwargs.get('namespace','default') if kwargs.get('kubeconfig') == 'incluster': log.info('load incluster config') config.load_incluster_config() else: cfg = kwargs.get('kubeconfig') log.info('load config %s', cfg) if not cfg: config.load_kube_config() else: config.load_kube_config(cfg) import urllib3 urllib3.disable_warnings()
def __init__(self, api_client=None, config_file=None): config = client.Configuration() if api_client: self.api_client = api_client else: if not config.api_client: if config_file is not None and config_file != "": konfig.load_kube_config(config_file=config_file) else: konfig.load_incluster_config() config.api_client = klient.ApiClient() # K8S python client doesn't provide any way to configure the # client pool size, so we inject the value here config.api_client.rest_client.pool_manager.connection_pool_kw[ 'maxsize'] = 20 self.api_client = config.api_client self._watch = None
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Load kubernetes config here, since this is a Singleton and # so this __init__ will be run way before anything else gets run. try: config.load_incluster_config() except config.ConfigException: config.load_kube_config() self.api = shared_client(self.api_group_name) # FIXME: Protect against malicious labels? self.label_selector = ','.join(['{}={}'.format(k, v) for k, v in self.labels.items()]) self.field_selector = ','.join(['{}={}'.format(k, v) for k, v in self.fields.items()]) self.first_load_future = Future() self._stop_event = threading.Event() self.start()
def check_new_pods(): """Background Task to look for new pods available in cluster """ from api.models.kubepod import KubePod try: with PidFile('new_pods') as p: print(p.pidname) config.load_incluster_config() v1 = client.CoreV1Api() release_name = os.environ.get('MLBENCH_KUBE_RELEASENAME') ns = os.environ.get('MLBENCH_NAMESPACE') ret = v1.list_namespaced_pod( ns, label_selector="component=worker,app=mlbench,release={}" .format(release_name)) all_pods = list(KubePod.objects.all().values_list('name')) for i in ret.items: if KubePod.objects.filter(name=i.metadata.name).count() == 0: ip = i.status.pod_ip if ip is None: ip = "" pod = KubePod(name=i.metadata.name, labels=i.metadata.labels, phase=i.status.phase, ip=ip, node_name=i.spec.node_name) pod.save() if i.metadata.name in all_pods: all_pods.remove(i.metadata.name) KubePod.objects.filter(name__in=all_pods).delete() except PidFileError: return
def init(self): from kubernetes import config, client try: config.load_incluster_config() except Exception as e: self._incluster = False return configuration = client.Configuration() class MyApiClient(client.ApiClient): """ A bug introduced by a fix. https://github.com/kubernetes-client/python/issues/411 https://github.com/swagger-api/swagger-codegen/issues/6392 """ def __del__(self): pass self.api_instance = client.CoreV1Api(MyApiClient(configuration)) # TODO: remove hardcoded part in the future. self.namespace = 'default' label_selector = 'component=master,app=mlbench' try: api_response = self.api_instance.list_namespaced_pod( self.namespace, label_selector=label_selector) except Exception as e: print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e) assert len(api_response.items) == 1 master_pod = api_response.items[0] ip = master_pod.status.pod_ip self.endpoint = "http://{ip}/api/metrics/".format(ip=ip) self._initialized = True
def kube_v1(): # Assume we got nothin'. k8s_api = None # XXX: is there a better way to check if we are inside a cluster or not? if "KUBERNETES_SERVICE_HOST" in os.environ: # If this goes horribly wrong and raises an exception (it shouldn't), # we'll crash, and Kubernetes will kill the pod. That's probably not an # unreasonable response. config.load_incluster_config() k8s_api = client.CoreV1Api() else: # Here, we might be running in docker, in which case we'll likely not # have any Kube secrets, and that's OK. try: config.load_kube_config() k8s_api = client.CoreV1Api() except FileNotFoundError: # Meh, just ride through. logger.info("No K8s") pass return k8s_api
def kube(): config.load_incluster_config() return client.CoreV1Api()
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--name', dest='name', type=str, help='Experiment name.' ) parser.add_argument( '--destination', dest='destination', type=str, help='The file which stores the best trial of the experiment.' ) parser.add_argument( '--train_file_path', dest='train_file_path', type=str, help='Location where training data is located.' ) parser.add_argument( '--validation_files_path', dest='validation_files_path', type=str, help='Location where validation data is located.' ) parser.add_argument( '--validation_train_files_path', dest='validation_train_files_path', type=str, help='Location where validation of training data is located.' ) parser.add_argument( '--es_host', dest='es_host', type=str, help='Name host of Elasticsearch.' ) parser.add_argument( '--model_name', dest='model_name', type=str, help='Name of feature set saved in Elasticsearch.' ) parser.add_argument( '--ranker', dest='ranker', type=str, help='RankLib algorith to use.' ) args = parser.parse_args() files = [f'{args.destination}/best_rank.txt', f'{args.destination}/best_model.txt'] for file_ in files: if os.path.isfile(file_): os.remove(file_) exp_json_file = PATH / 'experiment.json' exp_def = json.loads(open(str(exp_json_file)).read()) raw_template = json.dumps( exp_def['spec']['trialTemplate']['goTemplate']['rawTemplate'] ) raw_template = raw_template\ .replace('{PROJECT_ID}', os.getenv('PROJECT_ID'))\ .replace('{train_file_path}', args.train_file_path)\ .replace('{validation_files_path}', args.validation_files_path)\ .replace('{validation_train_files_path}', args.validation_train_files_path)\ .replace('{es_host}', args.es_host)\ .replace('{destination}', args.destination)\ .replace('{model_name}', args.model_name)\ .replace('{ranker}', args.ranker) exp_def['spec']['trialTemplate']['goTemplate']['rawTemplate'] = raw_template config.load_incluster_config() api_client = k8s_client.ApiClient() experiment = Experiment(client=api_client) exp_name = f'{args.name}-{uuid.uuid4().hex}'[:33] exp_def['spec']['parameters'] = get_ranker_parameters(args.ranker) exp_def['metadata']['name'] = exp_name print('this is exp_def: ', json.dumps(exp_def)) create_response = experiment.create(exp_def) print('create response: ', create_response) expected_conditions = ["Succeeded", "Failed"] current_exp = experiment.wait_for_condition('kubeflow', exp_name, expected_conditions) print('current_exp: ', json.dumps(current_exp)) expected, _ = experiment.is_expected_conditions(current_exp, ["Succeeded"]) if expected: best_rank = current_exp["status"]["currentOptimalTrial"]["observation"][ 'metrics'][0]['value'] print('Best Rank Found: ', best_rank) params = current_exp["status"]["currentOptimalTrial"]["parameterAssignments"] print(json.dumps(params)) os.makedirs(os.path.dirname(args.destination), exist_ok=True) if os.path.isfile(args.destination): os.remove(args.destination) experiment.delete(exp_name, 'kubeflow')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--verbose', type=bool, default=False) parser.add_argument('--node', action='append', default=[], help='Cilium pod names. Can specify multiple.') parser.add_argument('--selector', action='append', default=[], help='k8s equality label selectors for pods which ' 'monitor should listen to. each selector will ' 'retrieve its own set of pods. ' 'Format is "label-name=label-value" ' 'Can specify multiple.') parser.add_argument('--pod', action='append', default=[], help='pod names in form of "namespace:pod-name", ' 'if there is no namespace, default is assumed. ' 'Can specify multiple.') parser.add_argument('--endpoint', action='append', type=int, default=[], help='Cilium endpoint ids. Can specify multiple.') parser.add_argument('--to-selector', action='append', default=[], help='k8s equality label selectors for pods which ' 'monitor should listen to. each selector will ' 'retrieve its own set of pods. ' 'Matches events that go to selected pods. ' 'Format is "label-name=label-value" ' 'Can specify multiple.') parser.add_argument('--to-pod', action='append', default=[], help='pod names in form of "namespace:pod-name", ' 'if there is no namespace, default is assumed. ' 'Matches events that go to specified pods. ' 'Can specify multiple.') parser.add_argument('--to-endpoint', action='append', type=int, default=[], help='Cilium endpoint ids. ' 'Matches events that go to specified endpoints. ' 'Can specify multiple.') parser.add_argument('--from-selector', action='append', default=[], help='k8s equality label selectors for pods which ' 'monitor should listen to. each selector will ' 'retrieve its own set of pods. ' 'Matches events that come from selected pods. ' 'Format is "label-name=label-value" ' 'Can specify multiple.') parser.add_argument('--from-pod', action='append', default=[], help='pod names in form of "namespace:pod-name", ' 'if there is no namespace, default is assumed. ' 'Matches events that come from specified pods. ' 'Can specify multiple.') parser.add_argument('--from-endpoint', action='append', type=int, default=[], help='Cilium endpoint ids. ' 'Matches events that come from specified endpoints. ' 'Can specify multiple.') args = parser.parse_args() try: config.load_kube_config() except FileNotFoundError: config.load_incluster_config() c = Configuration() c.assert_hostname = False Configuration.set_default(c) api = core_v1_api.CoreV1Api() runner = MonitorRunner('kube-system', api) monitor_args = MonitorArgs(args.verbose, args.selector, args.pod, args.endpoint, args.to_selector, args.to_pod, args.to_endpoint, args.from_selector, args.from_pod, args.from_endpoint) try: runner.run(monitor_args, args.node) ui(runner) except KeyboardInterrupt as e: pass finally: runner.finish()
def connect(): config_file = None if os.environ.get('RD_CONFIG_ENV') == 'incluster': config.load_incluster_config() return if os.environ.get('RD_CONFIG_CONFIG_FILE'): config_file = os.environ.get('RD_CONFIG_CONFIG_FILE') elif os.environ.get('RD_NODE_KUBERNETES_CONFIG_FILE'): config_file = os.environ.get('RD_NODE_KUBERNETES_CONFIG_FILE') url = None if os.environ.get('RD_CONFIG_URL'): url = os.environ.get('RD_CONFIG_URL') elif os.environ.get('RD_NODE_KUBERNETES_CLUSTER_URL'): url = os.environ.get('RD_NODE_KUBERNETES_CLUSTER_URL') verify_ssl = None if os.environ.get('RD_CONFIG_VERIFY_SSL'): verify_ssl = os.environ.get('RD_CONFIG_VERIFY_SSL') elif os.environ.get('RD_NODE_KUBERNETES_VERIFY_SSL'): verify_ssl = os.environ.get('RD_NODE_KUBERNETES_VERIFY_SSL') ssl_ca_cert = None if os.environ.get('RD_CONFIG_SSL_CA_CERT'): ssl_ca_cert = os.environ.get('RD_CONFIG_SSL_CA_CERT') elif os.environ.get('RD_NODE_KUBERNETES_SSL_CA_CERT'): ssl_ca_cert = os.environ.get('RD_NODE_KUBERNETES_SSL_CA_CERT') token = None if os.environ.get('RD_CONFIG_TOKEN'): token = os.environ.get('RD_CONFIG_TOKEN') elif os.environ.get('RD_NODE_KUBERNETES_API_TOKEN'): token = os.environ.get('RD_NODE_KUBERNETES_API_TOKEN') log.debug("config file") log.debug(config_file) log.debug("-------------------") if config_file: log.debug("getting settings from file %s", config_file) config.load_kube_config(config_file=config_file) else: if url: log.debug("getting settings from plugin configuration") configuration = Configuration() configuration.host = url if verify_ssl == 'true': configuration.verify_ssl = verify_ssl else: configuration.verify_ssl = None configuration.assert_hostname = False if ssl_ca_cert: configuration.ssl_ca_cert = ssl_ca_cert configuration.api_key['authorization'] = token configuration.api_key_prefix['authorization'] = 'Bearer' client.Configuration.set_default(configuration) else: log.debug("getting settings from default config file") config.load_kube_config()
#print("%s" % (i.metadata.name)) #디버그 용 for j in i.status.conditions: #print("\t%s\t%s" % (j.type, j.status)) #디버그 용 if (j.type == "Ready" and j.status != "True"): if n_name in uk_node: uk_node[n_name] += 1 else: uk_node[n_name] = 0 print("unknown %s count=%d" % (n_name, uk_node[n_name])) # 카운터가 3회 넘어서면 노드를 제거 if uk_node[n_name] > 3: del uk_node[n_name] node_delete(v1, i.metadata.name) # 1번이라도 상태가 돌아오면 카운터를 초기화 if (j.type == "Ready" and j.status == "True"): if n_name in uk_node: del uk_node[n_name] except ApiException as e: print("Exception when calling CoreV1Api->list_node: %s\n" % e) ## 메인 if __name__ == '__main__': signal.signal(signal.SIGTERM, handler) # 시그널 처리 config.load_incluster_config() # 인증 정보 취득 v1 = client.CoreV1Api() # 인스턴스화 # 감시 루프 while True: node_monitor(v1) sleep(5) # 감시 간격
def main(): config.load_incluster_config() api = client.CustomObjectsApi() credentials = pika.PlainCredentials("guest", "guest") connection = pika.BlockingConnection(pika.ConnectionParameters("rabbitmq-0.rabbitmq.rabbits.svc.cluster.local", "5672", '/', credentials )) channel = connection.channel() channel.exchange_declare(exchange='topic_logs', exchange_type='topic') result = channel.queue_declare('', exclusive=True) queue_name = result.method.queue # binding_keys = sys.argv[1:] namespaces=['rabbits','team-a','team-b'] # if not binding_keys: # sys.stderr.write("Usage: %s [binding_key]...\n" % sys.argv[0]) # sys.exit(1) for namespace in namespaces: channel.queue_bind( exchange='topic_logs', queue=queue_name, routing_key=namespace) print(' [*] Waiting for logs. To exit press CTRL+C') def callback(ch, method, properties, body): print(" [x] %r:%r" % (method.routing_key, body)) # namespace = request.json['namespace'] # revision = request.json['command'] # print(body.decode()) # print(revision) api.create_namespaced_custom_object( group="tekton.dev", version="v1beta1", namespace=body.decode(), plural="taskruns", body={ "apiVersion": "tekton.dev/v1beta1", "kind": "TaskRun", "metadata": { "generateName": "echo-hello-world-taskrun-", "namespace":body.decode() }, "spec": { "serviceAccountName": "rabbitmq", "taskRef": { "name":"echo-hello-world" } }, }, ) print("Resource created") # output = sp.getoutput(str('tkn taskrun list | grep Succeeded')) # tasks = len(output.splitlines()) # if int(tasks)<2: # os.system(str(body.decode())) # ch.basic_ack(delivery_tag = method.delivery_tag) channel.basic_consume( queue=queue_name, on_message_callback=callback, auto_ack=True) channel.start_consuming()
def __init__(self): config.load_incluster_config() self.kubecoreapi = client.CoreV1Api() self.kubebatchapi = client.BatchV1Api()
def __init__(self, namespace=None, service_type=None, gs_image=None, etcd_image=None, zookeeper_image=None, gie_graph_manager_image=None, coordinator_name=None, coordinator_service_name=None, etcd_cpu=None, etcd_mem=None, zookeeper_cpu=None, zookeeper_mem=None, gie_graph_manager_cpu=None, gie_graph_manager_mem=None, engine_cpu=None, engine_mem=None, vineyard_cpu=None, vineyard_mem=None, vineyard_shared_mem=None, image_pull_policy=None, image_pull_secrets=None, volumes=None, num_workers=None, instance_id=None, log_level=None, timeout_seconds=None, waiting_for_delete=None, delete_namespace=None, **kwargs): try: kube_config.load_incluster_config() except: # noqa: E722 kube_config.load_kube_config() self._api_client = kube_client.ApiClient() self._core_api = kube_client.CoreV1Api(self._api_client) self._app_api = kube_client.AppsV1Api(self._api_client) self._instance_id = instance_id # random for multiple k8s cluster in the same namespace self._engine_name = self._engine_name_prefix + self._instance_id self._etcd_name = self._etcd_name_prefix + self._instance_id self._etcd_service_name = self._etcd_service_name_prefix + self._instance_id self._gie_graph_manager_name = (self._gie_graph_manager_name_prefix + self._instance_id) self._gie_graph_manager_service_name = ( self._gie_graph_manager_service_name_prefix + self._instance_id) self._vineyard_service_name = (self._vineyard_service_name_prefix + self._instance_id) self._namespace = namespace self._service_type = service_type self._num_workers = num_workers self._coordinator_name = coordinator_name self._coordinator_service_name = coordinator_service_name self._resource_object = [] # engine container info self._gs_image = gs_image self._engine_cpu = engine_cpu self._engine_mem = engine_mem # vineyard container info self._vineyard_cpu = vineyard_cpu self._vineyard_mem = vineyard_mem self._vineyard_shared_mem = vineyard_shared_mem # etcd pod info self._etcd_image = etcd_image self._etcd_cpu = etcd_cpu self._etcd_mem = etcd_mem # zookeeper pod info self._zookeeper_image = zookeeper_image self._zookeeper_cpu = zookeeper_cpu self._zookeeper_mem = zookeeper_mem # interactive engine graph manager info self._gie_graph_manager_image = gie_graph_manager_image self._gie_graph_manager_cpu = gie_graph_manager_cpu self._gie_graph_manager_mem = gie_graph_manager_mem self._image_pull_policy = image_pull_policy # image pull secrets self._etcd_endpoint = None if image_pull_secrets is not None: self._image_pull_secrets = image_pull_secrets.split(",") else: self._image_pull_secrets = [] self._volumes = json.loads(volumes) self._host0 = None self._pod_name_list = None self._pod_ip_list = None self._pod_host_ip_list = None self._analytical_engine_endpoint = None self._vineyard_service_endpoint = None self._closed = False self._glog_level = parse_as_glog_level(log_level) self._timeout_seconds = timeout_seconds self._waiting_for_delete = waiting_for_delete self._delete_namespace = delete_namespace self._analytical_engine_process = None # 8000 ~ 9000 is exposed self._learning_engine_ports_usage = 8000 self._graphlearn_services = dict() self._learning_instance_processes = {}
def create_k8s_api_client(configuration: Configuration, secrets: Secrets = None) -> client.ApiClient: """ Create a Kubernetes client from: 1. From a local configuration file if it exists (`~/.kube/config`). You can specify which context you want to use as well through the `KUBERNETES_CONTEXT` key in the environment or in the `secrets` object. 2. From the cluster configuration if executed from a Kubernetes pod and the CHAOSTOOLKIT_IN_POD is set to `"true"`. 3. From a mix of the following environment keys: * KUBERNETES_HOST: Kubernetes API address You can authenticate with a token via: * KUBERNETES_API_KEY: the API key to authenticate with * KUBERNETES_API_KEY_PREFIX: the key kind, if not set, defaults to "Bearer" Or via a username/password: * KUBERNETES_USERNAME * KUBERNETES_PASSWORD Or via SSL: * KUBERNETES_CERT_FILE * KUBERNETES_KEY_FILE Finally, you may disable SSL verification against HTTPS endpoints: * KUBERNETES_VERIFY_SSL: should we verify the SSL (unset means no) * KUBERNETES_CA_CERT_FILE: path the CA certificate when verification is expected You may pass a secrets dictionary, in which case, values will be looked there before the environ. """ env = os.environ secrets = secrets or {} def lookup(k: str, d: str = None) -> str: return secrets.get(k, env.get(k, d)) if has_local_config_file(): context = lookup("KUBERNETES_CONTEXT") logger.debug("Using Kubernetes context: {}".format(context or "default")) return config.new_client_from_config(context=context) elif env.get("CHAOSTOOLKIT_IN_POD") == "true": config.load_incluster_config() return client.ApiClient() else: cfg = client.Configuration() cfg.debug = True cfg.host = lookup("KUBERNETES_HOST", "http://localhost") cfg.verify_ssl = lookup("KUBERNETES_VERIFY_SSL", False) is not False cfg.cert_file = lookup("KUBERNETES_CA_CERT_FILE") if "KUBERNETES_API_KEY" in env or "KUBERNETES_API_KEY" in secrets: cfg.api_key['authorization'] = lookup("KUBERNETES_API_KEY") cfg.api_key_prefix['authorization'] = lookup( "KUBERNETES_API_KEY_PREFIX", "Bearer") elif "KUBERNETES_CERT_FILE" in env or \ "KUBERNETES_CERT_FILE" in secrets: cfg.cert_file = lookup("KUBERNETES_CERT_FILE") cfg.key_file = lookup("KUBERNETES_KEY_FILE") elif "KUBERNETES_USERNAME" in env or "KUBERNETES_USERNAME" in secrets: cfg.username = lookup("KUBERNETES_USERNAME") cfg.password = lookup("KUBERNETES_PASSWORD", "") return client.ApiClient(cfg)
def main(): config.load_incluster_config() api_ext = client.ApiextensionsV1beta1Api() apps = client.AppsV1beta1Api() crds = client.CustomObjectsApi() # Create API controllers within our namespace, which we # get through the downward API. namespace = os.environ["MY_NAMESPACE"] api_controller_image = os.environ["API_IMAGE"] owner = apps.read_namespaced_deployment(os.environ["OWNER_NAME"], namespace) # Define our OwnerReference that we will add to the metadata of # objects we create so that they are garbage collected when this # controller is deleted. controller_ref = { "apiVersion": owner.api_version, "blockOwnerDeletion": True, "controller": True, "kind": owner.kind, "name": os.environ["OWNER_NAME"], "uid": owner.metadata.uid, } def owner_ref(obj, controller=False): return { "apiVersion": obj["apiVersion"], "blockOwnerDeletion": True, "controller": controller, "kind": obj["kind"], "name": obj["metadata"]["name"], "uid": obj["metadata"]["uid"], } def delete_meta(api, resource): logging.error("Deleting deployment: %s", resource.group()) apps.delete_namespaced_deployment(resource.group(), namespace, body=client.V1DeleteOptions( propagation_policy='Foreground', grace_period_seconds=5)) logging.error("Deleting CRD: %s", resource.name()) api_ext.delete_custom_resource_definition( resource.name(), body=client.V1DeleteOptions(propagation_policy='Foreground', grace_period_seconds=5)) def update_meta(api, resource): # TODO(mattmoor): Establish a better way to diff the actual/desired # object states and reconcile them. For now, just check the image. controller = apps.read_namespaced_deployment(resource.group(), namespace) if controller.spec.template.spec.containers[ 0].image == api_controller_image: logging.warn("Image for %s controller is up-to-date!", resource.name()) return logging.warn("Updating image for %s controller", resource.name()) controller.spec.template.spec.containers[ 0].image = api_controller_image apps.replace_namespaced_deployment(resource.group(), namespace, controller) def create_meta(api, resource): api_ext.create_custom_resource_definition( resource.definition([controller_ref])) apps.create_namespaced_deployment( namespace, resource.controller(api_controller_image, [controller_ref])) def process_meta(t, api, obj): if t == "DELETED": logging.error("Delete event: %s", json.dumps(obj, indent=1)) for resource in api.resources(): delete_meta(api, resource) elif t == "MODIFIED" or t == "ADDED": for resource in api.resources(): controller_namespace = resource.controller_namespace() if controller_namespace: if controller_namespace != namespace: # This is being controlled by a controller in another namespace. logging.warn( "Found resourced being managed by another " "meta-controller, this is bound to create " "contention. Skipping %s", api.name()) return else: # This is being controlled by us, make sure it is up to date. update_meta(api, resource) continue # TODO(mattmoor): See if we can make the api-controller owned # by the CRD that spawned it. Right now, this seems ineffective. # crd_ref = owner_ref(obj) # This has not been processed yet. create_meta(api, resource) # Annotate our object with our resource (and namespace) resource.annotate(obj, namespace) obj = crds.replace_namespaced_custom_object( DOMAIN, VERSION, namespace, PLURAL, obj["metadata"]["name"], obj) else: logging.error("Unrecognized type: %s", t) resource_version = "" while True: stream = watch.Watch().stream(crds.list_cluster_custom_object, DOMAIN, VERSION, PLURAL, resource_version=resource_version) for event in stream: try: t = event["type"] obj = event["object"] api = Api(obj) process_meta(t, api, obj) # Configure where to resume streaming. metadata = obj.get("metadata") if metadata: resource_version = metadata["resourceVersion"] except: logging.exception("Error handling event")
async def k8s_update_dn_info(app): """ update dn urls by querying k8s api. Call each url to determine node_ids """ log.info("k8s_update_dn_info") # TBD - find more elegant way to avoid this warning import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) k8s_config.load_incluster_config( ) #get the config from within the cluster and set it as the default config for all new clients c = k8s_client.Configuration() #go and get a copy of the default config c.verify_ssl = False #set verify_ssl to false in that config k8s_client.Configuration.set_default( c) #make that config the default for all new clients v1 = k8s_client.CoreV1Api() k8s_namespace = config.get("k8s_namespace") if k8s_namespace: # get pods for given namespace log.info(f"getting pods for namespace: {k8s_namespace}") ret = v1.list_namespaced_pod(namespace=k8s_namespace) else: log.info("getting pods for all namespaces") ret = v1.list_pod_for_all_namespaces(watch=False) pod_ips = [] dn_urls = [] k8s_app_label = config.get("k8s_app_label") for i in ret.items: pod_ip = i.status.pod_ip if not pod_ip: continue labels = i.metadata.labels if labels and "app" in labels and labels["app"] == k8s_app_label: log.info( f"found hsds pod with app label: {k8s_app_label} - ip: {pod_ip}" ) pod_ips.append(pod_ip) if not pod_ips: log.error("Expected to find at least one hsds pod") return pod_ips.sort() # for assigning node numbers dn_port = config.get("dn_port") for pod_ip in pod_ips: dn_urls.append(f"http://{pod_ip}:{dn_port}") # call info on each dn container and get node ids dn_ids = [] for dn_url in app["dn_urls"]: req = dn_url + "/info" log.debug(f"about to call: {req}") try: rsp_json = await http_get(app, req) if "node" not in rsp_json: log.error("Unexepected response from info (no node key)") continue node_json = rsp_json["node"] if "id" not in node_json: log.error("Unexepected response from info (no node/id key)") continue dn_ids.append(node_json["id"]) except HTTPServiceUnavailable: log.warn("503 error from /info request") except Exception as e: log.error(f"Exception: {e} from /info request") log.info(f"node_info check dn_ids: {dn_ids}") # save to global app["dn_urls"] = dn_urls app["dn_ids"] = dn_ids
def __init__(self): config.load_incluster_config() self.k8s = client.CoreV1Api()
def loadK8SConfig(): if 'KUBERNETES_PORT' in os.environ: config.load_incluster_config() else: config.load_kube_config()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--timeout-monitors', type=int, default=0, help='Will remove monitor output which did ' 'not update in last `timeout` seconds. ' 'Will not work on last monitor on screen.') parser.add_argument('--verbose', action='store_true', default=False) parser.add_argument('--hex', action='store_true', default=False) # taken from github.com/cilium/cilium/cmd/monitor.go type_choices = ['drop', 'debug', 'capture', 'trace'] parser.add_argument('--type', action='append', default=[], choices=type_choices) parser.add_argument('--node', action='append', default=[], help='Specify which nodes monitor will be run on. ' 'Can match either by cilium pod names or k8s node ' 'names. Can specify multiple.') parser.add_argument('--selector', action='append', default=[], help='k8s equality label selectors for pods which ' 'monitor should listen to. each selector will ' 'retrieve its own set of pods. ' 'Format is "label-name=label-value" ' 'Can specify multiple.') parser.add_argument('--pod', action='append', default=[], help='pod names in form of "namespace:pod-name", ' 'if there is no namespace, default is assumed. ' 'Can specify multiple.') parser.add_argument('--endpoint', action='append', type=int, default=[], help='Cilium endpoint ids. Can specify multiple.') parser.add_argument('--to-selector', action='append', default=[], help='k8s equality label selectors for pods which ' 'monitor should listen to. each selector will ' 'retrieve its own set of pods. ' 'Matches events that go to selected pods. ' 'Format is "label-name=label-value" ' 'Can specify multiple.') parser.add_argument('--to-pod', action='append', default=[], help='pod names in form of "namespace:pod-name", ' 'if there is no namespace, default is assumed. ' 'Matches events that go to specified pods. ' 'Can specify multiple.') parser.add_argument('--to-endpoint', action='append', type=int, default=[], help='Cilium endpoint ids. ' 'Matches events that go to specified endpoints. ' 'Can specify multiple.') parser.add_argument('--from-selector', action='append', default=[], help='k8s equality label selectors for pods which ' 'monitor should listen to. each selector will ' 'retrieve its own set of pods. ' 'Matches events that come from selected pods. ' 'Format is "label-name=label-value" ' 'Can specify multiple.') parser.add_argument('--from-pod', action='append', default=[], help='pod names in form of "namespace:pod-name", ' 'if there is no namespace, default is assumed. ' 'Matches events that come from specified pods. ' 'Can specify multiple.') parser.add_argument('--from-endpoint', action='append', type=int, default=[], help='Cilium endpoint ids. ' 'Matches events that come from specified endpoints. ' 'Can specify multiple.') parser.add_argument('--send-command', type=str, default="", help='Execute command as-provided in argument on ' 'all specified nodes and show output.') parser.add_argument('--cilium-namespace', type=str, default="kube-system", help='Specify namespace in which Cilium pods reside') parser.add_argument('--clear-monitors', action='store_true', default=False, help='Kill all `cilium monitor` on Cilium nodes. ' 'Helpful for debugging') parser.add_argument('--combine', action='store_true', default=False, help='Prints all output retrieved from nodes to ' 'stdout. Times out after timeout_monitors.') args = parser.parse_args() try: config.load_kube_config() except FileNotFoundError: config.load_incluster_config() c = Configuration() c.assert_hostname = False Configuration.set_default(c) api = core_v1_api.CoreV1Api() runner = MonitorRunner(args.cilium_namespace, api) monitor_args = MonitorArgs(args.verbose, args.hex, args.selector, args.pod, args.endpoint, args.to_selector, args.to_pod, args.to_endpoint, args.from_selector, args.from_pod, args.from_endpoint, args.type) try: if args.clear_monitors: cmd = "pkill -f \"cilium monitor\"" else: cmd = args.send_command runner.run(monitor_args, args.node, cmd) if args.combine: batch(runner, args.timeout_monitors) elif not args.clear_monitors: ui(runner, args.timeout_monitors) except KeyboardInterrupt as e: pass finally: runner.finish()
from socket import * from .container import ContainerProcessProxy from kubernetes import client, config from ..sessions.kernelsessionmanager import KernelSessionManager import urllib3 urllib3.disable_warnings() # Default logging level of kubernetes produces too much noise - raise to warning only. logging.getLogger('kubernetes').setLevel(os.environ.get('EG_KUBERNETES_LOG_LEVEL', logging.WARNING)) enterprise_gateway_namespace = os.environ.get('EG_NAMESPACE', 'default') default_kernel_service_account_name = os.environ.get('EG_DEFAULT_KERNEL_SERVICE_ACCOUNT_NAME', 'default') kernel_cluster_role = os.environ.get('EG_KERNEL_CLUSTER_ROLE', 'cluster-admin') shared_namespace = bool(os.environ.get('EG_SHARED_NAMESPACE', 'False').lower() == 'true') config.load_incluster_config() class KubernetesProcessProxy(ContainerProcessProxy): def __init__(self, kernel_manager, proxy_config): super(KubernetesProcessProxy, self).__init__(kernel_manager, proxy_config) self.kernel_namespace = None self.delete_kernel_namespace = False def launch_process(self, kernel_cmd, **kw): # Set env before superclass call so we see these in the debug output # Kubernetes relies on many internal env variables. Since EG is running in a k8s pod, we will
def resolve_hostnames(self): orchestrator = os.getenv('KOLLAPS_ORCHESTRATOR', 'swarm') if orchestrator == 'kubernetes': # kubernetes version # we are only talking to the kubernetes API experimentUUID = environ.get('KOLLAPS_UUID', '') config.load_incluster_config() kubeAPIInstance = client.CoreV1Api() need_pods = kubeAPIInstance.list_namespaced_pod('default') for service in self.services: hosts = self.services[service] answers = [] ips = [] while len(ips) != len(hosts): answers = [] need_pods = kubeAPIInstance.list_namespaced_pod('default') try: for pod in need_pods.items: # loop through pods - much less elegant than using a DNS service if pod.metadata.name.startswith(service + "-" + experimentUUID): if pod.status.pod_ip is not None: # LL answers.append(pod.status.pod_ip) ips = [str(ip) for ip in answers] except: sleep(3) ips.sort() # needed for deterministic behaviour for i in range(len(hosts)): int_ip = ip2int(ips[i]) hosts[i].ip = int_ip hosts[i].replica_id = i self.hosts_by_ip[int_ip] = hosts[i] else: if orchestrator != 'swarm': print("Unrecognized orchestrator. Using default docker swarm.") # python's built in address resolver looks in /etc/hosts first # this is a problem since services with multiple replicas (same hostname) # will only have ONE entry in /etc/hosts, so the other hosts will never be found... # Solution: forcefully use dns queries that skip /etc/hosts (this pulls the dnspython dependency...) # Moreover, in some scenarios the /etc/resolv.conf is broken inside the containers # So to get the names to resolve properly we need to force to use dockers internal nameserver # 127.0.0.11 experimentUUID = environ.get('KOLLAPS_UUID', '') docker_resolver = dns.resolver.Resolver(configure=False) docker_resolver.nameservers = ['127.0.0.11'] for service in self.services: hosts = self.services[service] ips = [] while len(ips) != len(hosts): try: answers = docker_resolver.query(service + "-" + experimentUUID, 'A') ips = [str(ip) for ip in answers] if len(ips) != len(hosts): sleep(3) except: sleep(3) ips.sort() # needed for deterministic behaviour for i in range(len(hosts)): int_ip = ip2int(ips[i]) hosts[i].ip = int_ip hosts[i].replica_id = i self.hosts_by_ip[int_ip] = hosts[i]
def monitor(): global monitored_pods global zeek_pods print("Running on node %s as %s" % (my_node_name, my_zeek_node_type)) config.load_incluster_config() v1 = client.CoreV1Api() w = watch.Watch() for event in w.stream(v1.list_pod_for_all_namespaces): event_type = event['type'] labels = event['object'].metadata.labels metadata = event['object'].metadata key = metadata.namespace + '.' + metadata.name if event_type in ['ADDED', 'MODIFIED']: if 'zeek-monitor' in labels: monitored_pods[key] = event['object'] if 'zeek-node' in labels: zeek_pods[key] = event['object'] if event_type in ['DELETED']: if key in monitored_pods: del monitored_pods[key] if key in zeek_pods: del zeek_pods[key] zeek_topology = [] zeek_workers = {} for key in zeek_pods.keys(): pod = zeek_pods[key] pod_name = pod.metadata.name pod_ip = pod.status.pod_ip node_name = pod.spec.node_name zeek_node_type = pod.metadata.labels['zeek-node'].upper() if zeek_node_type in ['MANAGER']: zeek_node_name = 'manager' zeek_topology.append({ 'name': zeek_node_name, 'type': zeek_node_type, 'ip': pod_ip }) if zeek_node_type in ['PROXY', 'LOGGER']: zeek_node_name = '%s-%s' % (node_type.lower(), pod_name) zeek_topology.append({ 'name': zeek_node_name, 'type': zeek_node_type, 'ip': pod_ip, 'manager': 'manager' }) if zeek_node_type in ['WORKER']: zeek_workers[node_name] = pod for key in monitored_pods.keys(): pod = monitored_pods[key] pod_name = pod.metadata.name pod_namespace = pod.metadata.namespace node_name = pod.spec.node_name if node_name in zeek_workers: zeek_worker_ip = zeek_workers[node_name].status.pod_ip zeek_node_name = 'worker-%s-%s' % (pod_namespace, pod_name) interface_hash = hashlib.sha1(('%s.%s' % (pod_namespace, pod_name)).encode('utf-8')) zeek_interface = '%s%s' % (interface_prefix, interface_hash.hexdigest()[:11]) zeek_topology.append({ 'name': zeek_node_name, 'type': 'WORKER', 'ip': zeek_worker_ip, 'manager': 'manager', 'interface': zeek_interface }) zeek_topology.sort(key = lambda e: e['name']) port = 47761 for element in zeek_topology: element['port'] = port port += 1 with open(cluster_layout_template_file) as fi: template = Template(fi.read()) if not os.path.isfile(cluster_layout_file): with open(cluster_layout_file, 'w') as fo: fo.write(template.render(zeek_topology=zeek_topology)) sync_zeek() else: cluster_layout_file_temp = cluster_layout_file + '.tmp' with open(cluster_layout_file_temp, 'w') as fo: fo.write(template.render(zeek_topology=zeek_topology)) equal = filecmp.cmp(cluster_layout_file, cluster_layout_file_temp) if not equal: os.rename(cluster_layout_file_temp, cluster_layout_file) sync_zeek()
def get_conn(self) -> client.ApiClient: """Returns kubernetes api session for use with requests""" in_cluster = self._coalesce_param( self.in_cluster, self.conn_extras.get("extra__kubernetes__in_cluster") or None) cluster_context = self._coalesce_param( self.cluster_context, self.conn_extras.get("extra__kubernetes__cluster_context") or None) kubeconfig_path = self._coalesce_param( self.config_file, self.conn_extras.get("extra__kubernetes__kube_config_path") or None) kubeconfig = self.conn_extras.get( "extra__kubernetes__kube_config") or None num_selected_configuration = len( [o for o in [in_cluster, kubeconfig, kubeconfig_path] if o]) if num_selected_configuration > 1: raise AirflowException( "Invalid connection configuration. Options kube_config_path, " "kube_config, in_cluster are mutually exclusive. " "You can only use one option at a time.") disable_verify_ssl = self._coalesce_param( self.disable_verify_ssl, _get_bool(self._get_field("disable_verify_ssl"))) disable_tcp_keepalive = self._coalesce_param( self.disable_tcp_keepalive, _get_bool(self._get_field("disable_tcp_keepalive"))) # BEGIN apply settings from core kubernetes configuration # this section should be removed in next major release deprecation_warnings: List[Tuple[str, Any]] = [] if disable_verify_ssl is None and self._deprecated_core_disable_verify_ssl is True: deprecation_warnings.append(('verify_ssl', False)) disable_verify_ssl = self._deprecated_core_disable_verify_ssl # by default, hook will try in_cluster first. so we only need to # apply core airflow config and alert when False and in_cluster not otherwise set. if in_cluster is None and self._deprecated_core_in_cluster is False: deprecation_warnings.append( ('in_cluster', self._deprecated_core_in_cluster)) in_cluster = self._deprecated_core_in_cluster if not cluster_context and self._deprecated_core_cluster_context: deprecation_warnings.append( ('cluster_context', self._deprecated_core_cluster_context)) cluster_context = self._deprecated_core_cluster_context if not kubeconfig_path and self._deprecated_core_config_file: deprecation_warnings.append( ('config_file', self._deprecated_core_config_file)) kubeconfig_path = self._deprecated_core_config_file if disable_tcp_keepalive is None and self._deprecated_core_disable_tcp_keepalive is True: deprecation_warnings.append(('enable_tcp_keepalive', False)) disable_tcp_keepalive = True if deprecation_warnings: self._deprecation_warning_core_param(deprecation_warnings) # END apply settings from core kubernetes configuration if disable_verify_ssl is True: _disable_verify_ssl() if disable_tcp_keepalive is not True: _enable_tcp_keepalive() if in_cluster: self.log.debug( "loading kube_config from: in_cluster configuration") self._is_in_cluster = True config.load_incluster_config() return client.ApiClient() if kubeconfig_path is not None: self.log.debug("loading kube_config from: %s", kubeconfig_path) self._is_in_cluster = False config.load_kube_config( config_file=kubeconfig_path, client_configuration=self.client_configuration, context=cluster_context, ) return client.ApiClient() if kubeconfig is not None: with tempfile.NamedTemporaryFile() as temp_config: self.log.debug( "loading kube_config from: connection kube_config") temp_config.write(kubeconfig.encode()) temp_config.flush() self._is_in_cluster = False config.load_kube_config( config_file=temp_config.name, client_configuration=self.client_configuration, context=cluster_context, ) return client.ApiClient() return self._get_default_client(cluster_context=cluster_context)
def main(): print(f"{timestamp()} Starting collector") folder_annotation = os.getenv(FOLDER_ANNOTATION) if folder_annotation is None: print(f"{timestamp()} No folder annotation was provided, " "defaulting to k8s-sidecar-target-directory") folder_annotation = "k8s-sidecar-target-directory" label = os.getenv(LABEL) if label is None: print( f"{timestamp()} Should have added {LABEL} as environment variable! Exit" ) return -1 label_value = os.getenv(LABEL_VALUE) if label_value: print(f"{timestamp()} Filter labels with value: {label_value}") target_folder = os.getenv(FOLDER) if target_folder is None: print( f"{timestamp()} Should have added {FOLDER} as environment variable! Exit" ) return -1 resources = os.getenv(RESOURCE, "configmap") resources = ("secret", "configmap") if resources == "both" else (resources, ) print(f"{timestamp()} Selected resource type: {resources}") method = os.getenv(REQ_METHOD) url = os.getenv(REQ_URL) payload = os.getenv(REQ_PAYLOAD) script = os.getenv(SCRIPT) # this is where kube_config is going to look for a config file kube_config = os.path.expanduser(KUBE_CONFIG_DEFAULT_LOCATION) if os.path.exists(kube_config): config.load_kube_config(kube_config) else: config.load_incluster_config() print(f"{timestamp()} Config for cluster api loaded...") current_namespace = open( "/var/run/secrets/kubernetes.io/serviceaccount/namespace").read() if os.getenv(SKIP_TLS_VERIFY) == "true": configuration = client.Configuration() configuration.verify_ssl = False configuration.debug = False client.Configuration.set_default(configuration) unique_filenames = os.getenv(UNIQUE_FILENAMES) if unique_filenames is not None and unique_filenames.lower() == "true": print(f"{timestamp()} Unique filenames will be enforced.") unique_filenames = True else: print(f"{timestamp()} Unique filenames will not be enforced.") unique_filenames = False if os.getenv(METHOD) == "LIST": for res in resources: list_resources(label, label_value, target_folder, url, method, payload, current_namespace, folder_annotation, res, unique_filenames, script) else: watch_for_changes(os.getenv(METHOD), label, label_value, target_folder, url, method, payload, current_namespace, folder_annotation, resources, unique_filenames, script)
def __init__(self, options): self.options = options self.sqs_client = boto3.client('sqs', region_name=options.aws_region) if not self.options.sqs_queue_url: # Derive the URL from the queue name self.options.sqs_queue_url = self.sqs_client.get_queue_url(QueueName=self.options.sqs_queue_name)['QueueUrl'] config.load_incluster_config() self.apps_v1 = client.AppsV1Api() self.last_scale_up_time = time() self.last_scale_down_time = time() def message_count(self): response = self.sqs_client.get_queue_attributes( QueueUrl=self.options.sqs_queue_url, AttributeNames=['ApproximateNumberOfMessages'] ) return int(response['Attributes']['ApproximateNumberOfMessages']) def poll(self): message_count = self.message_count() t = time() if message_count >= self.options.scale_up_messages: if t - self.last_scale_up_time > self.options.scale_up_cool_down: self.scale_up() self.last_scale_up_time = t else: logger.debug("Waiting for scale up cooldown") if message_count <= self.options.scale_down_messages: if t - self.last_scale_down_time > self.options.scale_down_cool_down: self.scale_down() self.last_scale_down_time = t else: logger.debug("Waiting for scale down cooldown") # code for scale to use msg_count sleep(self.options.poll_period) def scale_up(self): deployment = self.deployment() if deployment.spec.replicas < self.options.max_pods: logger.info("Scaling up") deployment.spec.replicas += 1 self.update_deployment(deployment) elif deployment.spec.replicas > self.options.max_pods: self.scale_down() else: logger.info("Max pods reached") def scale_down(self): deployment = self.deployment() if deployment.spec.replicas > self.options.min_pods: logger.info("Scaling Down") deployment.spec.replicas -= 1 self.update_deployment(deployment) elif deployment.spec.replicas < self.options.min_pods: self.scale_up() else: logger.info("Min pods reached") def deployment(self): logger.debug("loading deployment: {} from namespace: {}".format(self.options.kubernetes_deployment, self.options.kubernetes_namespace)) deployments = self.apps_v1.list_namespaced_deployment(self.options.kubernetes_namespace, label_selector="component={}".format(self.options.kubernetes_deployment)) return deployments.items[0] def update_deployment(self, deployment): # Update the deployment api_response = self.apps_v1.patch_namespaced_deployment( name=self.options.kubernetes_deployment, namespace=self.options.kubernetes_namespace, body=deployment) logger.debug("Deployment updated. status='%s'" % str(api_response.status)) def run(self): options = self.options logger.debug("Starting poll for {} every {}s".format(options.sqs_queue_url, options.poll_period)) while True: self.poll()
def launch_kubernetes_kernel(connection_file, response_addr, spark_context_init_mode): # Launches a containerized kernel as a kubernetes pod. config.load_incluster_config() # Capture keywords and their values. keywords = dict() # Factory values... # Since jupyter lower cases the kernel directory as the kernel-name, we need to capture its case-sensitive # value since this is used to locate the kernel launch script within the image. keywords['kernel_name'] = os.path.basename(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) keywords['eg_response_address'] = response_addr keywords['kernel_connection_filename'] = connection_file keywords['kernel_spark_context_init_mode'] = spark_context_init_mode # Walk env variables looking for names prefixed with KERNEL_. When found, set corresponding keyword value # with name in lower case. for name, value in os.environ.items(): if name.startswith('KERNEL_'): keywords[name.lower()] = value # Read the kernel-pod yaml file, stripping off any commented lines. This allows instances of the # yaml file to comment out substitution parameters since we want to fail the launch if any are left # unsubstituted. Otherwise, commented out parameters could fail the launch if they had no substitutions. # yaml_template = '' with open(os.path.join(os.path.dirname(__file__), "kernel-pod.yaml")) as f: for line in f: line = line.split('#', 1)[0] yaml_template = yaml_template + line f.close() # Perform substitutions, then verify all parameters have been replaced. If any # parameters still exist, print their names and exit. If all have been replaced, # iterate over each document, issue creation statements. # k8s_yaml = Template(yaml_template).safe_substitute(keywords) # Check for non-substituted parameters - exit if found. # missing_params = [param[1] for param in Formatter().parse(k8s_yaml) if param[1]] if len(missing_params) > 0: missing_params = ['${' + param[1] + '}' for param in Formatter().parse(k8s_yaml) if param[1]] if len(missing_params) > 0: sys.exit("ERROR - The following parameters were not substituted - kernel launch terminating! {}". format(missing_params)) # For each k8s object (kind), call the appropriate API method. Too bad there isn't a method # that can take a set of objects. # # Creation for additional kinds of k8s objects can be added below. Refer to # https://github.com/kubernetes-client/python for API signatures. Other examples can be found in # https://github.com/jupyter-incubator/enterprise_gateway/blob/master/enterprise_gateway/services/processproxies/k8s.py # kernel_namespace = keywords['kernel_namespace'] k8s_objs = yaml.load_all(k8s_yaml) for k8s_obj in k8s_objs: if k8s_obj.get('kind'): if k8s_obj['kind'] == 'Pod': client.CoreV1Api(client.ApiClient()).create_namespaced_pod(body=k8s_obj, namespace=kernel_namespace) elif k8s_obj['kind'] == 'Secret': client.CoreV1Api(client.ApiClient()).create_namespaced_secret(body=k8s_obj, namespace=kernel_namespace) elif k8s_obj['kind'] == 'PersistentVolumeClaim': client.CoreV1Api(client.ApiClient()).create_namespaced_persistent_volume_claim( body=k8s_obj, namespace=kernel_namespace) elif k8s_obj['kind'] == 'PersistentVolume': client.CoreV1Api(client.ApiClient()).create_persistent_volume(body=k8s_obj) else: sys.exit("ERROR - Unhandled Kubernetes object kind '{}' found in yaml file - kernel launch terminating!". format(k8s_obj['kind'])) else: sys.exit("ERROR - Unknown Kubernetes object '{}' found in yaml file - kernel launch terminating!". format(k8s_obj))
def __init__(self): if is_running_in_k8s(): config.load_incluster_config() else: config.load_kube_config()
def main(): config.load_incluster_config() #config.load_kube_config() apps_beta1 = client.AppsV1beta1Api() crds = client.CustomObjectsApi() v1 = client.CoreV1Api() batch = client.BatchV2alpha1Api() def create_meta(app): controller_ref = { "apiVersion": app._apiversion.rstrip("/v1"), "blockOwnerDeletion": True, "kind": app._kind, "name": app.crd_name(), "uid": app._metadata["uid"], } job = batch.create_namespaced_cron_job(namespace="default", body=app.cronjob([controller_ref])) logging.warning("Created CronJob for App: %s", job.metadata.name) logging.warning("Owner's reference: %s", json.dumps(controller_ref)) def update_meta(app): try: create_meta(app) except ApiException as e: if e.status != httplib.CONFLICT: raise e # Tear down any versions that shouldn't exist. #delete_meta(app.other_versions()) def delete_meta(selector): # Handle random namespace later... namespace = "default" for job in batch.list_namespaced_cron_job( namespace, label_selector=selector).items: batch.delete_namespaced_cron_job( job.metadata.name, namespace, body=client.V1DeleteOptions( propagation_policy='Foreground', grace_period_seconds=5)) logging.warning("Deleted the CronJob for: %s", job.metadata.name) def process_meta(t, app, obj): if t == "DELETED": delete_meta(app.any_versions()) logging.warning("Deleted CRD, check garbage collection") elif t in ["MODIFIED", "ADDED"]: update_meta(app) else: logging.error("Unrecognized type: %s", t) # hack, using default namespace, default service account to get a token for kubecfg to work token = v1.read_namespaced_service_account(namespace="default",name="default").secrets[0].name resource_version = "" while True: stream = watch.Watch().stream(crds.list_cluster_custom_object, DOMAIN, VERSION, PLURAL, resource_version=resource_version) for event in stream: try: t = event["type"] obj = event["object"] print obj app = App(obj, token) logging.warning("Apps %s, %s" % (app.crd_name(),t)) process_meta(t, app, obj) # Configure where to resume streaming. metadata = obj.get("metadata") if metadata: resource_version = metadata["resourceVersion"] except: logging.exception("Error handling event")
def _load_config(self): """Load kubernetes configuration.""" if self.in_cluster: config.load_incluster_config() else: config.load_kube_config(context=self.context)
async def k8s_register(app): log.info("k8s_register") # TBD - find more elegant way to avoid this warning import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) k8s_config.load_incluster_config( ) #get the config from within the cluster and set it as the default config for all new clients c = k8s_client.Configuration() #go and get a copy of the default config c.verify_ssl = False #set verify_ssl to false in that config k8s_client.Configuration.set_default( c) #make that config the default for all new clients v1 = k8s_client.CoreV1Api() # TBD - use the async version ret = v1.list_pod_for_all_namespaces(watch=False) pod_ips = [] sn_urls = {} dn_urls = {} for i in ret.items: pod_ip = i.status.pod_ip if not pod_ip: continue labels = i.metadata.labels if labels and "app" in labels and labels["app"] == "hsds": log.info(f"hsds pod - ip: {pod_ip}") pod_ips.append(pod_ip) if not pod_ips: log.error("Expected to find at least one hsds pod") return pod_ips.sort() # for assigning node numbers node_count = len(pod_ips) ready_count = 0 this_node_id = app["id"] sn_port = config.get("sn_port") dn_port = config.get("dn_port") for node_number in range(node_count): for port in (sn_port, dn_port): # send an info request to the node pod_ip = pod_ips[node_number] url = f"http://{pod_ip}:{port}" if port == sn_port: sn_urls[node_number] = url else: dn_urls[node_number] = url info_rsp = await get_info(app, url) if not info_rsp: # timeout or other failure continue if "node" not in info_rsp: log.error("expected to find node key in info resp") continue node_rsp = info_rsp["node"] log.debug(f"got info resp: {node_rsp}") for key in ("type", "id", "node_number", "node_count"): if key not in node_rsp: log.error( f"unexpected node type in node state, expected to find key: {key}" ) continue if node_rsp["type"] not in ("sn", "dn"): log.error( f"expected node_type to be sn or dn, type is {node_rsp['type']}" ) continue node_id = node_rsp["id"] if node_id == this_node_id: # set node_number and node_count log.debug("got info_rsp for this node") if app["node_number"] != node_number: old_number = app["node_number"] log.info( f"node_number has changed - old value was {old_number} new number is {node_number}" ) if app["node_type"] == "dn": meta_cache = app["meta_cache"] chunk_cache = app["chunk_cache"] if meta_cache.dirtyCount > 0 or chunk_cache.dirtyCount > 0: # set the node state to waiting till the chunk cache have been flushed if app["node_state"] == "READY": log.info( "setting node_state to waiting while cache is flushing" ) app["node_state"] = "WAITING" else: meta_cache.clearCache() chunk_cache.clearCache() log.info( f"node number was: {old_number} setting to: {node_number}" ) app["node_number"] = node_number app['register_time'] = time.time() else: # SN nodes can update node_number immediately log.info( f"node number was: {old_number} setting to: {node_number}" ) app["node_number"] = node_number app['register_time'] = time.time() if app["node_count"] != node_count: old_count = app["node_count"] log.info( f"node count was: {old_count} setting to: {node_count}" ) app["node_count"] = node_count if node_number == node_rsp[ "node_number"] and node_count == node_rsp["node_count"]: ready_count += 1 log.debug(f"incremented ready_count to {ready_count}") else: log.info(f"differing node_number/node_count for url: {url}") log.info( f"expected node_number: {node_number} actual: {node_rsp['node_number']}" ) log.info( f"expected node_count: {node_count} actual: {node_rsp['node_count']}" ) if ready_count == node_count * 2: if app["node_state"] != "READY": log.info("setting node state to READY") app["node_state"] = "READY" app["node_count"] = node_count app["sn_urls"] = sn_urls app["dn_urls"] = dn_urls else: log.info( f"not all pods ready - ready_count: {ready_count}/{node_count*2}") if app["node_state"] == "READY": log.info("setting node state to SCALING") app["node_state"] = "SCALING"
def load_kube_config(): if "AWS_WEB_IDENTITY_TOKEN_FILE" in os.environ and "eks.amazonaws.com" in os.environ[ "AWS_WEB_IDENTITY_TOKEN_FILE"]: k8_config.load_incluster_config() else: k8_config.load_kube_config()
def create_config(in_cluster=False): if in_cluster: logging.info("Loading in-cluster config") return config.load_incluster_config() else: return config.load_kube_config()
type='Approved') body.status.conditions = [approval_condition] try: certs_api.replace_certificate_signing_request_approval( csr_name, body) except Exception as e: print( "Hit %s when signing cert %s. This will be retried" % (e, csr_name)) break continue if __name__ == "__main__": if 'KUBERNETES_PORT' in os.environ: config.load_incluster_config() else: config.load_kube_config() configuration = client.Configuration() configuration.assert_hostname = False api_client = client.api_client.ApiClient(configuration=configuration) v1 = client.CoreV1Api() certs_api = client.CertificatesV1beta1Api() try: k8sfile = '/var/run/secrets/kubernetes.io/serviceaccount/namespace' namespace = open(k8sfile, 'r').read() if os.path.exists( k8sfile) else os.environ.get('NAMESPACE', 'default') config_map_name = os.environ.get('CONFIG_MAP', 'autorules') config_map = v1.read_namespaced_config_map(namespace=namespace, name=config_map_name) config_map_data = config_map.to_dict().get('data', {})
def _run_scale(self): # Var defs machineset_workers = [] machine_spread = [] extra = 0 add_per = 0 if self.incluster == "true": config.load_incluster_config() k8s_config = client.Configuration() k8s_client = client.api_client.ApiClient(configuration=k8s_config) elif self.kubeconfig: k8s_client = config.new_client_from_config(self.kubeconfig) else: k8s_client = config.new_client_from_config() try: dyn_client = DynamicClient(k8s_client) except Exception as err: logger.error("Could not configure client, failing the run") logger.error(err) exit(1) if self.is_rosa: self.rosa_machinepools = self._rosa_getmachinepools() logger.debug("ROSA MachinePools: %s" % self.rosa_machinepools) try: nodes = dyn_client.resources.get(api_version="v1", kind="Node") machinesets = dyn_client.resources.get(kind="MachineSet") except Exception as err: logger.error( "Could not get information on nodes/machinesets, failing the run" ) logger.error(err) exit(1) worker_count = (len( nodes.get( label_selector= "node-role.kubernetes.io/worker,!node-role.kubernetes.io/master" ).attributes.items) or 0) workload_count = (len( nodes.get(label_selector="node-role.kubernetes.io/workload"). attributes.items) or 0) master_count = len( nodes.get(label_selector="node-role.kubernetes.io/master"). attributes.items) or 0 infra_count = len( nodes.get(label_selector="node-role.kubernetes.io/infra"). attributes.items) or 0 init_workers = worker_count infra = dyn_client.resources.get(kind="Infrastructure") try: platform = infra.get().attributes.items[0].spec.platformSpec.type except Exception as err: logger.error( "Platform type not obtained through spec.platformSpec.type") logger.error("Trying to query status.platform") logger.error(err) try: platform = infra.get().attributes.items[0].status.platform except Exception as err: logger.error("Could not identify platform. Marking as Unknown") logger.error(err) platform = "Unknown" # Machine set name list machineset_all_list = machinesets.get( namespace="openshift-machine-api").attributes.items machineset_worker_list = [] for i in range(len(machineset_all_list)): if (machineset_all_list[i].spec.template.metadata. labels["machine.openshift.io/cluster-api-machine-role"] == "worker"): machineset_worker_list.append(machineset_all_list[i]) # If we are already at the requested scale exit # Determine if we are scaling down or up action = "scale_nochange" if int(worker_count) == int(self.scale): logger.info("Already at requested worker count") return init_workers, worker_count, master_count, infra_count, workload_count, platform, action elif int(worker_count) > int(self.scale): action = "scale_down" else: action = "scale_up" logger.info("Current Worker count %s" % (worker_count)) # Number of workers to add per machine set add_per = int(self.scale / len(machineset_worker_list)) # Additional number of workers to add b/c math extra = self.scale % len(machineset_worker_list) logger.info("Number of machine sets %s" % (len(machineset_worker_list))) for i in range(len(machineset_worker_list)): machineset_workers.append(machineset_worker_list[i].metadata.name) machine_spread.append(add_per) for i in range(extra): machine_spread[i] += 1 logger.info("Machine sets: %s" % (machineset_workers)) logger.info("New worker per machine set %s" % (machine_spread)) logger.info("Starting Patching of machine sets") # Patch the machinesets if not self.is_rosa: for i in range(len(machineset_workers)): body = {"spec": {"replicas": machine_spread[i]}} machinesets.patch( body=body, namespace="openshift-machine-api", name=machineset_workers[i], content_type="application/merge-patch+json", ) else: self._rosa_scale("Default") logger.info( "Waiting for worker machine set to show the appropiate ready replicas" ) for i in range(len(machineset_worker_list)): new_machine_sets = machinesets.get( namespace="openshift-machine-api", name=machineset_worker_list[i].metadata.name) while new_machine_sets.status.readyReplicas != machine_spread[i]: if new_machine_sets.status.readyReplicas is None and machine_spread[ i] == 0: break new_machine_sets = machinesets.get( namespace="openshift-machine-api", name=machineset_worker_list[i].metadata.name) logger.debug( "Number of ready replicas for %s: %s. Waiting %d seconds for next check..." % ( new_machine_sets.metadata.name, str(new_machine_sets.status.readyReplicas), self.poll_interval, )) time.sleep(self.poll_interval) logger.info("Patching of machine sets complete") logger.info("Waiting for all workers to be schedulable") # Ensure all workers are not listed as unschedulable # If we don't do this it will auto-complete a scale-down even though the workers # have not been eliminated yet new_worker_list = nodes.get( label_selector="node-role.kubernetes.io/worker").attributes.items for i in range(len(new_worker_list)): while i < len( new_worker_list) and new_worker_list[i].spec.unschedulable: new_worker_list = nodes.get( label_selector="node-role.kubernetes.io/worker" ).attributes.items logger.debug( "Number of ready workers: %d. Waiting %d seconds for next check..." % (len(new_worker_list), self.poll_interval)) time.sleep(self.poll_interval) logger.info("All workers schedulable") worker_count = (len( nodes.get( label_selector= "node-role.kubernetes.io/worker,!node-role.kubernetes.io/master" ).attributes.items) or 0) workload_count = (len( nodes.get(label_selector="node-role.kubernetes.io/workload"). attributes.items) or 0) master_count = len( nodes.get(label_selector="node-role.kubernetes.io/master"). attributes.items) or 0 infra_count = len( nodes.get(label_selector="node-role.kubernetes.io/infra"). attributes.items) or 0 return init_workers, worker_count, master_count, infra_count, workload_count, platform, action
def main(argv=None): parser = argparse.ArgumentParser(description='ML Trainer') parser.add_argument( '--working-dir', help='Training job working directory.', required=True) parser.add_argument( '--train-files-dir', help='Path to training data', required=True) parser.add_argument( '--train-files-prefix', help='The prefix of the training input files.', required=True) parser.add_argument( '--tf-transform-dir', help='Tf-transform directory with model from preprocessing step', required=True) parser.add_argument( '--output-dir', help="""\ Directory under which which the serving model (under /serving_model_dir)\ and the tf-mode-analysis model (under /eval_model_dir) will be written\ """, required=True) parser.add_argument( '--eval-files-dir', help='Path to evaluation data', required=True ) parser.add_argument( '--eval-files-prefix', help='The prefix of the eval input files.', required=True) # Training arguments parser.add_argument( '--job-dir', help='GCS location to write checkpoints and export models', required=True) # Argument to turn on all logging parser.add_argument( '--verbosity', choices=['DEBUG', 'ERROR', 'FATAL', 'INFO', 'WARN'], default='INFO', ) # Experiment arguments parser.add_argument( '--train-steps', help='Count of steps to run the training job for', required=True, type=int) parser.add_argument( '--eval-steps', help='Number of steps to run evalution for at each checkpoint', default=100, type=int) parser.add_argument('--workers', type=int, default=0) parser.add_argument('--pss', type=int, default=0) parser.add_argument('--cluster', type=str, help='GKE cluster set up for kubeflow. If set, zone must be provided. ' + 'If not set, assuming this runs in a GKE container and current ' + 'cluster is used.') parser.add_argument('--zone', type=str, help='zone of the kubeflow cluster.') parser.add_argument('--kfversion', type=str, default='v1beta1', help='The version of the deployed kubeflow. ' + 'If not set, the default version is v1beta1') parser.add_argument('--tfjob-ns', type=str, default='kubeflow', help='The namespace where the tfjob is submitted' + 'If not set, the namespace is kubeflow') parser.add_argument('--tfjob-timeout-minutes', type=int, default=20, help='Time in minutes to wait for the TFJob to complete') args = parser.parse_args() logging.getLogger().setLevel(logging.INFO) args_dict = vars(args) if args.cluster and args.zone: cluster = args_dict.pop('cluster') zone = args_dict.pop('zone') else: # Get cluster name and zone from metadata metadata_server = "http://metadata/computeMetadata/v1/instance/" metadata_flavor = {'Metadata-Flavor' : 'Google'} cluster = requests.get(metadata_server + "attributes/cluster-name", headers = metadata_flavor).text zone = requests.get(metadata_server + "zone", headers = metadata_flavor).text.split('/')[-1] # logging.info('Getting credentials for GKE cluster %s.' % cluster) # subprocess.call(['gcloud', 'container', 'clusters', 'get-credentials', cluster, # '--zone', zone]) # Create metadata.json file for visualization. tb_dir = args_dict.pop('working_dir') # don't pass this arg to the training module metadata = { 'outputs' : [{ 'type': 'tensorboard', 'source': tb_dir, }] } with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f: json.dump(metadata, f) workers = args_dict.pop('workers') pss = args_dict.pop('pss') kf_version = args_dict.pop('kfversion') tfjob_ns = args_dict.pop('tfjob_ns') tfjob_timeout_minutes = args_dict.pop('tfjob_timeout_minutes') args_list = ['--%s=%s' % (k.replace('_', '-'),v) for k,v in six.iteritems(args_dict) if v is not None] logging.info('Generating training template.') template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'train.template.yaml') content_yaml = _generate_train_yaml(template_file, tfjob_ns, workers, pss, args_list) logging.info('Start training.') # Set up handler for k8s clients config.load_incluster_config() api_client = k8s_client.ApiClient() create_response = tf_job_client.create_tf_job(api_client, content_yaml, version=kf_version) job_name = create_response['metadata']['name'] wait_response = tf_job_client.wait_for_job( api_client, tfjob_ns, job_name, kf_version, timeout=datetime.timedelta(minutes=tfjob_timeout_minutes)) succ = True # TODO: update this failure checking after tf-operator has the condition checking function. if 'Worker' in wait_response['status']['replicaStatuses']: if 'Failed' in wait_response['status']['replicaStatuses']['Worker']: logging.error('Training failed since workers failed.') succ = False if 'PS' in wait_response['status']['replicaStatuses']: if 'Failed' in wait_response['status']['replicaStatuses']['PS']: logging.error('Training failed since PSs failed.') succ = False if 'Master' in wait_response['status']['replicaStatuses']: if 'Failed' in wait_response['status']['replicaStatuses']['Master']: logging.error('Training failed since Master failed.') succ = False # #TODO: remove this after kubeflow fixes the wait_for_job issue # # because the wait_for_job returns when the worker finishes but the master might not be complete yet. # if 'Master' in wait_response['status']['replicaStatuses'] and 'active' in wait_response['status']['replicaStatuses']['Master']: # master_active = True # while master_active: # # Wait for master to finish # time.sleep(2) # wait_response = tf_job_client.wait_for_job(api_client, tfjob_ns, job_name, kf_version, # timeout=datetime.timedelta(minutes=tfjob_timeout_minutes)) # if 'active' not in wait_response['status']['tfReplicaStatuses']['Master']: # master_active = False if succ: logging.info('Training success.') tf_job_client.delete_tf_job(api_client, tfjob_ns, job_name, version=kf_version) with open('/output.txt', 'w') as f: f.write(args.job_dir)
from kubernetes.client import V1Container from kubernetes.client import V1ResourceRequirements from kfserving import KFServingClient from kfserving import constants from kfserving import utils from kfserving import V1alpha2EndpointSpec from kfserving import V1alpha2PredictorSpec from kfserving import V1alpha2InferenceServiceSpec from kfserving import V1alpha2InferenceService from kfserving import V1alpha2CustomSpec from kubernetes import client as k8s_client from kubernetes import config as k8s_config from kubernetes.client.rest import ApiException k8s_config.load_incluster_config() def main(): api_version = constants.KFSERVING_GROUP + '/' + constants.KFSERVING_VERSION default_endpoint_spec = V1alpha2EndpointSpec( predictor=V1alpha2PredictorSpec(custom=V1alpha2CustomSpec( container=V1Container( name="kfserving-container", image=FLAGS.image, env=[{ "name": "STORAGE_URI", "value": "%s" % FLAGS.storage_uri }], resources=V1ResourceRequirements(
def init(): # Set passed environment variables as global variables for variable in [ "MARIADB_CLUSTER", "MARIADB_HOST", "MARIADB_USER", "MARIADB_PASSWORD" ]: checkAndSetEnvironmentVariablesAsGlobalVariables(variable) # Load the kubectl config and initialize the API config.load_incluster_config() global v1 v1 = CoreV1Api() # Get k8s topology information about the cluster to test global serverPods serverPods = v1.list_namespaced_pod( NAMESPACE, watch=False, label_selector="mariadb=%s,server.mariadb" % (MARIADB_CLUSTER, )) global maxScalePods maxScalePods = v1.list_namespaced_pod( NAMESPACE, watch=False, label_selector="mariadb=%s,maxscale.mariadb" % (MARIADB_CLUSTER, )) global umPods umPods = v1.list_namespaced_pod(NAMESPACE, watch=False, label_selector="mariadb=%s,um.mariadb" % (MARIADB_CLUSTER, )) global pmPods pmPods = v1.list_namespaced_pod(NAMESPACE, watch=False, label_selector="mariadb=%s,pm.mariadb" % (MARIADB_CLUSTER, )) global topology global system global MARIADB_PORT if len(umPods.items) > 0 and len(pmPods.items) > 0: system = "columnstore" MARIADB_PORT = 3306 if umPods.items[0].metadata.name == "%s-mdb-cs-single-0" % ( MARIADB_CLUSTER, ): topology = "columnstore-standalone" else: topology = "columnstore" elif len(serverPods.items) > 0 and len(maxScalePods.items) > 0: system = "server" MARIADB_PORT = 4006 if serverPods.items[0].metadata.name == "%s-mdb-galera-0" % ( MARIADB_CLUSTER, ): topology = "galera" else: topology = "masterslave" elif len(serverPods.items) == 1 and len(maxScalePods.items) == 0: system = "server" topology = "standalone" MARIADB_PORT = 3306 else: print( "error: no valid topology could be found in namespace %s.\nserver pods found: %d\nmaxscale pods found: %d\ncolumnstore um pods found: %d\ncolumnstore pm pods found: %d" % (NAMESPACE, len(serverPods.items), len( maxScalePods.items), len(umPods.items), len(pmPods.items))) sys.exit(666) # Wait for the database to be active if system == "columnstore": helper_functions.waitForColumnStoreActive(umPods.items[0], v1, MARIADB_CLUSTER, COLUMNSTORE_TIMEOUT) else: helper_functions.waitForServerActive(serverPods.items[0], v1, MARIADB_USER, MARIADB_PASSWORD, MARIADB_HOST, MARIADB_PORT, SERVER_TIMEOUT) print("") # Get a SQL connection, and prepare the test database error = False try: conn = mariadb.connect(user=MARIADB_USER, password=MARIADB_PASSWORD, host=MARIADB_HOST, port=MARIADB_PORT) cursor = conn.cursor() cursor.execute("DROP DATABASE IF EXISTS %s" % (DB_NAME, )) cursor.execute("CREATE DATABASE IF NOT EXISTS %s" % (DB_NAME, )) except Exception as e: print("error: could not prepare test database '%s'\n%s" % (DB_NAME, e)) error = True finally: try: if cursor: cursor.close() if conn: conn.close() except Exception: pass if error: sys.exit(666)
def load_config(): if frappe.get_conf().get("developer_mode"): config.load_kube_config() else: config.load_incluster_config()
def main(argv=None): parser = argparse.ArgumentParser(description='Kubeflow StudyJob launcher') parser.add_argument('--name', type=str, help='StudyJob name.') parser.add_argument('--namespace', type=str, default='kubeflow', help='StudyJob namespace.') parser.add_argument( '--optimizationtype', type=str, default='minimize', help='Direction of optimization. minimize or maximize.') parser.add_argument('--objectivevaluename', type=str, help='Objective value name which trainer optimizes.') parser.add_argument('--optimizationgoal', type=float, help='Stop studying once objectivevaluename value ' + 'exceeds optimizationgoal') parser.add_argument('--requestcount', type=int, default=1, help='The times asking request to suggestion service.') parser.add_argument('--metricsnames', type=strToList, help='StudyJob metrics name list.') parser.add_argument('--parameterconfigs', type=yamlOrJsonStr, default={}, help='StudyJob parameterconfigs.') parser.add_argument('--nasConfig', type=yamlOrJsonStr, default={}, help='StudyJob nasConfig.') parser.add_argument('--workertemplatepath', type=str, default="", help='StudyJob worker spec.') parser.add_argument('--mcollectortemplatepath', type=str, default="", help='StudyJob worker spec.') parser.add_argument('--suggestionspec', type=yamlOrJsonStr, default={}, help='StudyJob suggestion spec.') parser.add_argument( '--outputfile', type=str, default='/output.txt', help='The file which stores the best trial of the studyJob.') parser.add_argument( '--deleteAfterDone', type=strtobool, default=True, help= 'When studyjob done, delete the studyjob automatically if it is True.') parser.add_argument( '--studyjobtimeoutminutes', type=int, default=10, help='Time in minutes to wait for the StudyJob to complete') args = parser.parse_args() logging.getLogger().setLevel(logging.INFO) logging.info('Generating studyjob template.') template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'hp.template.yaml') content_yaml = _generate_studyjob_yaml( template_file, args.name, args.namespace, args.optimizationtype, args.objectivevaluename, args.optimizationgoal, args.requestcount, args.metricsnames, args.parameterconfigs, args.nasConfig, args.workertemplatepath, args.mcollectortemplatepath, args.suggestionspec) config.load_incluster_config() api_client = k8s_client.ApiClient() create_response = study_job_client.create_study_job( api_client, content_yaml) job_name = create_response['metadata']['name'] job_namespace = create_response['metadata']['namespace'] expected_condition = ["Completed", "Failed"] wait_response = study_job_client.wait_for_condition( api_client, job_namespace, job_name, expected_condition, timeout=datetime.timedelta(minutes=args.studyjobtimeoutminutes)) succ = False if wait_response.get("status", {}).get("condition") == "Completed": succ = True trial = get_best_trial(wait_response["status"]["bestTrialId"]) if not os.path.exists(os.path.dirname(args.outputfile)): os.makedirs(os.path.dirname(args.outputfile)) with open(args.outputfile, 'w') as f: ps_dict = {} for ps in trial.parameter_set: ps_dict[ps.name] = ps.value f.write(json.dumps(ps_dict)) if succ: logging.info('Study success.') if args.deleteAfterDone: study_job_client.delete_study_job(api_client, job_name, job_namespace)