def check_docker_commit(task, docker_id): # 在页面中测试时会自定接收者和id with session_scope(nullpool=True) as dbsession: try: docker = dbsession.query(Docker).filter_by( id=int(docker_id)).first() pod_name = "docker-commit-%s-%s" % (docker.created_by.username, str(docker.id)) namespace = conf.get('NOTEBOOK_NAMESPACE') k8s_client = K8s( conf.get('CLUSTERS').get( conf.get('ENVIRONMENT')).get('KUBECONFIG')) begin_time = datetime.datetime.now() now_time = datetime.datetime.now() while ((now_time - begin_time).seconds < 1800): # 也就是最多commit push 30分钟 time.sleep(12000) commit_pods = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) if commit_pods: commit_pod = commit_pods[0] if commit_pod['status'] == 'Succeeded': docker.last_image = docker.target_image dbsession.commit() break # 其他异常状态直接报警 if commit_pod['status'] != 'Running': push_message( conf.get('ADMIN_USER').split(','), 'commit pod %s not running' % commit_pod['name']) break else: break except Exception as e: print(e)
def listen(self): request_data = request.json run_id = request_data.get('run_id', '').replace('_', '-') if not run_id: response = make_response("输入参数不齐全") response.status_code = 400 return response from myapp.utils.py.py_k8s import K8s k8s = K8s() namespace = conf.get('PIPELINE_NAMESPACE') pod_name = "venus-" + run_id.replace('_', '-') pod_name = pod_name.lower()[:60].strip('-') pod = k8s.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: pod = pod[0] if type(pod['start_time']) == datetime.datetime: pod['start_time'] = pod['start_time'].strftime( "%Y-%d-%m %H:%M:%S") print(pod) response = make_response(json.dumps(pod)) response.status_code = 200 return response else: response = make_response('no pod') response.status_code = 400 return response
def web_log(self, cluster_name, namespace, pod_name): from myapp.utils.py.py_k8s import K8s all_clusters = conf.get('CLUSTERS', {}) if cluster_name in all_clusters: kubeconfig = all_clusters[cluster_name].get('KUBECONFIG', '') pod_url = all_clusters[cluster_name].get( 'K8S_DASHBOARD_CLUSTER' ) + "#/log/%s/%s/pod?namespace=%s&container=%s" % ( namespace, pod_name, namespace, pod_name) else: kubeconfig = None pod_url = conf.get( 'K8S_DASHBOARD_CLUSTER' ) + "#/log/%s/%s/pod?namespace=%s&container=%s" % ( namespace, pod_name, namespace, pod_name) k8s = K8s(kubeconfig) pod = k8s.get_pods(namespace=namespace, pod_name=pod_name) if pod: pod = pod[0] flash('当前pod状态:%s' % pod['status'], category='warning') data = { "url": pod_url, "target": 'div.kd-scroll-container', # kd-logs-container :nth-of-type(0) "delay": 2000, "loading": True, "currentHeight": 128 } # 返回模板 if cluster_name == conf.get('ENVIRONMENT'): return self.render_template('link.html', data=data) else: return self.render_template('external_link.html', data=data)
def build_mq_consumer(self, service_pipeline): namespace = conf.get('SERVICE_PIPELINE_NAMESPACE') name = service_pipeline.name command = service_pipeline.command image_secrets = conf.get('HUBSECRET', []) user_hubsecrets = db.session.query(Repository.hubsecret).filter( Repository.created_by_fk == g.user.id).all() if user_hubsecrets: for hubsecret in user_hubsecrets: if hubsecret[0] not in image_secrets: image_secrets.append(hubsecret[0]) from myapp.utils.py.py_k8s import K8s k8s_client = K8s(service_pipeline.project.cluster.get( 'KUBECONFIG', '')) dag_json = service_pipeline.dag_json if service_pipeline.dag_json else '{}' # 生成服务使用的configmap config_data = {"dag.json": dag_json} k8s_client.create_configmap(namespace=namespace, name=name, data=config_data, labels={'app': name}) env = service_pipeline.env if conf.get('SERVICE_PIPELINE_JAEGER', ''): env['JAEGER_HOST'] = conf.get('SERVICE_PIPELINE_JAEGER', '') env['SERVICE_NAME'] = name k8s_client.create_deployment( namespace=namespace, name=name, replicas=service_pipeline.replicas, labels={ "app": name, "username": service_pipeline.created_by.username }, # command=['sh','-c',command] if command else None, command=['bash', '-c', "python mq-pipeline/cube_kafka.py"], args=None, volume_mount=service_pipeline.volume_mount, working_dir=service_pipeline.working_dir, node_selector=service_pipeline.get_node_selector(), resource_memory=service_pipeline.resource_memory, resource_cpu=service_pipeline.resource_cpu, resource_gpu=service_pipeline.resource_gpu if service_pipeline.resource_gpu else '', image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'), image_pull_secrets=image_secrets, image=service_pipeline.images, hostAliases=conf.get('HOSTALIASES', ''), env=env, privileged=False, accounts=None, username=service_pipeline.created_by.username, ports=None) pass
def delete_old_service(self,service_name,cluster): service_external_name = (service_name + "-external").lower()[:60].strip('-') from myapp.utils.py.py_k8s import K8s k8s = K8s(cluster.get('KUBECONFIG','')) namespace = conf.get('SERVICE_NAMESPACE') k8s.delete_deployment(namespace=namespace, name=service_name) k8s.delete_service(namespace=namespace, name=service_name) k8s.delete_service(namespace=namespace, name=service_external_name) k8s.delete_istio_ingress(namespace=namespace, name=service_name)
def delete_pod(self, docker_id): docker = db.session.query(Docker).filter_by(id=docker_id).first() from myapp.utils.py.py_k8s import K8s k8s_client = K8s( conf.get('CLUSTERS').get(conf.get('ENVIRONMENT')).get( 'KUBECONFIG', '')) namespace = conf.get('NOTEBOOK_NAMESPACE') pod_name = "docker-%s-%s" % (docker.created_by.username, str( docker.id)) k8s_client.delete_pods(namespace=namespace, pod_name=pod_name) flash('清理结束,可重新进行调试', 'success') return redirect("/docker_modelview/list/")
def clear(self, service_pipeline_id): service_pipeline = db.session.query(Service_Pipeline).filter_by( id=service_pipeline_id).first() from myapp.utils.py.py_k8s import K8s k8s_client = K8s(service_pipeline.project.cluster.get( 'KUBECONFIG', '')) namespace = conf.get('SERVICE_PIPELINE_NAMESPACE') k8s_client.delete_deployment(namespace=namespace, name=service_pipeline.name) flash('服务清理完成', category='warning') return redirect('/service_pipeline_modelview/list/')
def log_task(self, nni_id): nni = db.session.query(NNI).filter_by(id=nni_id).first() from myapp.utils.py.py_k8s import K8s k8s = K8s(nni.project.cluster.get('KUBECONFIG', '')) namespace = conf.get('KATIB_NAMESPACE') pod = k8s.get_pods(namespace=namespace, pod_name=nni.name) if pod: pod = pod[0] return redirect("/myapp/web/log/%s/%s/%s" % (nni.project.cluster['NAME'], namespace, nni.name)) flash("未检测到当前搜索正在运行的容器", category='success') return redirect('/nni_modelview/list/')
def clear_task(self, task_id): task = db.session.query(Task).filter_by(id=task_id).first() from myapp.utils.py.py_k8s import K8s k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', '')) namespace = conf.get('PIPELINE_NAMESPACE') # 删除运行时容器 pod_name = "run-" + task.pipeline.name.replace( '_', '-') + "-" + task.name.replace('_', '-') pod_name = pod_name.lower()[:60].strip('-') pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: pod = pod[0] # 有历史,直接删除 if pod: k8s_client.delete_pods(namespace=namespace, pod_name=pod['name']) run_id = pod['labels'].get('run-id', '') if run_id: k8s_client.delete_workflow(all_crd_info=conf.get( "CRD_INFO", {}), namespace=namespace, run_id=run_id) k8s_client.delete_pods(namespace=namespace, labels={"run-id": run_id}) time.sleep(2) # 删除debug容器 pod_name = "debug-" + task.pipeline.name.replace( '_', '-') + "-" + task.name.replace('_', '-') pod_name = pod_name.lower()[:60].strip('-') pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: pod = pod[0] # 有历史,直接删除 if pod: k8s_client.delete_pods(namespace=namespace, pod_name=pod['name']) run_id = pod['labels'].get('run-id', '') if run_id: k8s_client.delete_workflow(all_crd_info=conf.get( "CRD_INFO", {}), namespace=namespace, run_id=run_id) k8s_client.delete_pods(namespace=namespace, labels={"run-id": run_id}) time.sleep(2) flash("删除完成", category='success') # self.update_redirect() return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id))
def create_experiment(self,id): hp = db.session.query(Hyperparameter_Tuning).filter(Hyperparameter_Tuning.id == int(id)).first() if hp: from myapp.utils.py.py_k8s import K8s k8s_client = K8s(hp.project.cluster.get('KUBECONFIG','')) namespace = conf.get('KATIB_NAMESPACE') crd_info =conf.get('CRD_INFO')['experiment'] print(hp.experiment) k8s_client.create_crd(group=crd_info['group'],version=crd_info['version'],plural=crd_info['plural'],namespace=namespace,body=hp.experiment) flash('部署完成','success') # kclient = kc.KatibClient() # kclient.create_experiment(hp, namespace=conf.get('KATIB_NAMESPACE')) self.update_redirect() return redirect(self.get_redirect())
def log_task(self, task_id): task = db.session.query(Task).filter_by(id=task_id).first() from myapp.utils.py.py_k8s import K8s k8s = K8s(task.pipeline.project.cluster.get('KUBECONFIG', '')) namespace = conf.get('PIPELINE_NAMESPACE') running_pod_name = "run-" + task.pipeline.name.replace( '_', '-') + "-" + task.name.replace('_', '-') pod_name = running_pod_name.lower()[:60].strip('-') pod = k8s.get_pods(namespace=namespace, pod_name=pod_name) if pod: pod = pod[0] return redirect( "/myapp/web/log/%s/%s/%s" % (task.pipeline.project.cluster['NAME'], namespace, pod_name)) flash("未检测到当前task正在运行的容器", category='success') return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id))
def apply_hubsecret(self, hubsecret): from myapp.utils.py.py_k8s import K8s all_cluster = conf.get('CLUSTERS', {}) all_kubeconfig = [ all_cluster[cluster].get('KUBECONFIG', '') for cluster in all_cluster ] + [''] all_kubeconfig = list(set(all_kubeconfig)) for kubeconfig in all_kubeconfig: k8s = K8s(kubeconfig) namespaces = conf.get('HUBSECRET_NAMESPACE') for namespace in namespaces: k8s.apply_hubsecret(namespace=namespace, name=hubsecret.hubsecret, user=hubsecret.user, password=hubsecret.password, server=hubsecret.server)
def make_container(service, mykfservice): from myapp.utils.py.py_k8s import K8s k8s = K8s() # 不部署,不需要配置集群信息 container = k8s.make_container( name=mykfservice.name + "-" + service.name, command=["sh", "-c", service.command] if service.command else None, args=None, volume_mount=None, image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'), image=service.images, working_dir=service.working_dir if service.working_dir else None, env=service.env, resource_memory=service.resource_memory, resource_cpu=service.resource_cpu, resource_gpu=service.resource_gpu, username=service.created_by.username) return container
def schedule_node(self, ip): all_node_json = resource_used['data'] for cluster_name in all_node_json: nodes = all_node_json[cluster_name] if ip in nodes: clusters = conf.get('CLUSTERS', {}) cluster = clusters[cluster_name] k8s_client = K8s(cluster.get('KUBECONFIG', '')) # 获取最新的节点信息 nodes = k8s_client.get_node(ip=ip) if nodes: node = nodes[0] enable_train = node['labels'].get('train', 'true') k8s_client.label_node([ip], { "train": "false" if enable_train == 'true' else "true" }) break return redirect('/myapp/home')
def debug(self, docker_id): docker = db.session.query(Docker).filter_by(id=docker_id).first() from myapp.utils.py.py_k8s import K8s k8s_client = K8s( conf.get('CLUSTERS').get(conf.get('ENVIRONMENT')).get( 'KUBECONFIG', '')) namespace = conf.get('NOTEBOOK_NAMESPACE') pod_name = "docker-%s-%s" % (docker.created_by.username, str( docker.id)) pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) if pod: pod = pod[0] # 有历史非运行态,直接删除 # if pod and (pod['status']!='Running' and pod['status']!='Pending'): if pod and pod['status'] == 'Succeeded': k8s_client.delete_pods(namespace=namespace, pod_name=pod_name) time.sleep(2) pod = None # 没有历史或者没有运行态,直接创建 if not pod or (pod['status'] != 'Running' and pod['status'] != 'Pending'): command = [ 'sh', '-c', 'sleep 7200 && hour=`date +%H` && while [ $hour -ge 06 ];do sleep 3600;hour=`date +%H`;done' ] hostAliases = conf.get('HOSTALIASES') default_volume_mount = docker.project.volume_mount k8s_client.create_debug_pod( namespace, name=pod_name, command=command, labels={}, args=None, volume_mount=json.loads(docker.expand).get( 'volume_mount', default_volume_mount) if docker.expand else default_volume_mount, working_dir='/mnt/%s' % docker.created_by.username, node_selector='%s=true,train=true,org=public' % ('gpu' if docker.need_gpu else 'cpu'), resource_memory=json.loads(docker.expand).get( 'resource_memory', '8G') if docker.expand else '8G', resource_cpu=json.loads(docker.expand).get( 'resource_cpu', '4') if docker.expand else '4', resource_gpu=json.loads( docker.expand if docker.expand else '{}').get( 'resource_gpu', '1') if docker.need_gpu else '0', image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'), image_pull_secrets=conf.get('HUBSECRET', []), image=docker.last_image if docker.last_image and docker.consecutive_build else docker.base_image, hostAliases=hostAliases, env=None, privileged=None, accounts=None, username=docker.created_by.username) try_num = 5 while (try_num > 0): pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: pod = pod[0] # 有历史非运行态,直接删除 if pod and pod['status'] == 'Running': break try_num = try_num - 1 time.sleep(2) if try_num == 0: flash('启动时间过长,一分钟后重试', 'warning') return redirect('/docker_modelview/list/') flash('镜像调试只安装环境,请不要运行业务代码。当晚前请注意保存镜像', 'warning') return redirect("/docker_modelview/web/debug/%s/%s/%s" % (conf.get('ENVIRONMENT'), namespace, pod_name))
def run_task(self, task_id): task = db.session.query(Task).filter_by(id=task_id).first() from myapp.utils.py.py_k8s import K8s k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', '')) namespace = conf.get('PIPELINE_NAMESPACE') pod_name = "run-" + task.pipeline.name.replace( '_', '-') + "-" + task.name.replace('_', '-') pod_name = pod_name.lower()[:60].strip('-') pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: pod = pod[0] # 有历史,直接删除 if pod: run_id = pod['labels'].get("run-id", '') if run_id: k8s_client.delete_workflow(all_crd_info=conf.get( 'CRD_INFO', {}), namespace=namespace, run_id=run_id) k8s_client.delete_pods(namespace=namespace, pod_name=pod_name) delete_time = datetime.datetime.now() while pod: time.sleep(2) pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) check_date = datetime.datetime.now() if (check_date - delete_time).seconds > 60: flash("超时,请稍后重试", category='warning') return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id)) # 没有历史或者没有运行态,直接创建 if not pod: command = None if task.job_template.entrypoint: command = task.job_template.entrypoint if task.command: command = task.command if command: command = command.split(" ") command = [com for com in command if com] ops_args = [] task_args = json.loads(task.args) if task.args else {} for task_attr_name in task_args: # 添加参数名 if type(task_args[task_attr_name]) == bool: if task_args[task_attr_name]: ops_args.append('%s' % str(task_attr_name)) # 添加参数值 elif type(task_args[task_attr_name]) == dict or type( task_args[task_attr_name]) == list: ops_args.append('%s' % str(task_attr_name)) ops_args.append('%s' % json.dumps( task_args[task_attr_name], ensure_ascii=False)) elif not task_args[task_attr_name]: # 如果参数值为空,则都不添加 pass else: ops_args.append('%s' % str(task_attr_name)) ops_args.append( '%s' % str(task_args[task_attr_name]) ) # 这里应该对不同类型的参数名称做不同的参数处理,比如bool型,只有参数,没有值 # print(ops_args) run_id = "run-" + str(task.pipeline.id) + "-" + str(task.id) self.run_pod( task=task, k8s_client=k8s_client, run_id=run_id, namespace=namespace, pod_name=pod_name, image=json.loads(task.args)['images'] if task.job_template.name == conf.get('CUSTOMIZE_JOB') else task.job_template.images.name, working_dir=json.loads(task.args)['workdir'] if task.job_template.name == conf.get('CUSTOMIZE_JOB') else task.job_template.workdir, command=['bash', '-c', json.loads(task.args)['command']] if task.job_template.name == conf.get('CUSTOMIZE_JOB') else command, args=None if task.job_template.name == conf.get('CUSTOMIZE_JOB') else ops_args) try_num = 5 while (try_num > 0): pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: break try_num = try_num - 1 time.sleep(2) if try_num == 0: flash('启动时间过长,一分钟后重试', 'warning') return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id)) return redirect( "/myapp/web/log/%s/%s/%s" % (task.pipeline.project.cluster['NAME'], namespace, pod_name))
def debug(self, task_id): task = db.session.query(Task).filter_by(id=task_id).first() if task.job_template.name != conf.get('CUSTOMIZE_JOB'): if not g.user.is_admin( ) and task.job_template.created_by.username != g.user.username: flash('仅管理员或当前任务模板创建者,可启动debug模式', 'warning') return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id)) from myapp.utils.py.py_k8s import K8s k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', '')) namespace = conf.get('PIPELINE_NAMESPACE') pod_name = "debug-" + task.pipeline.name.replace( '_', '-') + "-" + task.name.replace('_', '-') pod_name = pod_name.lower()[:60].strip('-') pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: pod = pod[0] # 有历史非运行态,直接删除 # if pod and (pod['status']!='Running' and pod['status']!='Pending'): if pod and pod['status'] == 'Succeeded': k8s_client.delete_pods(namespace=namespace, pod_name=pod_name) time.sleep(2) pod = None # 没有历史或者没有运行态,直接创建 if not pod or pod['status'] != 'Running': run_id = "debug-" + str(uuid.uuid4().hex) command = [ 'sh', '-c', 'sleep 7200 && hour=`date +%H` && while [ $hour -ge 06 ];do sleep 3600;hour=`date +%H`;done' ] self.run_pod(task=task, k8s_client=k8s_client, run_id=run_id, namespace=namespace, pod_name=pod_name, image=json.loads(task.args)['images'] if task.job_template.name == conf.get('CUSTOMIZE_JOB') else task.job_template.images.name, working_dir='/mnt', command=command, args=None) try_num = 5 while (try_num > 0): pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: pod = pod[0] # 有历史非运行态,直接删除 if pod and pod['status'] == 'Running': break try_num = try_num - 1 time.sleep(2) if try_num == 0: flash('启动时间过长,一分钟后重试', 'warning') return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id)) return redirect("/myapp/web/debug/%s/%s/%s/%s" % (task.pipeline.project.cluster['NAME'], namespace, pod_name, pod_name))
def deploy(self,service_id): image_secrets = conf.get('HUBSECRET', []) user_hubsecrets = db.session.query(Repository.hubsecret).filter(Repository.created_by_fk == g.user.id).all() if user_hubsecrets: for hubsecret in user_hubsecrets: if hubsecret[0] not in image_secrets: image_secrets.append(hubsecret[0]) service = db.session.query(Service).filter_by(id=service_id).first() from myapp.utils.py.py_k8s import K8s k8s_client = K8s(service.project.cluster.get('KUBECONFIG','')) namespace = conf.get('SERVICE_NAMESPACE') volume_mount = service.volume_mount k8s_client.create_deployment(namespace=namespace, name=service.name, replicas=service.replicas, labels={"app":service.name,"username":service.created_by.username}, command=['bash','-c',service.command] if service.command else None, args=None, volume_mount=volume_mount, working_dir=service.working_dir, node_selector=service.get_node_selector(), resource_memory=service.resource_memory, resource_cpu=service.resource_cpu, resource_gpu=service.resource_gpu if service.resource_gpu else '', image_pull_policy=conf.get('IMAGE_PULL_POLICY','Always'), image_pull_secrets=image_secrets, image=service.images, hostAliases=conf.get('HOSTALIASES',''), env=service.env, privileged=False, accounts=None, username=service.created_by.username, ports=[int(port) for port in service.ports.split(',')] ) ports = [int(port) for port in service.ports.split(',')] k8s_client.create_service( namespace=namespace, name=service.name, username=service.created_by.username, ports=ports ) # 如果域名配置的gateway,就用这个 host = service.name+"."+conf.get('SERVICE_DOMAIN') if service.host: host=service.host.replace('http://','').replace('https://','').strip() if "/" in host: host = host[:host.index("/")] if ":" in host: host = host[:host.index(":")] k8s_client.create_istio_ingress(namespace=namespace, name=service.name, host = host, ports=service.ports.split(',') ) # 以ip形式访问的话,使用的代理ip。不然不好处理机器服务化机器扩容和缩容时ip变化 # 创建EXTERNAL_IP的服务 SERVICE_EXTERNAL_IP = conf.get('SERVICE_EXTERNAL_IP', None) if not SERVICE_EXTERNAL_IP and service.project.expand: SERVICE_EXTERNAL_IP = json.loads(service.project.expand).get('SERVICE_EXTERNAL_IP', SERVICE_EXTERNAL_IP) if type(SERVICE_EXTERNAL_IP)==str: SERVICE_EXTERNAL_IP = [SERVICE_EXTERNAL_IP] if SERVICE_EXTERNAL_IP: service_ports = [[30000+10*service.id+index,port] for index,port in enumerate(ports)] service_external_name = (service.name + "-external").lower()[:60].strip('-') k8s_client.create_service( namespace=namespace, name=service_external_name, username=service.created_by.username, ports=service_ports, selector={"app": service.name, 'user': service.created_by.username}, externalIPs=SERVICE_EXTERNAL_IP ) # # 创建虚拟服务做代理 # crd_info = conf.get('CRD_INFO', {}).get('virtualservice', {}) # crd_name = "service-%s"%service.name # crd_list = k8s.get_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'],namespace=namespace, return_dict=None) # for vs_obj in crd_list: # if vs_obj['name'] == crd_name: # k8s.delete_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'],namespace=namespace, name=crd_name) # time.sleep(1) # crd_json = { # "apiVersion": "networking.istio.io/v1alpha3", # "kind": "VirtualService", # "metadata": { # "name": crd_name, # "namespace": namespace # }, # "spec": { # "gateways": [ # "kubeflow/kubeflow-gateway" # ], # "hosts": [ # "*" # ], # "http": [ # { # "match": [ # { # "uri": { # "prefix": "/service/%s/"%service.name # } # } # ], # "rewrite": { # "uri": "/" # }, # "route": [ # { # "destination": { # "host": "%s.service.svc.cluster.local"%service.name, # "port": { # "number": int(service.ports.split(',')[0]) # } # } # } # ], # "timeout": "300s" # } # ] # } # } # # # print(crd_json) # crd = k8s.create_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'],namespace=namespace, body=crd_json) # # return crd flash('服务部署完成',category='warning') return redirect('/service_modelview/list/')
def save(self, docker_id): docker = db.session.query(Docker).filter_by(id=docker_id).first() from myapp.utils.py.py_k8s import K8s k8s_client = K8s( conf.get('CLUSTERS').get(conf.get('ENVIRONMENT')).get( 'KUBECONFIG', '')) namespace = conf.get('NOTEBOOK_NAMESPACE') pod_name = "docker-%s-%s" % (docker.created_by.username, str( docker.id)) pod = k8s_client.v1.read_namespaced_pod(name=pod_name, namespace=namespace) node_name = '' container_id = '' if pod: node_name = pod.spec.node_name containers = [ container for container in pod.status.container_statuses if container.name == pod_name ] if containers: container_id = containers[0].container_id.replace( 'docker://', '') if not node_name or not container_id: flash('没有发现正在运行的调试镜像,请先调试惊险,安装环境后,再保存生成新镜像', category='warning') return redirect('/docker_modelview/list/') # flash('新镜像正在保存推送中,请留意消息通知',category='success') # return redirect('/docker_modelview/list/') pod_name = "docker-commit-%s-%s" % (docker.created_by.username, str(docker.id)) command = [ 'sh', '-c', 'docker commit %s %s && docker push %s' % (container_id, docker.target_image, docker.target_image) ] hostAliases = conf.get('HOSTALIASES') k8s_client.create_debug_pod( namespace=namespace, name=pod_name, command=command, labels={}, args=None, volume_mount='/var/run/docker.sock(hostpath):/var/run/docker.sock', working_dir='/mnt/%s' % docker.created_by.username, node_selector=None, resource_memory='4G', resource_cpu='4', resource_gpu='0', image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'), image_pull_secrets=conf.get('HUBSECRET', []), image='ccr.ccs.tencentyun.com/cube-studio/docker', hostAliases=hostAliases, env=None, privileged=None, accounts=None, username=docker.created_by.username, node_name=node_name) from myapp.tasks.async_task import check_docker_commit # 发起异步任务检查commit pod是否完成,如果完成,修正last_image kwargs = {"docker_id": docker.id} check_docker_commit.apply_async(kwargs=kwargs) return redirect("/myapp/web/log/%s/%s/%s" % (conf.get('ENVIRONMENT'), namespace, pod_name))
from myapp.project import push_admin, push_message from myapp import app, db, security_manager from myapp.models.model_job import (Pipeline, Workflow, Task) from myapp.utils.celery import session_scope conf = app.config prometheus = Prometheus(conf.get('PROMETHEUS', '')) cluster = os.getenv('ENVIRONMENT', '').lower() if not cluster: print('no cluster %s' % cluster) exit(1) else: clusters = conf.get('CLUSTERS', {}) if clusters and cluster in clusters: kubeconfig = clusters[cluster].get('KUBECONFIG', '') k8s_client = K8s(kubeconfig) # k8s_config.kube_config.load_kube_config(config_file=kubeconfig) else: print('no kubeconfig in cluster %s' % cluster) exit(1) # 推送微信消息 # @pysnooper.snoop() def deliver_message(workflow, dbsession): if not workflow: return receivers = workflow.username.split(',') receivers = [receiver.strip() for receiver in receivers]
def upgrade_service(task, service_id, name, namespace): # 将旧的在线版本进行下掉,前提是新的服务必须已经就绪 time.sleep(10) with session_scope(nullpool=True) as dbsession: try: service = dbsession.query(InferenceService).filter_by( id=int(service_id)).first() message = '%s 准备进行服务迭代 %s %s' % ( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version) push_admin(message) push_message([service.created_by.username], message) k8s_client = K8s(service.project.cluster['KUBECONFIG']) begin_time = time.time() while (True): try: deployment = k8s_client.AppsV1Api.read_namespaced_deployment( name=name, namespace=namespace) if deployment: ready_replicas = deployment.status.ready_replicas replicas = deployment.status.replicas message = '%s 服务 %s %s ready副本数:%s 目标副本数:%s' % ( datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version, ready_replicas, replicas) push_admin(message) push_message([service.created_by.username], message) # 如果新的dp副本数已全部就绪 if ready_replicas == replicas: break else: message = '%s 没有发现 %s %s 的 deployment' % ( datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version) push_admin(message) push_message([service.created_by.username], message) return except Exception as e: print(e) if time.time() - begin_time > 600: message = '%s 新版本运行状态检查超时,请手动检查和清理旧版本%s %s' % ( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version) push_admin(message) push_message([service.created_by.username], message) return time.sleep(60) old_services = dbsession.query(InferenceService)\ .filter(InferenceService.model_status=='online')\ .filter(InferenceService.model_name==service.model_name)\ .filter(InferenceService.name!=service.name)\ .filter(InferenceService.host==service.host).all() if old_services: for old_service in old_services: if old_service.name != service.name: inference_model_view = InferenceService_ModelView_base( ) inference_model_view.delete_old_service( old_service.name, old_service.project.cluster) old_service.model_status = 'offline' old_service.deploy_history = service.deploy_history + "\n" + "clear: %s %s" % ( 'admin', datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S')) dbsession.commit() message = '%s 新版本服务升级完成,下线旧服务 %s %s' % ( datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), service.model_name, old_service.model_version) push_admin(message) push_message([service.created_by.username], message) else: message = '%s %s 没有历史在线版本,%s版本升级完成' % ( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), service.model_name, service.model_version) push_admin(message) push_message([service.created_by.username], message) except Exception as e: print(e) push_admin('部署升级报错 %s %s: %s' % (service.model_name, service.model_version, str(e)))
def deploy1(self, kfservice_id): mykfservice = db.session.query(KfService).filter_by( id=kfservice_id).first() from myapp.utils.py.py_k8s import K8s k8s = K8s(mykfservice.project.cluster.get('KUBECONFIG', '')) namespace = conf.get('KFSERVING_NAMESPACE') crd_info = conf.get('CRD_INFO')['inferenceservice'] crd_list = k8s.get_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'], namespace=namespace) for crd_obj in crd_list: if crd_obj['name'] == mykfservice.name: k8s.delete_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'], namespace=namespace, name=mykfservice.name) def get_env(env_str): if not env_str: return [] envs = re.split('\r|\n', env_str) envs = [ env.split('=') for env in envs if env and len(env.split('=')) == 2 ] return envs def get_kfjson(service, mykfservice): if not service: return None image_secrets = conf.get('HUBSECRET', []) user_hubsecrets = db.session.query(Repository.hubsecret).filter( Repository.created_by_fk == g.user.id).all() if user_hubsecrets: for hubsecret in user_hubsecrets: if hubsecret[0] not in image_secrets: image_secrets.append(hubsecret[0]) kfjson = { "minReplicas": service.min_replicas, "maxReplicas": service.max_replicas, "custom": { "affinity": { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { "nodeSelectorTerms": [{ "matchExpressions": [ { "key": "gpu" if core.get_gpu( service.resource_gpu)[0] else "cpu", "operator": "In", "values": ["true"] }, ] }] } }, }, "imagePullSecrets": [{ "name": hubsecret } for hubsecret in image_secrets], "container": { "image": service.images, "imagePullPolicy": conf.get('IMAGE_PULL_POLICY', 'Always'), "name": mykfservice.name + "-" + service.name, "workingDir": service.working_dir if service.working_dir else None, "command": ["sh", "-c", service.command] if service.command else None, "resources": { "requests": { "cpu": service.resource_cpu, "memory": service.resource_memory } }, "env": [{ "name": env[0], "value": env[1] } for env in get_env(service.env)], # "volumeMounts": [ # { # "mountPath": "/mnt/%s" % service.created_by.username, # "name": "workspace", # "subPath": service.created_by.username # } # ], # "volumeDevices":[ # { # "devicePath": "/data/home/", # "name": "workspace" # } # ] } # "volumes": [ # { # "name": "workspace", # "persistentVolumeClaim": { # "claimName": "kubeflow-user-workspace" # } # } # ] } } return kfjson crd_json = { "apiVersion": "serving.kubeflow.org/v1alpha2", "kind": "InferenceService", "metadata": { "labels": { "app": mykfservice.name }, "name": mykfservice.name, "namespace": namespace }, "spec": { "canaryTrafficPercent": mykfservice.canary_traffic_percent, "default": { mykfservice.service_type: get_kfjson(mykfservice.default_service, mykfservice) }, "canary": { mykfservice.service_type: get_kfjson(mykfservice.canary_service, mykfservice), } if mykfservice.canary_service else None, } } import yaml ya = yaml.load(json.dumps(crd_json)) ya_str = yaml.safe_dump(ya, default_flow_style=False) logging.info(ya_str) crd_objects = k8s.create_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'], namespace=namespace, body=crd_json) flash(category='warning', message='部署启动,一分钟后部署完成') return redirect('/kfservice_modelview/list/')
def run(self): request_data = request.json job_template_id = request_data.get('job_template_id', '') job_template_name = request_data.get('job_template_name', '') run_id = request_data.get('run_id', '').replace('_', '-') resource_memory = request_data.get('resource_memory', '') resource_cpu = request_data.get('resource_cpu', '') task_args = request_data.get('args', '') if (not job_template_id and not job_template_name) or not run_id or task_args == '': response = make_response("输入参数不齐全") response.status_code = 400 return response job_template = None if job_template_id: job_template = db.session.query(Job_Template).filter_by( id=int(job_template_id)).first() elif job_template_name: job_template = db.session.query(Job_Template).filter_by( name=job_template_name).first() if not job_template: response = make_response("no job template exist") response.status_code = 400 return response from myapp.utils.py.py_k8s import K8s k8s = K8s() namespace = conf.get('PIPELINE_NAMESPACE') pod_name = "venus-" + run_id.replace('_', '-') pod_name = pod_name.lower()[:60].strip('-') pod = k8s.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: pod = pod[0] # 有历史,直接删除 if pod: k8s.delete_pods(namespace=namespace, pod_name=pod_name) time.sleep(2) pod = None # 没有历史或者没有运行态,直接创建 if not pod: args = [] job_template_args = json.loads( job_template.args) if job_template.args else {} for arg_name in task_args: arg_type = '' for group in job_template_args: for template_arg in job_template_args[group]: if template_arg == arg_name: arg_type = job_template_args[group][ template_arg].get('type', '') arg_value = task_args[arg_name] if arg_value: args.append(arg_name) if arg_type == 'json': args.append(json.dumps(arg_value)) else: args.append(arg_value) # command = ['sh', '-c','sleep 7200'] volume_mount = 'kubeflow-cfs-workspace(pvc):/mnt,kubeflow-cfs-archives(pvc):/archives' env = job_template.env + "\n" env += 'KFJ_TASK_ID=0\n' env += 'KFJ_TASK_NAME=' + str('venus-' + run_id) + "\n" env += 'KFJ_TASK_NODE_SELECTOR=cpu=true,train=true\n' env += 'KFJ_TASK_VOLUME_MOUNT=' + str(volume_mount) + "\n" env += 'KFJ_TASK_IMAGES=' + str(job_template.images) + "\n" env += 'KFJ_TASK_RESOURCE_CPU=' + str(resource_cpu) + "\n" env += 'KFJ_TASK_RESOURCE_MEMORY=' + str(resource_memory) + "\n" env += 'KFJ_TASK_RESOURCE_GPU=0\n' env += 'KFJ_PIPELINE_ID=0\n' env += 'KFJ_RUN_ID=' + run_id + "\n" env += 'KFJ_CREATOR=' + str(g.user.username) + "\n" env += 'KFJ_RUNNER=' + str(g.user.username) + "\n" env += 'KFJ_PIPELINE_NAME=venus\n' env += 'KFJ_NAMESPACE=pipeline' + "\n" def template_str(src_str): rtemplate = Environment( loader=BaseLoader, undefined=DebugUndefined).from_string(src_str) des_str = rtemplate.render( creator=g.user.username, datetime=datetime, runner=g.user.username, uuid=uuid, pipeline_id='0', pipeline_name='venus-task', cluster_name=conf.get('ENVIRONMENT')) return des_str global_envs = json.loads( template_str( json.dumps(conf.get('GLOBAL_ENV', {}), indent=4, ensure_ascii=False))) for global_env_key in global_envs: env += global_env_key + '=' + global_envs[global_env_key] + "\n" hostAliases = job_template.hostAliases + "\n" + conf.get( 'HOSTALIASES', '') k8s.create_debug_pod( namespace, name=pod_name, labels={'run-rtx': g.user.username}, command=None, args=args, volume_mount=volume_mount, working_dir=None, node_selector='cpu=true,train=true', resource_cpu=resource_cpu, resource_memory=resource_memory, resource_gpu=0, image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'), image_pull_secrets=[job_template.images.repository.hubsecret], image=job_template.images.name, hostAliases=hostAliases, env=env, privileged=job_template.privileged, accounts=job_template.accounts, username=g.user.username) try_num = 5 while (try_num > 0): pod = k8s.get_pods(namespace=namespace, pod_name=pod_name) # print(pod) if pod: break try_num = try_num - 1 time.sleep(2) if try_num == 0: response = make_response("启动时间过长,一分钟后重试") response.status_code = 400 return response user_roles = [role.name.lower() for role in list(g.user.roles)] if "admin" in user_roles: pod_url = conf.get( 'K8S_DASHBOARD_CLUSTER' ) + "#/log/%s/%s/pod?namespace=%s&container=%s" % ( namespace, pod_name, namespace, pod_name) else: pod_url = conf.get( 'K8S_DASHBOARD_PIPELINE' ) + "#/log/%s/%s/pod?namespace=%s&container=%s" % ( namespace, pod_name, namespace, pod_name) print(pod_url) response = make_response("启动成功,日志地址: %s" % pod_url) response.status_code = 200 return response
def reset_theia(self, notebook): from myapp.utils.py.py_k8s import K8s k8s_client = K8s(notebook.cluster.get('KUBECONFIG', '')) namespace = conf.get('NOTEBOOK_NAMESPACE') port = 3000 command = None workingDir = None volume_mount = notebook.volume_mount if '/dev/shm' not in volume_mount: volume_mount += ',10G(memory):/dev/shm' rewrite_url = '/' pre_command = '(nohup sh /init.sh > /notebook_init.log 2>&1 &) ; (nohup sh /mnt/%s/init.sh > /init.log 2>&1 &) ; ' % notebook.created_by.username if notebook.ide_type == 'jupyter': rewrite_url = '/notebook/jupyter/%s/' % notebook.name workingDir = '/mnt/%s' % notebook.created_by.username # command = ["sh", "-c", "%s jupyter lab --notebook-dir=%s --ip=0.0.0.0 " # "--no-browser --allow-root --port=%s " # "--NotebookApp.token='' --NotebookApp.password='' " # "--NotebookApp.allow_origin='*' " # "--NotebookApp.base_url=%s" % (pre_command,notebook.mount,port,rewrite_url)] command = [ "sh", "-c", "%s jupyter lab --notebook-dir=/ --ip=0.0.0.0 " "--no-browser --allow-root --port=%s " "--NotebookApp.token='' --NotebookApp.password='' " "--NotebookApp.allow_origin='*' " "--NotebookApp.base_url=%s" % (pre_command, port, rewrite_url) ] elif notebook.ide_type == 'theia': command = [ "bash", '-c', '%s node /home/theia/src-gen/backend/main.js /home/project --hostname=0.0.0.0 --port=%s' % (pre_command, port) ] # command = ["node","/home/theia/src-gen/backend/main.js", "/home/project","--hostname=0.0.0.0","--port=%s"%port] workingDir = '/home/theia' print(command) print(workingDir) image_secrets = conf.get('HUBSECRET', []) user_hubsecrets = db.session.query(Repository.hubsecret).filter( Repository.created_by_fk == notebook.created_by.id).all() if user_hubsecrets: for hubsecret in user_hubsecrets: if hubsecret[0] not in image_secrets: image_secrets.append(hubsecret[0]) k8s_client.create_debug_pod( namespace=namespace, name=notebook.name, labels={ "app": notebook.name, 'user': notebook.created_by.username }, command=command, args=None, volume_mount=volume_mount, working_dir=workingDir, node_selector=notebook.get_node_selector(), resource_memory="0G~" + notebook.resource_memory, resource_cpu="0~" + notebook.resource_cpu, resource_gpu=notebook.resource_gpu, image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'), image_pull_secrets=image_secrets, image=notebook.images, hostAliases=conf.get('HOSTALIASES', ''), env={ "NO_AUTH": "true", "USERNAME": notebook.created_by.username, "NODE_OPTIONS": "--max-old-space-size=%s" % str(int(notebook.resource_memory.replace("G", '')) * 1024) }, privileged=None, accounts=conf.get('JUPYTER_ACCOUNTS'), username=notebook.created_by.username) k8s_client.create_service(namespace=namespace, name=notebook.name, username=notebook.created_by.username, ports=[ port, ]) crd_info = conf.get('CRD_INFO', {}).get('virtualservice', {}) crd_name = "notebook-jupyter-%s" % notebook.name.replace( '_', '-') # notebook.name.replace('_', '-') vs_obj = k8s_client.get_one_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'], namespace=namespace, name=crd_name) if vs_obj: k8s_client.delete_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'], namespace=namespace, name=crd_name) time.sleep(1) host = notebook.project.cluster.get('JUPYTER_DOMAIN', request.host) if not host: host = request.host if ':' in host: host = host[:host.rindex(':')] # 如果捕获到端口号,要去掉 crd_json = { "apiVersion": "networking.istio.io/v1alpha3", "kind": "VirtualService", "metadata": { "name": crd_name, "namespace": namespace }, "spec": { "gateways": ["kubeflow/kubeflow-gateway"], "hosts": ["*" if core.checkip(host) else host], "http": [{ "match": [{ "uri": { "prefix": "/notebook/%s/%s/" % (namespace, notebook.name) } }], "rewrite": { "uri": rewrite_url }, "route": [{ "destination": { "host": "%s.%s.svc.cluster.local" % (notebook.name, namespace), "port": { "number": port } } }], "timeout": "300s" }] } } # print(crd_json) crd = k8s_client.create_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'], namespace=namespace, body=crd_json) # 创建EXTERNAL_IP的服务 SERVICE_EXTERNAL_IP = conf.get('SERVICE_EXTERNAL_IP', None) if not SERVICE_EXTERNAL_IP and notebook.project.expand: SERVICE_EXTERNAL_IP = json.loads(notebook.project.expand).get( 'SERVICE_EXTERNAL_IP', SERVICE_EXTERNAL_IP) if type(SERVICE_EXTERNAL_IP) == str: SERVICE_EXTERNAL_IP = [SERVICE_EXTERNAL_IP] if SERVICE_EXTERNAL_IP: service_ports = [[10000 + 10 * notebook.id + index, port] for index, port in enumerate([port])] service_external_name = (notebook.name + "-external").lower()[:60].strip('-') k8s_client.create_service(namespace=namespace, name=service_external_name, username=notebook.created_by.username, ports=service_ports, selector={ "app": notebook.name, 'user': notebook.created_by.username }, externalIPs=SERVICE_EXTERNAL_IP) return crd
def deploy_nni_service(self, nni, command): image_secrets = conf.get('HUBSECRET', []) user_hubsecrets = db.session.query(Repository.hubsecret).filter( Repository.created_by_fk == g.user.id).all() if user_hubsecrets: for hubsecret in user_hubsecrets: if hubsecret[0] not in image_secrets: image_secrets.append(hubsecret[0]) from myapp.utils.py.py_k8s import K8s k8s_client = K8s(nni.project.cluster.get('KUBECONFIG', '')) namespace = conf.get('KATIB_NAMESPACE') run_id = 'nni-' + nni.name try: nni_deploy = k8s_client.AppsV1Api.read_namespaced_deployment( name=nni.name, namespace=namespace) if nni_deploy: print('exist nni deploy') k8s_client.AppsV1Api.delete_namespaced_deployment( name=nni.name, namespace=namespace) # return except Exception as e: print(e) volume_mount = nni.volume_mount + ",/usr/share/zoneinfo/Asia/Shanghai(hostpath):/etc/localtime" labels = { "nni": nni.name, "username": nni.created_by.username, 'run-id': run_id } k8s_client.create_debug_pod( namespace=namespace, name=nni.name, labels=labels, command=command, args=None, volume_mount=volume_mount, working_dir='/mnt/%s' % nni.created_by.username, node_selector=nni.get_node_selector(), resource_memory='2G', resource_cpu='2', resource_gpu='0', image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'), image_pull_secrets=image_secrets, image=conf.get('NNI_IMAGES', json.loads(nni.job_json).get('job_worker_image')), hostAliases=conf.get('HOSTALIASES', ''), env=None, privileged=False, accounts='nni', username=nni.created_by.username) k8s_client.create_service(namespace=namespace, name=nni.name, username=nni.created_by.username, ports=[8888], selector=labels) host = nni.project.cluster.get('NNI_DOMAIN', request.host) if not host: host = request.host if ':' in host: host = host[:host.rindex(':')] # 如果捕获到端口号,要去掉 vs_json = { "apiVersion": "networking.istio.io/v1alpha3", "kind": "VirtualService", "metadata": { "name": nni.name, "namespace": namespace }, "spec": { "gateways": ["kubeflow/kubeflow-gateway"], "hosts": ["*" if core.checkip(host) else host], "http": [{ "match": [{ "uri": { "prefix": "/nni/%s//" % nni.name } }, { "uri": { "prefix": "/nni/%s/" % nni.name } }], "rewrite": { "uri": "/nni/%s/" % nni.name }, "route": [{ "destination": { "host": "%s.%s.svc.cluster.local" % (nni.name, namespace), "port": { "number": 8888 } } }], "timeout": "300s" }] } } crd_info = conf.get('CRD_INFO')['virtualservice'] k8s_client.delete_istio_ingress(namespace=namespace, name=nni.name) k8s_client.create_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'], namespace=namespace, body=vs_json)
def featureCheck(self): url = request.values.get("url", type=str, default=None) if '/myapp/home' in url: if 1 or not resource_used['check_time'] or resource_used[ 'check_time'] < (datetime.datetime.now() - datetime.timedelta(minutes=10)): clusters = conf.get('CLUSTERS', {}) for cluster_name in clusters: cluster = clusters[cluster_name] k8s_client = K8s(cluster.get('KUBECONFIG', '')) all_node = k8s_client.get_node() all_node_json = {} for node in all_node: # list 转dict ip = node['hostip'] if 'cpu' in node['labels'] or 'gpu' in node['labels']: all_node_json[ip] = node all_node_json[ip]['used_memory'] = [] all_node_json[ip]['used_cpu'] = [] all_node_json[ip]['used_gpu'] = [] all_node_json[ip]['user'] = [] # print(all_node_json) for namespace in [ 'jupyter', 'pipeline', 'katib', 'service' ]: all_pods = k8s_client.get_pods(namespace=namespace) for pod in all_pods: if pod['status'] == 'Running' and pod[ 'host_ip'] in all_node_json: # print(namespace,pod) all_node_json[ pod['host_ip']]['used_memory'].append( pod['memory']) all_node_json[ pod['host_ip']]['used_cpu'].append( pod['cpu']) all_node_json[ pod['host_ip']]['used_gpu'].append( pod['gpu']) # user = pod['labels'].get('user','') # if not user: # user = pod['labels'].get('run-rtx','') # if not user: # user = pod['labels'].get('rtx-user','') # if user: # all_node_json[pod['host_ip']]['user'].append(user) # print(all_node_json[pod['host_ip']]) for node in all_node_json: all_node_json[node]['used_memory'] = int( sum(all_node_json[node]['used_memory'])) all_node_json[node]['used_cpu'] = int( sum(all_node_json[node]['used_cpu'])) all_node_json[node]['used_gpu'] = int( sum(all_node_json[node]['used_gpu'])) resource_used['data'][cluster_name] = all_node_json resource_used['check_time'] = datetime.datetime.now() all_node_json = resource_used['data'] # 数据格式说明 dict: # 'delay': Integer 延时隐藏 单位: 毫秒 0为不隐藏 # 'hit': Boolean 是否命中 # 'target': String 当前目标 # 'type': String 类型 目前仅支持html类型 # 'title': String 标题 # 'content': String 内容html内容 # /static/appbuilder/mnt/make_pipeline.mp4 message = '' td_html = '<td style="border: 1px solid black;padding: 10px">%s</th>' message += "<tr>%s %s %s %s %s %s %s<tr>" % ( td_html % "集群", td_html % "资源组(监控)", td_html % "机器(进出)", td_html % "机型", td_html % "cpu占用率", td_html % "内存占用率", td_html % "gpu占用率") global_cluster_load = {} for cluster_name in all_node_json: global_cluster_load[cluster_name] = { "cpu_req": 0, "cpu_all": 0, "mem_req": 0, "mem_all": 0, "gpu_req": 0, "gpu_all": 0 } nodes = all_node_json[cluster_name] # nodes = sorted(nodes.items(), key=lambda item: item[1]['labels'].get('org','public')) # ips = [node[0] for node in nodes] # values = [node[1] for node in nodes] # nodes = dict(zip(ips,values)) # 按项目组和设备类型分组 stored_nodes = {} for ip in nodes: org = nodes[ip]['labels'].get('org', 'public') device = 'gpu/' + nodes[ip]['labels'].get( 'gpu-type', '') if 'gpu' in nodes[ip]['labels'] else 'cpu' if org not in stored_nodes: stored_nodes[org] = {} if device not in stored_nodes[org]: stored_nodes[org][device] = {} stored_nodes[org][device][ip] = nodes[ip] nodes = {} for org in stored_nodes: for device in stored_nodes[org]: nodes.update(stored_nodes[org][device]) cluster_config = conf.get('CLUSTERS', {}).get(cluster_name, {}) grafana_url = cluster_config.get( 'GRAFANA_HOST', '').strip('/') + conf.get('GRAFANA_CLUSTER_PATH') for ip in nodes: org = nodes[ip]['labels'].get('org', 'public') enable_train = nodes[ip]['labels'].get('train', 'true') if g.user.is_admin(): if enable_train == 'true': ip_html = '<a href="%s">%s</a>' % ( "/myapp/schedule/node/%s" % ip, ip) else: ip_html = '<a href="%s"><strike>%s</strike></a>' % ( "/myapp/schedule/node/%s" % ip, ip) else: if enable_train == 'true': ip_html = ip else: ip_html = '<strike>%s</strike>' % (ip, ) share = nodes[ip]['labels'].get('share', 'true') clolr = "#FFFFFF" if share == 'true' else '#F0F0F0' message += '<tr bgcolor="%s">%s %s %s %s %s %s %s<tr>' % ( clolr, td_html % cluster_name, td_html % ('<a target="blank" href="%s">%s</a>' % (grafana_url + org, org)), td_html % ip_html, td_html % ('gpu/' + nodes[ip]['labels'].get('gpu-type', '') if 'gpu' in nodes[ip]['labels'] else 'cpu'), td_html % ("cpu:%s/%s" % (nodes[ip]['used_cpu'], nodes[ip]['cpu'])), td_html % ("mem:%s/%s" % (nodes[ip]['used_memory'], nodes[ip]['memory'])), td_html % ("gpu:%s/%s" % (nodes[ip]['used_gpu'], nodes[ip]['gpu'])), # td_html % (','.join(list(set(nodes[ip]['user']))[0:1])) ) global_cluster_load[cluster_name]['cpu_req'] += int( nodes[ip]['used_cpu']) global_cluster_load[cluster_name]['cpu_all'] += int( nodes[ip]['cpu']) global_cluster_load[cluster_name]['mem_req'] += int( nodes[ip]['used_memory']) global_cluster_load[cluster_name]['mem_all'] += int( nodes[ip]['memory']) global_cluster_load[cluster_name]['gpu_req'] += int( nodes[ip]['used_gpu']) global_cluster_load[cluster_name]['gpu_all'] += int( nodes[ip]['gpu']) message = Markup(f'<table>%s</table>' % message) # print(message) cluster_global_info = '' # for cluster_name in global_cluster_load: # cluster_global_info+='\n集群:%s,CPU:%s/%s,MEM:%s/%s,GPU::%s/%s'%( # cluster_name, # global_cluster_load[cluster_name]['cpu_req'],global_cluster_load[cluster_name]['cpu_all'], # global_cluster_load[cluster_name]['mem_req'], global_cluster_load[cluster_name]['mem_all'], # global_cluster_load[cluster_name]['gpu_req'], global_cluster_load[cluster_name]['gpu_all'], # ) data = { 'content': message, 'delay': 300000, 'hit': True, 'target': url, 'title': '当前负载(%s)' % cluster_global_info, 'type': 'html', } # 返回模板 return jsonify(data) return jsonify({})