Пример #1
0
def check_docker_commit(task, docker_id):  # 在页面中测试时会自定接收者和id
    with session_scope(nullpool=True) as dbsession:
        try:
            docker = dbsession.query(Docker).filter_by(
                id=int(docker_id)).first()
            pod_name = "docker-commit-%s-%s" % (docker.created_by.username,
                                                str(docker.id))
            namespace = conf.get('NOTEBOOK_NAMESPACE')
            k8s_client = K8s(
                conf.get('CLUSTERS').get(
                    conf.get('ENVIRONMENT')).get('KUBECONFIG'))
            begin_time = datetime.datetime.now()
            now_time = datetime.datetime.now()
            while ((now_time - begin_time).seconds <
                   1800):  # 也就是最多commit push 30分钟
                time.sleep(12000)
                commit_pods = k8s_client.get_pods(namespace=namespace,
                                                  pod_name=pod_name)
                if commit_pods:
                    commit_pod = commit_pods[0]
                    if commit_pod['status'] == 'Succeeded':
                        docker.last_image = docker.target_image
                        dbsession.commit()
                        break
                    # 其他异常状态直接报警
                    if commit_pod['status'] != 'Running':
                        push_message(
                            conf.get('ADMIN_USER').split(','),
                            'commit pod %s not running' % commit_pod['name'])
                        break
                else:
                    break

        except Exception as e:
            print(e)
Пример #2
0
    def listen(self):
        request_data = request.json
        run_id = request_data.get('run_id', '').replace('_', '-')
        if not run_id:
            response = make_response("输入参数不齐全")
            response.status_code = 400
            return response

        from myapp.utils.py.py_k8s import K8s
        k8s = K8s()
        namespace = conf.get('PIPELINE_NAMESPACE')
        pod_name = "venus-" + run_id.replace('_', '-')
        pod_name = pod_name.lower()[:60].strip('-')
        pod = k8s.get_pods(namespace=namespace, pod_name=pod_name)
        # print(pod)
        if pod:
            pod = pod[0]
            if type(pod['start_time']) == datetime.datetime:
                pod['start_time'] = pod['start_time'].strftime(
                    "%Y-%d-%m %H:%M:%S")
            print(pod)
            response = make_response(json.dumps(pod))
            response.status_code = 200
            return response
        else:
            response = make_response('no pod')
            response.status_code = 400
            return response
Пример #3
0
    def web_log(self, cluster_name, namespace, pod_name):
        from myapp.utils.py.py_k8s import K8s
        all_clusters = conf.get('CLUSTERS', {})
        if cluster_name in all_clusters:
            kubeconfig = all_clusters[cluster_name].get('KUBECONFIG', '')
            pod_url = all_clusters[cluster_name].get(
                'K8S_DASHBOARD_CLUSTER'
            ) + "#/log/%s/%s/pod?namespace=%s&container=%s" % (
                namespace, pod_name, namespace, pod_name)
        else:
            kubeconfig = None
            pod_url = conf.get(
                'K8S_DASHBOARD_CLUSTER'
            ) + "#/log/%s/%s/pod?namespace=%s&container=%s" % (
                namespace, pod_name, namespace, pod_name)

        k8s = K8s(kubeconfig)
        pod = k8s.get_pods(namespace=namespace, pod_name=pod_name)
        if pod:
            pod = pod[0]
            flash('当前pod状态:%s' % pod['status'], category='warning')
        data = {
            "url": pod_url,
            "target":
            'div.kd-scroll-container',  #  kd-logs-container  :nth-of-type(0)
            "delay": 2000,
            "loading": True,
            "currentHeight": 128
        }
        # 返回模板
        if cluster_name == conf.get('ENVIRONMENT'):
            return self.render_template('link.html', data=data)
        else:
            return self.render_template('external_link.html', data=data)
Пример #4
0
    def build_mq_consumer(self, service_pipeline):
        namespace = conf.get('SERVICE_PIPELINE_NAMESPACE')
        name = service_pipeline.name
        command = service_pipeline.command
        image_secrets = conf.get('HUBSECRET', [])
        user_hubsecrets = db.session.query(Repository.hubsecret).filter(
            Repository.created_by_fk == g.user.id).all()
        if user_hubsecrets:
            for hubsecret in user_hubsecrets:
                if hubsecret[0] not in image_secrets:
                    image_secrets.append(hubsecret[0])

        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(service_pipeline.project.cluster.get(
            'KUBECONFIG', ''))
        dag_json = service_pipeline.dag_json if service_pipeline.dag_json else '{}'

        # 生成服务使用的configmap

        config_data = {"dag.json": dag_json}
        k8s_client.create_configmap(namespace=namespace,
                                    name=name,
                                    data=config_data,
                                    labels={'app': name})
        env = service_pipeline.env
        if conf.get('SERVICE_PIPELINE_JAEGER', ''):
            env['JAEGER_HOST'] = conf.get('SERVICE_PIPELINE_JAEGER', '')
            env['SERVICE_NAME'] = name

        k8s_client.create_deployment(
            namespace=namespace,
            name=name,
            replicas=service_pipeline.replicas,
            labels={
                "app": name,
                "username": service_pipeline.created_by.username
            },
            # command=['sh','-c',command] if command else None,
            command=['bash', '-c', "python mq-pipeline/cube_kafka.py"],
            args=None,
            volume_mount=service_pipeline.volume_mount,
            working_dir=service_pipeline.working_dir,
            node_selector=service_pipeline.get_node_selector(),
            resource_memory=service_pipeline.resource_memory,
            resource_cpu=service_pipeline.resource_cpu,
            resource_gpu=service_pipeline.resource_gpu
            if service_pipeline.resource_gpu else '',
            image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'),
            image_pull_secrets=image_secrets,
            image=service_pipeline.images,
            hostAliases=conf.get('HOSTALIASES', ''),
            env=env,
            privileged=False,
            accounts=None,
            username=service_pipeline.created_by.username,
            ports=None)

        pass
Пример #5
0
 def delete_old_service(self,service_name,cluster):
     service_external_name = (service_name + "-external").lower()[:60].strip('-')
     from myapp.utils.py.py_k8s import K8s
     k8s = K8s(cluster.get('KUBECONFIG',''))
     namespace = conf.get('SERVICE_NAMESPACE')
     k8s.delete_deployment(namespace=namespace, name=service_name)
     k8s.delete_service(namespace=namespace, name=service_name)
     k8s.delete_service(namespace=namespace, name=service_external_name)
     k8s.delete_istio_ingress(namespace=namespace, name=service_name)
Пример #6
0
 def delete_pod(self, docker_id):
     docker = db.session.query(Docker).filter_by(id=docker_id).first()
     from myapp.utils.py.py_k8s import K8s
     k8s_client = K8s(
         conf.get('CLUSTERS').get(conf.get('ENVIRONMENT')).get(
             'KUBECONFIG', ''))
     namespace = conf.get('NOTEBOOK_NAMESPACE')
     pod_name = "docker-%s-%s" % (docker.created_by.username, str(
         docker.id))
     k8s_client.delete_pods(namespace=namespace, pod_name=pod_name)
     flash('清理结束,可重新进行调试', 'success')
     return redirect("/docker_modelview/list/")
Пример #7
0
    def clear(self, service_pipeline_id):
        service_pipeline = db.session.query(Service_Pipeline).filter_by(
            id=service_pipeline_id).first()

        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(service_pipeline.project.cluster.get(
            'KUBECONFIG', ''))
        namespace = conf.get('SERVICE_PIPELINE_NAMESPACE')
        k8s_client.delete_deployment(namespace=namespace,
                                     name=service_pipeline.name)

        flash('服务清理完成', category='warning')
        return redirect('/service_pipeline_modelview/list/')
Пример #8
0
    def log_task(self, nni_id):
        nni = db.session.query(NNI).filter_by(id=nni_id).first()
        from myapp.utils.py.py_k8s import K8s
        k8s = K8s(nni.project.cluster.get('KUBECONFIG', ''))
        namespace = conf.get('KATIB_NAMESPACE')
        pod = k8s.get_pods(namespace=namespace, pod_name=nni.name)
        if pod:
            pod = pod[0]
            return redirect("/myapp/web/log/%s/%s/%s" %
                            (nni.project.cluster['NAME'], namespace, nni.name))

        flash("未检测到当前搜索正在运行的容器", category='success')
        return redirect('/nni_modelview/list/')
Пример #9
0
    def clear_task(self, task_id):
        task = db.session.query(Task).filter_by(id=task_id).first()
        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', ''))
        namespace = conf.get('PIPELINE_NAMESPACE')

        # 删除运行时容器
        pod_name = "run-" + task.pipeline.name.replace(
            '_', '-') + "-" + task.name.replace('_', '-')
        pod_name = pod_name.lower()[:60].strip('-')
        pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
        # print(pod)
        if pod:
            pod = pod[0]
        # 有历史,直接删除
        if pod:
            k8s_client.delete_pods(namespace=namespace, pod_name=pod['name'])
            run_id = pod['labels'].get('run-id', '')
            if run_id:
                k8s_client.delete_workflow(all_crd_info=conf.get(
                    "CRD_INFO", {}),
                                           namespace=namespace,
                                           run_id=run_id)
                k8s_client.delete_pods(namespace=namespace,
                                       labels={"run-id": run_id})
                time.sleep(2)

        # 删除debug容器
        pod_name = "debug-" + task.pipeline.name.replace(
            '_', '-') + "-" + task.name.replace('_', '-')
        pod_name = pod_name.lower()[:60].strip('-')
        pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
        # print(pod)
        if pod:
            pod = pod[0]
        # 有历史,直接删除
        if pod:
            k8s_client.delete_pods(namespace=namespace, pod_name=pod['name'])
            run_id = pod['labels'].get('run-id', '')
            if run_id:
                k8s_client.delete_workflow(all_crd_info=conf.get(
                    "CRD_INFO", {}),
                                           namespace=namespace,
                                           run_id=run_id)
                k8s_client.delete_pods(namespace=namespace,
                                       labels={"run-id": run_id})
                time.sleep(2)
        flash("删除完成", category='success')
        # self.update_redirect()
        return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id))
Пример #10
0
    def create_experiment(self,id):
        hp = db.session.query(Hyperparameter_Tuning).filter(Hyperparameter_Tuning.id == int(id)).first()
        if hp:
            from myapp.utils.py.py_k8s import K8s
            k8s_client = K8s(hp.project.cluster.get('KUBECONFIG',''))
            namespace = conf.get('KATIB_NAMESPACE')
            crd_info =conf.get('CRD_INFO')['experiment']
            print(hp.experiment)
            k8s_client.create_crd(group=crd_info['group'],version=crd_info['version'],plural=crd_info['plural'],namespace=namespace,body=hp.experiment)
            flash('部署完成','success')

            # kclient = kc.KatibClient()
            # kclient.create_experiment(hp, namespace=conf.get('KATIB_NAMESPACE'))

        self.update_redirect()
        return redirect(self.get_redirect())
Пример #11
0
    def log_task(self, task_id):
        task = db.session.query(Task).filter_by(id=task_id).first()
        from myapp.utils.py.py_k8s import K8s
        k8s = K8s(task.pipeline.project.cluster.get('KUBECONFIG', ''))
        namespace = conf.get('PIPELINE_NAMESPACE')
        running_pod_name = "run-" + task.pipeline.name.replace(
            '_', '-') + "-" + task.name.replace('_', '-')
        pod_name = running_pod_name.lower()[:60].strip('-')
        pod = k8s.get_pods(namespace=namespace, pod_name=pod_name)
        if pod:
            pod = pod[0]
            return redirect(
                "/myapp/web/log/%s/%s/%s" %
                (task.pipeline.project.cluster['NAME'], namespace, pod_name))

        flash("未检测到当前task正在运行的容器", category='success')
        return redirect('/pipeline_modelview/web/%s' % str(task.pipeline.id))
Пример #12
0
 def apply_hubsecret(self, hubsecret):
     from myapp.utils.py.py_k8s import K8s
     all_cluster = conf.get('CLUSTERS', {})
     all_kubeconfig = [
         all_cluster[cluster].get('KUBECONFIG', '')
         for cluster in all_cluster
     ] + ['']
     all_kubeconfig = list(set(all_kubeconfig))
     for kubeconfig in all_kubeconfig:
         k8s = K8s(kubeconfig)
         namespaces = conf.get('HUBSECRET_NAMESPACE')
         for namespace in namespaces:
             k8s.apply_hubsecret(namespace=namespace,
                                 name=hubsecret.hubsecret,
                                 user=hubsecret.user,
                                 password=hubsecret.password,
                                 server=hubsecret.server)
Пример #13
0
 def make_container(service, mykfservice):
     from myapp.utils.py.py_k8s import K8s
     k8s = K8s()  # 不部署,不需要配置集群信息
     container = k8s.make_container(
         name=mykfservice.name + "-" + service.name,
         command=["sh", "-c", service.command]
         if service.command else None,
         args=None,
         volume_mount=None,
         image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'),
         image=service.images,
         working_dir=service.working_dir
         if service.working_dir else None,
         env=service.env,
         resource_memory=service.resource_memory,
         resource_cpu=service.resource_cpu,
         resource_gpu=service.resource_gpu,
         username=service.created_by.username)
     return container
Пример #14
0
    def schedule_node(self, ip):
        all_node_json = resource_used['data']
        for cluster_name in all_node_json:
            nodes = all_node_json[cluster_name]
            if ip in nodes:
                clusters = conf.get('CLUSTERS', {})
                cluster = clusters[cluster_name]
                k8s_client = K8s(cluster.get('KUBECONFIG', ''))
                # 获取最新的节点信息
                nodes = k8s_client.get_node(ip=ip)
                if nodes:
                    node = nodes[0]
                    enable_train = node['labels'].get('train', 'true')
                    k8s_client.label_node([ip], {
                        "train":
                        "false" if enable_train == 'true' else "true"
                    })
                    break

        return redirect('/myapp/home')
Пример #15
0
    def debug(self, docker_id):
        docker = db.session.query(Docker).filter_by(id=docker_id).first()
        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(
            conf.get('CLUSTERS').get(conf.get('ENVIRONMENT')).get(
                'KUBECONFIG', ''))
        namespace = conf.get('NOTEBOOK_NAMESPACE')
        pod_name = "docker-%s-%s" % (docker.created_by.username, str(
            docker.id))
        pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
        if pod:
            pod = pod[0]
        # 有历史非运行态,直接删除
        # if pod and (pod['status']!='Running' and pod['status']!='Pending'):
        if pod and pod['status'] == 'Succeeded':
            k8s_client.delete_pods(namespace=namespace, pod_name=pod_name)
            time.sleep(2)
            pod = None

        # 没有历史或者没有运行态,直接创建
        if not pod or (pod['status'] != 'Running'
                       and pod['status'] != 'Pending'):

            command = [
                'sh', '-c',
                'sleep 7200 && hour=`date +%H` && while [ $hour -ge 06 ];do sleep 3600;hour=`date +%H`;done'
            ]
            hostAliases = conf.get('HOSTALIASES')

            default_volume_mount = docker.project.volume_mount
            k8s_client.create_debug_pod(
                namespace,
                name=pod_name,
                command=command,
                labels={},
                args=None,
                volume_mount=json.loads(docker.expand).get(
                    'volume_mount', default_volume_mount)
                if docker.expand else default_volume_mount,
                working_dir='/mnt/%s' % docker.created_by.username,
                node_selector='%s=true,train=true,org=public' %
                ('gpu' if docker.need_gpu else 'cpu'),
                resource_memory=json.loads(docker.expand).get(
                    'resource_memory', '8G') if docker.expand else '8G',
                resource_cpu=json.loads(docker.expand).get(
                    'resource_cpu', '4') if docker.expand else '4',
                resource_gpu=json.loads(
                    docker.expand if docker.expand else '{}').get(
                        'resource_gpu', '1') if docker.need_gpu else '0',
                image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'),
                image_pull_secrets=conf.get('HUBSECRET', []),
                image=docker.last_image if docker.last_image
                and docker.consecutive_build else docker.base_image,
                hostAliases=hostAliases,
                env=None,
                privileged=None,
                accounts=None,
                username=docker.created_by.username)

        try_num = 5
        while (try_num > 0):
            pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
            # print(pod)
            if pod:
                pod = pod[0]
            # 有历史非运行态,直接删除
            if pod and pod['status'] == 'Running':
                break
            try_num = try_num - 1
            time.sleep(2)
        if try_num == 0:
            flash('启动时间过长,一分钟后重试', 'warning')
            return redirect('/docker_modelview/list/')

        flash('镜像调试只安装环境,请不要运行业务代码。当晚前请注意保存镜像', 'warning')
        return redirect("/docker_modelview/web/debug/%s/%s/%s" %
                        (conf.get('ENVIRONMENT'), namespace, pod_name))
Пример #16
0
    def run_task(self, task_id):
        task = db.session.query(Task).filter_by(id=task_id).first()
        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', ''))
        namespace = conf.get('PIPELINE_NAMESPACE')
        pod_name = "run-" + task.pipeline.name.replace(
            '_', '-') + "-" + task.name.replace('_', '-')
        pod_name = pod_name.lower()[:60].strip('-')
        pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
        # print(pod)
        if pod:
            pod = pod[0]
        # 有历史,直接删除
        if pod:
            run_id = pod['labels'].get("run-id", '')
            if run_id:
                k8s_client.delete_workflow(all_crd_info=conf.get(
                    'CRD_INFO', {}),
                                           namespace=namespace,
                                           run_id=run_id)

            k8s_client.delete_pods(namespace=namespace, pod_name=pod_name)
            delete_time = datetime.datetime.now()
            while pod:
                time.sleep(2)
                pod = k8s_client.get_pods(namespace=namespace,
                                          pod_name=pod_name)
                check_date = datetime.datetime.now()
                if (check_date - delete_time).seconds > 60:
                    flash("超时,请稍后重试", category='warning')
                    return redirect('/pipeline_modelview/web/%s' %
                                    str(task.pipeline.id))

        # 没有历史或者没有运行态,直接创建
        if not pod:
            command = None
            if task.job_template.entrypoint:
                command = task.job_template.entrypoint
            if task.command:
                command = task.command
            if command:
                command = command.split(" ")
                command = [com for com in command if com]
            ops_args = []

            task_args = json.loads(task.args) if task.args else {}

            for task_attr_name in task_args:
                # 添加参数名
                if type(task_args[task_attr_name]) == bool:
                    if task_args[task_attr_name]:
                        ops_args.append('%s' % str(task_attr_name))
                # 添加参数值
                elif type(task_args[task_attr_name]) == dict or type(
                        task_args[task_attr_name]) == list:
                    ops_args.append('%s' % str(task_attr_name))
                    ops_args.append('%s' % json.dumps(
                        task_args[task_attr_name], ensure_ascii=False))
                elif not task_args[task_attr_name]:  # 如果参数值为空,则都不添加
                    pass
                else:
                    ops_args.append('%s' % str(task_attr_name))
                    ops_args.append(
                        '%s' % str(task_args[task_attr_name])
                    )  # 这里应该对不同类型的参数名称做不同的参数处理,比如bool型,只有参数,没有值

            # print(ops_args)
            run_id = "run-" + str(task.pipeline.id) + "-" + str(task.id)

            self.run_pod(
                task=task,
                k8s_client=k8s_client,
                run_id=run_id,
                namespace=namespace,
                pod_name=pod_name,
                image=json.loads(task.args)['images']
                if task.job_template.name == conf.get('CUSTOMIZE_JOB') else
                task.job_template.images.name,
                working_dir=json.loads(task.args)['workdir']
                if task.job_template.name == conf.get('CUSTOMIZE_JOB') else
                task.job_template.workdir,
                command=['bash', '-c',
                         json.loads(task.args)['command']]
                if task.job_template.name == conf.get('CUSTOMIZE_JOB') else
                command,
                args=None if task.job_template.name
                == conf.get('CUSTOMIZE_JOB') else ops_args)

        try_num = 5
        while (try_num > 0):
            pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
            # print(pod)
            if pod:
                break
            try_num = try_num - 1
            time.sleep(2)
        if try_num == 0:
            flash('启动时间过长,一分钟后重试', 'warning')
            return redirect('/pipeline_modelview/web/%s' %
                            str(task.pipeline.id))

        return redirect(
            "/myapp/web/log/%s/%s/%s" %
            (task.pipeline.project.cluster['NAME'], namespace, pod_name))
Пример #17
0
    def debug(self, task_id):
        task = db.session.query(Task).filter_by(id=task_id).first()
        if task.job_template.name != conf.get('CUSTOMIZE_JOB'):
            if not g.user.is_admin(
            ) and task.job_template.created_by.username != g.user.username:
                flash('仅管理员或当前任务模板创建者,可启动debug模式', 'warning')
                return redirect('/pipeline_modelview/web/%s' %
                                str(task.pipeline.id))

        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(task.pipeline.project.cluster.get('KUBECONFIG', ''))
        namespace = conf.get('PIPELINE_NAMESPACE')
        pod_name = "debug-" + task.pipeline.name.replace(
            '_', '-') + "-" + task.name.replace('_', '-')
        pod_name = pod_name.lower()[:60].strip('-')
        pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
        # print(pod)
        if pod:
            pod = pod[0]
        # 有历史非运行态,直接删除
        # if pod and (pod['status']!='Running' and pod['status']!='Pending'):
        if pod and pod['status'] == 'Succeeded':
            k8s_client.delete_pods(namespace=namespace, pod_name=pod_name)
            time.sleep(2)
            pod = None
        # 没有历史或者没有运行态,直接创建
        if not pod or pod['status'] != 'Running':
            run_id = "debug-" + str(uuid.uuid4().hex)
            command = [
                'sh', '-c',
                'sleep 7200 && hour=`date +%H` && while [ $hour -ge 06 ];do sleep 3600;hour=`date +%H`;done'
            ]
            self.run_pod(task=task,
                         k8s_client=k8s_client,
                         run_id=run_id,
                         namespace=namespace,
                         pod_name=pod_name,
                         image=json.loads(task.args)['images']
                         if task.job_template.name == conf.get('CUSTOMIZE_JOB')
                         else task.job_template.images.name,
                         working_dir='/mnt',
                         command=command,
                         args=None)

        try_num = 5
        while (try_num > 0):
            pod = k8s_client.get_pods(namespace=namespace, pod_name=pod_name)
            # print(pod)
            if pod:
                pod = pod[0]
            # 有历史非运行态,直接删除
            if pod and pod['status'] == 'Running':
                break
            try_num = try_num - 1
            time.sleep(2)
        if try_num == 0:
            flash('启动时间过长,一分钟后重试', 'warning')
            return redirect('/pipeline_modelview/web/%s' %
                            str(task.pipeline.id))

        return redirect("/myapp/web/debug/%s/%s/%s/%s" %
                        (task.pipeline.project.cluster['NAME'], namespace,
                         pod_name, pod_name))
Пример #18
0
    def deploy(self,service_id):
        image_secrets = conf.get('HUBSECRET', [])
        user_hubsecrets = db.session.query(Repository.hubsecret).filter(Repository.created_by_fk == g.user.id).all()
        if user_hubsecrets:
            for hubsecret in user_hubsecrets:
                if hubsecret[0] not in image_secrets:
                    image_secrets.append(hubsecret[0])

        service = db.session.query(Service).filter_by(id=service_id).first()
        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(service.project.cluster.get('KUBECONFIG',''))
        namespace = conf.get('SERVICE_NAMESPACE')

        volume_mount = service.volume_mount

        k8s_client.create_deployment(namespace=namespace,
                              name=service.name,
                              replicas=service.replicas,
                              labels={"app":service.name,"username":service.created_by.username},
                              command=['bash','-c',service.command] if service.command else None,
                              args=None,
                              volume_mount=volume_mount,
                              working_dir=service.working_dir,
                              node_selector=service.get_node_selector(),
                              resource_memory=service.resource_memory,
                              resource_cpu=service.resource_cpu,
                              resource_gpu=service.resource_gpu if service.resource_gpu else '',
                              image_pull_policy=conf.get('IMAGE_PULL_POLICY','Always'),
                              image_pull_secrets=image_secrets,
                              image=service.images,
                              hostAliases=conf.get('HOSTALIASES',''),
                              env=service.env,
                              privileged=False,
                              accounts=None,
                              username=service.created_by.username,
                              ports=[int(port) for port in service.ports.split(',')]
                              )


        ports = [int(port) for port in service.ports.split(',')]

        k8s_client.create_service(
            namespace=namespace,
            name=service.name,
            username=service.created_by.username,
            ports=ports
        )
        # 如果域名配置的gateway,就用这个
        host = service.name+"."+conf.get('SERVICE_DOMAIN')
        if service.host:
            host=service.host.replace('http://','').replace('https://','').strip()
            if "/" in host:
                host = host[:host.index("/")]
            if ":" in host:
                host = host[:host.index(":")]
        k8s_client.create_istio_ingress(namespace=namespace,
                           name=service.name,
                           host = host,
                           ports=service.ports.split(',')
                           )

        # 以ip形式访问的话,使用的代理ip。不然不好处理机器服务化机器扩容和缩容时ip变化
        # 创建EXTERNAL_IP的服务
        SERVICE_EXTERNAL_IP = conf.get('SERVICE_EXTERNAL_IP', None)
        if not SERVICE_EXTERNAL_IP and service.project.expand:
            SERVICE_EXTERNAL_IP = json.loads(service.project.expand).get('SERVICE_EXTERNAL_IP', SERVICE_EXTERNAL_IP)
            if type(SERVICE_EXTERNAL_IP)==str:
                SERVICE_EXTERNAL_IP = [SERVICE_EXTERNAL_IP]

        if SERVICE_EXTERNAL_IP:
            service_ports = [[30000+10*service.id+index,port] for index,port in enumerate(ports)]
            service_external_name = (service.name + "-external").lower()[:60].strip('-')
            k8s_client.create_service(
                namespace=namespace,
                name=service_external_name,
                username=service.created_by.username,
                ports=service_ports,
                selector={"app": service.name, 'user': service.created_by.username},
                externalIPs=SERVICE_EXTERNAL_IP
            )


        # # 创建虚拟服务做代理
        # crd_info = conf.get('CRD_INFO', {}).get('virtualservice', {})
        # crd_name =  "service-%s"%service.name
        # crd_list = k8s.get_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'],namespace=namespace, return_dict=None)
        # for vs_obj in crd_list:
        #     if vs_obj['name'] == crd_name:
        #         k8s.delete_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'],namespace=namespace, name=crd_name)
        #         time.sleep(1)
        # crd_json = {
        #     "apiVersion": "networking.istio.io/v1alpha3",
        #     "kind": "VirtualService",
        #     "metadata": {
        #         "name": crd_name,
        #         "namespace": namespace
        #     },
        #     "spec": {
        #         "gateways": [
        #             "kubeflow/kubeflow-gateway"
        #         ],
        #         "hosts": [
        #             "*"
        #         ],
        #         "http": [
        #             {
        #                 "match": [
        #                     {
        #                         "uri": {
        #                             "prefix": "/service/%s/"%service.name
        #                         }
        #                     }
        #                 ],
        #                 "rewrite": {
        #                     "uri": "/"
        #                 },
        #                 "route": [
        #                     {
        #                         "destination": {
        #                             "host": "%s.service.svc.cluster.local"%service.name,
        #                             "port": {
        #                                 "number": int(service.ports.split(',')[0])
        #                             }
        #                         }
        #                     }
        #                 ],
        #                 "timeout": "300s"
        #             }
        #         ]
        #     }
        # }
        #
        # # print(crd_json)
        # crd = k8s.create_crd(group=crd_info['group'], version=crd_info['version'], plural=crd_info['plural'],namespace=namespace, body=crd_json)
        # # return crd



        flash('服务部署完成',category='warning')
        return redirect('/service_modelview/list/')
Пример #19
0
    def save(self, docker_id):
        docker = db.session.query(Docker).filter_by(id=docker_id).first()
        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(
            conf.get('CLUSTERS').get(conf.get('ENVIRONMENT')).get(
                'KUBECONFIG', ''))
        namespace = conf.get('NOTEBOOK_NAMESPACE')
        pod_name = "docker-%s-%s" % (docker.created_by.username, str(
            docker.id))
        pod = k8s_client.v1.read_namespaced_pod(name=pod_name,
                                                namespace=namespace)
        node_name = ''
        container_id = ''
        if pod:
            node_name = pod.spec.node_name
            containers = [
                container for container in pod.status.container_statuses
                if container.name == pod_name
            ]
            if containers:
                container_id = containers[0].container_id.replace(
                    'docker://', '')

        if not node_name or not container_id:
            flash('没有发现正在运行的调试镜像,请先调试惊险,安装环境后,再保存生成新镜像', category='warning')
            return redirect('/docker_modelview/list/')

        # flash('新镜像正在保存推送中,请留意消息通知',category='success')
        # return redirect('/docker_modelview/list/')

        pod_name = "docker-commit-%s-%s" % (docker.created_by.username,
                                            str(docker.id))
        command = [
            'sh', '-c',
            'docker commit %s %s && docker push %s' %
            (container_id, docker.target_image, docker.target_image)
        ]
        hostAliases = conf.get('HOSTALIASES')
        k8s_client.create_debug_pod(
            namespace=namespace,
            name=pod_name,
            command=command,
            labels={},
            args=None,
            volume_mount='/var/run/docker.sock(hostpath):/var/run/docker.sock',
            working_dir='/mnt/%s' % docker.created_by.username,
            node_selector=None,
            resource_memory='4G',
            resource_cpu='4',
            resource_gpu='0',
            image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'),
            image_pull_secrets=conf.get('HUBSECRET', []),
            image='ccr.ccs.tencentyun.com/cube-studio/docker',
            hostAliases=hostAliases,
            env=None,
            privileged=None,
            accounts=None,
            username=docker.created_by.username,
            node_name=node_name)
        from myapp.tasks.async_task import check_docker_commit
        # 发起异步任务检查commit pod是否完成,如果完成,修正last_image
        kwargs = {"docker_id": docker.id}
        check_docker_commit.apply_async(kwargs=kwargs)

        return redirect("/myapp/web/log/%s/%s/%s" %
                        (conf.get('ENVIRONMENT'), namespace, pod_name))
Пример #20
0
from myapp.project import push_admin, push_message
from myapp import app, db, security_manager
from myapp.models.model_job import (Pipeline, Workflow, Task)
from myapp.utils.celery import session_scope
conf = app.config
prometheus = Prometheus(conf.get('PROMETHEUS', ''))

cluster = os.getenv('ENVIRONMENT', '').lower()
if not cluster:
    print('no cluster %s' % cluster)
    exit(1)
else:
    clusters = conf.get('CLUSTERS', {})
    if clusters and cluster in clusters:
        kubeconfig = clusters[cluster].get('KUBECONFIG', '')
        k8s_client = K8s(kubeconfig)
        # k8s_config.kube_config.load_kube_config(config_file=kubeconfig)
    else:
        print('no kubeconfig in cluster %s' % cluster)
        exit(1)


# 推送微信消息
# @pysnooper.snoop()
def deliver_message(workflow, dbsession):
    if not workflow:
        return

    receivers = workflow.username.split(',')
    receivers = [receiver.strip() for receiver in receivers]
Пример #21
0
def upgrade_service(task, service_id, name, namespace):
    # 将旧的在线版本进行下掉,前提是新的服务必须已经就绪
    time.sleep(10)
    with session_scope(nullpool=True) as dbsession:
        try:
            service = dbsession.query(InferenceService).filter_by(
                id=int(service_id)).first()
            message = '%s 准备进行服务迭代 %s %s' % (
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                service.model_name, service.model_version)
            push_admin(message)
            push_message([service.created_by.username], message)
            k8s_client = K8s(service.project.cluster['KUBECONFIG'])
            begin_time = time.time()
            while (True):
                try:
                    deployment = k8s_client.AppsV1Api.read_namespaced_deployment(
                        name=name, namespace=namespace)
                    if deployment:
                        ready_replicas = deployment.status.ready_replicas
                        replicas = deployment.status.replicas
                        message = '%s 服务 %s %s ready副本数:%s 目标副本数:%s' % (
                            datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'), service.model_name,
                            service.model_version, ready_replicas, replicas)
                        push_admin(message)
                        push_message([service.created_by.username], message)
                        # 如果新的dp副本数已全部就绪
                        if ready_replicas == replicas:
                            break
                    else:
                        message = '%s 没有发现 %s %s 的 deployment' % (
                            datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'), service.model_name,
                            service.model_version)
                        push_admin(message)
                        push_message([service.created_by.username], message)
                        return
                except Exception as e:
                    print(e)
                if time.time() - begin_time > 600:
                    message = '%s 新版本运行状态检查超时,请手动检查和清理旧版本%s %s' % (
                        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        service.model_name, service.model_version)
                    push_admin(message)
                    push_message([service.created_by.username], message)
                    return
                time.sleep(60)

            old_services = dbsession.query(InferenceService)\
                .filter(InferenceService.model_status=='online')\
                .filter(InferenceService.model_name==service.model_name)\
                .filter(InferenceService.name!=service.name)\
                .filter(InferenceService.host==service.host).all()

            if old_services:
                for old_service in old_services:
                    if old_service.name != service.name:
                        inference_model_view = InferenceService_ModelView_base(
                        )
                        inference_model_view.delete_old_service(
                            old_service.name, old_service.project.cluster)
                        old_service.model_status = 'offline'
                        old_service.deploy_history = service.deploy_history + "\n" + "clear: %s %s" % (
                            'admin', datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'))
                        dbsession.commit()
                        message = '%s 新版本服务升级完成,下线旧服务 %s %s' % (
                            datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'), service.model_name,
                            old_service.model_version)
                        push_admin(message)
                        push_message([service.created_by.username], message)
            else:
                message = '%s %s 没有历史在线版本,%s版本升级完成' % (
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    service.model_name, service.model_version)
                push_admin(message)
                push_message([service.created_by.username], message)
        except Exception as e:
            print(e)
            push_admin('部署升级报错 %s %s: %s' %
                       (service.model_name, service.model_version, str(e)))
Пример #22
0
    def deploy1(self, kfservice_id):
        mykfservice = db.session.query(KfService).filter_by(
            id=kfservice_id).first()
        from myapp.utils.py.py_k8s import K8s
        k8s = K8s(mykfservice.project.cluster.get('KUBECONFIG', ''))
        namespace = conf.get('KFSERVING_NAMESPACE')
        crd_info = conf.get('CRD_INFO')['inferenceservice']
        crd_list = k8s.get_crd(group=crd_info['group'],
                               version=crd_info['version'],
                               plural=crd_info['plural'],
                               namespace=namespace)
        for crd_obj in crd_list:
            if crd_obj['name'] == mykfservice.name:
                k8s.delete_crd(group=crd_info['group'],
                               version=crd_info['version'],
                               plural=crd_info['plural'],
                               namespace=namespace,
                               name=mykfservice.name)

        def get_env(env_str):
            if not env_str:
                return []
            envs = re.split('\r|\n', env_str)
            envs = [
                env.split('=') for env in envs
                if env and len(env.split('=')) == 2
            ]
            return envs

        def get_kfjson(service, mykfservice):
            if not service:
                return None

            image_secrets = conf.get('HUBSECRET', [])
            user_hubsecrets = db.session.query(Repository.hubsecret).filter(
                Repository.created_by_fk == g.user.id).all()
            if user_hubsecrets:
                for hubsecret in user_hubsecrets:
                    if hubsecret[0] not in image_secrets:
                        image_secrets.append(hubsecret[0])

            kfjson = {
                "minReplicas": service.min_replicas,
                "maxReplicas": service.max_replicas,
                "custom": {
                    "affinity": {
                        "nodeAffinity": {
                            "requiredDuringSchedulingIgnoredDuringExecution": {
                                "nodeSelectorTerms": [{
                                    "matchExpressions": [
                                        {
                                            "key":
                                            "gpu" if core.get_gpu(
                                                service.resource_gpu)[0] else
                                            "cpu",
                                            "operator":
                                            "In",
                                            "values": ["true"]
                                        },
                                    ]
                                }]
                            }
                        },
                    },
                    "imagePullSecrets": [{
                        "name": hubsecret
                    } for hubsecret in image_secrets],
                    "container": {
                        "image":
                        service.images,
                        "imagePullPolicy":
                        conf.get('IMAGE_PULL_POLICY', 'Always'),
                        "name":
                        mykfservice.name + "-" + service.name,
                        "workingDir":
                        service.working_dir if service.working_dir else None,
                        "command": ["sh", "-c", service.command]
                        if service.command else None,
                        "resources": {
                            "requests": {
                                "cpu": service.resource_cpu,
                                "memory": service.resource_memory
                            }
                        },
                        "env": [{
                            "name": env[0],
                            "value": env[1]
                        } for env in get_env(service.env)],
                        # "volumeMounts": [
                        #     {
                        #         "mountPath": "/mnt/%s" % service.created_by.username,
                        #         "name": "workspace",
                        #         "subPath": service.created_by.username
                        #     }
                        # ],
                        # "volumeDevices":[
                        #     {
                        #         "devicePath": "/data/home/",
                        #         "name": "workspace"
                        #     }
                        # ]
                    }
                    # "volumes": [
                    #     {
                    #         "name": "workspace",
                    #         "persistentVolumeClaim": {
                    #             "claimName": "kubeflow-user-workspace"
                    #         }
                    #     }
                    # ]
                }
            }
            return kfjson

        crd_json = {
            "apiVersion": "serving.kubeflow.org/v1alpha2",
            "kind": "InferenceService",
            "metadata": {
                "labels": {
                    "app": mykfservice.name
                },
                "name": mykfservice.name,
                "namespace": namespace
            },
            "spec": {
                "canaryTrafficPercent": mykfservice.canary_traffic_percent,
                "default": {
                    mykfservice.service_type:
                    get_kfjson(mykfservice.default_service, mykfservice)
                },
                "canary": {
                    mykfservice.service_type:
                    get_kfjson(mykfservice.canary_service, mykfservice),
                } if mykfservice.canary_service else None,
            }
        }

        import yaml
        ya = yaml.load(json.dumps(crd_json))
        ya_str = yaml.safe_dump(ya, default_flow_style=False)
        logging.info(ya_str)
        crd_objects = k8s.create_crd(group=crd_info['group'],
                                     version=crd_info['version'],
                                     plural=crd_info['plural'],
                                     namespace=namespace,
                                     body=crd_json)
        flash(category='warning', message='部署启动,一分钟后部署完成')
        return redirect('/kfservice_modelview/list/')
Пример #23
0
    def run(self):
        request_data = request.json
        job_template_id = request_data.get('job_template_id', '')
        job_template_name = request_data.get('job_template_name', '')
        run_id = request_data.get('run_id', '').replace('_', '-')
        resource_memory = request_data.get('resource_memory', '')
        resource_cpu = request_data.get('resource_cpu', '')
        task_args = request_data.get('args', '')
        if (not job_template_id
                and not job_template_name) or not run_id or task_args == '':
            response = make_response("输入参数不齐全")
            response.status_code = 400
            return response

        job_template = None
        if job_template_id:
            job_template = db.session.query(Job_Template).filter_by(
                id=int(job_template_id)).first()
        elif job_template_name:
            job_template = db.session.query(Job_Template).filter_by(
                name=job_template_name).first()
        if not job_template:
            response = make_response("no job template exist")
            response.status_code = 400
            return response

        from myapp.utils.py.py_k8s import K8s

        k8s = K8s()
        namespace = conf.get('PIPELINE_NAMESPACE')
        pod_name = "venus-" + run_id.replace('_', '-')
        pod_name = pod_name.lower()[:60].strip('-')
        pod = k8s.get_pods(namespace=namespace, pod_name=pod_name)
        # print(pod)
        if pod:
            pod = pod[0]
        # 有历史,直接删除
        if pod:
            k8s.delete_pods(namespace=namespace, pod_name=pod_name)
            time.sleep(2)
            pod = None
        # 没有历史或者没有运行态,直接创建
        if not pod:
            args = []

            job_template_args = json.loads(
                job_template.args) if job_template.args else {}
            for arg_name in task_args:
                arg_type = ''
                for group in job_template_args:
                    for template_arg in job_template_args[group]:
                        if template_arg == arg_name:
                            arg_type = job_template_args[group][
                                template_arg].get('type', '')
                arg_value = task_args[arg_name]
                if arg_value:
                    args.append(arg_name)
                    if arg_type == 'json':
                        args.append(json.dumps(arg_value))
                    else:
                        args.append(arg_value)

            # command = ['sh', '-c','sleep 7200']
            volume_mount = 'kubeflow-cfs-workspace(pvc):/mnt,kubeflow-cfs-archives(pvc):/archives'
            env = job_template.env + "\n"
            env += 'KFJ_TASK_ID=0\n'
            env += 'KFJ_TASK_NAME=' + str('venus-' + run_id) + "\n"
            env += 'KFJ_TASK_NODE_SELECTOR=cpu=true,train=true\n'
            env += 'KFJ_TASK_VOLUME_MOUNT=' + str(volume_mount) + "\n"
            env += 'KFJ_TASK_IMAGES=' + str(job_template.images) + "\n"
            env += 'KFJ_TASK_RESOURCE_CPU=' + str(resource_cpu) + "\n"
            env += 'KFJ_TASK_RESOURCE_MEMORY=' + str(resource_memory) + "\n"
            env += 'KFJ_TASK_RESOURCE_GPU=0\n'
            env += 'KFJ_PIPELINE_ID=0\n'
            env += 'KFJ_RUN_ID=' + run_id + "\n"
            env += 'KFJ_CREATOR=' + str(g.user.username) + "\n"
            env += 'KFJ_RUNNER=' + str(g.user.username) + "\n"
            env += 'KFJ_PIPELINE_NAME=venus\n'
            env += 'KFJ_NAMESPACE=pipeline' + "\n"

            def template_str(src_str):
                rtemplate = Environment(
                    loader=BaseLoader,
                    undefined=DebugUndefined).from_string(src_str)
                des_str = rtemplate.render(
                    creator=g.user.username,
                    datetime=datetime,
                    runner=g.user.username,
                    uuid=uuid,
                    pipeline_id='0',
                    pipeline_name='venus-task',
                    cluster_name=conf.get('ENVIRONMENT'))
                return des_str

            global_envs = json.loads(
                template_str(
                    json.dumps(conf.get('GLOBAL_ENV', {}),
                               indent=4,
                               ensure_ascii=False)))
            for global_env_key in global_envs:
                env += global_env_key + '=' + global_envs[global_env_key] + "\n"

            hostAliases = job_template.hostAliases + "\n" + conf.get(
                'HOSTALIASES', '')
            k8s.create_debug_pod(
                namespace,
                name=pod_name,
                labels={'run-rtx': g.user.username},
                command=None,
                args=args,
                volume_mount=volume_mount,
                working_dir=None,
                node_selector='cpu=true,train=true',
                resource_cpu=resource_cpu,
                resource_memory=resource_memory,
                resource_gpu=0,
                image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'),
                image_pull_secrets=[job_template.images.repository.hubsecret],
                image=job_template.images.name,
                hostAliases=hostAliases,
                env=env,
                privileged=job_template.privileged,
                accounts=job_template.accounts,
                username=g.user.username)

        try_num = 5
        while (try_num > 0):
            pod = k8s.get_pods(namespace=namespace, pod_name=pod_name)
            # print(pod)
            if pod:
                break
            try_num = try_num - 1
            time.sleep(2)
        if try_num == 0:
            response = make_response("启动时间过长,一分钟后重试")
            response.status_code = 400
            return response

        user_roles = [role.name.lower() for role in list(g.user.roles)]
        if "admin" in user_roles:
            pod_url = conf.get(
                'K8S_DASHBOARD_CLUSTER'
            ) + "#/log/%s/%s/pod?namespace=%s&container=%s" % (
                namespace, pod_name, namespace, pod_name)
        else:
            pod_url = conf.get(
                'K8S_DASHBOARD_PIPELINE'
            ) + "#/log/%s/%s/pod?namespace=%s&container=%s" % (
                namespace, pod_name, namespace, pod_name)
        print(pod_url)
        response = make_response("启动成功,日志地址: %s" % pod_url)
        response.status_code = 200
        return response
Пример #24
0
    def reset_theia(self, notebook):
        from myapp.utils.py.py_k8s import K8s

        k8s_client = K8s(notebook.cluster.get('KUBECONFIG', ''))
        namespace = conf.get('NOTEBOOK_NAMESPACE')
        port = 3000

        command = None
        workingDir = None
        volume_mount = notebook.volume_mount
        if '/dev/shm' not in volume_mount:
            volume_mount += ',10G(memory):/dev/shm'
        rewrite_url = '/'
        pre_command = '(nohup sh /init.sh > /notebook_init.log 2>&1 &) ; (nohup sh /mnt/%s/init.sh > /init.log 2>&1 &) ; ' % notebook.created_by.username
        if notebook.ide_type == 'jupyter':
            rewrite_url = '/notebook/jupyter/%s/' % notebook.name
            workingDir = '/mnt/%s' % notebook.created_by.username
            # command = ["sh", "-c", "%s jupyter lab --notebook-dir=%s --ip=0.0.0.0 "
            #                         "--no-browser --allow-root --port=%s "
            #                         "--NotebookApp.token='' --NotebookApp.password='' "
            #                         "--NotebookApp.allow_origin='*' "
            #                         "--NotebookApp.base_url=%s" % (pre_command,notebook.mount,port,rewrite_url)]

            command = [
                "sh", "-c",
                "%s jupyter lab --notebook-dir=/ --ip=0.0.0.0 "
                "--no-browser --allow-root --port=%s "
                "--NotebookApp.token='' --NotebookApp.password='' "
                "--NotebookApp.allow_origin='*' "
                "--NotebookApp.base_url=%s" % (pre_command, port, rewrite_url)
            ]

        elif notebook.ide_type == 'theia':
            command = [
                "bash", '-c',
                '%s node /home/theia/src-gen/backend/main.js /home/project --hostname=0.0.0.0 --port=%s'
                % (pre_command, port)
            ]
            # command = ["node","/home/theia/src-gen/backend/main.js",  "/home/project","--hostname=0.0.0.0","--port=%s"%port]
            workingDir = '/home/theia'
        print(command)
        print(workingDir)

        image_secrets = conf.get('HUBSECRET', [])
        user_hubsecrets = db.session.query(Repository.hubsecret).filter(
            Repository.created_by_fk == notebook.created_by.id).all()
        if user_hubsecrets:
            for hubsecret in user_hubsecrets:
                if hubsecret[0] not in image_secrets:
                    image_secrets.append(hubsecret[0])

        k8s_client.create_debug_pod(
            namespace=namespace,
            name=notebook.name,
            labels={
                "app": notebook.name,
                'user': notebook.created_by.username
            },
            command=command,
            args=None,
            volume_mount=volume_mount,
            working_dir=workingDir,
            node_selector=notebook.get_node_selector(),
            resource_memory="0G~" + notebook.resource_memory,
            resource_cpu="0~" + notebook.resource_cpu,
            resource_gpu=notebook.resource_gpu,
            image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'),
            image_pull_secrets=image_secrets,
            image=notebook.images,
            hostAliases=conf.get('HOSTALIASES', ''),
            env={
                "NO_AUTH":
                "true",
                "USERNAME":
                notebook.created_by.username,
                "NODE_OPTIONS":
                "--max-old-space-size=%s" %
                str(int(notebook.resource_memory.replace("G", '')) * 1024)
            },
            privileged=None,
            accounts=conf.get('JUPYTER_ACCOUNTS'),
            username=notebook.created_by.username)
        k8s_client.create_service(namespace=namespace,
                                  name=notebook.name,
                                  username=notebook.created_by.username,
                                  ports=[
                                      port,
                                  ])

        crd_info = conf.get('CRD_INFO', {}).get('virtualservice', {})
        crd_name = "notebook-jupyter-%s" % notebook.name.replace(
            '_', '-')  #  notebook.name.replace('_', '-')
        vs_obj = k8s_client.get_one_crd(group=crd_info['group'],
                                        version=crd_info['version'],
                                        plural=crd_info['plural'],
                                        namespace=namespace,
                                        name=crd_name)
        if vs_obj:
            k8s_client.delete_crd(group=crd_info['group'],
                                  version=crd_info['version'],
                                  plural=crd_info['plural'],
                                  namespace=namespace,
                                  name=crd_name)
            time.sleep(1)

        host = notebook.project.cluster.get('JUPYTER_DOMAIN', request.host)
        if not host:
            host = request.host
        if ':' in host:
            host = host[:host.rindex(':')]  # 如果捕获到端口号,要去掉
        crd_json = {
            "apiVersion": "networking.istio.io/v1alpha3",
            "kind": "VirtualService",
            "metadata": {
                "name": crd_name,
                "namespace": namespace
            },
            "spec": {
                "gateways": ["kubeflow/kubeflow-gateway"],
                "hosts": ["*" if core.checkip(host) else host],
                "http": [{
                    "match": [{
                        "uri": {
                            "prefix":
                            "/notebook/%s/%s/" % (namespace, notebook.name)
                        }
                    }],
                    "rewrite": {
                        "uri": rewrite_url
                    },
                    "route": [{
                        "destination": {
                            "host":
                            "%s.%s.svc.cluster.local" %
                            (notebook.name, namespace),
                            "port": {
                                "number": port
                            }
                        }
                    }],
                    "timeout":
                    "300s"
                }]
            }
        }

        # print(crd_json)
        crd = k8s_client.create_crd(group=crd_info['group'],
                                    version=crd_info['version'],
                                    plural=crd_info['plural'],
                                    namespace=namespace,
                                    body=crd_json)

        # 创建EXTERNAL_IP的服务
        SERVICE_EXTERNAL_IP = conf.get('SERVICE_EXTERNAL_IP', None)
        if not SERVICE_EXTERNAL_IP and notebook.project.expand:
            SERVICE_EXTERNAL_IP = json.loads(notebook.project.expand).get(
                'SERVICE_EXTERNAL_IP', SERVICE_EXTERNAL_IP)
            if type(SERVICE_EXTERNAL_IP) == str:
                SERVICE_EXTERNAL_IP = [SERVICE_EXTERNAL_IP]

        if SERVICE_EXTERNAL_IP:
            service_ports = [[10000 + 10 * notebook.id + index, port]
                             for index, port in enumerate([port])]
            service_external_name = (notebook.name +
                                     "-external").lower()[:60].strip('-')
            k8s_client.create_service(namespace=namespace,
                                      name=service_external_name,
                                      username=notebook.created_by.username,
                                      ports=service_ports,
                                      selector={
                                          "app": notebook.name,
                                          'user': notebook.created_by.username
                                      },
                                      externalIPs=SERVICE_EXTERNAL_IP)

        return crd
Пример #25
0
    def deploy_nni_service(self, nni, command):
        image_secrets = conf.get('HUBSECRET', [])
        user_hubsecrets = db.session.query(Repository.hubsecret).filter(
            Repository.created_by_fk == g.user.id).all()
        if user_hubsecrets:
            for hubsecret in user_hubsecrets:
                if hubsecret[0] not in image_secrets:
                    image_secrets.append(hubsecret[0])

        from myapp.utils.py.py_k8s import K8s
        k8s_client = K8s(nni.project.cluster.get('KUBECONFIG', ''))
        namespace = conf.get('KATIB_NAMESPACE')
        run_id = 'nni-' + nni.name

        try:
            nni_deploy = k8s_client.AppsV1Api.read_namespaced_deployment(
                name=nni.name, namespace=namespace)
            if nni_deploy:
                print('exist nni deploy')
                k8s_client.AppsV1Api.delete_namespaced_deployment(
                    name=nni.name, namespace=namespace)
                # return
        except Exception as e:
            print(e)

        volume_mount = nni.volume_mount + ",/usr/share/zoneinfo/Asia/Shanghai(hostpath):/etc/localtime"
        labels = {
            "nni": nni.name,
            "username": nni.created_by.username,
            'run-id': run_id
        }

        k8s_client.create_debug_pod(
            namespace=namespace,
            name=nni.name,
            labels=labels,
            command=command,
            args=None,
            volume_mount=volume_mount,
            working_dir='/mnt/%s' % nni.created_by.username,
            node_selector=nni.get_node_selector(),
            resource_memory='2G',
            resource_cpu='2',
            resource_gpu='0',
            image_pull_policy=conf.get('IMAGE_PULL_POLICY', 'Always'),
            image_pull_secrets=image_secrets,
            image=conf.get('NNI_IMAGES',
                           json.loads(nni.job_json).get('job_worker_image')),
            hostAliases=conf.get('HOSTALIASES', ''),
            env=None,
            privileged=False,
            accounts='nni',
            username=nni.created_by.username)

        k8s_client.create_service(namespace=namespace,
                                  name=nni.name,
                                  username=nni.created_by.username,
                                  ports=[8888],
                                  selector=labels)

        host = nni.project.cluster.get('NNI_DOMAIN', request.host)
        if not host:
            host = request.host
        if ':' in host:
            host = host[:host.rindex(':')]  # 如果捕获到端口号,要去掉
        vs_json = {
            "apiVersion": "networking.istio.io/v1alpha3",
            "kind": "VirtualService",
            "metadata": {
                "name": nni.name,
                "namespace": namespace
            },
            "spec": {
                "gateways": ["kubeflow/kubeflow-gateway"],
                "hosts": ["*" if core.checkip(host) else host],
                "http": [{
                    "match": [{
                        "uri": {
                            "prefix": "/nni/%s//" % nni.name
                        }
                    }, {
                        "uri": {
                            "prefix": "/nni/%s/" % nni.name
                        }
                    }],
                    "rewrite": {
                        "uri": "/nni/%s/" % nni.name
                    },
                    "route": [{
                        "destination": {
                            "host":
                            "%s.%s.svc.cluster.local" % (nni.name, namespace),
                            "port": {
                                "number": 8888
                            }
                        }
                    }],
                    "timeout":
                    "300s"
                }]
            }
        }
        crd_info = conf.get('CRD_INFO')['virtualservice']
        k8s_client.delete_istio_ingress(namespace=namespace, name=nni.name)

        k8s_client.create_crd(group=crd_info['group'],
                              version=crd_info['version'],
                              plural=crd_info['plural'],
                              namespace=namespace,
                              body=vs_json)
Пример #26
0
    def featureCheck(self):
        url = request.values.get("url", type=str, default=None)
        if '/myapp/home' in url:
            if 1 or not resource_used['check_time'] or resource_used[
                    'check_time'] < (datetime.datetime.now() -
                                     datetime.timedelta(minutes=10)):
                clusters = conf.get('CLUSTERS', {})
                for cluster_name in clusters:
                    cluster = clusters[cluster_name]
                    k8s_client = K8s(cluster.get('KUBECONFIG', ''))

                    all_node = k8s_client.get_node()
                    all_node_json = {}
                    for node in all_node:  # list 转dict
                        ip = node['hostip']
                        if 'cpu' in node['labels'] or 'gpu' in node['labels']:
                            all_node_json[ip] = node
                            all_node_json[ip]['used_memory'] = []
                            all_node_json[ip]['used_cpu'] = []
                            all_node_json[ip]['used_gpu'] = []
                            all_node_json[ip]['user'] = []

                    # print(all_node_json)
                    for namespace in [
                            'jupyter', 'pipeline', 'katib', 'service'
                    ]:
                        all_pods = k8s_client.get_pods(namespace=namespace)
                        for pod in all_pods:
                            if pod['status'] == 'Running' and pod[
                                    'host_ip'] in all_node_json:
                                # print(namespace,pod)
                                all_node_json[
                                    pod['host_ip']]['used_memory'].append(
                                        pod['memory'])
                                all_node_json[
                                    pod['host_ip']]['used_cpu'].append(
                                        pod['cpu'])
                                all_node_json[
                                    pod['host_ip']]['used_gpu'].append(
                                        pod['gpu'])

                                # user = pod['labels'].get('user','')
                                # if not user:
                                #     user = pod['labels'].get('run-rtx','')
                                # if not user:
                                #     user = pod['labels'].get('rtx-user','')
                                # if user:
                                #     all_node_json[pod['host_ip']]['user'].append(user)
                                # print(all_node_json[pod['host_ip']])

                    for node in all_node_json:
                        all_node_json[node]['used_memory'] = int(
                            sum(all_node_json[node]['used_memory']))
                        all_node_json[node]['used_cpu'] = int(
                            sum(all_node_json[node]['used_cpu']))
                        all_node_json[node]['used_gpu'] = int(
                            sum(all_node_json[node]['used_gpu']))

                    resource_used['data'][cluster_name] = all_node_json
                resource_used['check_time'] = datetime.datetime.now()

            all_node_json = resource_used['data']

            # 数据格式说明 dict:
            # 'delay': Integer 延时隐藏 单位: 毫秒 0为不隐藏
            # 'hit': Boolean 是否命中
            # 'target': String 当前目标
            # 'type': String 类型 目前仅支持html类型
            # 'title': String 标题
            # 'content': String 内容html内容
            # /static/appbuilder/mnt/make_pipeline.mp4
            message = ''
            td_html = '<td style="border: 1px solid black;padding: 10px">%s</th>'
            message += "<tr>%s %s %s %s %s %s %s<tr>" % (
                td_html % "集群", td_html % "资源组(监控)", td_html % "机器(进出)",
                td_html % "机型", td_html % "cpu占用率", td_html % "内存占用率",
                td_html % "gpu占用率")
            global_cluster_load = {}
            for cluster_name in all_node_json:
                global_cluster_load[cluster_name] = {
                    "cpu_req": 0,
                    "cpu_all": 0,
                    "mem_req": 0,
                    "mem_all": 0,
                    "gpu_req": 0,
                    "gpu_all": 0
                }
                nodes = all_node_json[cluster_name]
                # nodes = sorted(nodes.items(), key=lambda item: item[1]['labels'].get('org','public'))
                # ips = [node[0] for node in nodes]
                # values = [node[1] for node in nodes]
                # nodes = dict(zip(ips,values))

                # 按项目组和设备类型分组
                stored_nodes = {}
                for ip in nodes:
                    org = nodes[ip]['labels'].get('org', 'public')
                    device = 'gpu/' + nodes[ip]['labels'].get(
                        'gpu-type',
                        '') if 'gpu' in nodes[ip]['labels'] else 'cpu'
                    if org not in stored_nodes:
                        stored_nodes[org] = {}
                    if device not in stored_nodes[org]:
                        stored_nodes[org][device] = {}
                    stored_nodes[org][device][ip] = nodes[ip]
                nodes = {}
                for org in stored_nodes:
                    for device in stored_nodes[org]:
                        nodes.update(stored_nodes[org][device])

                cluster_config = conf.get('CLUSTERS', {}).get(cluster_name, {})
                grafana_url = cluster_config.get(
                    'GRAFANA_HOST',
                    '').strip('/') + conf.get('GRAFANA_CLUSTER_PATH')
                for ip in nodes:
                    org = nodes[ip]['labels'].get('org', 'public')
                    enable_train = nodes[ip]['labels'].get('train', 'true')
                    if g.user.is_admin():
                        if enable_train == 'true':
                            ip_html = '<a href="%s">%s</a>' % (
                                "/myapp/schedule/node/%s" % ip, ip)
                        else:
                            ip_html = '<a href="%s"><strike>%s</strike></a>' % (
                                "/myapp/schedule/node/%s" % ip, ip)
                    else:
                        if enable_train == 'true':
                            ip_html = ip
                        else:
                            ip_html = '<strike>%s</strike>' % (ip, )
                    share = nodes[ip]['labels'].get('share', 'true')
                    clolr = "#FFFFFF" if share == 'true' else '#F0F0F0'
                    message += '<tr bgcolor="%s">%s %s %s %s %s %s %s<tr>' % (
                        clolr,
                        td_html % cluster_name,
                        td_html % ('<a target="blank" href="%s">%s</a>' %
                                   (grafana_url + org, org)),
                        td_html % ip_html,
                        td_html %
                        ('gpu/' + nodes[ip]['labels'].get('gpu-type', '')
                         if 'gpu' in nodes[ip]['labels'] else 'cpu'),
                        td_html % ("cpu:%s/%s" %
                                   (nodes[ip]['used_cpu'], nodes[ip]['cpu'])),
                        td_html %
                        ("mem:%s/%s" %
                         (nodes[ip]['used_memory'], nodes[ip]['memory'])),
                        td_html % ("gpu:%s/%s" %
                                   (nodes[ip]['used_gpu'], nodes[ip]['gpu'])),
                        # td_html % (','.join(list(set(nodes[ip]['user']))[0:1]))
                    )

                    global_cluster_load[cluster_name]['cpu_req'] += int(
                        nodes[ip]['used_cpu'])
                    global_cluster_load[cluster_name]['cpu_all'] += int(
                        nodes[ip]['cpu'])
                    global_cluster_load[cluster_name]['mem_req'] += int(
                        nodes[ip]['used_memory'])
                    global_cluster_load[cluster_name]['mem_all'] += int(
                        nodes[ip]['memory'])
                    global_cluster_load[cluster_name]['gpu_req'] += int(
                        nodes[ip]['used_gpu'])
                    global_cluster_load[cluster_name]['gpu_all'] += int(
                        nodes[ip]['gpu'])

            message = Markup(f'<table>%s</table>' % message)
            # print(message)
            cluster_global_info = ''
            # for cluster_name in global_cluster_load:
            #     cluster_global_info+='\n集群:%s,CPU:%s/%s,MEM:%s/%s,GPU::%s/%s'%(
            #         cluster_name,
            #         global_cluster_load[cluster_name]['cpu_req'],global_cluster_load[cluster_name]['cpu_all'],
            #         global_cluster_load[cluster_name]['mem_req'], global_cluster_load[cluster_name]['mem_all'],
            #         global_cluster_load[cluster_name]['gpu_req'], global_cluster_load[cluster_name]['gpu_all'],
            #     )

            data = {
                'content': message,
                'delay': 300000,
                'hit': True,
                'target': url,
                'title': '当前负载(%s)' % cluster_global_info,
                'type': 'html',
            }
            # 返回模板
            return jsonify(data)
        return jsonify({})