示例#1
0
文件: views.py 项目: zmoon111/cloud
def create_user_RBAC_permissions(username):
    namespace = utils.email_escape(username)
    rbacapi = kubernetes.client.RbacAuthorizationV1beta1Api(
        utils.get_admin_api_client())
    body = {
        "apiVersion":
        "rbac.authorization.k8s.io/v1beta1",
        "kind":
        "RoleBinding",
        "metadata": {
            "name": "%s-admin-binding" % namespace,
            "namespace": namespace
        },
        "roleRef": {
            "apiGroup": "rbac.authorization.k8s.io",
            "kind": "ClusterRole",
            "name": "admin"
        },
        "subjects": [{
            "apiGroup": "rbac.authorization.k8s.io",
            "kind": "User",
            "name": username
        }]
    }
    try:
        rbacapi.create_namespaced_role_binding(namespace, body)
    except Exception, e:
        logging.error("%s", str(e))
示例#2
0
 def get_jobs(self, username):
     namespace = utils.email_escape(username)
     api_instance =\
         client.BatchV1Api(api_client=utils.get_user_api_client(username))
     job_list = api_instance.list_namespaced_job(namespace)
     # NOTE: when job is deleted, some pods of the job will be at "Terminating" status
     # for a while, which may cause submit fail. Find all pods that are still "Terminating".
     user_pod_list =\
         client.CoreV1Api(api_client=utils.get_user_api_client(username))\
         .list_namespaced_pod(namespace)
     terminating_jobs = []
     for pod in user_pod_list.items:
         jobname = ""
         if not pod.metadata.labels:
             continue
         if "paddle-job" in pod.metadata.labels:
             jobname = pod.metadata.labels["paddle-job"]
         elif "paddle-job-master" in pod.metadata.labels:
             jobname = pod.metadata.labels["paddle-job-master"]
         elif "paddle-job-pserver" in pod.metadata.labels:
             jobname = pod.metadata.labels["paddle-job-pserver"]
         if pod.metadata.deletion_timestamp and jobname:
             if jobname not in terminating_jobs:
                 terminating_jobs.append(jobname)
     # NOTE: put it in the original dict for backward compability
     ret_dict = copy.deepcopy(job_list.to_dict())
     ret_dict["terminating"] = terminating_jobs
     return ret_dict
示例#3
0
文件: views.py 项目: zmoon111/cloud
def create_user_namespace(username):
    v1api = kubernetes.client.CoreV1Api(utils.get_admin_api_client())
    namespaces = v1api.list_namespace()
    user_namespace_found = False
    user_namespace = utils.email_escape(username)
    for ns in namespaces.items:
        # must commit to user's namespace
        if ns.metadata.name == user_namespace:
            user_namespace_found = True
    # Create user's namespace if it does not exist
    if not user_namespace_found:
        v1api.create_namespace({
            "apiVersion": "v1",
            "kind": "Namespace",
            "metadata": {
                "name": user_namespace
            }
        })
    for dc, cfg in settings.DATACENTERS.items():
        #create DataCenter sercret if not exists
        secrets = v1api.list_namespaced_secret(user_namespace)
        secret_names = [item.metadata.name for item in secrets.items]

        # create Kubernetes Secret for ceph admin key
        if cfg["fstype"] == "cephfs" and cfg["secret"] not in secret_names:
            with open(cfg["admin_key"], "r") as f:
                key = f.read()
                encoded = base64.b64encode(key)
                v1api.create_namespaced_secret(
                    user_namespace, {
                        "apiVersion": "v1",
                        "kind": "Secret",
                        "metadata": {
                            "name": cfg["secret"]
                        },
                        "data": {
                            "key": encoded
                        }
                    })
    # create docker registry secret
    registry_secret = settings.JOB_DOCKER_IMAGE.get("registry_secret", None)
    if registry_secret and registry_secret not in secret_names:
        docker_config = settings.JOB_DOCKER_IMAGE["docker_config"]
        encode = base64.b64encode(json.dumps(docker_config))
        v1api.create_namespaced_secret(
            user_namespace, {
                "apiVersion": "v1",
                "kind": "Secret",
                "metadata": {
                    "name": registry_secret
                },
                "data": {
                    ".dockerconfigjson": encode
                },
                "type": "kubernetes.io/dockerconfigjson"
            })
    return user_namespace
示例#4
0
    def _valid_and_fill(self, paddlejob, username):
        namespace = utils.email_escape(username)
        api_client = utils.get_user_api_client(username)
        self.__setup_volumes(paddlejob, username)
        if not paddlejob.registry_secret:
            paddlejob.registry_secret = settings.JOB_DOCKER_IMAGE.get("registry_secret", None)
        if not paddlejob.image:
            if paddlejob.gpu > 0:
                paddlejob.image = settings.JOB_DOCKER_IMAGE["image_gpu"]
            else:
                paddlejob.image = settings.JOB_DOCKER_IMAGE["image"]

        # jobPackage validation: startwith /pfs
        # NOTE: job packages are uploaded to /pfs/[dc]/home/[user]/jobs/[jobname]
        package_in_pod = os.path.join("/pfs/%s/home/%s"%(paddlejob.dc, username), "jobs", paddlejob.name)

        logging.info("valid_and_fill: current package: %s", package_in_pod)
        # package must be ready before submit a job
        current_package_path = package_in_pod.replace("/pfs/%s/home"%paddlejob.dc, settings.STORAGE_PATH)
        if not os.path.exists(current_package_path):
            current_package_path = package_in_pod.replace("/pfs/%s/home/%s"%(paddlejob.dc, username), settings.STORAGE_PATH)
            if not os.path.exists(current_package_path):
                raise Exception("package not exist in cloud: %s"%current_package_path)
        logging.info("valid_and_fill: current package in pod: %s", current_package_path)

        # GPU quota management
        # TODO(Yancey1989) We should move this to Kubernetes
        if 'GPU_QUOTA' in dir(settings) and int(paddlejob.gpu) > 0:
            gpu_usage = 0
            pods = client.CoreV1Api(api_client=api_client).list_namespaced_pod(namespace=namespace)
            for pod in pods.items:
                # only statistics trainer GPU resource, pserver does not use GPU
                if pod.metadata.labels and 'paddle-job' in pod.metadata.labels and \
                    pod.status.phase == 'Running':
                    gpu_usage += int(pod.spec.containers[0].resources.limits.get('alpha.kubernetes.io/nvidia-gpu', '0'))
            if username in settings.GPU_QUOTA:
                gpu_quota = settings.GPU_QUOTA[username]['limit']
            else:
                gpu_quota = settings.GPU_QUOTA['DEFAULT']['limit']
            gpu_available = gpu_quota - gpu_usage
            gpu_request = int(paddlejob.gpu) * int(paddlejob.parallelism)
            logging.info('gpu available: %d, gpu request: %d' % (gpu_available, gpu_request))
            if gpu_available < gpu_request:
                raise Exception("You don't have enought GPU quota," + \
                    "request: %d, usage: %d, limit: %d" % (gpu_request, gpu_usage, gpu_quota))

        # add Nvidia lib volume if training with GPU
        if paddlejob.gpu > 0:
            paddlejob.volumes.append(volume.get_volume_config(
                fstype = settings.FSTYPE_HOSTPATH,
                name = "nvidia-libs",
                mount_path = "/usr/local/nvidia/lib64",
                host_path = settings.NVIDIA_LIB_PATH
            ))
示例#5
0
 def get_workers(self, jobname, username):
     namespace = utils.email_escape(username)
     job_pod_list = None
     api_client = utils.get_user_api_client(username)
     if not jobname:
         job_pod_list = client.CoreV1Api(api_client=api_client)\
             .list_namespaced_pod(namespace)
     else:
         selector = "paddle-job=%s"%jobname
         job_pod_list = client.CoreV1Api(api_client=api_client)\
             .list_namespaced_pod(namespace, label_selector=selector)
     return job_pod_list.to_dict()
示例#6
0
    def _create_traingingjobs(self, body, username):
        namespace = utils.email_escape(username)
        api_client = utils.get_user_api_client(username)
        resource_path = '/apis/paddlepaddle.org/v1/namespaces/' + namespace + '/trainingjobs'
        header_params = {}
        header_params['Accept'] = api_client.select_header_accept(['application/json'])
        header_params['Content-Type'] = api_client.select_header_content_type(['*/*'])

        (resp, code, header) = api_client.call_api(
                resource_path, 'POST', {'namespace': namespace}, {}, header_params, body, [], _preload_content=False)

        return json.loads(resp.data.decode('utf-8'))
示例#7
0
    def submit_job(self, paddlejob, username):
        self._valid_and_fill(paddlejob, username)

        namespace = utils.email_escape(username)
        api_client = utils.get_user_api_client(username)
        # ========== submit master ReplicaSet if using fault_tolerant feature ==
        # FIXME: alpha features in separate module
        if paddlejob.fault_tolerant:
            try:
                ret = client.ExtensionsV1beta1Api(api_client=api_client).create_namespaced_replica_set(
                    namespace,
                    paddlejob.new_master_job())
            except ApiException, e:
                logging.error("error submitting master job: %s", traceback.format_exc())
                raise e
示例#8
0
 def delete_job(self, jobname, username):
     namespace = utils.email_escape(username)
     api_client = utils.get_user_api_client(username)
     if not jobname:
         return utils.simple_response(500, "must specify jobname")
     # FIXME: options needed: grace_period_seconds, orphan_dependents, preconditions
     # FIXME: cascade delteing
     delete_status = []
     # delete job
     trainer_name = jobname + "-trainer"
     try:
         u_status = client.BatchV1Api(api_client=api_client)\
             .delete_namespaced_job(trainer_name, namespace, {})
     except ApiException, e:
         logging.error("error deleting job: %s, %s", jobname, str(e))
         delete_status.append(str(e))
示例#9
0
 def get_quotas(self, username):
     namespace = utils.email_escape(username)
     api_client = utils.get_user_api_client(username)
     quota_list = client.CoreV1Api(api_client=api_client)\
         .list_namespaced_resource_quota(namespace)
     return quota_list.to_dict()
示例#10
0
 def get_pservers(self, username):
     namespace = utils.email_escape(username)
     api_instance = client.ExtensionsV1beta1Api(api_client=utils.get_user_api_client(username))
     return api_instance.list_namespaced_replica_set(namespace).to_dict()
示例#11
0
    def get_logs(self, jobname, num_lines, worker, username):
        def _get_pod_log(api_client, namespace, pod_name, num_lines):
            try:
                if num_lines:
                    pod_log = client.CoreV1Api(api_client=api_client)\
                        .read_namespaced_pod_log(
                            pod_name, namespace, tail_lines=int(num_lines))
                else:
                    pod_log = client.CoreV1Api(api_client=api_client)\
                        .read_namespaced_pod_log(i.metadata.name, namespace)
                return pod_log
            except ApiException, e:
                return str(e)

        namespace = utils.email_escape(username)
        api_client = utils.get_user_api_client(username)
        job_pod_list = client.CoreV1Api(api_client=api_client)\
            .list_namespaced_pod(namespace, label_selector="paddle-job=%s"%jobname)
        total_job_log = ""
        if not worker:
            for i in job_pod_list.items:
                total_job_log = "".join((total_job_log, 
                    "==========================%s==========================" % i.metadata.name))
                pod_log = _get_pod_log(api_client, namespace, i.metadata.name, num_lines)
                total_job_log = "\n".join((total_job_log, pod_log))
        else:
            total_job_log = _get_pod_log(api_client, namespace, worker, num_lines)
        return total_job_log

    def get_workers(self, jobname, username):