def create_user_RBAC_permissions(username): namespace = utils.email_escape(username) rbacapi = kubernetes.client.RbacAuthorizationV1beta1Api( utils.get_admin_api_client()) body = { "apiVersion": "rbac.authorization.k8s.io/v1beta1", "kind": "RoleBinding", "metadata": { "name": "%s-admin-binding" % namespace, "namespace": namespace }, "roleRef": { "apiGroup": "rbac.authorization.k8s.io", "kind": "ClusterRole", "name": "admin" }, "subjects": [{ "apiGroup": "rbac.authorization.k8s.io", "kind": "User", "name": username }] } try: rbacapi.create_namespaced_role_binding(namespace, body) except Exception, e: logging.error("%s", str(e))
def get_jobs(self, username): namespace = utils.email_escape(username) api_instance =\ client.BatchV1Api(api_client=utils.get_user_api_client(username)) job_list = api_instance.list_namespaced_job(namespace) # NOTE: when job is deleted, some pods of the job will be at "Terminating" status # for a while, which may cause submit fail. Find all pods that are still "Terminating". user_pod_list =\ client.CoreV1Api(api_client=utils.get_user_api_client(username))\ .list_namespaced_pod(namespace) terminating_jobs = [] for pod in user_pod_list.items: jobname = "" if not pod.metadata.labels: continue if "paddle-job" in pod.metadata.labels: jobname = pod.metadata.labels["paddle-job"] elif "paddle-job-master" in pod.metadata.labels: jobname = pod.metadata.labels["paddle-job-master"] elif "paddle-job-pserver" in pod.metadata.labels: jobname = pod.metadata.labels["paddle-job-pserver"] if pod.metadata.deletion_timestamp and jobname: if jobname not in terminating_jobs: terminating_jobs.append(jobname) # NOTE: put it in the original dict for backward compability ret_dict = copy.deepcopy(job_list.to_dict()) ret_dict["terminating"] = terminating_jobs return ret_dict
def create_user_namespace(username): v1api = kubernetes.client.CoreV1Api(utils.get_admin_api_client()) namespaces = v1api.list_namespace() user_namespace_found = False user_namespace = utils.email_escape(username) for ns in namespaces.items: # must commit to user's namespace if ns.metadata.name == user_namespace: user_namespace_found = True # Create user's namespace if it does not exist if not user_namespace_found: v1api.create_namespace({ "apiVersion": "v1", "kind": "Namespace", "metadata": { "name": user_namespace } }) for dc, cfg in settings.DATACENTERS.items(): #create DataCenter sercret if not exists secrets = v1api.list_namespaced_secret(user_namespace) secret_names = [item.metadata.name for item in secrets.items] # create Kubernetes Secret for ceph admin key if cfg["fstype"] == "cephfs" and cfg["secret"] not in secret_names: with open(cfg["admin_key"], "r") as f: key = f.read() encoded = base64.b64encode(key) v1api.create_namespaced_secret( user_namespace, { "apiVersion": "v1", "kind": "Secret", "metadata": { "name": cfg["secret"] }, "data": { "key": encoded } }) # create docker registry secret registry_secret = settings.JOB_DOCKER_IMAGE.get("registry_secret", None) if registry_secret and registry_secret not in secret_names: docker_config = settings.JOB_DOCKER_IMAGE["docker_config"] encode = base64.b64encode(json.dumps(docker_config)) v1api.create_namespaced_secret( user_namespace, { "apiVersion": "v1", "kind": "Secret", "metadata": { "name": registry_secret }, "data": { ".dockerconfigjson": encode }, "type": "kubernetes.io/dockerconfigjson" }) return user_namespace
def _valid_and_fill(self, paddlejob, username): namespace = utils.email_escape(username) api_client = utils.get_user_api_client(username) self.__setup_volumes(paddlejob, username) if not paddlejob.registry_secret: paddlejob.registry_secret = settings.JOB_DOCKER_IMAGE.get("registry_secret", None) if not paddlejob.image: if paddlejob.gpu > 0: paddlejob.image = settings.JOB_DOCKER_IMAGE["image_gpu"] else: paddlejob.image = settings.JOB_DOCKER_IMAGE["image"] # jobPackage validation: startwith /pfs # NOTE: job packages are uploaded to /pfs/[dc]/home/[user]/jobs/[jobname] package_in_pod = os.path.join("/pfs/%s/home/%s"%(paddlejob.dc, username), "jobs", paddlejob.name) logging.info("valid_and_fill: current package: %s", package_in_pod) # package must be ready before submit a job current_package_path = package_in_pod.replace("/pfs/%s/home"%paddlejob.dc, settings.STORAGE_PATH) if not os.path.exists(current_package_path): current_package_path = package_in_pod.replace("/pfs/%s/home/%s"%(paddlejob.dc, username), settings.STORAGE_PATH) if not os.path.exists(current_package_path): raise Exception("package not exist in cloud: %s"%current_package_path) logging.info("valid_and_fill: current package in pod: %s", current_package_path) # GPU quota management # TODO(Yancey1989) We should move this to Kubernetes if 'GPU_QUOTA' in dir(settings) and int(paddlejob.gpu) > 0: gpu_usage = 0 pods = client.CoreV1Api(api_client=api_client).list_namespaced_pod(namespace=namespace) for pod in pods.items: # only statistics trainer GPU resource, pserver does not use GPU if pod.metadata.labels and 'paddle-job' in pod.metadata.labels and \ pod.status.phase == 'Running': gpu_usage += int(pod.spec.containers[0].resources.limits.get('alpha.kubernetes.io/nvidia-gpu', '0')) if username in settings.GPU_QUOTA: gpu_quota = settings.GPU_QUOTA[username]['limit'] else: gpu_quota = settings.GPU_QUOTA['DEFAULT']['limit'] gpu_available = gpu_quota - gpu_usage gpu_request = int(paddlejob.gpu) * int(paddlejob.parallelism) logging.info('gpu available: %d, gpu request: %d' % (gpu_available, gpu_request)) if gpu_available < gpu_request: raise Exception("You don't have enought GPU quota," + \ "request: %d, usage: %d, limit: %d" % (gpu_request, gpu_usage, gpu_quota)) # add Nvidia lib volume if training with GPU if paddlejob.gpu > 0: paddlejob.volumes.append(volume.get_volume_config( fstype = settings.FSTYPE_HOSTPATH, name = "nvidia-libs", mount_path = "/usr/local/nvidia/lib64", host_path = settings.NVIDIA_LIB_PATH ))
def get_workers(self, jobname, username): namespace = utils.email_escape(username) job_pod_list = None api_client = utils.get_user_api_client(username) if not jobname: job_pod_list = client.CoreV1Api(api_client=api_client)\ .list_namespaced_pod(namespace) else: selector = "paddle-job=%s"%jobname job_pod_list = client.CoreV1Api(api_client=api_client)\ .list_namespaced_pod(namespace, label_selector=selector) return job_pod_list.to_dict()
def _create_traingingjobs(self, body, username): namespace = utils.email_escape(username) api_client = utils.get_user_api_client(username) resource_path = '/apis/paddlepaddle.org/v1/namespaces/' + namespace + '/trainingjobs' header_params = {} header_params['Accept'] = api_client.select_header_accept(['application/json']) header_params['Content-Type'] = api_client.select_header_content_type(['*/*']) (resp, code, header) = api_client.call_api( resource_path, 'POST', {'namespace': namespace}, {}, header_params, body, [], _preload_content=False) return json.loads(resp.data.decode('utf-8'))
def submit_job(self, paddlejob, username): self._valid_and_fill(paddlejob, username) namespace = utils.email_escape(username) api_client = utils.get_user_api_client(username) # ========== submit master ReplicaSet if using fault_tolerant feature == # FIXME: alpha features in separate module if paddlejob.fault_tolerant: try: ret = client.ExtensionsV1beta1Api(api_client=api_client).create_namespaced_replica_set( namespace, paddlejob.new_master_job()) except ApiException, e: logging.error("error submitting master job: %s", traceback.format_exc()) raise e
def delete_job(self, jobname, username): namespace = utils.email_escape(username) api_client = utils.get_user_api_client(username) if not jobname: return utils.simple_response(500, "must specify jobname") # FIXME: options needed: grace_period_seconds, orphan_dependents, preconditions # FIXME: cascade delteing delete_status = [] # delete job trainer_name = jobname + "-trainer" try: u_status = client.BatchV1Api(api_client=api_client)\ .delete_namespaced_job(trainer_name, namespace, {}) except ApiException, e: logging.error("error deleting job: %s, %s", jobname, str(e)) delete_status.append(str(e))
def get_quotas(self, username): namespace = utils.email_escape(username) api_client = utils.get_user_api_client(username) quota_list = client.CoreV1Api(api_client=api_client)\ .list_namespaced_resource_quota(namespace) return quota_list.to_dict()
def get_pservers(self, username): namespace = utils.email_escape(username) api_instance = client.ExtensionsV1beta1Api(api_client=utils.get_user_api_client(username)) return api_instance.list_namespaced_replica_set(namespace).to_dict()
def get_logs(self, jobname, num_lines, worker, username): def _get_pod_log(api_client, namespace, pod_name, num_lines): try: if num_lines: pod_log = client.CoreV1Api(api_client=api_client)\ .read_namespaced_pod_log( pod_name, namespace, tail_lines=int(num_lines)) else: pod_log = client.CoreV1Api(api_client=api_client)\ .read_namespaced_pod_log(i.metadata.name, namespace) return pod_log except ApiException, e: return str(e) namespace = utils.email_escape(username) api_client = utils.get_user_api_client(username) job_pod_list = client.CoreV1Api(api_client=api_client)\ .list_namespaced_pod(namespace, label_selector="paddle-job=%s"%jobname) total_job_log = "" if not worker: for i in job_pod_list.items: total_job_log = "".join((total_job_log, "==========================%s==========================" % i.metadata.name)) pod_log = _get_pod_log(api_client, namespace, i.metadata.name, num_lines) total_job_log = "\n".join((total_job_log, pod_log)) else: total_job_log = _get_pod_log(api_client, namespace, worker, num_lines) return total_job_log def get_workers(self, jobname, username):