def post(self, request, format=None): """ Simple put file. """ file_obj = request.data['file'] file_path = request.query_params.get("path") if not file_path: return utils.error_message_response("must specify path") try: write_file = self.__validate_path(request, file_path) except Exception, e: return utils.error_message_response("file path not valid: %s" % str(e))
def get_paddlejob(request): username = request.user.username obj = json.loads(request.body) topology = obj.get("topology", "") entry = obj.get("entry", "") if not topology and not entry: return utils.simple_response(500, "no topology or entry specified") if not obj.get("datacenter"): return utils.simple_response(500, "no datacenter specified") cfgs = {} dc = obj.get("datacenter") # jobPackage validation: startwith /pfs # NOTE: job packages are uploaded to /pfs/[dc]/home/[user]/jobs/[jobname] job_name = obj.get("name", "paddle-cluster-job") package_in_pod = os.path.join("/pfs/%s/home/%s" % (dc, username), "jobs", job_name) logging.info("current package: %s", package_in_pod) # package must be ready before submit a job package_path_4test = package_in_pod.replace("/pfs/%s/home" % dc, settings.STORAGE_PATH) if not os.path.exists(package_path_4test): package_path_4test = package_in_pod.replace( "/pfs/%s/home/%s" % (dc, username), settings.STORAGE_PATH) if not os.path.exists(package_path_4test): return utils.error_message_response( "package not exist in cloud: %s" % package_path_4test) logging.info("current package in pod: %s", package_path_4test) envs = {} envs.update({"PADDLE_CLOUD_CURRENT_DATACENTER": dc}) envs.update({"PADDLE_CLOUD_USERNAME": username}) # ===================== create PaddleJob instance ====================== paddle_job = PaddleJob(name=job_name, job_package=package_in_pod, parallelism=obj.get("parallelism", 1), cpu=obj.get("cpu", 1), memory=obj.get("memory", "1Gi"), pservers=obj.get("pservers", 1), pscpu=obj.get("pscpu", 1), psmemory=obj.get("psmemory", "1Gi"), topology=topology, entry=entry, gpu=obj.get("gpu", 0), image=obj.get("image", None), passes=obj.get("passes", 1), registry_secret=obj.get("registry", None), volumes=[], envs=envs, fault_tolerant=obj.get("faulttolerant", False), min_instance=obj.get("min_instance", 1), max_instance=obj.get("max_instance", 1), etcd_image=settings.ETCD_IMAGE, dc=dc) logging.info("return paddlejob") return paddle_job
def get(self, request, format=None): """ Simple get file. """ file_path = request.query_params.get("path") try: write_file = self.__validate_path(request, file_path) except Exception, e: return utils.error_message_response("file path not valid: %s" % str(e))
def post(self, request, format=None): """ Submit a trainingjobs. """ username = request.user.username job = get_paddlejob(request) p = K8sProvider() try: p.submit_trainingjobs(job, username) except Exception, e: return utils.error_message_response(str(e))
def post(self, request, format=None): """ Submit the PaddlePaddle job """ # ========== submit master ReplicaSet if using fault_tolerant feature == username = request.user.username job = get_paddlejob(request) p = K8sProvider() try: p.submit_job(job, username) except Exception, e: return utils.error_message_response(str(e))
def post(self, request, format=None): """ Submit the PaddlePaddle job """ #submit parameter server, it's Kubernetes ReplicaSet username = request.user.username namespace = notebook.utils.email_escape(username) obj = json.loads(request.body) topology = obj.get("topology", "") entry = obj.get("entry", "") fault_tolerant = obj.get("faulttolerant", False) api_client = notebook.utils.get_user_api_client(username) if not topology and not entry: return utils.simple_response(500, "no topology or entry specified") if not obj.get("datacenter"): return utils.simple_response(500, "no datacenter specified") cfgs = {} dc = obj.get("datacenter") volumes = [] for k, cfg in settings.DATACENTERS.items(): if k != dc and k != "public": continue fstype = cfg["fstype"] if fstype == settings.FSTYPE_CEPHFS: if k == "public": mount_path = cfg["mount_path"] % dc cephfs_path = cfg["cephfs_path"] else: mount_path = cfg["mount_path"] % (dc, username) cephfs_path = cfg["cephfs_path"] % username volumes.append( volume.get_volume_config( fstype=fstype, name=k.replace("_", "-"), monitors_addr=cfg["monitors_addr"], secret=cfg["secret"], user=cfg["user"], mount_path=mount_path, cephfs_path=cephfs_path, admin_key=cfg["admin_key"], read_only=cfg.get("read_only", False))) elif fstype == settings.FSTYPE_HOSTPATH: if k == "public": mount_path = cfg["mount_path"] % dc host_path = cfg["host_path"] else: mount_path = cfg["mount_path"] % (dc, username) host_path = cfg["host_path"] volumes.append( volume.get_volume_config(fstype=fstype, name=k.replace("_", "-"), mount_path=mount_path, host_path=host_path)) else: pass registry_secret = obj.get("registry", None) if not registry_secret: registry_secret = settings.JOB_DOCKER_IMAGE.get( "registry_secret", None) # get user specified image job_image = obj.get("image", None) gpu_count = obj.get("gpu", 0) # jobPackage validation: startwith /pfs # NOTE: job packages are uploaded to /pfs/[dc]/home/[user]/jobs/[jobname] job_name = obj.get("name", "paddle-cluster-job") if settings.STORAGE_MODE == "CEPHFS": package_in_pod = os.path.join("/pfs/%s/home/%s" % (dc, username), "jobs", job_name) elif settings.STORAGE_MODE == "HDFS": package_in_pod = obj.get("jobPackage") logging.info("current package: %s", package_in_pod) # package must be ready before submit a job current_package_path = package_in_pod.replace("/pfs/%s/home" % dc, settings.STORAGE_PATH) if not os.path.exists(current_package_path): current_package_path = package_in_pod.replace( "/pfs/%s/home/%s" % (dc, username), settings.STORAGE_PATH) if not os.path.exists(current_package_path): return utils.error_message_response( "package not exist in cloud: %s" % current_package_path) logging.info("current package in pod: %s", current_package_path) # use default images if not job_image: if gpu_count > 0: job_image = settings.JOB_DOCKER_IMAGE["image_gpu"] else: job_image = settings.JOB_DOCKER_IMAGE["image"] # add Nvidia lib volume if training with GPU if gpu_count > 0: volumes.append( volume.get_volume_config(fstype=settings.FSTYPE_HOSTPATH, name="nvidia-libs", mount_path="/usr/local/nvidia/lib64", host_path=settings.NVIDIA_LIB_PATH)) envs = {} envs.update({"PADDLE_CLOUD_CURRENT_DATACENTER": dc}) # ===================== create PaddleJob instance ====================== paddle_job = PaddleJob(name=job_name, job_package=package_in_pod, parallelism=obj.get("parallelism", 1), cpu=obj.get("cpu", 1), memory=obj.get("memory", "1Gi"), pservers=obj.get("pservers", 1), pscpu=obj.get("pscpu", 1), psmemory=obj.get("psmemory", "1Gi"), topology=topology, entry=entry, gpu=obj.get("gpu", 0), image=job_image, passes=obj.get("passes", 1), registry_secret=registry_secret, volumes=volumes, envs=envs, new_pserver=fault_tolerant, etcd_image=settings.ETCD_IMAGE) # ========== submit master ReplicaSet if using fault_tolerant feature == # FIXME: alpha features in separate module if fault_tolerant: try: ret = client.ExtensionsV1beta1Api( api_client=api_client).create_namespaced_replica_set( namespace, paddle_job.new_master_job()) except ApiException, e: logging.error("error submitting master job: %s", e) return utils.simple_response(500, str(e))