예제 #1
0
 def post(self, request, format=None):
     """
     Simple put file.
     """
     file_obj = request.data['file']
     file_path = request.query_params.get("path")
     if not file_path:
         return utils.error_message_response("must specify path")
     try:
         write_file = self.__validate_path(request, file_path)
     except Exception, e:
         return utils.error_message_response("file path not valid: %s" %
                                             str(e))
예제 #2
0
def get_paddlejob(request):
    username = request.user.username
    obj = json.loads(request.body)
    topology = obj.get("topology", "")
    entry = obj.get("entry", "")
    if not topology and not entry:
        return utils.simple_response(500, "no topology or entry specified")
    if not obj.get("datacenter"):
        return utils.simple_response(500, "no datacenter specified")
    cfgs = {}
    dc = obj.get("datacenter")
    # jobPackage validation: startwith /pfs
    # NOTE: job packages are uploaded to /pfs/[dc]/home/[user]/jobs/[jobname]
    job_name = obj.get("name", "paddle-cluster-job")
    package_in_pod = os.path.join("/pfs/%s/home/%s" % (dc, username), "jobs",
                                  job_name)

    logging.info("current package: %s", package_in_pod)
    # package must be ready before submit a job
    package_path_4test = package_in_pod.replace("/pfs/%s/home" % dc,
                                                settings.STORAGE_PATH)
    if not os.path.exists(package_path_4test):
        package_path_4test = package_in_pod.replace(
            "/pfs/%s/home/%s" % (dc, username), settings.STORAGE_PATH)
        if not os.path.exists(package_path_4test):
            return utils.error_message_response(
                "package not exist in cloud: %s" % package_path_4test)
    logging.info("current package in pod: %s", package_path_4test)

    envs = {}
    envs.update({"PADDLE_CLOUD_CURRENT_DATACENTER": dc})
    envs.update({"PADDLE_CLOUD_USERNAME": username})
    # ===================== create PaddleJob instance ======================
    paddle_job = PaddleJob(name=job_name,
                           job_package=package_in_pod,
                           parallelism=obj.get("parallelism", 1),
                           cpu=obj.get("cpu", 1),
                           memory=obj.get("memory", "1Gi"),
                           pservers=obj.get("pservers", 1),
                           pscpu=obj.get("pscpu", 1),
                           psmemory=obj.get("psmemory", "1Gi"),
                           topology=topology,
                           entry=entry,
                           gpu=obj.get("gpu", 0),
                           image=obj.get("image", None),
                           passes=obj.get("passes", 1),
                           registry_secret=obj.get("registry", None),
                           volumes=[],
                           envs=envs,
                           fault_tolerant=obj.get("faulttolerant", False),
                           min_instance=obj.get("min_instance", 1),
                           max_instance=obj.get("max_instance", 1),
                           etcd_image=settings.ETCD_IMAGE,
                           dc=dc)

    logging.info("return paddlejob")
    return paddle_job
예제 #3
0
 def get(self, request, format=None):
     """
     Simple get file.
     """
     file_path = request.query_params.get("path")
     try:
         write_file = self.__validate_path(request, file_path)
     except Exception, e:
         return utils.error_message_response("file path not valid: %s" %
                                             str(e))
예제 #4
0
 def post(self, request, format=None):
     """
     Submit a trainingjobs.
     """
     username = request.user.username
     job = get_paddlejob(request)
     p = K8sProvider()
     try:
         p.submit_trainingjobs(job, username)
     except Exception, e:
         return utils.error_message_response(str(e))
예제 #5
0
 def post(self, request, format=None):
     """
     Submit the PaddlePaddle job
     """
     # ========== submit master ReplicaSet if using fault_tolerant feature ==
     username = request.user.username
     job = get_paddlejob(request)
     p = K8sProvider()
     try:
         p.submit_job(job, username)
     except Exception, e:
         return utils.error_message_response(str(e))
예제 #6
0
    def post(self, request, format=None):
        """
        Submit the PaddlePaddle job
        """
        #submit parameter server, it's Kubernetes ReplicaSet
        username = request.user.username
        namespace = notebook.utils.email_escape(username)
        obj = json.loads(request.body)
        topology = obj.get("topology", "")
        entry = obj.get("entry", "")
        fault_tolerant = obj.get("faulttolerant", False)
        api_client = notebook.utils.get_user_api_client(username)
        if not topology and not entry:
            return utils.simple_response(500, "no topology or entry specified")
        if not obj.get("datacenter"):
            return utils.simple_response(500, "no datacenter specified")
        cfgs = {}
        dc = obj.get("datacenter")

        volumes = []
        for k, cfg in settings.DATACENTERS.items():
            if k != dc and k != "public":
                continue
            fstype = cfg["fstype"]
            if fstype == settings.FSTYPE_CEPHFS:
                if k == "public":
                    mount_path = cfg["mount_path"] % dc
                    cephfs_path = cfg["cephfs_path"]
                else:
                    mount_path = cfg["mount_path"] % (dc, username)
                    cephfs_path = cfg["cephfs_path"] % username
                volumes.append(
                    volume.get_volume_config(
                        fstype=fstype,
                        name=k.replace("_", "-"),
                        monitors_addr=cfg["monitors_addr"],
                        secret=cfg["secret"],
                        user=cfg["user"],
                        mount_path=mount_path,
                        cephfs_path=cephfs_path,
                        admin_key=cfg["admin_key"],
                        read_only=cfg.get("read_only", False)))
            elif fstype == settings.FSTYPE_HOSTPATH:
                if k == "public":
                    mount_path = cfg["mount_path"] % dc
                    host_path = cfg["host_path"]
                else:
                    mount_path = cfg["mount_path"] % (dc, username)
                    host_path = cfg["host_path"]

                volumes.append(
                    volume.get_volume_config(fstype=fstype,
                                             name=k.replace("_", "-"),
                                             mount_path=mount_path,
                                             host_path=host_path))
            else:
                pass
        registry_secret = obj.get("registry", None)
        if not registry_secret:
            registry_secret = settings.JOB_DOCKER_IMAGE.get(
                "registry_secret", None)
        # get user specified image
        job_image = obj.get("image", None)
        gpu_count = obj.get("gpu", 0)
        # jobPackage validation: startwith /pfs
        # NOTE: job packages are uploaded to /pfs/[dc]/home/[user]/jobs/[jobname]
        job_name = obj.get("name", "paddle-cluster-job")
        if settings.STORAGE_MODE == "CEPHFS":
            package_in_pod = os.path.join("/pfs/%s/home/%s" % (dc, username),
                                          "jobs", job_name)
        elif settings.STORAGE_MODE == "HDFS":
            package_in_pod = obj.get("jobPackage")

        logging.info("current package: %s", package_in_pod)
        # package must be ready before submit a job
        current_package_path = package_in_pod.replace("/pfs/%s/home" % dc,
                                                      settings.STORAGE_PATH)
        if not os.path.exists(current_package_path):
            current_package_path = package_in_pod.replace(
                "/pfs/%s/home/%s" % (dc, username), settings.STORAGE_PATH)
            if not os.path.exists(current_package_path):
                return utils.error_message_response(
                    "package not exist in cloud: %s" % current_package_path)
        logging.info("current package in pod: %s", current_package_path)

        # use default images
        if not job_image:
            if gpu_count > 0:
                job_image = settings.JOB_DOCKER_IMAGE["image_gpu"]
            else:
                job_image = settings.JOB_DOCKER_IMAGE["image"]

        # add Nvidia lib volume if training with GPU
        if gpu_count > 0:
            volumes.append(
                volume.get_volume_config(fstype=settings.FSTYPE_HOSTPATH,
                                         name="nvidia-libs",
                                         mount_path="/usr/local/nvidia/lib64",
                                         host_path=settings.NVIDIA_LIB_PATH))
        envs = {}
        envs.update({"PADDLE_CLOUD_CURRENT_DATACENTER": dc})
        # ===================== create PaddleJob instance ======================
        paddle_job = PaddleJob(name=job_name,
                               job_package=package_in_pod,
                               parallelism=obj.get("parallelism", 1),
                               cpu=obj.get("cpu", 1),
                               memory=obj.get("memory", "1Gi"),
                               pservers=obj.get("pservers", 1),
                               pscpu=obj.get("pscpu", 1),
                               psmemory=obj.get("psmemory", "1Gi"),
                               topology=topology,
                               entry=entry,
                               gpu=obj.get("gpu", 0),
                               image=job_image,
                               passes=obj.get("passes", 1),
                               registry_secret=registry_secret,
                               volumes=volumes,
                               envs=envs,
                               new_pserver=fault_tolerant,
                               etcd_image=settings.ETCD_IMAGE)
        # ========== submit master ReplicaSet if using fault_tolerant feature ==
        # FIXME: alpha features in separate module
        if fault_tolerant:
            try:
                ret = client.ExtensionsV1beta1Api(
                    api_client=api_client).create_namespaced_replica_set(
                        namespace, paddle_job.new_master_job())
            except ApiException, e:
                logging.error("error submitting master job: %s", e)
                return utils.simple_response(500, str(e))