Exemplo n.º 1
0
class JobsView(APIView):
    permission_classes = (permissions.IsAuthenticated, )

    def get(self, request, format=None):
        username = request.user.username
        p = K8sProvider()
        ret_dict = p.get_jobs(username)
        return Response(ret_dict)

    def post(self, request, format=None):
        """
        Submit the PaddlePaddle job
        """
        # ========== submit master ReplicaSet if using fault_tolerant feature ==
        username = request.user.username
        job = get_paddlejob(request)
        p = K8sProvider()
        try:
            p.submit_job(job, username)
        except Exception, e:
            return utils.error_message_response(str(e))

        return utils.simple_response(200, "")
Exemplo n.º 2
0
class RegistryView(APIView):
    permission_classes = (permissions.IsAuthenticated, )

    def post(self, request):
        """
        Cretea a registry secret
        """
        username = request.user.username
        user_namespace = notebook.utils.email_escape(username)
        api_client = notebook.utils.get_user_api_client(username)
        obj = json.loads(request.body)
        name = obj.get("name")
        docker_username = obj.get("username")
        docker_password = obj.get("password")
        docker_server = obj.get("server")
        cfg = docker_cfg(docker_username, docker_password, username,
                         docker_server)
        try:
            ret = client.CoreV1Api(
                api_client=api_client).create_namespaced_secret(
                    namespace=user_namespace,
                    body={
                        "apiVersion": "v1",
                        "kind": "Secret",
                        "metadata": {
                            "name": name
                        },
                        "data": {
                            ".dockerconfigjson": base64.b64encode(cfg)
                        },
                        "type": "kubernetes.io/dockerconfigjson"
                    })
        except ApiException, e:
            logging.error("Failed when create secret.")
            return utils.simple_response(500, str(e))
        return utils.simple_response(200, "")
Exemplo n.º 3
0
def login(request):
    if request.method != "POST":
        return simple_response(405)
    if request.user.id == -1:
        data = request.POST
        try:
            password = data["password"]
            email = data["login"]
            validate_email(email)
            user = User.objects.get(email=email)
            if user.login(password):
                request.session["id"] = user.id
                return simple_response(200)
        except KeyError:
            return simple_response(400)
        except ValidationError:
            return simple_response(400)
        except ObjectDoesNotExist:
            return simple_response(401)
    else:
        return simple_response(200)
Exemplo n.º 4
0
def storage(request):
    data = request.POST or request.GET
    if request.method == "POST":
        if request.user.status > 2:
            return simple_response(403)
        try:
            key = data["key"].lower()
            value = data["value"]
            access = data.get("access", 5)
            if len(key) > 32 or not all(
                [i in "abcdefghijklmnopqrstuvwxyz_" for i in key]):
                raise KeyError
        except KeyError:
            return simple_response(400)
        try:
            Storage.objects.get(key=key)
            return simple_response(422)
        except ObjectDoesNotExist:
            Storage(key=key, value=value, access=access).save()
            request.in_history = True
            request.history = {
                "action": "create",
                "section": "core",
                "object": "storage",
                "params": {
                    key: value,
                    "access": access
                }
            }
            return simple_response(201)
    elif request.method == "GET":
        try:
            key = data["key"].lower()
            s = Storage.objects.get(key=key)
            if request.user.status <= s.access:
                return simple_response(data={key: s.value})
            else:
                return simple_response(403)
        except KeyError:
            return simple_response(400)
        except ObjectDoesNotExist:
            return simple_response(404)
    elif request.method == "PUT":
        if request.user.status > 2:
            return simple_response(403)
        try:
            key = data["key"].lower()
            value = data["value"]
            if len(key) > 32 or not all(
                [i in "abcdefghijklmnopqrstuvwxyz_" for i in key]):
                raise KeyError
        except KeyError:
            return simple_response(400)
        try:
            s = Storage.objects.get(key=key)
            s.change_value(value)
            request.in_history = True
            request.history = {
                "action": "update",
                "section": "core",
                "object": "storage",
                "params": {
                    key: value,
                    "access": s.access
                }
            }
            return simple_response(200)
        except ObjectDoesNotExist:
            return simple_response(422)
    elif request.method == "DELETE":
        if request.user.status > 2:
            return simple_response(403)
        try:
            key = data["key"].lower()
            Storage.objects.get(key=key).delete()
            request.in_history = True
            request.history = {
                "action": "delete",
                "section": "core",
                "object": "storage",
                "params": {
                    "key": key
                }
            }
            return simple_response(200)
        except KeyError:
            return simple_response(400)
        except ObjectDoesNotExist:
            return simple_response(404)
Exemplo n.º 5
0
def logout(request):
    request.session.flush()
    return simple_response(200)
Exemplo n.º 6
0
def check(request):
    if request.user.id == -1:
        return simple_response(401)
    else:
        return simple_response(200)
Exemplo n.º 7
0
        """
        username = username = request.user.username
        user_namespace = notebook.utils.email_escape(username)
        api_client = notebook.utils.get_user_api_client(username)
        obj = json.loads(request.body)
        name = obj.get("name")
        try:
            ret = client.CoreV1Api(
                api_client=api_client).delete_namespaced_secret(
                    name=name,
                    namespace=user_namespace,
                    body=client.V1DeleteOptions())
        except ApiException, e:
            logging.error("Failed when delete secret.")
            return utils.simple_response(500, str(e))
        return utils.simple_response(200, "")

    def get(self, request):
        """
        Get registrys
        """
        username = username = request.user.username
        user_namespace = notebook.utils.email_escape(username)
        api_client = notebook.utils.get_user_api_client(username)
        try:
            secretes_list = client.CoreV1Api(
                api_client=api_client).list_namespaced_secret(
                    namespace=user_namespace)
            return utils.simple_response(200, secretes_list.to_dict())
        except ApiException, e:
            logging.error("Failed when list secrets.")
Exemplo n.º 8
0
    def post(self, request, format=None):
        """
        Submit the PaddlePaddle job
        """
        #submit parameter server, it's Kubernetes ReplicaSet
        username = request.user.username
        namespace = notebook.utils.email_escape(username)
        obj = json.loads(request.body)
        topology = obj.get("topology", "")
        entry = obj.get("entry", "")
        fault_tolerant = obj.get("faulttolerant", False)
        api_client = notebook.utils.get_user_api_client(username)
        if not topology and not entry:
            return utils.simple_response(500, "no topology or entry specified")
        if not obj.get("datacenter"):
            return utils.simple_response(500, "no datacenter specified")
        cfgs = {}
        dc = obj.get("datacenter")

        volumes = []
        for k, cfg in settings.DATACENTERS.items():
            if k != dc and k != "public":
                continue
            fstype = cfg["fstype"]
            if fstype == settings.FSTYPE_CEPHFS:
                if k == "public":
                    mount_path = cfg["mount_path"] % dc
                    cephfs_path = cfg["cephfs_path"]
                else:
                    mount_path = cfg["mount_path"] % (dc, username)
                    cephfs_path = cfg["cephfs_path"] % username
                volumes.append(
                    volume.get_volume_config(
                        fstype=fstype,
                        name=k.replace("_", "-"),
                        monitors_addr=cfg["monitors_addr"],
                        secret=cfg["secret"],
                        user=cfg["user"],
                        mount_path=mount_path,
                        cephfs_path=cephfs_path,
                        admin_key=cfg["admin_key"],
                        read_only=cfg.get("read_only", False)))
            elif fstype == settings.FSTYPE_HOSTPATH:
                if k == "public":
                    mount_path = cfg["mount_path"] % dc
                    host_path = cfg["host_path"]
                else:
                    mount_path = cfg["mount_path"] % (dc, username)
                    host_path = cfg["host_path"]

                volumes.append(
                    volume.get_volume_config(fstype=fstype,
                                             name=k.replace("_", "-"),
                                             mount_path=mount_path,
                                             host_path=host_path))
            else:
                pass
        registry_secret = obj.get("registry", None)
        if not registry_secret:
            registry_secret = settings.JOB_DOCKER_IMAGE.get(
                "registry_secret", None)
        # get user specified image
        job_image = obj.get("image", None)
        gpu_count = obj.get("gpu", 0)
        # jobPackage validation: startwith /pfs
        # NOTE: job packages are uploaded to /pfs/[dc]/home/[user]/jobs/[jobname]
        job_name = obj.get("name", "paddle-cluster-job")
        if settings.STORAGE_MODE == "CEPHFS":
            package_in_pod = os.path.join("/pfs/%s/home/%s" % (dc, username),
                                          "jobs", job_name)
        elif settings.STORAGE_MODE == "HDFS":
            package_in_pod = obj.get("jobPackage")

        logging.info("current package: %s", package_in_pod)
        # package must be ready before submit a job
        current_package_path = package_in_pod.replace("/pfs/%s/home" % dc,
                                                      settings.STORAGE_PATH)
        if not os.path.exists(current_package_path):
            current_package_path = package_in_pod.replace(
                "/pfs/%s/home/%s" % (dc, username), settings.STORAGE_PATH)
            if not os.path.exists(current_package_path):
                return utils.error_message_response(
                    "package not exist in cloud: %s" % current_package_path)
        logging.info("current package in pod: %s", current_package_path)

        # use default images
        if not job_image:
            if gpu_count > 0:
                job_image = settings.JOB_DOCKER_IMAGE["image_gpu"]
            else:
                job_image = settings.JOB_DOCKER_IMAGE["image"]

        # add Nvidia lib volume if training with GPU
        if gpu_count > 0:
            volumes.append(
                volume.get_volume_config(fstype=settings.FSTYPE_HOSTPATH,
                                         name="nvidia-libs",
                                         mount_path="/usr/local/nvidia/lib64",
                                         host_path=settings.NVIDIA_LIB_PATH))
        envs = {}
        envs.update({"PADDLE_CLOUD_CURRENT_DATACENTER": dc})
        # ===================== create PaddleJob instance ======================
        paddle_job = PaddleJob(name=job_name,
                               job_package=package_in_pod,
                               parallelism=obj.get("parallelism", 1),
                               cpu=obj.get("cpu", 1),
                               memory=obj.get("memory", "1Gi"),
                               pservers=obj.get("pservers", 1),
                               pscpu=obj.get("pscpu", 1),
                               psmemory=obj.get("psmemory", "1Gi"),
                               topology=topology,
                               entry=entry,
                               gpu=obj.get("gpu", 0),
                               image=job_image,
                               passes=obj.get("passes", 1),
                               registry_secret=registry_secret,
                               volumes=volumes,
                               envs=envs,
                               new_pserver=fault_tolerant,
                               etcd_image=settings.ETCD_IMAGE)
        # ========== submit master ReplicaSet if using fault_tolerant feature ==
        # FIXME: alpha features in separate module
        if fault_tolerant:
            try:
                ret = client.ExtensionsV1beta1Api(
                    api_client=api_client).create_namespaced_replica_set(
                        namespace, paddle_job.new_master_job())
            except ApiException, e:
                logging.error("error submitting master job: %s", e)
                return utils.simple_response(500, str(e))
Exemplo n.º 9
0
        # ========================= submit pserver job =========================
        try:
            ret = client.ExtensionsV1beta1Api(
                api_client=api_client).create_namespaced_replica_set(
                    namespace, paddle_job.new_pserver_job())
        except ApiException, e:
            logging.error("error submitting pserver job: %s ", e)
            return utils.simple_response(500, str(e))
        # ========================= submit trainer job =========================
        try:
            ret = client.BatchV1Api(
                api_client=api_client).create_namespaced_job(
                    namespace, paddle_job.new_trainer_job())
        except ApiException, e:
            logging.error("error submitting trainer job: %s" % e)
            return utils.simple_response(500, str(e))

        # TODO(typhoonzero): stop master and pservers when job finish or fails

        return utils.simple_response(200, "")

    def delete(self, request, format=None):
        """
        Kill a job
        """
        username = request.user.username
        namespace = notebook.utils.email_escape(username)
        obj = json.loads(request.body)
        jobname = obj.get("jobname")
        api_client = notebook.utils.get_user_api_client(username)
        if not jobname: