Exemplo n.º 1
0
    def _create_instance_manager(self, args):
        instance_manager = None

        container_command = ["/bin/bash"]
        if args.num_workers:
            assert args.worker_image, "Worker image cannot be empty"

            worker_client_command = (
                BashCommandTemplate.SET_PIPEFAIL
                + " python -m elasticdl.python.worker.main"
            )
            worker_args = [
                "--master_addr",
                self.master_addr,
                "--job_type",
                self.job_type,
            ]
            worker_args.extend(
                build_arguments_from_parsed_result(args, filter_args=["envs"])
            )
            worker_args = wrap_python_args_with_string(worker_args)
            worker_args.insert(0, worker_client_command)

            if args.use_go_ps:
                opt_type, opt_args = get_optimizer_info(self.optimizer)
                ps_command = "elasticdl_ps"
                ps_command_args = [
                    "-job_name=" + args.job_name,
                    "-namespace=" + args.namespace,
                    "-master_addr=" + self.master_addr,
                    "-port=2222",
                    "-use_async=" + ("true" if args.use_async else "false"),
                    "-grads_to_wait=" + str(args.grads_to_wait),
                    "-lr_staleness_modulation="
                    + ("true" if args.lr_staleness_modulation else "false"),
                    "-sync_version_tolerance="
                    + str(args.sync_version_tolerance),
                    "-evaluation_steps=" + str(args.evaluation_steps),
                    "-num_ps_pods=" + str(args.num_ps_pods),
                    "-num_workers=" + str(args.num_workers),
                    "-checkpoint_dir=" + str(args.checkpoint_dir),
                    "-checkpoint_steps=" + str(args.checkpoint_steps),
                    "-keep_checkpoint_max=" + str(args.keep_checkpoint_max),
                    "-checkpoint_dir_for_init="
                    + str(args.checkpoint_dir_for_init),
                    "-opt_type=" + opt_type,
                    "-opt_args=" + opt_args,
                ]
                ps_command_args = wrap_go_args_with_string(ps_command_args)
                # Execute source /root/.bashrc to add the file path
                # of `elasticdl_ps` into the PATH environment variable.
                ps_args = [
                    "source",
                    "/root/.bashrc_elasticdl",
                    "&&",
                    ps_command,
                ]
                ps_args.extend(ps_command_args)
            else:
                ps_command = (
                    BashCommandTemplate.SET_PIPEFAIL
                    + " python -m elasticdl.python.ps.main"
                )
                ps_command_args = [
                    "--grads_to_wait",
                    str(args.grads_to_wait),
                    "--lr_staleness_modulation",
                    str(args.lr_staleness_modulation),
                    "--sync_version_tolerance",
                    str(args.sync_version_tolerance),
                    "--use_async",
                    str(args.use_async),
                    "--model_zoo",
                    args.model_zoo,
                    "--model_def",
                    args.model_def,
                    "--job_name",
                    args.job_name,
                    "--port",
                    "2222",
                    "--master_addr",
                    self.master_addr,
                    "--namespace",
                    args.namespace,
                    "--evaluation_steps",
                    str(args.evaluation_steps),
                    "--checkpoint_dir",
                    str(args.checkpoint_dir),
                    "--checkpoint_steps",
                    str(args.checkpoint_steps),
                    "--keep_checkpoint_max",
                    str(args.keep_checkpoint_max),
                    "--num_ps_pods",
                    str(args.num_ps_pods),
                    "--checkpoint_dir_for_init",
                    str(args.checkpoint_dir_for_init),
                    "--num_workers",
                    str(args.num_workers),
                    "--log_level",
                    str(args.log_level),
                    "--minibatch_size",
                    str(args.minibatch_size),
                    "--num_minibatches_per_task",
                    str(args.num_minibatches_per_task),
                ]
                ps_args = wrap_python_args_with_string(ps_command_args)
                ps_args.insert(0, ps_command)

            worker_args = ["-c", " ".join(worker_args)]
            ps_args = ["-c", " ".join(ps_args)]

            env_dict = parse_envs(args.envs)
            env = []
            for key in env_dict:
                env.append(V1EnvVar(name=key, value=env_dict[key]))

            kwargs = get_dict_from_params_str(args.aux_params)
            disable_relaunch = kwargs.get("disable_relaunch", False)
            cluster_spec = self._get_image_cluster_spec(args.cluster_spec)

            instance_manager = InstanceManager(
                self.task_d,
                job_name=args.job_name,
                image_name=args.worker_image,
                worker_command=container_command,
                worker_args=worker_args,
                namespace=args.namespace,
                num_workers=args.num_workers,
                worker_resource_request=args.worker_resource_request,
                worker_resource_limit=args.worker_resource_limit,
                worker_pod_priority=args.worker_pod_priority,
                num_ps=args.num_ps_pods,
                ps_command=container_command,
                ps_args=ps_args,
                ps_resource_request=args.ps_resource_request,
                ps_resource_limit=args.ps_resource_limit,
                ps_pod_priority=args.ps_pod_priority,
                volume=args.volume,
                image_pull_policy=args.image_pull_policy,
                restart_policy=args.restart_policy,
                cluster_spec=cluster_spec,
                envs=env,
                expose_ports=self.distribution_strategy
                == DistributionStrategy.ALLREDUCE,
                disable_relaunch=disable_relaunch,
                log_file_path=args.log_file_path,
            )

        return instance_manager
Exemplo n.º 2
0
def test_transformer():
    service_name = 'isvc-transformer'
    predictor = V1beta1PredictorSpec(
        min_replicas=1,
        pytorch=V1beta1TorchServeSpec(
            storage_uri=
            "gs://kfserving-examples/models/torchserve/image_classifier/v1",
            protocol_version="v1",
            resources=V1ResourceRequirements(
                requests={
                    "cpu": "100m",
                    "memory": "1Gi"
                },
                limits={
                    "cpu": "1",
                    "memory": "1Gi"
                },
            ),
        ),
    )
    transformer = V1beta1TransformerSpec(
        min_replicas=1,
        containers=[
            V1Container(
                image=
                '809251082950.dkr.ecr.us-west-2.amazonaws.com/kserve/image-transformer:'
                + os.environ.get("PULL_BASE_SHA"),
                name='kserve-container',
                resources=V1ResourceRequirements(requests={
                    'cpu': '100m',
                    'memory': '1Gi'
                },
                                                 limits={
                                                     'cpu': '100m',
                                                     'memory': '1Gi'
                                                 }),
                args=["--model_name", "mnist"],
                env=[
                    V1EnvVar(
                        name="STORAGE_URI",
                        value=
                        "gs://kfserving-examples/models/torchserve/image_classifier/v1"
                    )
                ])
        ])

    isvc = V1beta1InferenceService(
        api_version=constants.KSERVE_V1BETA1,
        kind=constants.KSERVE_KIND,
        metadata=client.V1ObjectMeta(name=service_name,
                                     namespace=KSERVE_TEST_NAMESPACE),
        spec=V1beta1InferenceServiceSpec(predictor=predictor,
                                         transformer=transformer))

    kserve_client = KServeClient(
        config_file=os.environ.get("KUBECONFIG", "~/.kube/config"))
    kserve_client.create(isvc)
    try:
        kserve_client.wait_isvc_ready(service_name,
                                      namespace=KSERVE_TEST_NAMESPACE)
    except RuntimeError as e:
        print(
            kserve_client.api_instance.get_namespaced_custom_object(
                "serving.knative.dev", "v1", KSERVE_TEST_NAMESPACE, "services",
                service_name + "-predictor-default"))
        pods = kserve_client.core_api.list_namespaced_pod(
            KSERVE_TEST_NAMESPACE,
            label_selector='serving.kserve.io/inferenceservice={}'.format(
                service_name))
        for pod in pods.items:
            print(pod)
        raise e
    res = predict(service_name, "./data/transformer.json", model_name="mnist")
    assert (res.get("predictions")[0] == 2)
    kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)
Exemplo n.º 3
0
 def get_container_env(self) -> Sequence[V1EnvVar]:
     user_env = [
         V1EnvVar(name=name, value=value)
         for name, value in self.get_env().items()
     ]
     return user_env + self.get_kubernetes_environment()
Exemplo n.º 4
0
  def create_simple_job_spec(
      self,
      experiment: Experiment,
      name: str,
      image: str,
      min_cpu: int,
      min_mem: int,
      command: Optional[List[str]] = None,
      env: Dict[str, str] = {},
      accelerator: Optional[Accelerator] = None,
      accelerator_count: int = 1,
      namespace: str = k.DEFAULT_NAMESPACE,
      machine_type: Optional[MachineType] = None,
      preemptible: bool = True,
      preemptible_tpu: bool = True,
      tpu_driver: str = k.DEFAULT_TPU_DRIVER) -> Optional[JobSpec]:
    """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster

    Args:
    name: job name
    image: container image url (gcr.io/...)
    min_cpu: minimum cpu needed, in milli-cpu
    min_mem: minimum memory needed, in MB
    command: command to execute, None = container entrypoint
    args: args to pass to command
    env: environment vars for container
    accelerator: accelerator type, None=cpu only
    accelerator_count: accelerator count
    namespace: kubernetes namespace
    machine_type: machine type, None=default for mode (cpu/gpu)
    preemptible: use preemptible instance
    preemptible_tpu: use preemptible tpus
    tpu_driver: tpu driver to use

    Returns:
    JobSpec on success, None otherwise
    """

    args = conf.experiment_to_args(experiment.kwargs, experiment.args)

    # ------------------------------------------------------------------------
    # container

    # tpu/gpu resources
    container_resources = V1ResourceRequirements(
        requests=Cluster.container_requests(min_cpu, min_mem),
        limits=Cluster.container_limits(
            accelerator,
            accelerator_count,
            preemptible_tpu,
        ),
    )

    container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()]

    # this is a simple 1-container, 1-pod job, so we just name the
    # container the same thing (minus the generated suffix) as the job itself
    container = V1Container(
        name=name,
        image=image,
        command=command,
        args=args,
        resources=container_resources,
        env=container_env,
        image_pull_policy='Always',
    )

    # ------------------------------------------------------------------------
    # template

    # todo: should we support anything other than a 'never' restart policy?
    # see this for discussion
    # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy

    tolerations = Cluster.tolerations(preemptible=preemptible)

    # backoff count plus 'OnFailure' may be correct here
    template_spec = V1PodSpec(
        restart_policy='Never',
        containers=[container],
        tolerations=tolerations,
        node_selector=Cluster.node_selector(
            preemptible=preemptible,
            machine_type=machine_type,
            accelerator=accelerator,
        ),
        host_ipc=True,
    )

    template = V1PodTemplateSpec(
        metadata=Cluster.template_metadata(
            accelerator=accelerator,
            tpu_driver=tpu_driver,
        ),
        spec=template_spec,
    )

    # ------------------------------------------------------------------------
    # job
    job_spec = V1JobSpec(template=template, backoff_limit=4)

    return JobSpec.get_or_create(
        experiment=experiment,
        spec=ApiClient().sanitize_for_serialization(job_spec),
        platform=Platform.GKE,
    )
Exemplo n.º 5
0
JOB_AND_PVC_PREFIX = "flush-"
ALREADY_EXISTS = "AlreadyExists"
CONFLICT = "Conflict"
NOT_FOUND = "Not Found"

DEFAULT_IMAGE_VERSION = "latest"
try:
    with open("version.json") as fp:
        DEFAULT_IMAGE_VERSION = str(
            json.load(fp).get("version") or DEFAULT_IMAGE_VERSION)
except (FileNotFoundError, json.decoder.JSONDecodeError):  # pragma: no cover
    pass  # use default

DEFAULT_ENV = [
    V1EnvVar(name=name, value=os.environ[name]) for name in get_config_dict()
    if name in os.environ
]

parser = ArgumentParser(description=__doc__)
parser.add_argument(
    "--command",
    default=["python", "-m", "ingestion_edge.flush"],
    type=json.loads,
    help=
    "Docker command for flush jobs; must drain queue until empty or exit non-zero",
)
parser.add_argument(
    "--env",
    default=DEFAULT_ENV,
    type=json.loads,
Exemplo n.º 6
0
def main():
    args = parse_args()
    logger = get_logger("master", level=args.log_level.upper())

    # Master addr
    master_ip = os.getenv("MY_POD_IP", "localhost")
    master_addr = "%s:%d" % (master_ip, args.port)

    # Start tensorboard service if required
    if args.tensorboard_log_dir:
        logger.info(
            "Starting tensorboard service with log directory %s",
            args.tensorboard_log_dir,
        )
        tb_service = TensorboardService(args.tensorboard_log_dir, master_ip)
        tb_service.start()
    else:
        tb_service = None

    # Start task queue
    logger.debug(
        "Starting task queue with training data directory %s, "
        "evaluation data directory %s, "
        "and prediction data directory %s",
        args.training_data_dir,
        args.evaluation_data_dir,
        args.prediction_data_dir,
    )
    task_d = _make_task_dispatcher(
        args.training_data_dir,
        args.evaluation_data_dir,
        args.prediction_data_dir,
        args.records_per_task,
        args.num_epochs,
    )
    model_module = load_module(
        get_module_file_path(args.model_zoo, args.model_def)).__dict__
    model_inst = load_model_from_module(args.model_def, model_module,
                                        args.model_params)
    optimizer = model_module[args.optimizer]()

    if all((
            args.training_data_dir,
            args.evaluation_data_dir,
            args.evaluation_throttle_secs or args.evaluation_steps,
    )):
        job_type = JobType.TRAINING_WITH_EVALUATION
    elif all((
            args.evaluation_data_dir,
            not args.training_data_dir,
            not args.prediction_data_dir,
    )):
        job_type = JobType.EVALUATION_ONLY
    elif all((
            args.prediction_data_dir,
            not args.evaluation_data_dir,
            not args.training_data_dir,
    )):
        job_type = JobType.PREDICTION_ONLY
    else:
        job_type = JobType.TRAINING_ONLY

    # Initialize checkpoint service
    if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION:
        logger.info("Starting checkpoint service")
        checkpoint_service = CheckpointService(
            args.checkpoint_dir,
            args.checkpoint_steps,
            args.keep_checkpoint_max,
            job_type == JobType.TRAINING_WITH_EVALUATION,
        )
    else:
        checkpoint_service = None

    # Initialize evaluation service
    evaluation_service = None
    if (job_type == JobType.TRAINING_WITH_EVALUATION
            or job_type == JobType.EVALUATION_ONLY):
        logger.info(
            "Starting evaluation service with throttle seconds %d "
            " and evaluation steps %d",
            args.evaluation_throttle_secs,
            args.evaluation_steps,
        )
        evaluation_service = EvaluationService(
            checkpoint_service,
            tb_service,
            task_d,
            args.evaluation_start_delay_secs,
            args.evaluation_throttle_secs,
            args.evaluation_steps,
            job_type == JobType.EVALUATION_ONLY,
        )
        evaluation_service.start()
        task_d.set_evaluation_service(evaluation_service)

    embedding_service_endpoint = None
    # Search for embedding layers in the model,
    # if found, initialize embedding service
    layers = find_layer(model_inst, Embedding)
    if layers:
        embedding_service = EmbeddingService()
        embedding_service_endpoint = embedding_service.start_embedding_service(
            job_name=args.job_name,
            image_name=args.worker_image,
            namespace=args.namespace,
            resource_request=args.master_resource_request,
            resource_limit=args.master_resource_limit,
            pod_priority=args.worker_pod_priority,
            volume=args.volume,
            image_pull_policy=args.image_pull_policy,
            restart_policy=args.restart_policy,
            cluster_spec=args.cluster_spec,
        )
        logger.info("Embedding service start succeeded. The endpoint is %s." %
                    str(embedding_service_endpoint))

    # The master service
    logger.info("Starting master service")
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=64),
        options=[
            ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH),
            (
                "grpc.max_receive_message_length",
                GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
            ),
        ],
    )
    master_servicer = MasterServicer(
        args.grads_to_wait,
        args.minibatch_size,
        optimizer,
        task_d,
        init_var=model_inst.trainable_variables if model_inst.built else [],
        checkpoint_filename_for_init=args.checkpoint_filename_for_init,
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        embedding_service_endpoint=embedding_service_endpoint,
    )
    elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server)
    server.add_insecure_port("[::]:{}".format(args.port))
    server.start()
    logger.info("Server started at port: %d", args.port)

    worker_manager = None
    if args.num_workers:
        assert args.worker_image, "Worker image cannot be empty"

        worker_command = ["python"]
        worker_args = [
            "-m",
            "elasticdl.python.worker.main",
            "--model_zoo",
            args.model_zoo,
            "--master_addr",
            master_addr,
            "--log_level",
            args.log_level,
            "--dataset_fn",
            args.dataset_fn,
            "--loss",
            args.loss,
            "--optimizer",
            args.optimizer,
            "--eval_metrics_fn",
            args.eval_metrics_fn,
            "--model_def",
            args.model_def,
            "--job_type",
            job_type,
            "--minibatch_size",
            str(args.minibatch_size),
            "--embedding_service_endpoint",
            str(embedding_service_endpoint),
        ]

        logger.info(">>> master pod envs argument is %s" % args.envs)
        env_dict = parse_envs(args.envs)
        env = []
        for key in env_dict:
            env.append(V1EnvVar(name=key, value=env_dict[key]))

        worker_manager = WorkerManager(
            task_d,
            job_name=args.job_name,
            image_name=args.worker_image,
            command=worker_command,
            args=worker_args,
            namespace=args.namespace,
            num_workers=args.num_workers,
            worker_resource_request=args.worker_resource_request,
            worker_resource_limit=args.worker_resource_limit,
            pod_priority=args.worker_pod_priority,
            volume=args.volume,
            image_pull_policy=args.image_pull_policy,
            restart_policy=args.restart_policy,
            cluster_spec=args.cluster_spec,
            envs=env,
        )
        worker_manager.update_status(WorkerManagerStatus.PENDING)
        logger.info("Launching %d workers", args.num_workers)
        worker_manager.start_workers()
        worker_manager.update_status(WorkerManagerStatus.RUNNING)

        if tb_service:
            worker_manager.start_tensorboard_service()

    try:
        while True:
            if task_d.finished():
                if worker_manager:
                    worker_manager.update_status(WorkerManagerStatus.FINISHED)
                if args.output:
                    master_servicer.save_latest_checkpoint(args.output)
                break
            time.sleep(30)
    except KeyboardInterrupt:
        logger.warning("Server stopping")

    if evaluation_service:
        logger.info("Stopping evaluation service")
        evaluation_service.stop()

    logger.info("Stopping RPC server")
    server.stop(0)

    # Keep TensorBoard running when all the tasks are finished
    if tb_service:
        logger.info(
            "All tasks finished. Keeping TensorBoard service running...")
        while True:
            if tb_service.is_active():
                time.sleep(10)
            else:
                logger.warning("Unable to keep TensorBoard running. "
                               "It has already terminated")
                break
    logger.info("Master stopped")
Exemplo n.º 7
0
def ensure_traefik(api_core_v1, api_ext_v1_beta1, api_apps_v1, api_custom,
                   api_rbac_auth_v1_b1, admin_email, domain, static_ip,
                   oauth_client_id, oauth_client_secret, oauth_domain,
                   oauth_secret):
    ensure_crd(api=api_ext_v1_beta1,
               name='ingressroutes.traefik.containo.us',
               group='traefik.containo.us',
               kind='IngressRoute',
               plural='ingressroutes',
               singular='ingressroute',
               scope='Namespaced')
    ensure_crd(api=api_ext_v1_beta1,
               name='ingressroutetcps.traefik.containo.us',
               group='traefik.containo.us',
               kind='IngressRouteTCP',
               plural='ingressroutetcps',
               singular='ingressroutetcp',
               scope='Namespaced')
    ensure_crd(api=api_ext_v1_beta1,
               name='middlewares.traefik.containo.us',
               group='traefik.containo.us',
               kind='Middleware',
               plural='middlewares',
               singular='middleware',
               scope='Namespaced')
    ensure_crd(api=api_ext_v1_beta1,
               name='tlsoptions.traefik.containo.us',
               group='traefik.containo.us',
               kind='TLSOption',
               plural='tlsoptions',
               singular='tlsoption',
               scope='Namespaced')
    ensure_role(api=api_rbac_auth_v1_b1,
                role=V1ClusterRole(
                    api_version='rbac.authorization.k8s.io/v1beta1',
                    kind='ClusterRole',
                    metadata=V1ObjectMeta(name='traefik-ingress-controller'),
                    rules=[
                        V1PolicyRule(
                            api_groups=[''],
                            resources=['services', 'endpoints', 'secrets'],
                            verbs=['get', 'list', 'watch']),
                        V1PolicyRule(api_groups=['extensions'],
                                     resources=['ingresses'],
                                     verbs=['get', 'list', 'watch']),
                        V1PolicyRule(api_groups=['extensions'],
                                     resources=['ingresses/status'],
                                     verbs=['update']),
                        V1PolicyRule(api_groups=['traefik.containo.us'],
                                     resources=['middlewares'],
                                     verbs=['get', 'list', 'watch']),
                        V1PolicyRule(api_groups=['traefik.containo.us'],
                                     resources=['ingressroutes'],
                                     verbs=['get', 'list', 'watch']),
                        V1PolicyRule(api_groups=['traefik.containo.us'],
                                     resources=['ingressroutetcps'],
                                     verbs=['get', 'list', 'watch']),
                        V1PolicyRule(api_groups=['traefik.containo.us'],
                                     resources=['tlsoptions'],
                                     verbs=['get', 'list', 'watch'])
                    ]),
                name='traefik-ingress-controller')
    ensure_role_binding(
        api=api_rbac_auth_v1_b1,
        role_binding=V1ClusterRoleBinding(
            api_version='rbac.authorization.k8s.io/v1beta1',
            kind='ClusterRoleBinding',
            metadata=V1ObjectMeta(name='traefik-ingress-controller'),
            role_ref=V1RoleRef(api_group='rbac.authorization.k8s.io',
                               kind='ClusterRole',
                               name='traefik-ingress-controller'),
            subjects=[
                V1Subject(kind='ServiceAccount',
                          name='traefik-ingress-controller',
                          namespace='default')
            ]),
        name='traefik-ingress-controller')
    ensure_service(
        api=api_core_v1,
        service=V1Service(
            api_version="v1",
            metadata=V1ObjectMeta(name='traefik'),
            spec=V1ServiceSpec(
                type='LoadBalancer',
                load_balancer_ip=static_ip,
                ports=[
                    # V1ServicePort(
                    #     protocol='TCP',
                    #     port=80,
                    #     name='web'
                    # ),
                    V1ServicePort(protocol='TCP', port=443, name='websecure'),
                ],
                selector={'app': 'traefik'})),
        name='traefik',
        namespace='default')
    ensure_service_account(
        api=api_core_v1,
        account=V1ServiceAccount(
            api_version="v1",
            metadata=V1ObjectMeta(name='traefik-ingress-controller'),
        ),
        name='traefik-ingress-controller',
        namespace='default')
    ensure_deployment(
        api=api_apps_v1,
        deployment=V1Deployment(
            api_version="apps/v1",
            metadata=V1ObjectMeta(name='traefik', labels={'app': 'traefik'}),
            spec=V1DeploymentSpec(
                replicas=1,
                selector=V1LabelSelector(match_labels={'app': 'traefik'}),
                template=V1PodTemplateSpec(
                    metadata=V1ObjectMeta(name='traefik',
                                          labels={'app': 'traefik'}),
                    spec=V1PodSpec(
                        service_account_name='traefik-ingress-controller',
                        containers=[
                            V1Container(
                                name='traefik',
                                image='traefik:v2.0',
                                args=[
                                    '--api.insecure',
                                    '--accesslog',
                                    '--entrypoints.web.Address=:80',
                                    '--entrypoints.websecure.Address=:443',
                                    '--providers.kubernetescrd',
                                    '--certificatesresolvers.default.acme.tlschallenge',
                                    f'--certificatesresolvers.default.acme.email={admin_email}',
                                    '--certificatesresolvers.default.acme.storage=acme.json',
                                    # '--certificatesresolvers.default.acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory',
                                ],
                                ports=[
                                    V1ContainerPort(name='web',
                                                    container_port=8000),
                                    V1ContainerPort(name='websecure',
                                                    container_port=4443),
                                    V1ContainerPort(name='admin',
                                                    container_port=8080),
                                ])
                        ])))),
        name='traefik',
        namespace='default')
    ensure_deployment(
        api=api_apps_v1,
        deployment=V1Deployment(
            api_version="apps/v1",
            metadata=V1ObjectMeta(name='traefik-forward-auth',
                                  labels={'app': 'traefik-forward-auth'}),
            spec=V1DeploymentSpec(
                replicas=1,
                selector=V1LabelSelector(
                    match_labels={'app': 'traefik-forward-auth'}),
                template=V1PodTemplateSpec(
                    metadata=V1ObjectMeta(
                        name='traefik-forward-auth',
                        labels={'app': 'traefik-forward-auth'}),
                    spec=V1PodSpec(containers=[
                        V1Container(
                            name='traefik-forward-auth',
                            image='thomseddon/traefik-forward-auth:2',
                            ports=[
                                V1ContainerPort(name='auth',
                                                container_port=4181),
                            ],
                            env=[
                                V1EnvVar(name='PROVIDERS_GOOGLE_CLIENT_ID',
                                         value=oauth_client_id),
                                # V1EnvVar(name='LOG_LEVEL', value='trace'),
                                V1EnvVar(name='PROVIDERS_GOOGLE_CLIENT_SECRET',
                                         value=oauth_client_secret),
                                V1EnvVar(name='SECRET', value=oauth_secret),
                                V1EnvVar(name='DOMAIN', value=oauth_domain),
                                V1EnvVar(name='COOKIE_DOMAIN', value=domain),
                                V1EnvVar(name='AUTH_HOST',
                                         value=f'auth.{domain}'),
                            ])
                    ])))),
        name='traefik-forward-auth',
        namespace='default')
    ensure_custom_object(api=api_custom,
                         custom_object={
                             'apiVersion': 'traefik.containo.us/v1alpha1',
                             'kind': 'IngressRoute',
                             'metadata': {
                                 'name': 'traefik-forward-auth',
                             },
                             'spec': {
                                 'entryPoints': ['websecure'],
                                 'routes': [{
                                     'match':
                                     f'Host(`auth.{domain}`)',
                                     'kind':
                                     'Rule',
                                     'services': [{
                                         'name': 'traefik-forward-auth',
                                         'port': 4181
                                     }],
                                     'middlewares': [{
                                         'name':
                                         'traefik-forward-auth'
                                     }]
                                 }],
                                 'tls': {
                                     'certResolver': 'default'
                                 }
                             }
                         },
                         group='traefik.containo.us',
                         plural='ingressroutes',
                         version='v1alpha1',
                         name='traefik-forward-auth',
                         namespace='default')
    ensure_custom_object(api=api_custom,
                         custom_object={
                             'apiVersion': 'traefik.containo.us/v1alpha1',
                             'kind': 'Middleware',
                             'metadata': {
                                 'name': 'traefik-forward-auth',
                             },
                             'spec': {
                                 'forwardAuth': {
                                     'address':
                                     'http://traefik-forward-auth:4181',
                                     'authResponseHeaders':
                                     ['X-Forwarded-User'],
                                 }
                             }
                         },
                         group='traefik.containo.us',
                         plural='middlewares',
                         version='v1alpha1',
                         name='traefik-forward-auth',
                         namespace='default')
    ensure_service(api=api_core_v1,
                   service=V1Service(
                       api_version="v1",
                       metadata=V1ObjectMeta(name='traefik-forward-auth'),
                       spec=V1ServiceSpec(
                           type='ClusterIP',
                           ports=[
                               V1ServicePort(protocol='TCP',
                                             port=4181,
                                             name='auth'),
                           ],
                           selector={'app': 'traefik-forward-auth'})),
                   name='traefik-forward-auth',
                   namespace='default')
    ensure_whoami(api_apps_v1, api_core_v1, api_custom, domain)
Exemplo n.º 8
0
def test_transformer():
    service_name = 'raw'
    predictor = V1beta1PredictorSpec(
        min_replicas=1,
        pytorch=V1beta1TorchServeSpec(
            storage_uri=
            'gs://kfserving-examples/models/torchserve/image_classifier',
            resources=V1ResourceRequirements(requests={
                'cpu': '100m',
                'memory': '2Gi'
            },
                                             limits={
                                                 'cpu': '100m',
                                                 'memory': '2Gi'
                                             })),
    )
    transformer = V1beta1TransformerSpec(
        min_replicas=1,
        containers=[
            V1Container(
                image='kfserving/torchserve-image-transformer:latest',
                name='kfserving-container',
                resources=V1ResourceRequirements(requests={
                    'cpu': '100m',
                    'memory': '2Gi'
                },
                                                 limits={
                                                     'cpu': '100m',
                                                     'memory': '2Gi'
                                                 }),
                env=[
                    V1EnvVar(
                        name="STORAGE_URI",
                        value=
                        "gs://kfserving-examples/models/torchserve/image_classifier"
                    )
                ])
        ])

    annotations = dict()
    annotations['serving.kubeflow.org/raw'] = 'true'
    annotations['kubernetes.io/ingress.class'] = 'istio'
    isvc = V1beta1InferenceService(
        api_version=constants.KFSERVING_V1BETA1,
        kind=constants.KFSERVING_KIND,
        metadata=client.V1ObjectMeta(name=service_name,
                                     namespace=KFSERVING_TEST_NAMESPACE,
                                     annotations=annotations),
        spec=V1beta1InferenceServiceSpec(predictor=predictor,
                                         transformer=transformer))

    KFServing.create(isvc)
    try:
        KFServing.wait_isvc_ready(service_name,
                                  namespace=KFSERVING_TEST_NAMESPACE)
    except RuntimeError as e:
        raise e

    time.sleep(30)

    isvc = KFServing.get(
        service_name,
        namespace=KFSERVING_TEST_NAMESPACE,
    )

    cluster_ip = get_cluster_ip()
    logging.info("clusterip = %s", cluster_ip)

    host = isvc["status"]["url"]
    host = host[host.rfind('/') + 1:]
    url = 'http://{}/v1/models/mnist:predict'.format(cluster_ip)
    logging.info("url = %s ", url)
    headers = {"Host": host}
    data_str = '{"instances": [{"data": "iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAAw0lE\
    QVR4nGNgGFggVVj4/y8Q2GOR83n+58/fP0DwcSqmpNN7oOTJw6f+/H2pjUU2JCSEk0EWqN0cl828e/FIxvz9/9cCh1\
        zS5z9/G9mwyzl/+PNnKQ45nyNAr9ThMHQ/UG4tDofuB4bQIhz6fIBenMWJQ+7Vn7+zeLCbKXv6z59NOPQVgsIcW\
            4QA9YFi6wNQLrKwsBebW/68DJ388Nun5XFocrqvIFH59+XhBAxThTfeB0r+vP/QHbuDCgr2JmOXoSsAAKK7b\
                U3vISS4AAAAAElFTkSuQmCC", "target": 0}]}'

    res = requests.post(url, data_str, headers=headers)
    logging.info("res.text = %s", res.text)
    preds = json.loads(res.content.decode("utf-8"))
    assert (preds["predictions"] == [2])

    KFServing.delete(service_name, KFSERVING_TEST_NAMESPACE)
  def apply_pod_profile(self, username, pod, profile, default_mount_path, gpu_mode=None):
    api_client = kubernetes.client.ApiClient()

    pod.metadata.labels['jupyterhub.opendatahub.io/user'] = escape(username)

    profile_volumes = profile.get('volumes')

    if profile_volumes:
      for volume in profile_volumes:
        volume_name = re.sub('[^a-zA-Z0-9\.]', '-', volume['name']).lower()
        read_only = volume['persistentVolumeClaim'].get('readOnly')
        pvc = V1PersistentVolumeClaimVolumeSource(volume['persistentVolumeClaim']['claimName'], read_only=read_only)
        mount_path = self.generate_volume_path(volume.get('mountPath'), default_mount_path, volume_name)
        pod.spec.volumes.append(V1Volume(name=volume_name, persistent_volume_claim=pvc))
        pod.spec.containers[0].volume_mounts.append(V1VolumeMount(name=volume_name, mount_path=mount_path))

    profile_environment = profile.get('env')

    if profile_environment:

      # Kept for backwards compatibility with simplified env var definitions
      if isinstance(profile_environment, dict):
        for k, v in profile['env'].items():
          update = False
          for e in pod.spec.containers[0].env:
            if e.name == k:
              e.value = v
              update = True
              break
          if not update:
            pod.spec.containers[0].env.append(V1EnvVar(k, v))
            
      elif isinstance(profile_environment, list):
        for i in profile_environment:
          r = type("Response", (), {})
          r.data = json.dumps(i)
          env_var = api_client.deserialize(r, V1EnvVar)
          pod.spec.containers[0].env.append(env_var)
    
    resource_var = None
    resource_json = type("Response", (), {})
    resource_json.data = json.dumps(profile.get('resources'))
    resource_var = api_client.deserialize(resource_json, V1ResourceRequirements)

    if resource_var:
      pod.spec.containers[0].resources = resource_var
      mem_limit = resource_var.limits.get('memory', '')
      if mem_limit:
        pod.spec.containers[0].env.append(V1EnvVar(name='MEM_LIMIT', value=self.get_mem_limit(mem_limit)))
      
    if profile.get('node_tolerations'):
        pod.spec.tolerations = profile.get('node_tolerations')

    if profile.get('node_affinity'):
        if not pod.spec.affinity:
            pod.spec.affinity = {}
        pod.spec.affinity['nodeAffinity'] = profile.get('node_affinity')

    for c in pod.spec.containers:
      update = False
      if type(c) is dict:
        env = c['env']
      else:
        env = c.env
      for e in env:
        if type(e) is dict:
          if e['name'] == _JUPYTERHUB_USER_NAME_ENV:
            e['value'] = username
            update = True
            break
        else:
          if e.name == _JUPYTERHUB_USER_NAME_ENV:
            e.value = username
            update = True
            break

      if not update:
        env.append(V1EnvVar(_JUPYTERHUB_USER_NAME_ENV, username))

    self.apply_gpu_config(gpu_mode, profile.get('gpu', 0), pod)

    return pod  
Exemplo n.º 10
0
    def _get_container(self, command: str, config_map_volume_names: list = None) -> V1Container:
        """Get the K8s container object to be used by by a PodSpec.

        :param command: The command to run in the container.
        :return: The V1Container object definition to run the command.
        """
        if self.use_https:
            protocol = "https"
        else:
            protocol = "http"

        config_map_mounts = []
        if config_map_volume_names:
            for map_number, map_volume in enumerate(config_map_volume_names):
                mount_dir = self.config_map_path + f"/{map_number}"
                config_map_mounts.append(
                    V1VolumeMount(name=map_volume, mount_path=mount_dir)
                )
            env_setup_ca1 = "mkdir -p /root/.mc/certs/CAs; "
            # The find command doesn't exist by default in the minio container
            env_setup_ca2 = f"cp {self.config_map_path}/*/* /root/.mc/certs/CAs/; "
            env_setup_ca = env_setup_ca1 + env_setup_ca2
        else:
            env_setup_ca = ""

        env_setup1 = f"export MC_HOST_{self.s3_alias}={protocol}://${{S3_ACCESSKEY}}:${{S3_SECRETKEY}}@"
        env_setup2 = f"{self.s3_host}:{self.s3_port}; "
        container_setup = env_setup_ca + env_setup1 + env_setup2

        container = V1Container(
            name="netapp-dataops-s3mc-container",
            image=self.image,
            env=[
                V1EnvVar(
                    name="S3_ACCESSKEY",
                    value_from=V1EnvVarSource(
                        secret_key_ref=V1SecretKeySelector(
                            name=self.credentials_secret_name,
                            key="access_key"
                        )
                    )
                ),
                V1EnvVar(
                    name="S3_SECRETKEY",
                    value_from=V1EnvVarSource(
                        secret_key_ref=V1SecretKeySelector(
                            name=self.credentials_secret_name,
                            key="secret_key"
                        )
                    )
                ),
            ],
            command=["/bin/bash"],
            args=["-c", container_setup + command],
            volume_mounts=[
                              V1VolumeMount(mount_path=self.data_volume_path, name=self.data_volume_name)
                          ] + config_map_mounts,
            resources=self._get_resource_requirements()
        )

        return container
Exemplo n.º 11
0
    def _create_instance_manager(self, args):
        instance_manager = None
        if args.num_workers:
            assert args.worker_image, "Worker image cannot be empty"

            worker_command = ["python"]
            worker_args = [
                "-m",
                "elasticdl.python.worker.main",
                "--master_addr",
                self.master_addr,
                "--job_type",
                self.job_type,
            ]
            worker_args.extend(build_arguments_from_parsed_result(args))

            ps_command = ["python"]
            ps_args = [
                "-m",
                "elasticdl.python.ps.main",
                "--grads_to_wait",
                str(args.grads_to_wait),
                "--lr_staleness_modulation",
                str(args.lr_staleness_modulation),
                "--use_async",
                str(args.use_async),
                "--minibatch_size",
                str(args.minibatch_size),
                "--model_zoo",
                args.model_zoo,
                "--model_def",
                args.model_def,
                "--job_name",
                args.job_name,
                "--num_minibatches_per_task",
                str(args.num_minibatches_per_task),
                "--port",
                "2222",
                "--master_addr",
                self.master_addr,
                "--namespace",
                args.namespace,
                "--evaluation_steps",
                str(args.evaluation_steps),
                "--checkpoint_dir",
                str(args.checkpoint_dir),
                "--checkpoint_steps",
                str(args.checkpoint_steps),
                "--keep_checkpoint_max",
                str(args.keep_checkpoint_max),
                "--num_ps_pods",
                str(args.num_ps_pods),
                "--checkpoint_dir_for_init",
                str(args.checkpoint_dir_for_init),
                "--num_workers",
                str(args.num_workers),
                "--log_level",
                str(args.log_level),
            ]

            env_dict = parse_envs(args.envs)
            env = []
            for key in env_dict:
                env.append(V1EnvVar(name=key, value=env_dict[key]))

            instance_manager = InstanceManager(
                self.task_d,
                job_name=args.job_name,
                image_name=args.worker_image,
                worker_command=worker_command,
                worker_args=worker_args,
                namespace=args.namespace,
                num_workers=args.num_workers,
                worker_resource_request=args.worker_resource_request,
                worker_resource_limit=args.worker_resource_limit,
                worker_pod_priority=args.worker_pod_priority,
                num_ps=args.num_ps_pods,
                ps_command=ps_command,
                ps_args=ps_args,
                ps_resource_request=args.ps_resource_request,
                ps_resource_limit=args.ps_resource_limit,
                ps_pod_priority=args.ps_pod_priority,
                volume=args.volume,
                image_pull_policy=args.image_pull_policy,
                restart_policy=args.restart_policy,
                cluster_spec=args.cluster_spec,
                envs=env,
            )

        return instance_manager
    def apply_pod_profile(self, spawner, pod, profile):
        api_client = kubernetes.client.ApiClient()

        pod.metadata.labels['jupyterhub.opendatahub.io/user'] = escape(
            spawner.user.name)

        profile_environment = profile.get('env')

        if profile_environment:

            # Kept for backwards compatibility with simplified env var definitions
            if isinstance(profile_environment, dict):
                for k, v in profile['env'].items():
                    update = False
                    for e in pod.spec.containers[0].env:
                        if e.name == k:
                            e.value = v
                            update = True
                            break
                    if not update:
                        pod.spec.containers[0].env.append(V1EnvVar(k, v))

            elif isinstance(profile_environment, list):
                for i in profile_environment:
                    r = type("Response", (), {})
                    r.data = json.dumps(i)
                    env_var = api_client.deserialize(r, V1EnvVar)
                    pod.spec.containers[0].env.append(env_var)

        resource_var = None
        resource_json = type("Response", (), {})
        resource_json.data = json.dumps(profile.get('resources'))
        resource_var = api_client.deserialize(resource_json,
                                              V1ResourceRequirements)

        if resource_var:
            pod.spec.containers[0].resources = resource_var

        if profile.get('node_tolerations'):
            pod.spec.tolerations = profile.get('node_tolerations')

        if profile.get('node_affinity'):
            if not pod.spec.affinity:
                pod.spec.affinity = {}
            pod.spec.affinity['nodeAffinity'] = profile.get('node_affinity')

        for c in pod.spec.containers:
            update = False
            if type(c) is dict:
                env = c['env']
            else:
                env = c.env
            for e in env:
                if type(e) is dict:
                    if e['name'] == _JUPYTERHUB_USER_NAME_ENV:
                        e['value'] = spawner.user.name
                        update = True
                        break
                else:
                    if e.name == _JUPYTERHUB_USER_NAME_ENV:
                        e.value = spawner.user.name
                        update = True
                        break

            if not update:
                env.append(
                    V1EnvVar(_JUPYTERHUB_USER_NAME_ENV, spawner.user.name))

        self.apply_gpu_config(spawner.gpu_mode, spawner.gpu_count, pod)

        return pod
Exemplo n.º 13
0
 def get_reference_object(self) -> V1Deployment:
     """Get deployment object for outpost"""
     # Generate V1ContainerPort objects
     container_ports = []
     for port in self.controller.deployment_ports:
         container_ports.append(
             V1ContainerPort(
                 container_port=port.inner_port or port.port,
                 name=port.name,
                 protocol=port.protocol.upper(),
             )
         )
     meta = self.get_object_meta(name=self.name)
     image_name = self.controller.get_container_image()
     image_pull_secrets = self.outpost.config.kubernetes_image_pull_secrets
     version = get_full_version()
     return V1Deployment(
         metadata=meta,
         spec=V1DeploymentSpec(
             replicas=self.outpost.config.kubernetes_replicas,
             selector=V1LabelSelector(match_labels=self.get_pod_meta()),
             template=V1PodTemplateSpec(
                 metadata=V1ObjectMeta(
                     labels=self.get_pod_meta(
                         **{
                             # Support istio-specific labels, but also use the standard k8s
                             # recommendations
                             "app.kubernetes.io/version": version,
                             "app": "authentik-outpost",
                             "version": version,
                         }
                     )
                 ),
                 spec=V1PodSpec(
                     image_pull_secrets=[
                         V1ObjectReference(name=secret) for secret in image_pull_secrets
                     ],
                     containers=[
                         V1Container(
                             name=str(self.outpost.type),
                             image=image_name,
                             ports=container_ports,
                             env=[
                                 V1EnvVar(
                                     name="AUTHENTIK_HOST",
                                     value_from=V1EnvVarSource(
                                         secret_key_ref=V1SecretKeySelector(
                                             name=self.name,
                                             key="authentik_host",
                                         )
                                     ),
                                 ),
                                 V1EnvVar(
                                     name="AUTHENTIK_HOST_BROWSER",
                                     value_from=V1EnvVarSource(
                                         secret_key_ref=V1SecretKeySelector(
                                             name=self.name,
                                             key="authentik_host_browser",
                                         )
                                     ),
                                 ),
                                 V1EnvVar(
                                     name="AUTHENTIK_TOKEN",
                                     value_from=V1EnvVarSource(
                                         secret_key_ref=V1SecretKeySelector(
                                             name=self.name,
                                             key="token",
                                         )
                                     ),
                                 ),
                                 V1EnvVar(
                                     name="AUTHENTIK_INSECURE",
                                     value_from=V1EnvVarSource(
                                         secret_key_ref=V1SecretKeySelector(
                                             name=self.name,
                                             key="authentik_host_insecure",
                                         )
                                     ),
                                 ),
                             ],
                         )
                     ],
                 ),
             ),
         ),
     )
Exemplo n.º 14
0
    def __init__(self) -> None:
        metadata = V1ObjectMeta(name="environmentd",
                                labels={"app": "environmentd"})
        label_selector = V1LabelSelector(match_labels={"app": "environmentd"})

        value_from = V1EnvVarSource(field_ref=V1ObjectFieldSelector(
            field_path="metadata.name"))

        env = [
            V1EnvVar(name="MZ_POD_NAME", value_from=value_from),
            V1EnvVar(name="AWS_REGION", value="minio"),
            V1EnvVar(name="AWS_ACCESS_KEY_ID", value="minio"),
            V1EnvVar(name="AWS_SECRET_ACCESS_KEY", value="minio123"),
        ]

        ports = [V1ContainerPort(container_port=5432, name="sql")]

        volume_mounts = [
            V1VolumeMount(name="data", mount_path="/data"),
        ]

        s3_endpoint = urllib.parse.quote("http://minio-service.default:9000")

        container = V1Container(
            name="environmentd",
            image=self.image("environmentd"),
            args=[
                "--storaged-image=" + self.image("storaged"),
                "--computed-image=" + self.image("computed"),
                "--availability-zone=kind-worker",
                "--availability-zone=kind-worker2",
                "--availability-zone=kind-worker3",
                f"--persist-blob-url=s3://minio:minio123@persist/persist?endpoint={s3_endpoint}&region=minio",
                "--orchestrator=kubernetes",
                "--orchestrator-kubernetes-image-pull-policy=never",
                "--persist-consensus-url=postgres://[email protected]?options=--search_path=consensus",
                "--adapter-stash-url=postgres://[email protected]?options=--search_path=catalog",
                "--storage-stash-url=postgres://[email protected]?options=--search_path=storage",
                "--unsafe-mode",
            ],
            env=env,
            ports=ports,
            volume_mounts=volume_mounts,
        )

        pod_spec = V1PodSpec(containers=[container])
        template_spec = V1PodTemplateSpec(metadata=metadata, spec=pod_spec)
        claim_templates = [
            V1PersistentVolumeClaim(
                metadata=V1ObjectMeta(name="data"),
                spec=V1PersistentVolumeClaimSpec(
                    access_modes=["ReadWriteOnce"],
                    resources=V1ResourceRequirements(
                        requests={"storage": "1Gi"}),
                ),
            )
        ]

        self.stateful_set = V1StatefulSet(
            api_version="apps/v1",
            kind="StatefulSet",
            metadata=metadata,
            spec=V1StatefulSetSpec(
                service_name="environmentd",
                replicas=1,
                pod_management_policy="Parallel",
                selector=label_selector,
                template=template_spec,
                volume_claim_templates=claim_templates,
            ),
        )