def test_katib_is_ready(record_xml_attribute, namespace): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_katib_is_ready") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() deployment_names = [ "katib-controller", "katib-mysql", "katib-db-manager", "katib-ui", ] for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def deploy_model(args): """Deploy a TF model using the TF serving component.""" api_client = create_k8s_client(args) app_dir = setup_kubeflow_ks_app(args, api_client) component = "modelServer" logging.info("Deploying tf-serving.") generate_command = ["ks", "generate", "tf-serving", component] util.run(generate_command, cwd=app_dir) params = {} for pair in args.params.split(","): k, v = pair.split("=", 1) params[k] = v if "namespace" not in params: raise ValueError("namespace must be supplied via --params.") namespace = params["namespace"] ks_deploy(app_dir, component, params, env=None, account=None) core_api = k8s_client.CoreV1Api(api_client) deploy = core_api.read_namespaced_service(args.deploy_name, args.namespace) cluster_ip = deploy.spec.cluster_ip if not cluster_ip: raise ValueError("inception service wasn't assigned a cluster ip.") util.wait_for_deployment(api_client, namespace, args.deploy_name + "-v1", timeout_minutes=10) logging.info("Verified TF serving started.")
def setup(args): """Test deploying Kubeflow.""" api_client = create_k8s_client(args) app_dir = setup_kubeflow_ks_app(args, api_client) namespace = args.namespace # TODO(jlewi): We don't need to generate a core component if we are # just deploying TFServing. Might be better to refactor this code. # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name)
def deploy_kubeflow(test_case): """Deploy Kubeflow.""" args = parse_args() test_dir = test_case.test_suite.test_dir namespace = args.namespace api_client = deploy_utils.create_k8s_client() app_dir = deploy_utils.setup_kubeflow_ks_app(test_dir, namespace, args.github_token, api_client) # TODO(jlewi): We don't need to generate a core component if we are # just deploying TFServing. Might be better to refactor this code. # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace ], cwd=app_dir) util.run([ "ks", "generate", "pytorch-operator", "pytorch-operator", "--name=pytorch-operator", "--namespace=" + namespace ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", "-c", "pytorch-operator", ] if args.as_gcloud_user: account = deploy_utils.get_gcp_identity() logging.info("Impersonate %s", account) # If we don't use --as to impersonate the service account then we # observe RBAC errors when doing certain operations. The problem appears # to be that we end up using the in cluster config (e.g. pod service account) # and not the GCP service account which has more privileges. apply_command.append("--as=" + account) util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyterhub_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyterhub_name) # Verify that PyTorch Operator actually deployed pytorch_operator_deployment_name = "pytorch-operator" logging.info("Verifying PyTorchJob controller started.") util.wait_for_deployment(api_client, namespace, pytorch_operator_deployment_name)
def test_kf_is_ready(namespace, use_basic_auth): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. deployment_names = [ "argo-ui", "centraldashboard", "cert-manager", "cloud-endpoints-controller", "jupyter-web-app", "ml-pipeline", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "notebooks-controller", "tf-job-operator", "profiles", "pytorch-operator", "studyjob-controller", "workflow-controller", ] stateful_sets = [ "backend-updater", ] if use_basic_auth: deployment_names.extend(["basic-auth"]) else: deployment_names.extend(["iap-enabler"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name) for name in stateful_sets: logging.info("Verifying that statefulset %s started...", name) util.wait_for_statefulset(api_client, namespace, name)
def test_deploy(record_xml_attribute, deploy_name, namespace, model_dir, export_dir): util.set_pytest_junit(record_xml_attribute, "test_deploy") util.maybe_activate_service_account() app_dir = os.path.join(os.path.dirname(__file__), "../serving/GCS") app_dir = os.path.abspath(app_dir) logging.info("--app_dir not set defaulting to: %s", app_dir) # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue: # https://github.com/kubernetes-sigs/kustomize/issues/1295 kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \ 'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64' util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir) # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue. # Invalid object doesn't have additional properties ... kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \ 'release/v1.14.0/bin/linux/amd64/kubectl' util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl], cwd=app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir) # Configure custom parameters using kustomize configmap = 'mnist-map-serving' util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=name' + '=' + deploy_name ], cwd=app_dir) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=modelBasePath=' + model_dir ], cwd=app_dir) util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=exportDir=' + export_dir ], cwd=app_dir) # Apply the components util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'], cwd=app_dir) util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir) kube_config.load_kube_config() api_client = k8s_client.ApiClient() util.wait_for_deployment(api_client, namespace, deploy_name, timeout_minutes=4)
def deploy_model(args): """Deploy a TF model using the TF serving component.""" api_client = create_k8s_client(args) app_dir = setup_kubeflow_ks_app(args, api_client) logging.info("Deploying tf-serving.") params = {} for pair in args.params.split(","): k, v = pair.split("=", 1) if k != "namespace": params[k] = v else: namespace = v if namespace == None: raise ValueError("namespace must be supplied in args.") # deployment component deployComponent = "modelServer" generate_command = [ "ks", "generate", "tf-serving-deployment-gcp", deployComponent ] util.run(generate_command, cwd=app_dir) ks_deploy(app_dir, deployComponent, params, env=None, account=None, namespace=namespace) # service component serviceComponent = "modelServer-service" generate_command = [ "ks", "generate", "tf-serving-service", serviceComponent ] util.run(generate_command, cwd=app_dir) ks_deploy(app_dir, serviceComponent, params, env=None, account=None, namespace=namespace) core_api = k8s_client.CoreV1Api(api_client) deploy = core_api.read_namespaced_service(args.deploy_name, args.namespace) cluster_ip = deploy.spec.cluster_ip if not cluster_ip: raise ValueError("inception service wasn't assigned a cluster ip.") util.wait_for_deployment(api_client, namespace, args.deploy_name, timeout_minutes=10) logging.info("Verified TF serving started.")
def deploy_kubeflow(test_case): """Deploy Kubeflow.""" args = parse_args() test_dir = test_case.test_suite.test_dir src_root_dir = args.src_root_dir namespace = args.namespace api_client = deploy_utils.create_k8s_client() app_dir = deploy_utils.setup_ks_app(test_dir, src_root_dir, namespace, args.github_token, api_client) # Deploy Kubeflow util.run(["ks", "generate", "tf-job-operator", "tf-job-operator"], cwd=app_dir) util.run( ["ks", "generate", "argo", "kubeflow-argo", "--name=kubeflow-argo"], cwd=app_dir) cmd = "ks param set tf-job-operator namespace " + namespace util.run(cmd.split(), cwd=app_dir) # cmd = "ks param set tf-job-operator tfJobImage \ # gcr.io/kubeflow-images-public/tf_operator:v20180522-77375baf" # util.run(cmd.split(), cwd=app_dir) cmd = "ks param set tf-job-operator tfJobVersion v1beta1" util.run(cmd.split(), cwd=app_dir) cmd = "ks param set kubeflow-argo namespace " + namespace util.run(cmd.split(), cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "tf-job-operator", "-c", "kubeflow-argo" ] if args.as_gcloud_user: account = deploy_utils.get_gcp_identity() logging.info("Impersonate %s", account) # If we don't use --as to impersonate the service account then we # observe RBAC errors when doing certain operations. The problem appears # to be that we end up using the in cluster config (e.g. pod service account) # and not the GCP service account which has more privileges. apply_command.append("--as=" + account) util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator-v1beta1" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that the Argo operator is deployed. argo_deployment_name = "workflow-controller" logging.info("Verifying Argo controller started.") util.wait_for_deployment(api_client, namespace, argo_deployment_name) # change the namespace to default to set up nfs-volume and nfs-server namespace = "default" deploy_utils.set_clusterrole(namespace)
def deploy_kubeflow(test_case): # pylint: disable=unused-argument """Deploy Kubeflow.""" args = parse_args() src_root_dir = args.src_root_dir namespace = args.namespace api_client = deploy_utils.create_k8s_client() manifest_repo_dir = path.join(src_root_dir, "kubeflow", "manifests") argo_manifest_dir = path.join(manifest_repo_dir, "argo", "base") tfoperator_manifest_dir = path.join(manifest_repo_dir, "tf-training", "tf-job-operator", "base") deploy_utils.setup_test(api_client, namespace) apply_args = "-f -" if args.as_gcloud_user: account = deploy_utils.get_gcp_identity() logging.info("Impersonate %s", account) # If we don't use --as to impersonate the service account then we # observe RBAC errors when doing certain operations. The problem appears # to be that we end up using the in cluster config (e.g. pod service account) # and not the GCP service account which has more privileges. apply_args = " ".join(["--as=" + account, apply_args]) # Deploy argo logging.info("Deploying argo") util.run(["kustomize", "edit", "set", "namespace", namespace], cwd=argo_manifest_dir) util.run(["sh", "-c", "kustomize build | kubectl apply " + apply_args], cwd=argo_manifest_dir) # Deploy tf-job-operator logging.info("Deploying tf-job-operator") util.run(["kustomize", "edit", "set", "namespace", namespace], cwd=tfoperator_manifest_dir) util.run(["sh", "-c", "kustomize build | kubectl apply " + apply_args], cwd=tfoperator_manifest_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that the Argo operator is deployed. argo_deployment_name = "workflow-controller" logging.info("Verifying Argo controller started.") util.wait_for_deployment(api_client, namespace, argo_deployment_name) deploy_utils.set_clusterrole(namespace)
def install_kubebench_nfs(api_client, app_dir, namespace): """Deploy required kubeflow packages to run benchmark""" util.run(["ks", "pkg", "install", "kubebench/kubebench-quickstarter"], cwd=app_dir) util.run([ "ks", "generate", "kubebench-quickstarter-service", "kubebench-quickstarter-service" ], cwd=app_dir) util.run([ "ks", "generate", "kubebench-quickstarter-volume", "kubebench-quickstarter-volume" ], cwd=app_dir) util.run([ "ks", "param", "set", "kubebench-quickstarter-service", "namespace", namespace ], cwd=app_dir) util.run([ "ks", "param", "set", "kubebench-quickstarter-volume", "namespace", namespace ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "kubebench-quickstarter-service" ] util.run(apply_command, cwd=app_dir) kubebench_nfs_deployment_name = "kubebench-nfs-deploy" kubebench_nfs_service_name = "kubebench-nfs-svc" logging.info("Verifying NFS deployment started") util.wait_for_deployment(api_client, namespace, kubebench_nfs_deployment_name) service = get_k8s_service(api_client, namespace, kubebench_nfs_service_name) util.run([ "ks", "param", "set", "kubebench-quickstarter-volume", "nfsServiceIP", service.spec.cluster_ip ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "kubebench-quickstarter-volume" ] util.run(apply_command, cwd=app_dir)
def check_deployments_ready(record_xml_attribute, namespace, name, deployments, cluster_name): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, name) kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) api_client = deploy_utils.create_k8s_client() for deployment_name in deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def test_wait_for_deployment(self): api_client = mock.MagicMock(spec=k8s_client.ApiClient) response = k8s_client.ExtensionsV1beta1Deployment() response.status = k8s_client.ExtensionsV1beta1DeploymentStatus() response.status.ready_replicas = 1 api_client.call_api.return_value = response result = util.wait_for_deployment(api_client, "some-namespace", "some-deployment") self.assertIsNotNone(result)
def test_serve(self): # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. api_client = k8s_client.ApiClient() # Apply the components for component in ["mnist-deploy-gcp", "mnist-service"]: # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) util.run([self.ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created deployment %s in namespaces %s", self.name, self.namespace) util.wait_for_deployment(api_client, self.namespace, self.name, timeout_minutes=4)
def deploy_kubeflow(_): """Deploy Kubeflow.""" args = parse_args() namespace = args.namespace api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that Jupyter is actually deployed. jupyter_name = "jupyter" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name) # Verify that core components are actually deployed. deployment_names = [ "tf-job-operator-v1beta1", "pytorch-operator", "studyjob-controller" ] for deployment_name in deployment_names: logging.info("Verifying that %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name)
def deploy_kubeflow(_): """Deploy Kubeflow.""" args = parse_args() namespace = args.namespace api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator-v1beta1" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that Jupyter is actually deployed. jupyter_name = "jupyter" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name) # Verify that PyTorch Operator actually deployed pytorch_operator_deployment_name = "pytorch-operator" logging.info("Verifying PyTorchJob controller started.") util.wait_for_deployment(api_client, namespace, pytorch_operator_deployment_name)
def check_deployments_ready(record_xml_attribute, namespace, name, deployments): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() # TODO(jlewi): Should we do this in the calling function)? util.set_pytest_junit(record_xml_attribute, name) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() for deployment_name in deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def test_serve(self): # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. api_client = k8s_client.ApiClient() # Apply the components for pair in self.params.split(","): k, v = pair.split("=", 1) if k == "namespace": util.run(["kustomize edit set", k, v], cwd=self.app_dir) else: util.run([ "kustomize edit add configmap", configmap, "--from-literal=", k, "=", v ], cwd=self.app_dir) util.wait_for_deployment(api_client, self.namespace, self.name, timeout_minutes=4)
def test_serve(self): # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. api_client = k8s_client.ApiClient() # TODO (jinchihe) beflow code will be removed once new test-worker image # is publish in https://github.com/kubeflow/testing/issues/373. kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \ 'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64' util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=self.app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=self.app_dir) # Apply the components configmap = 'mnist-map-serving' for pair in self.params.split(","): k, v = pair.split("=", 1) if k == "namespace": util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir) else: util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=' + k + '=' + v ], cwd=self.app_dir) # Seems the util.run cannot handle pipes case, using check_call. subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -' subprocess.check_call(subCmd, shell=True) util.wait_for_deployment(api_client, self.namespace, self.name, timeout_minutes=4)
def wait_for_kubeflow_install(api_client, namespace): """Wait until kubeflow components are up.""" # Verify that the Argo operator is deployed. argo_deployment_name = "workflow-controller" logging.info("Verifying Argo controller started.") util.wait_for_deployment(api_client, namespace, argo_deployment_name) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that the Argo operator is deployed. mpi_job_deployment_name = "mpi-operator" logging.info("Verifying MPIJob controller started.") util.wait_for_deployment(api_client, namespace, mpi_job_deployment_name)
def setup(args): """Test deploying Kubeflow.""" api_client = create_k8s_client(args) now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = args.namespace namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) if args.github_token: logging.info("Setting GITHUB_TOKEN to %s.", args.github_token) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token if not os.getenv("GITHUB_TOKEN"): logging.warn("GITHUB_TOKEN not set; you will probably hit Github API " "limits.") # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) if args.deploy_tf_serving: logging.info("Deploying tf-serving.") util.run([ "ks", "generate", "tf-serving", "modelServer", "--name=inception", "--namespace=" + namespace.metadata.name, "--model_path=gs://kubeflow-models/inception", "--model_server_image=" + args.model_server_image ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "modelServer", ] util.run(apply_command, cwd=app_dir) core_api = k8s_client.CoreV1Api(api_client) deploy = core_api.read_namespaced_service("inception", namespace.metadata.name) cluster_ip = deploy.spec.cluster_ip util.wait_for_deployment(api_client, namespace.metadata.name, "inception") logging.info("Verified TF serving started.")
def test_kf_is_ready(namespace, use_basic_auth, use_istio): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. deployment_names = [ "argo-ui", "centraldashboard", "cloud-endpoints-controller", "jupyter-web-app-deployment", "metadata-db", "metadata-deployment", "metadata-ui", "ml-pipeline", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "notebook-controller-deployment", "tf-job-operator", "pytorch-operator", "katib-controller", "workflow-controller", ] stateful_set_names = [ "kfserving-controller-manager", ] ingress_related_deployments = [] ingress_related_stateful_sets = [] if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. # TODO(yanniszark): This list is incomplete and missing a lot of components. deployment_names = [ "argo-ui", "centraldashboard", "jupyter-web-app-deployment", "minio", "ml-pipeline", "ml-pipeline-persistenceagent", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "ml-pipeline-viewer-controller-deployment", "mysql", "notebook-controller-deployment", "profiles-deployment", "pytorch-operator", "tf-job-operator", "workflow-controller", ] stateful_set_names = [] with open(os.path.join(app_path, "app.yaml")) as f: kfdef = yaml.safe_load(f) platform = kfdef["spec"]["platform"] ingress_related_deployments = [ "istio-citadel", "istio-egressgateway", "istio-galley", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "istio-tracing", "kiali", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split("=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, "tfJobVersion": args.tf_job_version, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"v1alpha1": tf_job_deployment_name = "tf-job-operator" elif args.tf_job_version == "v1alpha2": tf_job_deployment_name = "tf-job-operator-v1alpha2" else: raise ValueError( "Unrecognized value for tf_job_version %s" % args.tf_job_version) logging.info("Verifying TfJob deployment %s started.", tf_job_deployment_name) # TODO(jlewi): We should verify the image of the operator is the correct. util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, app_path, cluster_name): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready") kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) api_client = deploy_utils.create_k8s_client() # Verify that components are actually deployed. deployment_names = [] stateful_set_names = [] platform, _ = get_platform_app_name(app_path) # TODO(PatrickXYS): not sure why istio-galley can't found ingress_related_deployments = [ "cluster-local-gateway", "istio-citadel", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "autoscaler-hpa", "controller", "networking-istio", "webhook", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] elif platform == "aws": # TODO(PatrickXYS): Extend List with AWS Deployment deployment_names.extend(["alb-ingress-controller"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise ingress_names = ["istio-ingress"] # Check if Ingress is Ready and Healthy if platform in ["aws"]: for ingress_name in ingress_names: logging.info("Verifying that ingress %s started...", ingress_name) util.wait_for_ingress(api_client, ingress_namespace, ingress_name, 10) for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10) # Check if Dex is Ready and Healthy dex_deployment_names = ["dex"] dex_namespace = "auth" for dex_deployment_name in dex_deployment_names: logging.info("Verifying that deployment %s started...", dex_deployment_name) util.wait_for_deployment(api_client, dex_namespace, dex_deployment_name, 10) # Check if Cert-Manager is Ready and Healthy cert_manager_deployment_names = [ "cert-manager", "cert-manager-cainjector", "cert-manager-webhook", ] cert_manager_namespace = "cert-manager" for cert_manager_deployment_name in cert_manager_deployment_names: logging.info("Verifying that deployment %s started...", cert_manager_deployment_name) util.wait_for_deployment(api_client, cert_manager_namespace, cert_manager_deployment_name, 10)
def setup_kubeflow(args): """Setup Kubeflow. Args: args: Command line arguments that control the setup process. """ project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, "tfJobVersion": args.tf_job_version, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) ks_deploy(args.test_app_dir, component, params, account=account) # Verify that the TfJob operator is actually deployed. if args.tf_job_version == "v1alpha2": tf_job_deployment_name = "tf-job-operator-v1alpha2" elif args.tf_job_version == "v1beta1": tf_job_deployment_name = "tf-job-operator-v1beta1" else: raise ValueError("Unrecognized value for tf_job_version %s" % args.tf_job_version) logging.info("Verifying TfJob deployment %s started.", tf_job_deployment_name) # TODO(jlewi): We should verify the image of the operator is the correct # one. try: util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) finally: # Run kubectl describe to get useful information about the deployment. # This will help troubleshoot any errors. util.run([ "kubectl", "-n", args.namespace, "describe", "deploy", tf_job_deployment_name ]) util.run([ "kubectl", "-n", args.namespace, "describe", "pods", "-l", "name=tf-job-operator" ]) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, use_istio, app_path): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. # TODO(yanniszark): This list is incomplete and missing a lot of components. deployment_names = [ "workflow-controller", ] stateful_set_names = [] platform, _ = get_platform_app_name(app_path) ingress_related_deployments = [ "istio-egressgateway", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "istio-tracing", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)