def setup(args): """Test deploying Kubeflow.""" api_client = create_k8s_client(args) app_dir = setup_kubeflow_ks_app(args, api_client) namespace = args.namespace # TODO(jlewi): We don't need to generate a core component if we are # just deploying TFServing. Might be better to refactor this code. # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name)
def check_statefulsets_ready(record_xml_attribute, namespace, name, stateful_sets): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace to check """ set_logging() # TODO(jlewi): Should we do this in the calling function)? util.set_pytest_junit(record_xml_attribute, name) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() for set_name in stateful_sets: logging.info("Verifying that stateful set %s.%s started...", namespace, set_name) try: util.wait_for_statefulset(api_client, namespace, set_name) except: # Collect debug information by running describe util.run(["kubectl", "-n", namespace, "describe", "statefulsets", set_name]) raise Exception(f"Stateful set {namespace}.{name} is not ready")
def deploy_kubeflow(test_case): """Deploy Kubeflow.""" args = parse_args() test_dir = test_case.test_suite.test_dir namespace = args.namespace api_client = deploy_utils.create_k8s_client() app_dir = deploy_utils.setup_kubeflow_ks_app(test_dir, namespace, args.github_token, api_client) # TODO(jlewi): We don't need to generate a core component if we are # just deploying TFServing. Might be better to refactor this code. # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace ], cwd=app_dir) util.run([ "ks", "generate", "pytorch-operator", "pytorch-operator", "--name=pytorch-operator", "--namespace=" + namespace ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", "-c", "pytorch-operator", ] if args.as_gcloud_user: account = deploy_utils.get_gcp_identity() logging.info("Impersonate %s", account) # If we don't use --as to impersonate the service account then we # observe RBAC errors when doing certain operations. The problem appears # to be that we end up using the in cluster config (e.g. pod service account) # and not the GCP service account which has more privileges. apply_command.append("--as=" + account) util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyterhub_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyterhub_name) # Verify that PyTorch Operator actually deployed pytorch_operator_deployment_name = "pytorch-operator" logging.info("Verifying PyTorchJob controller started.") util.wait_for_deployment(api_client, namespace, pytorch_operator_deployment_name)
def test_kf_is_ready(namespace, use_basic_auth): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. deployment_names = [ "argo-ui", "centraldashboard", "cert-manager", "cloud-endpoints-controller", "jupyter-web-app", "ml-pipeline", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "notebooks-controller", "tf-job-operator", "profiles", "pytorch-operator", "studyjob-controller", "workflow-controller", ] stateful_sets = [ "backend-updater", ] if use_basic_auth: deployment_names.extend(["basic-auth"]) else: deployment_names.extend(["iap-enabler"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name) for name in stateful_sets: logging.info("Verifying that statefulset %s started...", name) util.wait_for_statefulset(api_client, namespace, name)
def test_wait_for_statefulset(self): api_client = mock.MagicMock(spec=k8s_client.ApiClient) response = k8s_client.V1beta1StatefulSet() response.status = k8s_client.V1beta1StatefulSetStatus(ready_replicas=1, replicas=1) api_client.call_api.return_value = response result = util.wait_for_statefulset(api_client, "some-namespace", "some-set") self.assertIsNotNone(result)
def deploy_kubeflow(_): """Deploy Kubeflow.""" args = parse_args() namespace = args.namespace api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that Jupyter is actually deployed. jupyter_name = "jupyter" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name) # Verify that core components are actually deployed. deployment_names = [ "tf-job-operator-v1beta1", "pytorch-operator", "studyjob-controller" ] for deployment_name in deployment_names: logging.info("Verifying that %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name)
def deploy_kubeflow(_): """Deploy Kubeflow.""" args = parse_args() namespace = args.namespace api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator-v1beta1" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace, tf_job_deployment_name) # Verify that Jupyter is actually deployed. jupyter_name = "jupyter" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name) # Verify that PyTorch Operator actually deployed pytorch_operator_deployment_name = "pytorch-operator" logging.info("Verifying PyTorchJob controller started.") util.wait_for_deployment(api_client, namespace, pytorch_operator_deployment_name)
def setup(args): """Test deploying Kubeflow.""" api_client = create_k8s_client(args) now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = args.namespace namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) if args.github_token: logging.info("Setting GITHUB_TOKEN to %s.", args.github_token) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token if not os.getenv("GITHUB_TOKEN"): logging.warn("GITHUB_TOKEN not set; you will probably hit Github API " "limits.") # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) if args.deploy_tf_serving: logging.info("Deploying tf-serving.") util.run([ "ks", "generate", "tf-serving", "modelServer", "--name=inception", "--namespace=" + namespace.metadata.name, "--model_path=gs://kubeflow-models/inception", "--model_server_image=" + args.model_server_image ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "modelServer", ] util.run(apply_command, cwd=app_dir) core_api = k8s_client.CoreV1Api(api_client) deploy = core_api.read_namespaced_service("inception", namespace.metadata.name) cluster_ip = deploy.spec.cluster_ip util.wait_for_deployment(api_client, namespace.metadata.name, "inception") logging.info("Verified TF serving started.")
def test_kf_is_ready(namespace, use_basic_auth, use_istio): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. deployment_names = [ "argo-ui", "centraldashboard", "cloud-endpoints-controller", "jupyter-web-app-deployment", "metadata-db", "metadata-deployment", "metadata-ui", "ml-pipeline", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "notebook-controller-deployment", "tf-job-operator", "pytorch-operator", "katib-controller", "workflow-controller", ] stateful_set_names = [ "kfserving-controller-manager", ] ingress_related_deployments = [] ingress_related_stateful_sets = [] if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. # TODO(yanniszark): This list is incomplete and missing a lot of components. deployment_names = [ "argo-ui", "centraldashboard", "jupyter-web-app-deployment", "minio", "ml-pipeline", "ml-pipeline-persistenceagent", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "ml-pipeline-viewer-controller-deployment", "mysql", "notebook-controller-deployment", "profiles-deployment", "pytorch-operator", "tf-job-operator", "workflow-controller", ] stateful_set_names = [] with open(os.path.join(app_path, "app.yaml")) as f: kfdef = yaml.safe_load(f) platform = kfdef["spec"]["platform"] ingress_related_deployments = [ "istio-citadel", "istio-egressgateway", "istio-galley", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "istio-tracing", "kiali", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, use_istio, app_path): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. # TODO(yanniszark): This list is incomplete and missing a lot of components. deployment_names = [ "workflow-controller", ] stateful_set_names = [] platform, _ = get_platform_app_name(app_path) ingress_related_deployments = [ "istio-egressgateway", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "istio-tracing", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, app_path, cluster_name): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready") kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) api_client = deploy_utils.create_k8s_client() # Verify that components are actually deployed. deployment_names = [] stateful_set_names = [] platform, _ = get_platform_app_name(app_path) # TODO(PatrickXYS): not sure why istio-galley can't found ingress_related_deployments = [ "cluster-local-gateway", "istio-citadel", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "autoscaler-hpa", "controller", "networking-istio", "webhook", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] elif platform == "aws": # TODO(PatrickXYS): Extend List with AWS Deployment deployment_names.extend(["alb-ingress-controller"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise ingress_names = ["istio-ingress"] # Check if Ingress is Ready and Healthy if platform in ["aws"]: for ingress_name in ingress_names: logging.info("Verifying that ingress %s started...", ingress_name) util.wait_for_ingress(api_client, ingress_namespace, ingress_name, 10) for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10) # Check if Dex is Ready and Healthy dex_deployment_names = ["dex"] dex_namespace = "auth" for dex_deployment_name in dex_deployment_names: logging.info("Verifying that deployment %s started...", dex_deployment_name) util.wait_for_deployment(api_client, dex_namespace, dex_deployment_name, 10) # Check if Cert-Manager is Ready and Healthy cert_manager_deployment_names = [ "cert-manager", "cert-manager-cainjector", "cert-manager-webhook", ] cert_manager_namespace = "cert-manager" for cert_manager_deployment_name in cert_manager_deployment_names: logging.info("Verifying that deployment %s started...", cert_manager_deployment_name) util.wait_for_deployment(api_client, cert_manager_namespace, cert_manager_deployment_name, 10)
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)