def test_kfctl_delete(record_xml_attribute, kfctl_path, app_path, cluster_name): util.set_pytest_junit(record_xml_attribute, "test_kfctl_delete") # TODO(PatrickXYS): do we need to load kubeconfig again? if not kfctl_path: raise ValueError("kfctl_path is required") if not app_path: raise ValueError("app_path is required") logging.info("Using kfctl path %s", kfctl_path) logging.info("Using app path %s", app_path) kfdef_path = os.path.join(app_path, "tmp.yaml") logging.info("Using kfdef file path %s", kfdef_path) kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) # We see failures because delete operation will delete cert-manager and # knative-serving, and encounter timeout. To deal with this we do retries. # This has a potential downside of hiding errors that are fixed by retrying. @retry(stop_max_delay=60 * 3 * 1000) def run_delete(): util.run([kfctl_path, "delete", "-V", "-f", kfdef_path], cwd=app_path) run_delete()
def test_deploy_pytorchjob(kfctl_repo_path, namespace, cluster_name): """Deploy PytorchJob.""" kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) logging.info("using kfctl repo: %s" % kfctl_repo_path) util.run([ "kubectl", "apply", "-f", os.path.join( kfctl_repo_path, "py/kubeflow/kfctl/testing/pytests/testdata/pytorch_job.yaml") ]) api_client = k8s_client.ApiClient() api = k8s_client.CoreV1Api(api_client) # If the call throws exception, let it emit as an error case. resp = api.list_namespaced_pod(namespace) names = { "pytorch-mnist-ddp-cpu-master-0": False, "pytorch-mnist-ddp-cpu-worker-0": False, } for pod in resp.items: name = pod.metadata.name if name in names: names[name] = True msg = [] for n in names: if not names[n]: msg.append("pod %s is not found" % n) if msg: raise ValueError("; ".join(msg))
def kfctl_deploy_kubeflow(app_path, config_path, kfctl_path, build_and_apply, cluster_name): """Deploy kubeflow. Args: app_path: The path to the Kubeflow app. config_path: Path to the KFDef spec file. kfctl_path: Path to the kfctl go binary build_and_apply: whether to build and apply or apply cluster_name: Name of EKS cluster Returns: app_path: Path where Kubeflow is installed """ # build_and_apply is a boolean used for testing both the new semantics # test case 1: build_and_apply # kfctl build -f <config file> # kfctl apply # test case 2: apply # kfctl apply -f <config file> kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) if not os.path.exists(kfctl_path): msg = "kfctl Go binary not found: {path}".format(path=kfctl_path) logging.error(msg) raise RuntimeError(msg) app_path, parent_dir = get_or_create_app_path_and_parent_dir(app_path) logging.info("app path %s", app_path) logging.info("kfctl path %s", kfctl_path) config_spec = get_config_spec(config_path, app_path, cluster_name) with open(os.path.join(app_path, "tmp.yaml"), "w") as f: yaml.dump(config_spec, f) # build_and_apply logging.info("running kfctl with build and apply: %s \n", build_and_apply) logging.info("switching working directory to: %s \n", app_path) os.chdir(app_path) # push newly built kfctl to S3 push_kfctl_to_s3(kfctl_path) # Workaround to fix issue # msg="Encountered error applying application bootstrap: (kubeflow.error): Code 500 with message: Apply.Run # : error when creating \"/tmp/kout927048001\": namespaces \"kubeflow-test-infra\" not found" # filename="kustomize/kustomize.go:266" # TODO(PatrickXYS): fix the issue permanentely rather than work-around util.run(["kubectl", "create", "namespace", "kubeflow-test-infra"]) # Do not run with retries since it masks errors logging.info("Running kfctl with config:\n%s", yaml.safe_dump(config_spec)) if build_and_apply: build_and_apply_kubeflow(kfctl_path, app_path) else: apply_kubeflow(kfctl_path, app_path) return app_path
def test_kfctl_delete_wrong_cluster(record_xml_attribute, kfctl_path, app_path, cluster_name): util.set_pytest_junit(record_xml_attribute, "test_kfctl_delete_wrong_cluster") if not kfctl_path: raise ValueError("kfctl_path is required") if not app_path: raise ValueError("app_path is required") logging.info("Using kfctl path %s", kfctl_path) logging.info("Using app path %s", app_path) kfdef_path = os.path.join(app_path, "tmp.yaml") kfdef = {} with open(kfdef_path, "r") as f: kfdef = yaml.safe_load(f) # Make sure we copy the correct host instead of string reference. cluster = kfdef.get("metadata", {}).get("clusterName", "")[:] if not cluster: raise ValueError("cluster is not written to kfdef") kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) @retry(stop_max_delay=60 * 3 * 1000) def run_delete(): try: # Put an obvious wrong cluster into KfDef kfdef["metadata"]["clusterName"] = "dummy" with open(kfdef_path, "w") as f: yaml.dump(kfdef, f) util.run([kfctl_path, "delete", "-V", "-f", kfdef_path], cwd=app_path) except subprocess.CalledProcessError as e: if e.output.find("cluster name doesn't match") != -1: return else: # Re-throw error if it's not expected. raise e finally: # Restore the correct host info. kfdef["metadata"]["clusterName"] = cluster[:] with open(kfdef_path, "w") as f: yaml.dump(kfdef, f) run_delete()
def check_deployments_ready(record_xml_attribute, namespace, name, deployments, cluster_name): """Test that Kubeflow deployments are successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, name) kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) api_client = deploy_utils.create_k8s_client() for deployment_name in deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def test_kfam(record_xml_attribute, cluster_name): util.set_pytest_junit(record_xml_attribute, "test_kfam_e2e") kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) getcmd = "kubectl get pods -n kubeflow -l=app=jupyter-web-app --template '{{range.items}}{{.metadata.name}}{{end}}'" jupyterpod = util.run(getcmd.split(' '))[1:-1] logging.info("accessing kfam svc from jupyter pod %s" % jupyterpod) sleep(10) # Profile Creation profile_name = "testprofile-%s" % uuid.uuid4().hex[0:7] util.run([ 'kubectl', 'exec', jupyterpod, '-n', 'kubeflow', '--', 'curl', '--silent', '-X', 'POST', '-d', '{"metadata":{"name":"%s"},"spec":{"owner":{"kind":"User","name":"*****@*****.**"}}}' % profile_name, 'profiles-kfam.kubeflow:8081/kfam/v1/profiles' ]) assert verify_profile_creation(jupyterpod, profile_name)
def test_jupyter(record_xml_attribute, kfctl_repo_path, namespace, cluster_name): """Test the jupyter notebook. Args: record_xml_attribute: Test fixture provided by pytest. kfctl_repo_path: path to local kfctl repository. namespace: namespace to run in. """ kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) logging.info("using kfctl repo: %s" % kfctl_repo_path) util.run([ "kubectl", "apply", "-f", os.path.join( kfctl_repo_path, "py/kubeflow/kfctl/testing/pytests/testdata/jupyter_test.yaml") ]) api_client = k8s_client.ApiClient() api = k8s_client.CoreV1Api(api_client) resp = api.list_namespaced_service(namespace) names = [service.metadata.name for service in resp.items] if not "jupyter-test" in names: raise ValueError("not able to find jupyter-test service.")
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, app_path, cluster_name): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready") kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name) api_client = deploy_utils.create_k8s_client() # Verify that components are actually deployed. deployment_names = [] stateful_set_names = [] platform, _ = get_platform_app_name(app_path) # TODO(PatrickXYS): not sure why istio-galley can't found ingress_related_deployments = [ "cluster-local-gateway", "istio-citadel", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "autoscaler-hpa", "controller", "networking-istio", "webhook", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] elif platform == "aws": # TODO(PatrickXYS): Extend List with AWS Deployment deployment_names.extend(["alb-ingress-controller"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise ingress_names = ["istio-ingress"] # Check if Ingress is Ready and Healthy if platform in ["aws"]: for ingress_name in ingress_names: logging.info("Verifying that ingress %s started...", ingress_name) util.wait_for_ingress(api_client, ingress_namespace, ingress_name, 10) for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10) # Check if Dex is Ready and Healthy dex_deployment_names = ["dex"] dex_namespace = "auth" for dex_deployment_name in dex_deployment_names: logging.info("Verifying that deployment %s started...", dex_deployment_name) util.wait_for_deployment(api_client, dex_namespace, dex_deployment_name, 10) # Check if Cert-Manager is Ready and Healthy cert_manager_deployment_names = [ "cert-manager", "cert-manager-cainjector", "cert-manager-webhook", ] cert_manager_namespace = "cert-manager" for cert_manager_deployment_name in cert_manager_deployment_names: logging.info("Verifying that deployment %s started...", cert_manager_deployment_name) util.wait_for_deployment(api_client, cert_manager_namespace, cert_manager_deployment_name, 10)