예제 #1
0
def test_kfctl_delete(record_xml_attribute, kfctl_path, app_path,
                      cluster_name):
    util.set_pytest_junit(record_xml_attribute, "test_kfctl_delete")

    # TODO(PatrickXYS): do we need to load kubeconfig again?

    if not kfctl_path:
        raise ValueError("kfctl_path is required")

    if not app_path:
        raise ValueError("app_path is required")

    logging.info("Using kfctl path %s", kfctl_path)
    logging.info("Using app path %s", app_path)

    kfdef_path = os.path.join(app_path, "tmp.yaml")
    logging.info("Using kfdef file path %s", kfdef_path)

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    # We see failures because delete operation will delete cert-manager and
    # knative-serving, and encounter timeout. To deal with this we do retries.
    # This has a potential downside of hiding errors that are fixed by retrying.
    @retry(stop_max_delay=60 * 3 * 1000)
    def run_delete():
        util.run([kfctl_path, "delete", "-V", "-f", kfdef_path], cwd=app_path)

    run_delete()
예제 #2
0
def test_deploy_pytorchjob(kfctl_repo_path, namespace, cluster_name):
    """Deploy PytorchJob."""
    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)
    logging.info("using kfctl repo: %s" % kfctl_repo_path)
    util.run([
        "kubectl", "apply", "-f",
        os.path.join(
            kfctl_repo_path,
            "py/kubeflow/kfctl/testing/pytests/testdata/pytorch_job.yaml")
    ])
    api_client = k8s_client.ApiClient()
    api = k8s_client.CoreV1Api(api_client)

    # If the call throws exception, let it emit as an error case.
    resp = api.list_namespaced_pod(namespace)
    names = {
        "pytorch-mnist-ddp-cpu-master-0": False,
        "pytorch-mnist-ddp-cpu-worker-0": False,
    }

    for pod in resp.items:
        name = pod.metadata.name
        if name in names:
            names[name] = True

    msg = []
    for n in names:
        if not names[n]:
            msg.append("pod %s is not found" % n)
    if msg:
        raise ValueError("; ".join(msg))
예제 #3
0
def kfctl_deploy_kubeflow(app_path, config_path, kfctl_path, build_and_apply,
                          cluster_name):
    """Deploy kubeflow.

  Args:
  app_path: The path to the Kubeflow app.
  config_path: Path to the KFDef spec file.
  kfctl_path: Path to the kfctl go binary
  build_and_apply: whether to build and apply or apply
  cluster_name: Name of EKS cluster
  Returns:
  app_path: Path where Kubeflow is installed
  """
    # build_and_apply is a boolean used for testing both the new semantics
    # test case 1: build_and_apply
    # kfctl build -f <config file>
    # kfctl apply
    # test case 2: apply
    # kfctl apply -f <config file>

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    if not os.path.exists(kfctl_path):
        msg = "kfctl Go binary not found: {path}".format(path=kfctl_path)
        logging.error(msg)
        raise RuntimeError(msg)

    app_path, parent_dir = get_or_create_app_path_and_parent_dir(app_path)

    logging.info("app path %s", app_path)
    logging.info("kfctl path %s", kfctl_path)

    config_spec = get_config_spec(config_path, app_path, cluster_name)
    with open(os.path.join(app_path, "tmp.yaml"), "w") as f:
        yaml.dump(config_spec, f)

    # build_and_apply
    logging.info("running kfctl with build and apply: %s \n", build_and_apply)

    logging.info("switching working directory to: %s \n", app_path)
    os.chdir(app_path)

    # push newly built kfctl to S3
    push_kfctl_to_s3(kfctl_path)

    # Workaround to fix issue
    # msg="Encountered error applying application bootstrap:  (kubeflow.error): Code 500 with message: Apply.Run
    # : error when creating \"/tmp/kout927048001\": namespaces \"kubeflow-test-infra\" not found"
    # filename="kustomize/kustomize.go:266"
    # TODO(PatrickXYS): fix the issue permanentely rather than work-around
    util.run(["kubectl", "create", "namespace", "kubeflow-test-infra"])

    # Do not run with retries since it masks errors
    logging.info("Running kfctl with config:\n%s", yaml.safe_dump(config_spec))
    if build_and_apply:
        build_and_apply_kubeflow(kfctl_path, app_path)
    else:
        apply_kubeflow(kfctl_path, app_path)
    return app_path
예제 #4
0
def test_kfctl_delete_wrong_cluster(record_xml_attribute, kfctl_path, app_path,
                                    cluster_name):
    util.set_pytest_junit(record_xml_attribute,
                          "test_kfctl_delete_wrong_cluster")
    if not kfctl_path:
        raise ValueError("kfctl_path is required")

    if not app_path:
        raise ValueError("app_path is required")

    logging.info("Using kfctl path %s", kfctl_path)
    logging.info("Using app path %s", app_path)

    kfdef_path = os.path.join(app_path, "tmp.yaml")
    kfdef = {}
    with open(kfdef_path, "r") as f:
        kfdef = yaml.safe_load(f)

    # Make sure we copy the correct host instead of string reference.
    cluster = kfdef.get("metadata", {}).get("clusterName", "")[:]
    if not cluster:
        raise ValueError("cluster is not written to kfdef")

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    @retry(stop_max_delay=60 * 3 * 1000)
    def run_delete():
        try:
            # Put an obvious wrong cluster into KfDef
            kfdef["metadata"]["clusterName"] = "dummy"
            with open(kfdef_path, "w") as f:
                yaml.dump(kfdef, f)
            util.run([kfctl_path, "delete", "-V", "-f", kfdef_path],
                     cwd=app_path)
        except subprocess.CalledProcessError as e:
            if e.output.find("cluster name doesn't match") != -1:
                return
            else:
                # Re-throw error if it's not expected.
                raise e
        finally:
            # Restore the correct host info.
            kfdef["metadata"]["clusterName"] = cluster[:]
            with open(kfdef_path, "w") as f:
                yaml.dump(kfdef, f)

    run_delete()
예제 #5
0
def check_deployments_ready(record_xml_attribute, namespace, name, deployments,
                            cluster_name):
    """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, name)

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    api_client = deploy_utils.create_k8s_client()

    for deployment_name in deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)
예제 #6
0
def test_kfam(record_xml_attribute, cluster_name):
    util.set_pytest_junit(record_xml_attribute, "test_kfam_e2e")
    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    getcmd = "kubectl get pods -n kubeflow -l=app=jupyter-web-app --template '{{range.items}}{{.metadata.name}}{{end}}'"
    jupyterpod = util.run(getcmd.split(' '))[1:-1]

    logging.info("accessing kfam svc from jupyter pod %s" % jupyterpod)

    sleep(10)
    # Profile Creation
    profile_name = "testprofile-%s" % uuid.uuid4().hex[0:7]
    util.run([
        'kubectl', 'exec', jupyterpod, '-n', 'kubeflow', '--', 'curl',
        '--silent', '-X', 'POST', '-d',
        '{"metadata":{"name":"%s"},"spec":{"owner":{"kind":"User","name":"*****@*****.**"}}}'
        % profile_name, 'profiles-kfam.kubeflow:8081/kfam/v1/profiles'
    ])

    assert verify_profile_creation(jupyterpod, profile_name)
예제 #7
0
def test_jupyter(record_xml_attribute, kfctl_repo_path, namespace,
                 cluster_name):
    """Test the jupyter notebook.
  Args:
    record_xml_attribute: Test fixture provided by pytest.
    kfctl_repo_path: path to local kfctl repository.
    namespace: namespace to run in.
  """
    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)
    logging.info("using kfctl repo: %s" % kfctl_repo_path)
    util.run([
        "kubectl", "apply", "-f",
        os.path.join(
            kfctl_repo_path,
            "py/kubeflow/kfctl/testing/pytests/testdata/jupyter_test.yaml")
    ])
    api_client = k8s_client.ApiClient()
    api = k8s_client.CoreV1Api(api_client)

    resp = api.list_namespaced_service(namespace)
    names = [service.metadata.name for service in resp.items]
    if not "jupyter-test" in names:
        raise ValueError("not able to find jupyter-test service.")
예제 #8
0
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth, app_path,
                     cluster_name):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready")

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    api_client = deploy_utils.create_k8s_client()

    # Verify that components are actually deployed.
    deployment_names = []

    stateful_set_names = []

    platform, _ = get_platform_app_name(app_path)

    # TODO(PatrickXYS): not sure why istio-galley can't found
    ingress_related_deployments = [
        "cluster-local-gateway",
        "istio-citadel",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "autoscaler-hpa",
        "controller",
        "networking-istio",
        "webhook",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []
    elif platform == "aws":
        # TODO(PatrickXYS): Extend List with AWS Deployment
        deployment_names.extend(["alb-ingress-controller"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system"
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    ingress_names = ["istio-ingress"]
    # Check if Ingress is Ready and Healthy
    if platform in ["aws"]:
        for ingress_name in ingress_names:
            logging.info("Verifying that ingress %s started...", ingress_name)
            util.wait_for_ingress(api_client, ingress_namespace, ingress_name,
                                  10)

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)

    # Check if Dex is Ready and Healthy
    dex_deployment_names = ["dex"]
    dex_namespace = "auth"
    for dex_deployment_name in dex_deployment_names:
        logging.info("Verifying that deployment %s started...",
                     dex_deployment_name)
        util.wait_for_deployment(api_client, dex_namespace,
                                 dex_deployment_name, 10)

    # Check if Cert-Manager is Ready and Healthy
    cert_manager_deployment_names = [
        "cert-manager",
        "cert-manager-cainjector",
        "cert-manager-webhook",
    ]
    cert_manager_namespace = "cert-manager"
    for cert_manager_deployment_name in cert_manager_deployment_names:
        logging.info("Verifying that deployment %s started...",
                     cert_manager_deployment_name)
        util.wait_for_deployment(api_client, cert_manager_namespace,
                                 cert_manager_deployment_name, 10)