예제 #1
0
def check_statefulsets_ready(record_xml_attribute, namespace, name, stateful_sets):
  """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace to check
  """
  set_logging()
  # TODO(jlewi): Should we do this in the calling function)?
  util.set_pytest_junit(record_xml_attribute, name)

  # Need to activate account for scopes.
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    util.run(["gcloud", "auth", "activate-service-account",
              "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]])

  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()

  for set_name in stateful_sets:
    logging.info("Verifying that stateful set %s.%s started...", namespace,
                 set_name)
    try:
      util.wait_for_statefulset(api_client, namespace, set_name)
    except:
      # Collect debug information by running describe
      util.run(["kubectl", "-n", namespace, "describe", "statefulsets",
                set_name])
      raise Exception(f"Stateful set {namespace}.{name} is not ready")
예제 #2
0
def test_build_kfctl_go(record_xml_attribute, app_path, project,
                        use_basic_auth, use_istio, config_path,
                        build_and_apply, kfctl_repo_path, cluster_name,
                        values):
    """Test building and deploying Kubeflow.

  Args:
    app_path: The path to the Kubeflow app.
    project: The GCP project to use.
    use_basic_auth: Whether to use basic_auth.
    use_istio: Whether to use Istio or not
    config_path: Path to the KFDef spec file.
    cluster_name: Name of EKS cluster
    build_and_apply: whether to build and apply or apply
    kfctl_repo_path: path to the kubeflow/kfctl repo.
    values: Comma separated list of variables to substitute into config_path
  """
    util.set_pytest_junit(record_xml_attribute, "test_deploy_kubeflow")

    if values:
        pairs = values.split(",")
        path_vars = {}
        for p in pairs:
            k, v = p.split("=")
            path_vars[k] = v

        config_path = config_path.format(**path_vars)
        logging.info("config_path after substitution: %s", config_path)

        kfctl_path = os.path.join(kfctl_repo_path, "bin", "kfctl")
        app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, config_path,
                                                    kfctl_path,
                                                    build_and_apply,
                                                    cluster_name)
        logging.info("kubeflow app path: %s", app_path)
예제 #3
0
def test_endpoint_is_ready(record_xml_attribute, project, app_path, app_name,
                           use_basic_auth):
    """Test that Kubeflow was successfully deployed.

  Args:
    project: The gcp project that we deployed kubeflow
    app_name: The name of the kubeflow deployment
  """
    util.set_pytest_junit(record_xml_attribute, "test_endpoint_is_ready")

    url = "https://{}.endpoints.{}.cloud.goog".format(app_name, project)
    if use_basic_auth:
        with open(os.path.join(app_path, "login.json"), "r") as f:
            login = json.load(f)
            # Let it fail if login info cannot be found.
            username = login["KUBEFLOW_USERNAME"]
            password = login["KUBEFLOW_PASSWORD"]
        if not gcp_util.basic_auth_is_ready(url, username, password):
            raise Exception("Basic auth endpoint is not ready")
    else:
        # Owned by project kubeflow-ci-deployment.
        os.environ[
            "CLIENT_ID"] = "29647740582-7meo6c7a9a76jvg54j0g2lv8lrsb4l8g.apps.googleusercontent.com"
        if not gcp_util.iap_is_ready(url):
            raise Exception("IAP endpoint is not ready")
예제 #4
0
def test_run_notebook(record_xml_attribute, namespace, # pylint: disable=too-many-branches,too-many-statements
                      image_file, notebook_path, test_target_name,
                      artifacts_gcs):

  if not image_file:
    raise ValueError("image_file must provided")

  notebook_name = os.path.basename(
      notebook_path).replace(".ipynb", "").replace("_", "-")
  junit_name = "_".join(["test", notebook_name])
  util.set_pytest_junit(record_xml_attribute, junit_name, test_target_name)

  name = "-".join([notebook_name,
                   datetime.datetime.now().strftime("%H%M%S"),
                   uuid.uuid4().hex[0:3]])

  logging.info(f"Reading file {image_file}")
  contents = util.read_file(image_file)
  image_data = yaml.load(contents)

  if not "image" in image_data:
    raise ValueError(f"File {image_file} is missing field image containing "
                     f"the URI of the docker image to run the notebook in")

  image = image_data["image"]
  logging.info(f"Using image {image}")
  nb_test_util.run_papermill_job(notebook_path, name, namespace, image,
                                 artifacts_gcs)
예제 #5
0
def test_kfctl_delete(record_xml_attribute, kfctl_path, app_path,
                      cluster_name):
    util.set_pytest_junit(record_xml_attribute, "test_kfctl_delete")

    # TODO(PatrickXYS): do we need to load kubeconfig again?

    if not kfctl_path:
        raise ValueError("kfctl_path is required")

    if not app_path:
        raise ValueError("app_path is required")

    logging.info("Using kfctl path %s", kfctl_path)
    logging.info("Using app path %s", app_path)

    kfdef_path = os.path.join(app_path, "tmp.yaml")
    logging.info("Using kfdef file path %s", kfdef_path)

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    # We see failures because delete operation will delete cert-manager and
    # knative-serving, and encounter timeout. To deal with this we do retries.
    # This has a potential downside of hiding errors that are fixed by retrying.
    @retry(stop_max_delay=60 * 3 * 1000)
    def run_delete():
        util.run([kfctl_path, "delete", "-V", "-f", kfdef_path], cwd=app_path)

    run_delete()
예제 #6
0
def test_katib_is_ready(record_xml_attribute, namespace):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_katib_is_ready")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    deployment_names = [
        "katib-controller",
        "katib-mysql",
        "katib-db-manager",
        "katib-ui",
    ]
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)
def test_deploy(record_xml_attribute, deploy_name, namespace, model_dir,
                export_dir):

    util.set_pytest_junit(record_xml_attribute, "test_deploy")

    util.maybe_activate_service_account()

    app_dir = os.path.join(os.path.dirname(__file__), "../serving/GCS")
    app_dir = os.path.abspath(app_dir)
    logging.info("--app_dir not set defaulting to: %s", app_dir)

    # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue:
    # https://github.com/kubernetes-sigs/kustomize/issues/1295
    kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
             'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
    util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl],
             cwd=app_dir)
    util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir)

    # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue.
    # Invalid object doesn't have additional properties ...
    kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \
             'release/v1.14.0/bin/linux/amd64/kubectl'
    util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl],
             cwd=app_dir)
    util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir)

    # Configure custom parameters using kustomize
    configmap = 'mnist-map-serving'
    util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir)
    util.run([
        'kustomize', 'edit', 'add', 'configmap', configmap,
        '--from-literal=name' + '=' + deploy_name
    ],
             cwd=app_dir)

    util.run([
        'kustomize', 'edit', 'add', 'configmap', configmap,
        '--from-literal=modelBasePath=' + model_dir
    ],
             cwd=app_dir)
    util.run([
        'kustomize', 'edit', 'add', 'configmap', configmap,
        '--from-literal=exportDir=' + export_dir
    ],
             cwd=app_dir)

    # Apply the components
    util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'],
             cwd=app_dir)
    util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir)

    kube_config.load_kube_config()
    api_client = k8s_client.ApiClient()
    util.wait_for_deployment(api_client,
                             namespace,
                             deploy_name,
                             timeout_minutes=4)
예제 #8
0
def test_kfctl_delete(record_xml_attribute, kfctl_path, app_path, project,
                      cluster_deletion_script):
    util.set_pytest_junit(record_xml_attribute, "test_kfctl_delete")

    # TODO(yanniszark): split this into a separate workflow step
    if cluster_deletion_script:
        logging.info("cluster_deletion_script specified: %s",
                     cluster_deletion_script)
        util.run(["/bin/bash", "-c", cluster_deletion_script])
        return

    if not kfctl_path:
        raise ValueError("kfctl_path is required")

    if not app_path:
        raise ValueError("app_path is required")

    logging.info("Using kfctl path %s", kfctl_path)
    logging.info("Using app path %s", app_path)

    kfdef_path = os.path.join(app_path, "tmp.yaml")
    logging.info("Using kfdef file path %s", kfdef_path)
    kfdef = {}
    with open(kfdef_path) as f:
        kfdef = yaml.load(f)
    for plugin in kfdef.get("spec", {}).get("plugins", []):
        if plugin.get("kind", "") == "KfGcpPlugin":
            if not "spec" in plugin:
                raise ValueError("Invalid GCP plugin spec %s", str(plugin))
            plugin["spec"]["deleteStorage"] = True
    with open(kfdef_path, "w") as f:
        yaml.dump(kfdef, f)

    # We see failures because delete will try to update the IAM policy which only allows
    # 1 update at a time. To deal with this we do retries.
    # This has a potential downside of hiding errors that are fixed by retrying.
    @retry(stop_max_delay=60 * 3 * 1000)
    def run_delete():
        util.run([
            kfctl_path, "delete", "-V", "-f",
            os.path.join(app_path, "tmp.yaml")
        ],
                 cwd=app_path)

    run_delete()

    # Use services.list instead of services.get because error returned is not
    # 404, it's 403 which is confusing.
    name = os.path.basename(app_path)
    endpoint_name = "{deployment}.endpoints.{project}.cloud.goog".format(
        deployment=name, project=project)
    logging.info("Verify endpoint service is deleted: " + endpoint_name)
    if endpoint_name in get_endpoints_list(project):
        msg = "Endpoint is not deleted: " + endpoint_name
        logging.error(msg)
        raise AssertionError(msg)
    else:
        logging.info("Verified endpoint service is deleted.")
예제 #9
0
def test_gcp_kf_admin_wi(record_xml_attribute, namespace, app_name, platform,
                         project):
  """Test that the kubeflow admin SA has proper workload identity binding.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
  set_logging()
  util.set_pytest_junit(record_xml_attribute, "test_gcp_kf_admin_wi")

  # Need to activate account for scopes.
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    util.run(["gcloud", "auth", "activate-service-account",
              "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]])

  api_client = deploy_utils.create_k8s_client()

  if platform != "gcp":

    pytest.skip("Not running on GCP")
    return

  cred = GoogleCredentials.get_application_default()
  # Create the Cloud IAM service object
  service = googleapiclient.discovery.build('iam', 'v1', credentials=cred)

  adminGcpSa = ('projects/%s/serviceAccounts/'
                '%s-admin@%s.iam.gserviceaccount.com') % (
                  project, app_name, project)
  adminSa = 'serviceAccount:%s-admin@%s.iam.gserviceaccount.com' % (app_name, project)

  request = service.projects().serviceAccounts().getIamPolicy(
    resource=adminGcpSa)
  response = request.execute()
  roleToMembers = {}
  for binding in response['bindings']:
    roleToMembers[binding['role']] = set(binding['members'])

  workloadIdentityRole = 'roles/iam.workloadIdentityUser'
  if workloadIdentityRole not in roleToMembers:
    raise Exception("roles/iam.workloadIdentityUser missing in iam-policy of "
                    "service account %s" % adminGcpSa)

  account_str = "{project}.svc.id.goog[{namespace}/{account}]"

  # Expected workload identity users of the admin service account
  expected_wi_sa = [(namespace, "kf-admin"),
                    (namespace, "profiles-controller-service-account"),
                    ("istio-system", "kf-admin")]

  for sa in expected_wi_sa:
    gcp_sa = account_str.format(project=project, namespace=sa[0], account=sa[1])

    error_message = ("GCP SA {0} missing workload identity binding for "
                     "{1}").format(adminGcpSa, gcp_sa)

    binding = "serviceAccount:" + gcp_sa
    assert binding in roleToMembers[workloadIdentityRole], error_message
예제 #10
0
def test_lint(record_xml_attribute, src_dir, rcfile):  # pylint: disable=redefined-outer-name
    # Override the classname attribute in the junit file.
    # This makes it easy to group related tests in test grid.
    # http://doc.pytest.org/en/latest/usage.html#record-xml-attribute
    util.set_pytest_junit(record_xml_attribute, "test_py_lint")

    logging.info('Running test_lint')
    # Print out the pylint version because different versions can produce
    # different results.
    util.run(["pylint", "--version"])

    # kubeflow_testing is imported as a submodule so we should exclude it
    # TODO(jlewi): We should make this an argument.
    dir_excludes = [
        "dashboard/frontend/node_modules",
        "kubeflow_testing",
        "dev-kubeflow-org/ks-app/vendor",
        "release-infra",
    ]
    full_dir_excludes = [
        os.path.join(os.path.abspath(src_dir), f) for f in dir_excludes
    ]

    # TODO(jlewi): Use pathlib once we switch to python3.
    includes = ["*.py"]
    failed_files = []
    if not rcfile:
        rcfile = os.path.join(src_dir, ".pylintrc")

    for root, dirs, files in os.walk(os.path.abspath(src_dir), topdown=True):
        # Exclude vendor directories and all sub files.
        if "vendor" in root.split(os.sep):
            continue

        # excludes can be done with fnmatch.filter and complementary set,
        # but it's more annoying to read.
        if should_exclude(root, full_dir_excludes):
            continue

        dirs[:] = [d for d in dirs]
        for pat in includes:
            for f in fnmatch.filter(files, pat):
                full_path = os.path.join(root, f)
                try:
                    util.run(["pylint", "--rcfile=" + rcfile, full_path],
                             cwd=src_dir)
                except subprocess.CalledProcessError:
                    failed_files.append(full_path[len(src_dir):])
    if failed_files:
        failed_files.sort()
        logging.error("%s files had lint errors:\n%s", len(failed_files),
                      "\n".join(failed_files))
    else:
        logging.info("No lint issues.")

    assert not failed_files
예제 #11
0
def test_jupyter(record_xml_attribute, env, namespace):
  """Test the jupyter notebook.

  Args:
    record_xml_attribute: Test fixture provided by pytest.
    env: ksonnet environment.
    namespace: namespace to run in.
  """
  util.set_pytest_junit(record_xml_attribute, "jupyter_test")

  app_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
  if app_credentials:
    logging.info("Activate service account")
    util.run([
        "gcloud", "auth", "activate-service-account",
        "--key-file=" + app_credentials
    ])

  # util.load_kube_config appears to hang on python3
  kube_config.load_kube_config()
  api_client = k8s_client.ApiClient()
  host = api_client.configuration.host
  logging.info("Kubernetes master: %s", host)
  master = host.rsplit("/", 1)[-1]

  this_dir = os.path.dirname(__file__)
  app_dir = os.path.join(this_dir, "test_app")

  ks_cmd = ks_util.get_ksonnet_cmd(app_dir)

  name = "jupyter-test"
  service = "jupyter-test"
  component = "jupyter"
  params = ""
  ks_util.setup_ks_app(app_dir, env, namespace, component, params)

  util.run([ks_cmd, "apply", env, "-c", component], cwd=app_dir)
  conditions = ["Running"]
  results = util.wait_for_cr_condition(api_client, GROUP, PLURAL, VERSION,
                                       namespace, name, conditions)

  logging.info("Result of CRD:\n%s", results)

  # We proxy the request through the APIServer so that we can connect
  # from outside the cluster.
  url = ("https://{master}/api/v1/namespaces/{namespace}/services/{service}:80"
         "/proxy/default/jupyter/lab?").format(
             master=master, namespace=namespace, service=service)
  logging.info("Request: %s", url)
  r = send_request(url, verify=False)

  if r.status_code != requests.codes.OK:
    msg = "Request to {0} exited with status code: {1} and content: {2}".format(
        url, r.status_code, r.content)
    logging.error(msg)
    raise RuntimeError(msg)
예제 #12
0
def test_kfctl_delete(record_xml_attribute, cluster_deletion_script,
                      cluster_name):
    util.set_pytest_junit(record_xml_attribute, "test_cluster_delete")

    if cluster_deletion_script:
        logging.info("cluster_deletion_script specified: %s",
                     cluster_deletion_script)
        os.environ["CLUSTER_NAME"] = cluster_name
        util.run(["/bin/bash", "-c", cluster_deletion_script])
        return
예제 #13
0
def test_gcp_access(record_xml_attribute, namespace, app_path, project):
    """Test that Kubeflow gcp was configured with workload_identity and GCP service account credentails.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_gcp_access")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    platform, app_name = get_platform_app_name(app_path)
    if platform == "gcp":
        # check secret
        util.check_secret(api_client, namespace, "user-gcp-sa")

        cred = GoogleCredentials.get_application_default()
        # Create the Cloud IAM service object
        service = googleapiclient.discovery.build('iam',
                                                  'v1',
                                                  credentials=cred)

        userSa = 'projects/%s/serviceAccounts/%s-user@%s.iam.gserviceaccount.com' % (
            project, app_name, project)
        adminSa = 'serviceAccount:%s-admin@%s.iam.gserviceaccount.com' % (
            app_name, project)

        request = service.projects().serviceAccounts().getIamPolicy(
            resource=userSa)
        response = request.execute()
        roleToMembers = {}
        for binding in response['bindings']:
            roleToMembers[binding['role']] = set(binding['members'])

        if 'roles/owner' not in roleToMembers:
            raise Exception("roles/owner missing in iam-policy of %s" % userSa)

        if adminSa not in roleToMembers['roles/owner']:
            raise Exception("Admin %v should be owner of user %s" %
                            (adminSa, userSa))

        workloadIdentityRole = 'roles/iam.workloadIdentityUser'
        if workloadIdentityRole not in roleToMembers:
            raise Exception(
                "roles/iam.workloadIdentityUser missing in iam-policy of %s" %
                userSa)
예제 #14
0
def test_mnist_gcp(record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements
                   repos, image):
  '''Generate Job and summit.'''
  util.set_pytest_junit(record_xml_attribute, "test_mpioperator")

  if not name:
    name = "mpioperator_notebook-" + datetime.datetime.now().strftime("%H%M%S") + "-"
    name = name + uuid.uuid4().hex[0:3]

  util.set_pytest_junit(record_xml_attribute, "test_mpioperator_notebook")

  notebook_path = "kubeflow/mpi-operator/examples/v1alpha2/mpi_notebook.ipynb"
  nb_test_util.run_papermill_job(notebook_path, name, namespace, repos, image)
예제 #15
0
def test_build_kfctl_go(record_xml_attribute):
    """Test building of kfctl go.

  """
    util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    kfctl_path = kfctl_util.build_kfctl_go()
    logging.info("kfctl go binary path %s", kfctl_path)
예제 #16
0
def test_endpoint_is_ready(record_xml_attribute, project, app_name):
  """Test that Kubeflow was successfully deployed.

  Args:
    project: The gcp project that we deployed kubeflow
    app_name: The name of the kubeflow deployment
  """
  util.set_pytest_junit(record_xml_attribute, "test_endpoint_is_ready")

  # Owned by project kubeflow-ci-deployment.
  os.environ["CLIENT_ID"] = "29647740582-7meo6c7a9a76jvg54j0g2lv8lrsb4l8g.apps.googleusercontent.com"
  if not gcp_util.endpoint_is_ready(
      "https://{}.endpoints.{}.cloud.goog".format(app_name, project),
      wait_min=25):
    raise Exception("Endpoint not ready")
예제 #17
0
def test_kfctl_delete_wrong_cluster(record_xml_attribute, kfctl_path, app_path,
                                    project, cluster_deletion_script):
    util.set_pytest_junit(record_xml_attribute,
                          "test_kfctl_delete_wrong_cluster")
    if not kfctl_path:
        raise ValueError("kfctl_path is required")

    if not app_path:
        raise ValueError("app_path is required")

    logging.info("Using kfctl path %s", kfctl_path)
    logging.info("Using app path %s", app_path)

    kfdef_path = os.path.join(app_path, "tmp.yaml")
    kfdef = {}
    with open(kfdef_path, "r") as f:
        kfdef = yaml.safe_load(f)

    # Make sure we copy the correct host instead of string reference.
    cluster = kfdef.get("metadata", {}).get("clusterName", "")[:]
    if not cluster:
        raise ValueError("cluster is not written to kfdef")

    @retry(stop_max_delay=60 * 3 * 1000)
    def run_delete():
        try:
            # Put an obvious wrong cluster into KfDef
            kfdef["metadata"]["clusterName"] = "dummy"
            with open(kfdef_path, "w") as f:
                yaml.dump(kfdef, f)
            util.run([
                kfctl_path, "delete", "--delete_storage", "-V", "-f",
                kfdef_path
            ],
                     cwd=app_path)
        except subprocess.CalledProcessError as e:
            if e.output.find("cluster name doesn't match") != -1:
                return
            else:
                # Re-throw error if it's not expected.
                raise e
        finally:
            # Restore the correct host info.
            kfdef["metadata"]["clusterName"] = cluster[:]
            with open(kfdef_path, "w") as f:
                yaml.dump(kfdef, f)

    run_delete()
예제 #18
0
def check_deployments_ready(record_xml_attribute, namespace, name, deployments,
                            cluster_name):
    """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, name)

    kfctl_aws_util.aws_auth_load_kubeconfig(cluster_name)

    api_client = deploy_utils.create_k8s_client()

    for deployment_name in deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)
예제 #19
0
def test_kfam(record_xml_attribute):
  util.set_pytest_junit(record_xml_attribute, "test_kfam_e2e")
  util.load_kube_config()
  util.load_kube_credentials()

  getcmd = "kubectl get pods -n kubeflow -l=app=jupyter-web-app --template '{{range.items}}{{.metadata.name}}{{end}}'"
  jupyterpod = util.run(getcmd.split(' '))[1:-1]

  logging.info("accessing kfam svc from jupyter pod %s" % jupyterpod)

  sleep(10)
  # Profile Creation
  profile_name = "testprofile-%s" % uuid.uuid4().hex[0:7]
  util.run(['kubectl', 'exec', jupyterpod, '-n', 'kubeflow', '--', 'curl',
            '--silent', '-X', 'POST', '-d',
            '{"metadata":{"name":"%s"},"spec":{"owner":{"kind":"User","name":"*****@*****.**"}}}' % profile_name,
            'profiles-kfam.kubeflow:8081/kfam/v1/profiles'])

  assert verify_profile_creation(jupyterpod, profile_name)
예제 #20
0
def test_profiles(record_xml_attribute,
                  profileFile="profile_v1beta1_profile.yaml"):
    util.set_pytest_junit(record_xml_attribute, "test_profile_e2e")
    app_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
    util.maybe_activate_service_account()
    # util.load_kube_config appears to hang on python3
    kube_config.load_kube_config()
    api_client = k8s_client.ApiClient()
    profileYamlFile = profileFile

    #Profile Creation
    group, version, name = createProfile(api_client, profileYamlFile)
    verifyProfileCreation(api_client, group, version, name)
    verifyNamespaceCreation(api_client, name)
    verifyServiceAccounts(api_client, name)
    verifyRolebindings(api_client, name)

    #Profile deletion
    deleteProfile(api_client, group, version, name)
    verifyProfileDeletion(api_client, group, version, name)
예제 #21
0
def test_run_notebook(
        record_xml_attribute,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image,
        notebook_path):
    notebook_name = os.path.basename(notebook_path).replace(".ipynb",
                                                            "").replace(
                                                                "_", "-")
    junit_name = "_".join(["test", notebook_name])
    util.set_pytest_junit(record_xml_attribute, junit_name)

    name = "-".join([
        notebook_name,
        datetime.datetime.now().strftime("%H%M%S"),
        uuid.uuid4().hex[0:3]
    ])

    util.set_pytest_junit(record_xml_attribute, junit_name)
    nb_test_util.run_papermill_job(notebook_path, name, namespace, repos,
                                   image)
예제 #22
0
def test_deploy_kfctl_go(record_xml_attribute, app_path, project,
                         use_basic_auth, use_istio, config_path, kfctl_path):
    """Test deploying Kubeflow.

  Args:
    app_path: The path to the Kubeflow app.
    project: The GCP project to use.
  """
    util.set_pytest_junit(record_xml_attribute, "test_deploy_kfctl_go")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    kfctl_util.kfctl_deploy_kubeflow(app_path, project, use_basic_auth,
                                     use_istio, config_path, kfctl_path)

    kfctl_util.verify_kubeconfig(app_path)
예제 #23
0
def test_create_cluster(record_xml_attribute, cluster_name, eks_cluster_version, cluster_creation_script, values):
  """Test Create Cluster For E2E Test.
  Args:
    cluster_name: Name of EKS cluster
    eks_cluster_version: Version of EKS cluster
    cluster_creation_script: script invoked to create a new cluster
    values: Comma separated list of variables to substitute into config_path
  """
  util.set_pytest_junit(record_xml_attribute, "test_create_cluster")

  if values:
    pairs = values.split(",")
    path_vars = {}
    for p in pairs:
      k, v = p.split("=")
      path_vars[k] = v

  # Create EKS Cluster
  logging.info("Creating EKS Cluster")
  os.environ["CLUSTER_NAME"] = cluster_name
  os.environ["EKS_CLUSTER_VERSION"] = eks_cluster_version
  util.run(["/bin/bash", "-c", cluster_creation_script])
예제 #24
0
def test_build_kfctl_go(record_xml_attribute, app_path, project,
                        use_basic_auth, use_istio, config_path,
                        build_and_apply, kfctl_repo_path,
                        cluster_creation_script):
    """Test building and deploying Kubeflow.

  Args:
    app_path: The path to the Kubeflow app.
    project: The GCP project to use.
    use_basic_auth: Whether to use basic_auth.
    use_istio: Whether to use Istio or not
    config_path: Path to the KFDef spec file.
    cluster_creation_script: script invoked to create a new cluster
    build_and_apply: whether to build and apply or apply
    kfctl_repo_path: path to the kubeflow/kfctl repo.
  """
    util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    # TODO(yanniszark): split this into a separate workflow step
    if cluster_creation_script:
        logging.info("Cluster creation script specified: %s",
                     cluster_creation_script)
        util.run(["/bin/bash", "-c", cluster_creation_script])

    logging.info("using kfctl repo: %s" % kfctl_repo_path)
    kfctl_path = kfctl_util.build_kfctl_go(kfctl_repo_path)
    app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, project,
                                                use_basic_auth, use_istio,
                                                config_path, kfctl_path,
                                                build_and_apply)
    if not cluster_creation_script:
        kfctl_util.verify_kubeconfig(app_path)
예제 #25
0
def check_deployments_ready(record_xml_attribute, namespace, name, deployments):
  """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
  set_logging()
  # TODO(jlewi): Should we do this in the calling function)?
  util.set_pytest_junit(record_xml_attribute, name)

  # Need to activate account for scopes.
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    util.run(["gcloud", "auth", "activate-service-account",
              "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]])

  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()

  for deployment_name in deployments:
    logging.info("Verifying that deployment %s started...", deployment_name)
    util.wait_for_deployment(api_client, namespace, deployment_name, 10)
예제 #26
0
def test_build_kfctl_go(record_xml_attribute, config_path, kfctl_repo_path,
                        values):
    """Test building and deploying Kubeflow.

  Args:
    config_path: Path to the KFDef spec file.
    kfctl_repo_path: path to the kubeflow/kfctl repo.
    values: Comma separated list of variables to substitute into config_path
  """
    util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go")

    logging.info("using kfctl repo: %s" % kfctl_repo_path)

    if values:
        pairs = values.split(",")
        path_vars = {}
        for p in pairs:
            k, v = p.split("=")
            path_vars[k] = v

        config_path = config_path.format(**path_vars)
        logging.info("config_path after substitution: %s", config_path)

    kfctl_util.build_kfctl_go(kfctl_repo_path)
예제 #27
0
def test_training(
        record_xml_attribute,
        tfjob_name,
        namespace,
        trainer_image,
        num_ps,  #pylint: disable=too-many-arguments
        num_workers,
        train_steps,
        batch_size,
        learning_rate,
        model_dir,
        export_dir):

    util.set_pytest_junit(record_xml_attribute, "test_mnist")

    util.maybe_activate_service_account()

    app_dir = os.path.join(os.path.dirname(__file__), "../training/GCS")
    app_dir = os.path.abspath(app_dir)
    logging.info("--app_dir not set defaulting to: %s", app_dir)

    # TODO (@jinchihe) Using kustomize 2.0.3 to work around below issue:
    # https://github.com/kubernetes-sigs/kustomize/issues/1295
    kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \
             'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64'
    util.run(['wget', '-q', '-O', '/usr/local/bin/kustomize', kusUrl],
             cwd=app_dir)
    util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=app_dir)

    # TODO (@jinchihe): The kubectl need to be upgraded to 1.14.0 due to below issue.
    # Invalid object doesn't have additional properties ...
    kusUrl = 'https://storage.googleapis.com/kubernetes-release/' \
             'release/v1.14.0/bin/linux/amd64/kubectl'
    util.run(['wget', '-q', '-O', '/usr/local/bin/kubectl', kusUrl],
             cwd=app_dir)
    util.run(['chmod', 'a+x', '/usr/local/bin/kubectl'], cwd=app_dir)

    # Configurate custom parameters using kustomize
    util.run(['kustomize', 'edit', 'set', 'namespace', namespace], cwd=app_dir)
    util.run([
        'kustomize', 'edit', 'set', 'image', 'training-image=' + trainer_image
    ],
             cwd=app_dir)

    util.run(['../base/definition.sh', '--numPs', num_ps], cwd=app_dir)
    util.run(['../base/definition.sh', '--numWorkers', num_workers],
             cwd=app_dir)

    trainning_config = {
        "name": tfjob_name,
        "trainSteps": train_steps,
        "batchSize": batch_size,
        "learningRate": learning_rate,
        "modelDir": model_dir,
        "exportDir": export_dir,
    }

    configmap = 'mnist-map-training'
    for key, value in trainning_config.items():
        util.run([
            'kustomize', 'edit', 'add', 'configmap', configmap,
            '--from-literal=' + key + '=' + value
        ],
                 cwd=app_dir)

    # Created the TFJobs.
    util.run(['kustomize', 'build', app_dir, '-o', 'generated.yaml'],
             cwd=app_dir)
    util.run(['kubectl', 'apply', '-f', 'generated.yaml'], cwd=app_dir)
    logging.info("Created job %s in namespaces %s", tfjob_name, namespace)

    kube_config.load_kube_config()
    api_client = k8s_client.ApiClient()

    # Wait for the job to complete.
    logging.info("Waiting for job to finish.")
    results = tf_job_client.wait_for_job(
        api_client,
        namespace,
        tfjob_name,
        status_callback=tf_job_client.log_status)
    logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

    # Check for errors creating pods and services. Can potentially
    # help debug failed test runs.
    creation_failures = tf_job_client.get_creation_failures_from_tfjob(
        api_client, namespace, results)
    if creation_failures:
        logging.warning(creation_failures)

    if not tf_job_client.job_succeeded(results):
        failure = "Job {0} in namespace {1} in status {2}".format(  # pylint: disable=attribute-defined-outside-init
            tfjob_name, namespace, results.get("status", {}))
        logging.error(failure)

        # if the TFJob failed, print out the pod logs for debugging.
        pod_names = tf_job_client.get_pod_names(api_client, namespace,
                                                tfjob_name)
        logging.info("The Pods name:\n %s", pod_names)

        core_api = k8s_client.CoreV1Api(api_client)

        for pod in pod_names:
            logging.info("Getting logs of Pod %s.", pod)
            try:
                pod_logs = core_api.read_namespaced_pod_log(pod, namespace)
                logging.info("The logs of Pod %s log:\n %s", pod, pod_logs)
            except k8s_client.rest.ApiException as e:
                logging.info(
                    "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n",
                    e)
        return
예제 #28
0
def test_build_kfctl_go(record_xml_attribute, app_name, app_path, project,
                        use_basic_auth, use_istio, config_path,
                        build_and_apply, kfctl_repo_path,
                        cluster_creation_script, self_signed_cert, values):
    """Test building and deploying Kubeflow.

  Args:
    app_name: kubeflow deployment name.
    app_path: The path to the Kubeflow app.
    project: The GCP project to use.
    use_basic_auth: Whether to use basic_auth.
    use_istio: Whether to use Istio or not
    config_path: Path to the KFDef spec file.
    cluster_creation_script: script invoked to create a new cluster
    build_and_apply: whether to build and apply or apply
    kfctl_repo_path: path to the kubeflow/kfctl repo.
    self_signed_cert: whether to use self-signed cert for ingress.
    values: Comma separated list of variables to substitute into config_path
  """
    util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    # TODO(yanniszark): split this into a separate workflow step
    if cluster_creation_script:
        logging.info("Cluster creation script specified: %s",
                     cluster_creation_script)
        util.run(["/bin/bash", "-c", cluster_creation_script])

    logging.info("using kfctl repo: %s" % kfctl_repo_path)

    if values:
        pairs = values.split(",")
        path_vars = {}
        for p in pairs:
            k, v = p.split("=")
            path_vars[k] = v

        config_path = config_path.format(**path_vars)
        logging.info("config_path after substitution: %s", config_path)

    kfctl_path = kfctl_util.build_kfctl_go(kfctl_repo_path)
    app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, project,
                                                use_basic_auth, use_istio,
                                                config_path, kfctl_path,
                                                build_and_apply)
    if not cluster_creation_script:
        kfctl_util.verify_kubeconfig(app_path)

    # Use self-signed cert for testing to prevent quota limiting.
    if self_signed_cert:
        logging.info("Configuring self signed certificate")
        util.load_kube_credentials()
        api_client = k8s_client.ApiClient()
        ingress_namespace = "istio-system"
        ingress_name = "envoy-ingress"
        tls_endpoint = "{0}.endpoints.{1}.cloud.goog".format(app_name, project)
        logging.info("Configuring self signed cert for %s", tls_endpoint)
        util.use_self_signed_for_ingress(ingress_namespace, ingress_name,
                                         tls_endpoint, api_client)
예제 #29
0
def test_xgboost_synthetic(
        record_xml_attribute,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image,
        notebook_artifacts_dir):
    '''Generate Job and summit.'''
    util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    # We need to checkout the correct version of the code
    # in presubmits and postsubmits. We should check the environment variables
    # for the prow environment variables to get the appropriate values.
    # We should probably also only do that if the
    # See
    # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
    if not repos:
        repos = argo_build_util.get_repo_from_prow_env()

    repos += ",kubeflow/testing@HEAD"
    logging.info("Repos set to %s", repos)
    job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
        "/usr/local/bin/checkout_repos.sh",
        "--repos=" + repos,
        "--src_dir=/src",
        "--depth=all",
    ]

    nb_bucket = "kubeflow-ci-deployment"
    nb_path = os.path.join("xgboost_synthetic_testing", os.getenv("JOB_TYPE"),
                           os.getenv("HOSTNAME"), "notebook.html")
    output_gcs = util.to_gcs_uri(nb_bucket, nb_path)
    logging.info("Tested notebook will be outputed to: %s", output_gcs)
    job["spec"]["template"]["spec"]["containers"][0]["env"] = [
        {
            "name": "PYTHONPATH",
            "value": "/src/kubeflow/testing/py"
        },
        {
            "name": "OUTPUT_GCS",
            "value": output_gcs
        },
    ]
    job["spec"]["template"]["spec"]["containers"][0]["image"] = image
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("xgboost-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
        name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    # Download notebook html to artifacts
    notebook_artifacts_path = os.path.join(notebook_artifacts_dir,
                                           "notebook.html")
    logging.info("Writing notebook artifact to: %s", notebook_artifacts_path)
    os.makedirs(notebook_artifacts_dir, exist_ok=True)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(nb_bucket)
    blob = bucket.get_blob(nb_path)
    blob.download_to_filename(notebook_artifacts_path)

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
예제 #30
0
def test_kf_is_ready(record_xml_attribute, namespace, use_basic_auth,
                     use_istio, app_path):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_kf_is_ready")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    # TODO(yanniszark): This list is incomplete and missing a lot of components.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "jupyter-web-app-deployment",
        "minio",
        "ml-pipeline",
        "ml-pipeline-persistenceagent",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "ml-pipeline-viewer-controller-deployment",
        "mysql",
        "notebook-controller-deployment",
        "profiles-deployment",
        "pytorch-operator",
        "tf-job-operator",
        "workflow-controller",
    ]

    stateful_set_names = []

    platform, _ = get_platform_app_name(app_path)

    ingress_related_deployments = [
        "istio-egressgateway",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "istio-tracing",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)