예제 #1
0
def test_katib_is_ready(record_xml_attribute, namespace):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    util.set_pytest_junit(record_xml_attribute, "test_katib_is_ready")

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    deployment_names = [
        "katib-controller",
        "katib-mysql",
        "katib-db-manager",
        "katib-ui",
    ]
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)
예제 #2
0
def check_statefulsets_ready(record_xml_attribute, namespace, name, stateful_sets):
  """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace to check
  """
  set_logging()
  # TODO(jlewi): Should we do this in the calling function)?
  util.set_pytest_junit(record_xml_attribute, name)

  # Need to activate account for scopes.
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    util.run(["gcloud", "auth", "activate-service-account",
              "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]])

  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()

  for set_name in stateful_sets:
    logging.info("Verifying that stateful set %s.%s started...", namespace,
                 set_name)
    try:
      util.wait_for_statefulset(api_client, namespace, set_name)
    except:
      # Collect debug information by running describe
      util.run(["kubectl", "-n", namespace, "describe", "statefulsets",
                set_name])
      raise Exception(f"Stateful set {namespace}.{name} is not ready")
예제 #3
0
def handle_retriable_exception(exception):
  if isinstance(exception, rest.ApiException):
    # ApiException could store the exit code in status or it might
    # store it in HTTP response body
    # see: https://github.com/kubernetes-client/python/blob/5e512ff564c244c50cab780d821542ed56aa965a/kubernetes/client/rest.py#L289  # pylint: disable=line-too-long
    code = None
    if exception.body:
      if isinstance(exception.body, six.string_types):
        body = {}
        try:
          logging.info("Parsing ApiException body: %s", exception.body)
          body = json.loads(exception.body)
        except json.JSONDecodeError as e:
          logging.error("Error parsing body: %s", e)
      else:
        body = exception.body
      code = body.get("code")
    else:
      code = exception.status

    # UNAUTHORIZED and FORBIDDEN errors can be an indication we need to
    # refresh credentials
    logging.info("ApiException code=%s", code)
    # TODO(jlewi): In python3 we can switch to using http.HttpStatusCode
    if code in [httplib.UNAUTHORIZED, httplib.FORBIDDEN, httplib.GATEWAY_TIMEOUT]:
      # Due to https://github.com/kubernetes-client/python-base/issues/59,
      # we need to reload the kube config (which refreshes the GCP token).
      # TODO(richardsliu): Remove this workaround when the k8s client issue
      # is resolved.
      util.load_kube_config()
      return True
  return not isinstance(exception, util.TimeoutError)
예제 #4
0
def test_deploy_pytorchjob(record_xml_attribute, kfctl_repo_path, namespace):
    """Deploy PytorchJob."""
    util.load_kube_config()
    util.load_kube_credentials()
    logging.info("using kfctl repo: %s" % kfctl_repo_path)
    util.run([
        "kubectl", "apply", "-f",
        os.path.join(
            kfctl_repo_path,
            "py/kubeflow/kfctl/testing/pytests/testdata/pytorch_job.yaml")
    ])
    api_client = k8s_client.ApiClient()
    api = k8s_client.CoreV1Api(api_client)

    # If the call throws exception, let it emit as an error case.
    resp = api.list_namespaced_pod(namespace)
    names = {
        "pytorch-mnist-ddp-cpu-master-0": False,
        "pytorch-mnist-ddp-cpu-worker-0": False,
    }

    for pod in resp.items:
        name = pod.metadata.name
        if name in names:
            names[name] = True

    msg = []
    for n in names:
        if not names[n]:
            msg.append("pod %s is not found" % n)
    if msg:
        raise ValueError("; ".join(msg))
예제 #5
0
def run_test(test_case, test_func, args):  # pylint: disable=too-many-branches,too-many-statements
  """Run a test."""
  gcs_client = storage.Client(project=args.project)
  project = args.project
  cluster_name = args.cluster
  zone = args.zone
  # TODO(jlewi): When using GKE we should copy the .kube config and any other
  # files to the test directory. We should then set the environment variable
  # KUBECONFIG to point at that file. This should prevent us from having
  # to rerun util.configure_kubectl on each step. Instead we could run it once
  # as part of GKE cluster creation and store the config in the NFS directory.
  # This would make the handling of credentials
  # and KUBECONFIG more consistent between GKE and minikube and eventually
  # this could be extended to other K8s deployments.
  if cluster_name:
    util.configure_kubectl(project, zone, cluster_name)
  util.load_kube_config()

  start = time.time()

  try:  # pylint: disable=too-many-nested-blocks
    # We repeat the test multiple times.
    # This ensures that if we delete the job we can create a new job with the
    # same name.

    num_trials = args.num_trials
    logging.info("tfjob_version=%s", args.tfjob_version)

    for trial in range(num_trials):
      logging.info("Trial %s", trial)
      test_func()

    # TODO(jlewi):
    #  Here are some validation checks to run:
    #  1. Check that all resources are garbage collected.
    # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
    # TODO(jlewi): Are there other generic validation checks we should
    # run.
  except tf_operator_util.JobTimeoutError as e:
    if e.job:
      spec = "Job:\n" + json.dumps(e.job, indent=2)
    else:
      spec = "JobTimeoutError did not contain job"
    test_case.failure = "Timeout waiting for job to finish: " + spec
    logging.exception(test_case.failure)
  except Exception as e:  # pylint: disable-msg=broad-except
    # TODO(jlewi): I'm observing flakes where the exception has message "status"
    # in an effort to try to nail down this exception we print out more
    # information about the exception.
    logging.exception("There was a problem running the job; Exception %s", e)
    # We want to catch all exceptions because we want the test as failed.
    test_case.failure = ("Exception occured; type {0} message {1}".format(
      e.__class__, e.message))
  finally:
    test_case.time = time.time() - start
    if args.artifacts_path:
      test_util.create_junit_xml_file(
        [test_case],
        args.artifacts_path + "/junit_" + test_func.__name__ + ".xml",
        gcs_client)
예제 #6
0
def test_wait_for_deployment(test_case): # pylint: disable=redefined-outer-name
  args = parse_args()
  util.maybe_activate_service_account()
  util.load_kube_config()
  end_time =  datetime.datetime.now() + datetime.timedelta(0, args.timeout*60)
  wait_for_resource("crd/tfjobs.kubeflow.org", end_time)
  wait_for_resource("crd/pytorchjobs.kubeflow.org", end_time)
  logging.info("Found all resources successfully")
예제 #7
0
def run_test(args, test_case):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    util.load_kube_config()

    api_client = k8s_client.ApiClient()

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    namespace, name, env = test_runner.setup_ks_app(args)
    t.name = os.path.basename(name)

    try:  # pylint: disable=too-many-nested-blocks
        util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir)

        logging.info("Created job %s in namespaces %s", name, namespace)

        logging.info("Wait for conditions Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            namespace,
            name, ["Succeeded", "Failed"],
            status_callback=tf_job_client.log_status)

        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        # For v1alpha2 check for non-empty completionTime
        last_condition = results.get("status", {}).get("conditions", [])[-1]
        if last_condition.get("type", "").lower() != "failed":
            message = "Job {0} in namespace {1} did not fail; status {2}".format(
                name, namespace, results.get("status", {}))
            logging.error(message)
            test_case.add_failure_info(message)
            return

        pattern = ".*the spec is invalid.*"
        condition_message = last_condition.get("message", "")
        if not re.match(pattern, condition_message):
            message = "Condition message {0} did not match pattern {1}".format(
                condition_message, pattern)
            logging.error(message)
            test_case.add_failure_info(message)
    except tf_operator_util.JobTimeoutError as e:
        if e.job:
            spec = "Job:\n" + json.dumps(e.job, indent=2)
        else:
            spec = "JobTimeoutError did not contain job"
        message = ("Timeout waiting for {0} in namespace {1} to finish; "
                   ).format(name, namespace) + spec
        logging.exception(message)
        test_case.add_failure_info(message)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        message = "There was a problem running the job; Exception {0}".format(
            e)
        logging.exception(message)
        test_case.add_failure_info(message)
예제 #8
0
def test_kf_is_ready(namespace, use_basic_auth):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """

    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "cert-manager",
        "cloud-endpoints-controller",
        "jupyter-web-app",
        "ml-pipeline",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "notebooks-controller",
        "tf-job-operator",
        "profiles",
        "pytorch-operator",
        "studyjob-controller",
        "workflow-controller",
    ]

    stateful_sets = [
        "backend-updater",
    ]

    if use_basic_auth:
        deployment_names.extend(["basic-auth"])
    else:
        deployment_names.extend(["iap-enabler"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name)

    for name in stateful_sets:
        logging.info("Verifying that statefulset %s started...", name)
        util.wait_for_statefulset(api_client, namespace, name)
예제 #9
0
def create_k8s_client():
    # We need to load the kube config so that we can have credentials to
    # talk to the APIServer.
    util.load_kube_config(persist_config=False)

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()

    return api_client
예제 #10
0
def cleanup_workflows(args):
    logging.info("Cleanup Argo workflows")
    util.maybe_activate_service_account()

    util.run([
        "gcloud",
        "container",
        "clusters",
        "get-credentials",
        args.testing_cluster,
        "--zone=" + args.testing_zone,
        "--project=" + args.testing_project,
    ])

    # We need to load the kube config so that we can have credentials to
    # talk to the APIServer.
    util.load_kube_config(persist_config=False)

    client = k8s_client.ApiClient()
    crd_api = k8s_client.CustomObjectsApi(client)
    workflows = crd_api.list_namespaced_custom_object(argo_client.GROUP,
                                                      argo_client.VERSION,
                                                      args.namespace,
                                                      argo_client.PLURAL)

    expired = []
    unexpired = []

    for w in workflows["items"]:
        is_expired = False

        start_time = date_parser.parse(w["status"]["startedAt"])
        now = datetime.datetime.now(start_time.tzinfo)

        name = w["metadata"]["name"]
        age = now - start_time
        if age > datetime.timedelta(hours=args.max_wf_age_hours):
            logging.info("Deleting workflow: %s", name)
            is_expired = True
            if not args.dryrun:
                try:
                    crd_api.delete_namespaced_custom_object(
                        argo_client.GROUP, argo_client.VERSION, args.namespace,
                        argo_client.PLURAL, name, k8s_client.V1DeleteOptions())
                except Exception as e:  # pylint: disable=broad-except
                    logging.error(
                        "There was a problem deleting workflow %s.%s; "
                        "error: %s", args.namespace, args.name, e)
        if is_expired:
            expired.append(name)
        else:
            unexpired.append(name)

    logging.info("Unexpired workflows:\n%s", "\n".join(unexpired))
    logging.info("expired workflows:\n%s", "\n".join(expired))
    logging.info("Finished cleanup of Argo workflows")
예제 #11
0
def handle_retriable_exception(exception):
    if (isinstance(exception, rest.ApiException)
            and (exception.status == 401 or exception.status == 403)):
        # Due to https://github.com/kubernetes-client/python-base/issues/59,
        # we need to reload the kube config (which refreshes the GCP token).
        # TODO(richardsliu): Remove this workaround when the k8s client issue
        # is resolved.
        util.load_kube_config()
        return True
    return not isinstance(exception, util.TimeoutError)
예제 #12
0
def test_katib(test_case):  # pylint: disable=redefined-outer-name
  args = parse_args()
  namespace = NAMESPACE
  name = "katib-studyjob-test"

  util.load_kube_config()
  api_client = k8s_client.ApiClient()
  create_app_and_job(args, namespace, name)
  try:
    wait_for_condition(
        api_client, namespace, name, ["Running"], status_callback=log_status)
    logging.info("StudyJob launched successfully")
  except Exception as e:
    logging.error("Test failed waiting for job; %s", e)
    test_case.add_failure_info(e.message)
예제 #13
0
def run_test(test_case, test_func, args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    util.load_kube_config()

    start = time.time()

    try:  # pylint: disable=too-many-nested-blocks
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        num_trials = args.num_trials
        logging.info("tfjob_version=%s", args.tfjob_version)

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            test_func()

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except tf_operator_util.JobTimeoutError as e:
        if e.job:
            spec = "Job:\n" + json.dumps(e.job, indent=2)
        else:
            spec = "JobTimeoutError did not contain job"
        test_case.failure = "Timeout waiting for job to finish: " + spec
        logging.exception(test_case.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.exception("There was a problem running the job; Exception %s",
                          e)
        # We want to catch all exceptions because we want the test as failed.
        test_case.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        test_case.time = time.time() - start
        if args.artifacts_path:
            test_util.create_junit_xml_file([test_case],
                                            args.artifacts_path + "/junit_" +
                                            test_func.__name__ + ".xml")
예제 #14
0
def _iter_blueprints(namespace, context=None):
    """Return an iterator over blueprints.

  Args:
    namespace: The namespace to look for blueprints
    context: The kube context to use.
  """
    # We need to load the kube config so that we can have credentials to
    # talk to the APIServer.
    util.load_kube_config(persist_config=False, context=context)

    client = k8s_client.ApiClient()
    crd_api = cnrm_clients.CnrmClientApi(client, "containercluster")

    clusters = crd_api.list_namespaced(namespace)

    for c in clusters.get("items"):
        yield c
예제 #15
0
def test_tf_job_simple(test_case):  # pylint: disable=redefined-outer-name
    args = parse_args()
    namespace = "default"
    name = "tf-job-simple"

    util.load_kube_config()
    api_client = k8s_client.ApiClient()
    create_app_and_job(args, namespace, name)
    try:
        tf_job_client.wait_for_condition(
            api_client,
            namespace,
            name, ["Running"],
            status_callback=tf_job_client.log_status)
        logging.info("TFJob launched successfully")
    except Exception as e:
        logging.error("Test failed waiting for job; %s", e)
        test_case.add_failure_info(e.message)
예제 #16
0
def test_kfam(record_xml_attribute):
  util.set_pytest_junit(record_xml_attribute, "test_kfam_e2e")
  util.load_kube_config()
  util.load_kube_credentials()

  getcmd = "kubectl get pods -n kubeflow -l=app=jupyter-web-app --template '{{range.items}}{{.metadata.name}}{{end}}'"
  jupyterpod = util.run(getcmd.split(' '))[1:-1]

  logging.info("accessing kfam svc from jupyter pod %s" % jupyterpod)

  sleep(10)
  # Profile Creation
  profile_name = "testprofile-%s" % uuid.uuid4().hex[0:7]
  util.run(['kubectl', 'exec', jupyterpod, '-n', 'kubeflow', '--', 'curl',
            '--silent', '-X', 'POST', '-d',
            '{"metadata":{"name":"%s"},"spec":{"owner":{"kind":"User","name":"*****@*****.**"}}}' % profile_name,
            'profiles-kfam.kubeflow:8081/kfam/v1/profiles'])

  assert verify_profile_creation(jupyterpod, profile_name)
예제 #17
0
def create_k8s_client(args):
    if args.cluster:
        project = args.project
        cluster_name = args.cluster
        zone = args.zone
        logging.info("Using cluster: %s in project: %s in zone: %s",
                     cluster_name, project, zone)
        # Print out config to help debug issues with accounts and
        # credentials.
        util.run(["gcloud", "config", "list"])
        util.configure_kubectl(project, zone, cluster_name)
        util.load_kube_config()
    else:
        # TODO(jlewi): This is sufficient for API access but it doesn't create
        # a kubeconfig file which ksonnet needs for ks init.
        logging.info("Running inside cluster.")
        incluster_config.load_incluster_config()

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
예제 #18
0
def test_jupyter(record_xml_attribute, kfctl_repo_path, namespace):
  """Test the jupyter notebook.
  Args:
    record_xml_attribute: Test fixture provided by pytest.
    env: ksonnet environment.
    namespace: namespace to run in.
  """
  util.load_kube_config()
  util.load_kube_credentials()
  logging.info("using kfctl repo: %s" % kfctl_repo_path)
  util.run(["kubectl", "apply", "-f",
            os.path.join(kfctl_repo_path,
                         "py/kubeflow/kfctl/testing/pytests/testdata/jupyter_test.yaml")])
  api_client = k8s_client.ApiClient()
  api = k8s_client.CoreV1Api(api_client)

  resp = api.list_namespaced_service(namespace)
  names = [service.metadata.name for service in resp.items]
  if not "jupyter-test" in names:
    raise ValueError("not able to find jupyter-test service.")
예제 #19
0
def deploy_kubeflow(_):
    """Deploy Kubeflow."""
    args = parse_args()
    namespace = args.namespace
    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that Jupyter is actually deployed.
    jupyter_name = "jupyter"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace, jupyter_name)

    # Verify that core components are actually deployed.
    deployment_names = [
        "tf-job-operator-v1beta1", "pytorch-operator", "studyjob-controller"
    ]
    for deployment_name in deployment_names:
        logging.info("Verifying that %s started...", deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name)
예제 #20
0
def deploy_kubeflow(_):
  """Deploy Kubeflow."""
  args = parse_args()
  namespace = args.namespace
  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()
  # Verify that the TfJob operator is actually deployed.
  tf_job_deployment_name = "tf-job-operator-v1beta1"
  logging.info("Verifying TfJob controller started.")
  util.wait_for_deployment(api_client, namespace, tf_job_deployment_name)

  # Verify that Jupyter is actually deployed.
  jupyter_name = "jupyter"
  logging.info("Verifying TfHub started.")
  util.wait_for_statefulset(api_client, namespace, jupyter_name)

  # Verify that PyTorch Operator actually deployed
  pytorch_operator_deployment_name = "pytorch-operator"
  logging.info("Verifying PyTorchJob controller started.")
  util.wait_for_deployment(api_client, namespace, pytorch_operator_deployment_name)
예제 #21
0
def check_deployments_ready(record_xml_attribute, namespace, name, deployments):
  """Test that Kubeflow deployments are successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
  set_logging()
  # TODO(jlewi): Should we do this in the calling function)?
  util.set_pytest_junit(record_xml_attribute, name)

  # Need to activate account for scopes.
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
    util.run(["gcloud", "auth", "activate-service-account",
              "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]])

  api_client = deploy_utils.create_k8s_client()

  util.load_kube_config()

  for deployment_name in deployments:
    logging.info("Verifying that deployment %s started...", deployment_name)
    util.wait_for_deployment(api_client, namespace, deployment_name, 10)
예제 #22
0
def cleanup_workflows(args):
    # We need to load the kube config so that we can have credentials to
    # talk to the APIServer.
    util.load_kube_config(persist_config=False)

    client = k8s_client.ApiClient()
    crd_api = k8s_client.CustomObjectsApi(client)
    workflows = crd_api.list_namespaced_custom_object(argo_client.GROUP,
                                                      argo_client.VERSION,
                                                      args.namespace,
                                                      argo_client.PLURAL)

    expired = []
    unexpired = []

    for w in workflows["items"]:
        is_expired = False

        start_time = date_parser.parse(w["status"]["startedAt"])
        now = datetime.datetime.now(start_time.tzinfo)

        name = w["metadata"]["name"]
        age = now - start_time
        if age > datetime.timedelta(hours=args.max_age_hours):
            logging.info("Deleting workflow: %s", name)
            is_expired = True
            if not args.dryrun:
                crd_api.delete_namespaced_custom_object(
                    argo_client.GROUP, argo_client.VERSION, args.namespace,
                    argo_client.PLURAL, name, k8s_client.V1DeleteOptions())
            break

        if is_expired:
            expired.append(name)
        else:
            unexpired.append(name)

    logging.info("Unexpired workflows:\n%s", "\n".join(unexpired))
    logging.info("expired workflows:\n%s", "\n".join(expired))
예제 #23
0
    def wait_for_jobs(self, namespace, label_filter):
        """Wait for all the jobs with the specified label to finish.

    Args:
      label_filter: A label filter expression e.g. "group=mygroup"
    """
        if not util.is_in_cluster():
            util.load_kube_config(persist_config=False)
        else:
            config.load_incluster_config()

        # Create an API client object to talk to the K8s master.
        api_client = k8s_client.ApiClient()
        jobs = util.wait_for_jobs_with_label(api_client, namespace,
                                             label_filter)

        done = 0
        succeeded = 0
        for job in jobs.items:
            project = job.metadata.labels.get("project", "")
            if not job.status.conditions:
                logging.info("Project %s Job %s.%s missing condition", project,
                             job.metadata.namespace, job.metadata.name)
                continue

            last_condition = job.status.conditions[-1]
            if last_condition.type in ["Failed", "Complete"]:
                logging.info("Project %s Job %s.%s has condition %s", project,
                             job.metadata.namespace, job.metadata.name,
                             last_condition.type)
                done += 1
                if last_condition.type in ["Complete"]:
                    succeeded += 1

        logging.info("%s of %s jobs finished", done, len(jobs.items))
        logging.info("%s of %s jobs finished successfully", succeeded,
                     len(jobs.items))
예제 #24
0
    def run(self, tekton_cluster_info, current_cluster_info):
        """Kicks off all the Tekton pipelines.
    Args:
      tekton_cluster_info: ClusterInfo having the info to run pipelines on.
      Tekton runs on different cluster right now.
      current_cluster_info: Current cluster info.

    Returns:
      a list of UI urls.
    """
        urls = dict()
        try:
            # Currently only tekton tests run in kf-ci-v1.
            util.configure_kubectl(tekton_cluster_info.project,
                                   tekton_cluster_info.zone,
                                   tekton_cluster_info.cluster_name)
            # util.configure_kubectl(project, "us-east1-d", "kf-ci-v1")
            util.load_kube_config()

            for w in self.workflows:
                w.run()
                urls[w.name] = w.ui_url
                if w.teardown_runner:
                    urls[w.teardown_runner.name] = w.teardown_runner.ui_url
                logging.info("URL for workflow: %s", w.ui_url)
        except Exception as e:  # pylint: disable=broad-except
            logging.error(
                "Error when starting Tekton workflow: %s;\nstacktrace:\n%s", e,
                traceback.format_exc())
        finally:
            # Restore kubectl
            util.configure_kubectl(current_cluster_info.project,
                                   current_cluster_info.zone,
                                   current_cluster_info.cluster_name)
            util.load_kube_config()

        return urls
예제 #25
0
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches
  job_type = os.getenv("JOB_TYPE")
  repo_owner = os.getenv("REPO_OWNER")
  repo_name = os.getenv("REPO_NAME")
  pull_base_sha = os.getenv("PULL_BASE_SHA")

  # For presubmit/postsubmit jobs, find the list of files changed by the PR.
  diff_command = []
  if job_type == "presubmit":
    diff_command = ["git", "diff", "--name-only", "master"]
  elif job_type == "postsubmit":
    diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha]

  changed_files = []
  if job_type == "presubmit" or job_type == "postsubmit":
    changed_files = util.run(diff_command,
      cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines()

  for f in changed_files:
    logging.info("File %s is modified.", f)

  if args.release:
    generate_env_from_head(args)
  workflows = []
  if args.config_file:
    workflows.extend(parse_config_file(args.config_file, args.repos_dir))

  create_started_file(args.bucket)

  util.maybe_activate_service_account()

  util.configure_kubectl(args.project, args.zone, args.cluster)
  util.load_kube_config()

  workflow_names = []
  ui_urls = {}

  for w in workflows:
    # Create the name for the workflow
    # We truncate sha numbers to prevent the workflow name from being too large.
    # Workflow name should not be more than 63 characters because its used
    # as a label on the pods.
    workflow_name = os.getenv("JOB_NAME") + "-" + w.name
    ks_cmd = get_ksonnet_cmd(w)

    # Print ksonnet version
    util.run([ks_cmd, "version"])

    # Skip this workflow if it is scoped to a different job type.
    if w.job_types and not job_type in w.job_types:
      logging.info("Skipping workflow %s because job type %s is not one of "
                   "%s.", w.name, job_type, w.job_types)
      continue

    # If we are scoping this workflow to specific directories, check if any files
    # modified match the specified regex patterns.
    dir_modified = False
    if w.include_dirs:
      for f in changed_files:
        for d in w.include_dirs:
          if fnmatch.fnmatch(f, d):
            dir_modified = True
            logging.info("Triggering workflow %s because %s in dir %s is modified.",
                         w.name, f, d)
            break
        if dir_modified:
          break

    # Only consider modified files when the job is pre or post submit, and if
    # the include_dirs stanza is defined.
    if job_type != "periodic" and w.include_dirs and not dir_modified:
      logging.info("Skipping workflow %s because no code modified in %s.",
                   w.name, w.include_dirs)
      continue

    if job_type == "presubmit":
      workflow_name += "-{0}".format(os.getenv("PULL_NUMBER"))
      workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7])

    elif job_type == "postsubmit":
      workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7])

    workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER"))

    salt = uuid.uuid4().hex[0:4]
    # Add some salt. This is mostly a convenience for the case where you
    # are submitting jobs manually for testing/debugging. Since the prow should
    # vend unique build numbers for each job.
    workflow_name += "-{0}".format(salt)

    workflow_names.append(workflow_name)
    # Create a new environment for this run
    env = workflow_name

    util.run([ks_cmd, "env", "add", env], cwd=w.app_dir)

    util.run([ks_cmd, "param", "set", "--env=" + env, w.component,
              "name", workflow_name],
             cwd=w.app_dir)

    # Set the prow environment variables.
    prow_env = []

    names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER",
             "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER",
             "REPO_NAME"]
    names.sort()
    for v in names:
      if not os.getenv(v):
        continue
      prow_env.append("{0}={1}".format(v, os.getenv(v)))

    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env",
             ",".join(prow_env)], cwd=w.app_dir)
    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace",
             get_namespace(args)], cwd=w.app_dir)
    util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket",
             args.bucket], cwd=w.app_dir)
    if args.release:
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag",
                os.getenv("VERSION_TAG")], cwd=w.app_dir)

    # Set any extra params. We do this in alphabetical order to make it easier to verify in
    # the unittest.
    param_names = w.params.keys()
    param_names.sort()
    for k in param_names:
      util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k,
               "{0}".format(w.params[k])], cwd=w.app_dir)

    # For debugging print out the manifest
    util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir)
    util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir)

    ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}"
              "?tab=workflow".format(workflow_name))
    ui_urls[workflow_name] = ui_url
    logging.info("URL for workflow: %s", ui_url)

  success = True
  workflow_phase = {}
  try:
    results = argo_client.wait_for_workflows(get_namespace(args),
                                             workflow_names,
                                             timeout=datetime.timedelta(minutes=180),
                                             status_callback=argo_client.log_status)
    for r in results:
      phase = r.get("status", {}).get("phase")
      name = r.get("metadata", {}).get("name")
      workflow_phase[name] = phase
      if phase != "Succeeded":
        success = False
      logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase)
  except util.TimeoutError:
    success = False
    logging.exception("Time out waiting for Workflows %s to finish", ",".join(workflow_names))
  except Exception as e:
    # We explicitly log any exceptions so that they will be captured in the
    # build-log.txt that is uploaded to Gubernator.
    logging.exception("Exception occurred: %s", e)
    raise
  finally:
    success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls)

    # Upload logs to GCS. No logs after this point will appear in the
    # file in gcs
    file_handler.flush()
    util.upload_file_to_gcs(
      file_handler.baseFilename,
      os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt"))

  return success
예제 #26
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """

    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "cloud-endpoints-controller",
        "jupyter-web-app-deployment",
        "metadata-db",
        "metadata-deployment",
        "metadata-ui",
        "ml-pipeline",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "notebook-controller-deployment",
        "tf-job-operator",
        "pytorch-operator",
        "katib-controller",
        "workflow-controller",
    ]

    stateful_set_names = [
        "kfserving-controller-manager",
    ]

    ingress_related_deployments = []
    ingress_related_stateful_sets = []

    if use_basic_auth:
        deployment_names.extend(["basic-auth-login"])
        ingress_related_stateful_sets.extend(["backend-updater"])
    else:
        ingress_related_deployments.extend(["iap-enabler"])
        ingress_related_stateful_sets.extend(["backend-updater"])

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]
    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
예제 #27
0
def setup(args):
  """Test deploying Kubeflow."""
  if args.cluster:
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    logging.info("Using cluster: %s in project: %s in zone: %s",
                 cluster_name, project, zone)
    # Print out config to help debug issues with accounts and
    # credentials.
    util.run(["gcloud", "config", "list"])
    util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()
  else:
    # TODO(jlewi): This is sufficient for API access but it doesn't create
    # a kubeconfig file which ksonnet needs for ks init.
    logging.info("Running inside cluster.")
    incluster_config.load_incluster_config()

  # Create an API client object to talk to the K8s master.
  api_client = k8s_client.ApiClient()

  now = datetime.datetime.now()
  run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4]

  if not os.path.exists(args.test_dir):
    os.makedirs(args.test_dir)

  logging.info("Using test directory: %s", args.test_dir)

  namespace_name = run_label
  def run():
    namespace = _setup_test(api_client, namespace_name)
    logging.info("Using namespace: %s", namespace)
    # Set a GITHUB_TOKEN so that we don't rate limited by GitHub;
    # see: https://github.com/ksonnet/ksonnet/issues/233
    os.environ["GITHUB_TOKEN"] = args.github_token

    # Initialize a ksonnet app.
    app_name = "kubeflow-test"
    util.run(["ks", "init", app_name,], cwd=args.test_dir)

    app_dir = os.path.join(args.test_dir, app_name)

    kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow"
    util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

    # Install required packages
    packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

    for p in packages:
      util.run(["ks", "pkg", "install", p], cwd=app_dir)

    # Delete the vendor directory and replace with a symlink to the src
    # so that we use the code at the desired commit.
    target_dir = os.path.join(app_dir, "vendor", "kubeflow")

    logging.info("Deleting %s", target_dir)
    shutil.rmtree(target_dir)

    REPO_ORG = "kubeflow"
    REPO_NAME = "kubeflow"
    REGISTRY_PATH = "kubeflow"
    source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME,
                          REGISTRY_PATH)
    logging.info("Creating link %s -> %s", target_dir, source)
    os.symlink(source, target_dir)

    # Deploy Kubeflow
    util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
              "--namespace=" + namespace.metadata.name], cwd=app_dir)

    # TODO(jlewi): For reasons I don't understand even though we ran
    # configure_kubectl above, if we don't rerun it we get rbac errors
    # when we do ks apply; I think because we aren't using the proper service
    # account. This might have something to do with the way ksonnet gets
    # its credentials; maybe we need to configure credentials after calling
    # ks init?
    if args.cluster:
      util.configure_kubectl(args.project, args.zone, args.cluster)

    apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

    util.run(apply_command, cwd=app_dir)

    # Verify that the TfJob operator is actually deployed.
    tf_job_deployment_name = "tf-job-operator"
    logging.info("Verifying TfJob controller started.")
    util.wait_for_deployment(api_client, namespace.metadata.name,
                             tf_job_deployment_name)

    # Verify that JupyterHub is actually deployed.
    jupyter_name = "tf-hub"
    logging.info("Verifying TfHub started.")
    util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)

  main_case = test_util.TestCase()
  main_case.class_name = "KubeFlow"
  main_case.name = "deploy-kubeflow"
  try:
    test_util.wrap_test(run, main_case)
  finally:
    # Delete the namespace
    logging.info("Deleting namespace %s", namespace_name)

    # We report teardown as a separate test case because this will help
    # us track down issues with garbage collecting namespaces.
    teardown = test_util.TestCase(main_case.class_name, "teardown")
    def run_teardown():
      core_api = k8s_client.CoreV1Api(api_client)
      core_api.delete_namespace(namespace_name, {})

    try:
      test_util.wrap_test(run_teardown, teardown)
    except Exception as e:  # pylint: disable-msg=broad-except
      logging.error("There was a problem deleting namespace: %s; %s",
                    namespace_name, e.message)
    junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml")
    logging.info("Writing test results to %s", junit_path)
    test_util.create_junit_xml_file([main_case, teardown], junit_path)
예제 #28
0
def main():
    parser = argparse.ArgumentParser('Label an image using Inception')
    parser.add_argument('-p',
                        '--port',
                        type=int,
                        default=9000,
                        help='Port at which Inception model is being served')
    parser.add_argument("--namespace",
                        required=True,
                        type=str,
                        help=("The namespace to use."))
    parser.add_argument("--service_name",
                        required=True,
                        type=str,
                        help=("The TF serving service to use."))
    parser.add_argument(
        "--artifacts_dir",
        default="",
        type=str,
        help="Directory to use for artifacts that should be preserved after "
        "the test runs. Defaults to test_dir if not set.")
    parser.add_argument("--input_path",
                        required=True,
                        type=str,
                        help=("The input file to use."))
    parser.add_argument("--result_path",
                        type=str,
                        help=("The expected result."))
    parser.add_argument("--workflow_name",
                        default="tfserving",
                        type=str,
                        help="The name of the workflow.")

    args = parser.parse_args()

    t = test_util.TestCase()
    t.class_name = "Kubeflow"
    t.name = args.workflow_name + "-" + args.service_name

    start = time.time()

    util.load_kube_config(persist_config=False)
    api_client = k8s_client.ApiClient()
    core_api = k8s_client.CoreV1Api(api_client)
    try:
        with open(args.input_path) as f:
            instances = json.loads(f.read())

        service = core_api.read_namespaced_service(args.service_name,
                                                   args.namespace)
        service_ip = service.spec.cluster_ip
        model_urls = [
            "http://" + service_ip +
            ":8500/v1/models/mnist:predict",  # tf serving's http server
        ]
        for model_url in model_urls:
            logging.info("Try predicting with endpoint {}".format(model_url))
            num_try = 1
            result = None
            while True:
                try:
                    result = requests.post(model_url, json=instances)
                    assert (result.status_code == 200)
                except Exception as e:
                    num_try += 1
                    if num_try > 10:
                        raise
                    logging.info(
                        'prediction failed: {}. Retrying...'.format(e))
                    time.sleep(5)
                else:
                    break
            logging.info('Got result: {}'.format(result.text))
            if args.result_path:
                with open(args.result_path) as f:
                    expected_result = json.loads(f.read())
                    logging.info('Expected result: {}'.format(expected_result))
                    assert (almost_equal(expected_result,
                                         json.loads(result.text)))
    except Exception as e:
        t.failure = "Test failed; " + e.message
        raise
    finally:
        t.time = time.time() - start
        junit_path = os.path.join(
            args.artifacts_dir,
            "junit_kubeflow-tf-serving-image-{}.xml".format(args.service_name))
        logging.info("Writing test results to %s", junit_path)
        test_util.create_junit_xml_file([t], junit_path)
        # Pause to collect Stackdriver logs.
        time.sleep(60)
예제 #29
0
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path):
    """Test that Kubeflow was successfully deployed.

  Args:
    namespace: The namespace Kubeflow is deployed to.
  """
    set_logging()
    logging.info("Using namespace %s", namespace)

    # Need to activate account for scopes.
    if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        util.run([
            "gcloud", "auth", "activate-service-account",
            "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
        ])

    api_client = deploy_utils.create_k8s_client()

    util.load_kube_config()

    # Verify that components are actually deployed.
    # TODO(jlewi): We need to parameterize this list based on whether
    # we are using IAP or basic auth.
    # TODO(yanniszark): This list is incomplete and missing a lot of components.
    deployment_names = [
        "argo-ui",
        "centraldashboard",
        "jupyter-web-app-deployment",
        "minio",
        "ml-pipeline",
        "ml-pipeline-persistenceagent",
        "ml-pipeline-scheduledworkflow",
        "ml-pipeline-ui",
        "ml-pipeline-viewer-controller-deployment",
        "mysql",
        "notebook-controller-deployment",
        "profiles-deployment",
        "pytorch-operator",
        "tf-job-operator",
        "workflow-controller",
    ]

    stateful_set_names = []

    with open(os.path.join(app_path, "app.yaml")) as f:
        kfdef = yaml.safe_load(f)
    platform = kfdef["spec"]["platform"]

    ingress_related_deployments = [
        "istio-citadel",
        "istio-egressgateway",
        "istio-galley",
        "istio-ingressgateway",
        "istio-pilot",
        "istio-policy",
        "istio-sidecar-injector",
        "istio-telemetry",
        "istio-tracing",
        "kiali",
        "prometheus",
    ]
    ingress_related_stateful_sets = []

    knative_namespace = "knative-serving"
    knative_related_deployments = [
        "activator",
        "autoscaler",
        "controller",
    ]

    if platform == "gcp":
        deployment_names.extend(["cloud-endpoints-controller"])
        stateful_set_names.extend(["kfserving-controller-manager"])
        if use_basic_auth:
            deployment_names.extend(["basic-auth-login"])
            ingress_related_stateful_sets.extend(["backend-updater"])
        else:
            ingress_related_deployments.extend(["iap-enabler"])
            ingress_related_stateful_sets.extend(["backend-updater"])
    elif platform == "existing_arrikto":
        deployment_names.extend(["dex"])
        ingress_related_deployments.extend(["authservice"])
        knative_related_deployments = []

    # TODO(jlewi): Might want to parallelize this.
    for deployment_name in deployment_names:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, namespace, deployment_name, 10)

    ingress_namespace = "istio-system" if use_istio else namespace
    for deployment_name in ingress_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, ingress_namespace,
                                 deployment_name, 10)

    all_stateful_sets = [(namespace, name) for name in stateful_set_names]
    all_stateful_sets.extend([(ingress_namespace, name)
                              for name in ingress_related_stateful_sets])

    for ss_namespace, name in all_stateful_sets:
        logging.info("Verifying that stateful set %s.%s started...",
                     ss_namespace, name)
        try:
            util.wait_for_statefulset(api_client, ss_namespace, name)
        except:
            # Collect debug information by running describe
            util.run([
                "kubectl", "-n", ss_namespace, "describe", "statefulsets", name
            ])
            raise

    # TODO(jlewi): We should verify that the ingress is created and healthy.

    for deployment_name in knative_related_deployments:
        logging.info("Verifying that deployment %s started...",
                     deployment_name)
        util.wait_for_deployment(api_client, knative_namespace,
                                 deployment_name, 10)
예제 #30
0
def run_papermill_job(
        notebook_path,
        name,
        namespace,  # pylint: disable=too-many-branches,too-many-statements
        repos,
        image):
    """Generate a K8s job to run a notebook using papermill

  Args:
    notebook_path: Path to the notebook. This should be in the form
      "{REPO_OWNER}/{REPO}/path/to/notebook.ipynb"
    name: Name for the K8s job
    namespace: The namespace where the job should run.
    repos: Which repos to checkout; if None or empty tries
      to infer based on PROW environment variables
    image: The docker image to run the notebook in.
  """

    util.maybe_activate_service_account()

    with open("job.yaml") as hf:
        job = yaml.load(hf)

    if notebook_path.startswith("/"):
        raise ValueError(
            "notebook_path={0} should not start with /".format(notebook_path))

    # We need to checkout the correct version of the code
    # in presubmits and postsubmits. We should check the environment variables
    # for the prow environment variables to get the appropriate values.
    # We should probably also only do that if the
    # See
    # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
    if not repos:
        repos = argo_build_util.get_repo_from_prow_env()

    if not repos:
        raise ValueError("Could not get repos from prow environment variable "
                         "and --repos isn't explicitly set")

    repos += ",kubeflow/testing@HEAD"

    logging.info("Repos set to %s", repos)
    job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
        "/usr/local/bin/checkout_repos.sh",
        "--repos=" + repos,
        "--src_dir=/src",
        "--depth=all",
    ]

    job["spec"]["template"]["spec"]["containers"][0]["image"] = image

    full_notebook_path = os.path.join("/src", notebook_path)
    job["spec"]["template"]["spec"]["containers"][0]["command"] = [
        "python3", "-m", "kubeflow.examples.notebook_tests.execute_notebook",
        "--notebook_path", full_notebook_path
    ]

    job["spec"]["template"]["spec"]["containers"][0][
        "workingDir"] = os.path.dirname(full_notebook_path)

    # The prow bucket to use for results/artifacts
    prow_bucket = prow_artifacts.PROW_RESULTS_BUCKET

    if os.getenv("REPO_OWNER") and os.getenv("REPO_NAME"):
        # Running under prow
        prow_dir = prow_artifacts.get_gcs_dir(prow_bucket)
        logging.info("Prow artifacts dir: %s", prow_dir)
        prow_dir = os.path.join(prow_dir, "artifacts")

        if os.getenv("TEST_TARGET_NAME"):
            prow_dir = os.path.join(prow_dir,
                                    os.getenv("TEST_TARGET_NAME").lstrip("/"))
        prow_bucket, prow_path = util.split_gcs_uri(prow_dir)

    else:
        prow_path = "notebook-test" + datetime.datetime.now().strftime(
            "%H%M%S")
        prow_path = prow_path + "-" + uuid.uuid4().hex[0:3]
        prow_dir = util.to_gcs_uri(prow_bucket, prow_path)

    prow_path = os.path.join(prow_path, name + ".html")
    output_gcs = util.to_gcs_uri(NB_BUCKET, prow_path)

    job["spec"]["template"]["spec"]["containers"][0]["env"] = [
        {
            "name": "OUTPUT_GCS",
            "value": output_gcs
        },
        {
            "name": "PYTHONPATH",
            "value": "/src/kubeflow/testing/py:/src/kubeflow/examples/py"
        },
    ]

    logging.info("Notebook will be written to %s", output_gcs)
    util.load_kube_config(persist_config=False)

    if name:
        job["metadata"]["name"] = name
    else:
        job["metadata"]["name"] = ("notebook-test-" +
                                   datetime.datetime.now().strftime("%H%M%S") +
                                   "-" + uuid.uuid4().hex[0:3])
    name = job["metadata"]["name"]

    job["metadata"]["namespace"] = namespace

    # Create an API client object to talk to the K8s master.
    api_client = k8s_client.ApiClient()
    batch_api = k8s_client.BatchV1Api(api_client)

    logging.info("Creating job:\n%s", yaml.dump(job))
    actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
                                                 job)
    logging.info("Created job %s.%s:\n%s", namespace, name,
                 yaml.safe_dump(actual_job.to_dict()))

    final_job = util.wait_for_job(api_client,
                                  namespace,
                                  name,
                                  timeout=datetime.timedelta(minutes=30))

    logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict()))

    # Download notebook html to artifacts
    logging.info("Copying %s to bucket %s", output_gcs, prow_bucket)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(NB_BUCKET)
    blob = bucket.get_blob(prow_path)

    destination_bucket = storage_client.get_bucket(prow_bucket)
    bucket.copy_blob(blob, destination_bucket)

    if not final_job.status.conditions:
        raise RuntimeError("Job {0}.{1}; did not complete".format(
            namespace, name))

    last_condition = final_job.status.conditions[-1]

    if last_condition.type not in ["Complete"]:
        logging.error("Job didn't complete successfully")
        raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
예제 #31
0
def run_test(args):  # pylint: disable=too-many-branches,too-many-statements
    """Run a test."""
    gcs_client = storage.Client(project=args.project)
    project = args.project
    cluster_name = args.cluster
    zone = args.zone
    # TODO(jlewi): When using GKE we should copy the .kube config and any other
    # files to the test directory. We should then set the environment variable
    # KUBECONFIG to point at that file. This should prevent us from having
    # to rerun util.configure_kubectl on each step. Instead we could run it once
    # as part of GKE cluster creation and store the config in the NFS directory.
    # This would make the handling of credentials
    # and KUBECONFIG more consistent between GKE and minikube and eventually
    # this could be extended to other K8s deployments.
    if cluster_name:
        util.configure_kubectl(project, zone, cluster_name)
    util.load_kube_config()

    api_client = k8s_client.ApiClient()
    masterHost = api_client.configuration.host

    t = test_util.TestCase()
    t.class_name = "tfjob_test"
    namespace, name, env = _setup_ks_app(args)
    t.name = os.path.basename(name)

    start = time.time()

    try:  # pylint: disable=too-many-nested-blocks
        # We repeat the test multiple times.
        # This ensures that if we delete the job we can create a new job with the
        # same name.

        # TODO(jlewi): We should make this an argument.
        num_trials = 2

        for trial in range(num_trials):
            logging.info("Trial %s", trial)
            util.run(["ks", "apply", env, "-c", args.component],
                     cwd=args.app_dir)

            logging.info("Created job %s in namespaces %s", name, namespace)
            logging.info("tfjob_version=%s", args.tfjob_version)
            # Wait for the job to either be in Running state or a terminal state
            if args.tfjob_version == "v1alpha1":
                logging.info("Wait for Phase Running, Done, or Failed")
                results = tf_job_client.wait_for_phase(
                    api_client,
                    namespace,
                    name, ["Running", "Done", "Failed"],
                    status_callback=tf_job_client.log_status)
            else:
                logging.info(
                    "Wait for conditions Running, Succeeded, or Failed")
                results = tf_job_client.wait_for_condition(
                    api_client,
                    namespace,
                    name, ["Running", "Succeeded", "Failed"],
                    status_callback=tf_job_client.log_status)

            logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

            # The job is now either running or done.
            if args.shutdown_policy:
                logging.info("Enforcing shutdownPolicy %s",
                             args.shutdown_policy)
                if args.shutdown_policy in ["master", "chief"]:
                    if args.tfjob_version == "v1alpha1":
                        replica = "master"
                    else:
                        replica = "chief"
                elif args.shutdown_policy in ["worker", "all_workers"]:
                    replica = "worker"
                else:
                    raise ValueError("Unrecognized shutdown_policy "
                                     "%s" % args.shutdown_policy)

                # Number of targets.
                num_targets = 1
                if args.shutdown_policy in ["all_workers"]:
                    # Assume v1alpha2
                    num_targets = results.get("spec", {}).get(
                        "tfReplicaSpecs", {}).get("Worker",
                                                  {}).get("replicas", 0)
                    logging.info("There are %s worker replicas", num_targets)

                if args.tfjob_version == "v1alpha1":
                    runtime_id = results.get("spec", {}).get("RuntimeId")
                    target = "{name}-{replica}-{runtime}".format(
                        name=name, replica=replica, runtime=runtime_id)
                    pod_labels = get_labels(name, runtime_id)
                    pod_selector = to_selector(pod_labels)
                else:
                    target = "{name}-{replica}".format(name=name,
                                                       replica=replica)
                    pod_labels = get_labels_v1alpha2(namespace, name)
                    pod_selector = to_selector(pod_labels)

                # Wait for the pods to be ready before we shutdown
                # TODO(jlewi): We are get pods using a label selector so there is
                # a risk that the pod we actual care about isn't present.
                logging.info(
                    "Waiting for pods to be running before shutting down.")
                wait_for_pods_to_be_in_phases(
                    api_client,
                    namespace,
                    pod_selector, ["Running"],
                    timeout=datetime.timedelta(minutes=4))
                logging.info("Pods are ready")
                logging.info("Issuing the terminate request")
                for num in range(num_targets):
                    full_target = target + "-{0}".format(num)
                    terminateReplica(masterHost, namespace, full_target)

            logging.info("Waiting for job to finish.")
            results = tf_job_client.wait_for_job(
                api_client,
                namespace,
                name,
                args.tfjob_version,
                status_callback=tf_job_client.log_status)

            if args.tfjob_version == "v1alpha1":
                if results.get("status", {}).get("state",
                                                 {}).lower() != "succeeded":
                    t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format(
                        trial, name, namespace,
                        results.get("status", {}).get("state", None))
                    logging.error(t.failure)
                    break
            else:
                # For v1alpha2 check for non-empty completionTime
                last_condition = results.get("status",
                                             {}).get("conditions", [])[-1]
                if last_condition.get("type", "").lower() != "succeeded":
                    t.failure = "Trial {0} Job {1} in namespace {2} in status {3}".format(
                        trial, name, namespace, results.get("status", {}))
                    logging.error(t.failure)
                    break

            runtime_id = results.get("spec", {}).get("RuntimeId")
            logging.info("Trial %s Job %s in namespace %s runtime ID %s",
                         trial, name, namespace, runtime_id)

            uid = results.get("metadata", {}).get("uid")
            events = get_events(api_client, namespace, uid)
            for e in events:
                logging.info("K8s event: %s", e.message)

            # Print out the K8s events because it can be useful for debugging.
            for e in events:
                logging.info("Recieved K8s Event:\n%s", e)
            created_pods, created_services = parse_events(events)

            num_expected = 0
            if args.tfjob_version == "v1alpha1":
                for replica in results.get("spec", {}).get("replicaSpecs", []):
                    num_expected += replica.get("replicas", 0)
            else:
                for replicakey in results.get("spec",
                                              {}).get("tfReplicaSpecs", {}):
                    replica_spec = results.get("spec",
                                               {}).get("tfReplicaSpecs",
                                                       {}).get(replicakey, {})
                    if replica_spec:
                        num_expected += replica_spec.get("replicas", 1)

            creation_failures = []
            if len(created_pods) != num_expected:
                message = ("Expected {0} pods to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_pods))
                creation_failures.append(message)

            if len(created_services) != num_expected:
                message = ("Expected {0} services to be created but only "
                           "got {1} create events.").format(
                               num_expected, len(created_services))
                creation_failures.append(message)

            if creation_failures:
                # TODO(jlewi): Starting with
                # https://github.com/kubeflow/tf-operator/pull/646 the number of events
                # no longer seems to match the expected; it looks like maybe events
                # are being combined? For now we just log a warning rather than an
                # error.
                logging.warning(creation_failures)
            if args.tfjob_version == "v1alpha1":
                pod_labels = get_labels(name, runtime_id)
                pod_selector = to_selector(pod_labels)
            else:
                pod_labels = get_labels_v1alpha2(name)
                pod_selector = to_selector(pod_labels)

            # We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy
            # means completed pods won't be deleted.
            # TODO(jlewi): We should add a test to deal with deleted pods.
            if args.tfjob_version == "v1alpha1":
                wait_for_pods_to_be_deleted(api_client, namespace,
                                            pod_selector)

            tf_job_client.delete_tf_job(api_client,
                                        namespace,
                                        name,
                                        version=args.tfjob_version)

            logging.info("Waiting for job %s in namespaces %s to be deleted.",
                         name, namespace)
            wait_for_delete(api_client,
                            namespace,
                            name,
                            args.tfjob_version,
                            status_callback=tf_job_client.log_status)

        # TODO(jlewi):
        #  Here are some validation checks to run:
        #  1. Check that all resources are garbage collected.
        # TODO(jlewi): Add an option to add chaos and randomly kill various resources?
        # TODO(jlewi): Are there other generic validation checks we should
        # run.
    except util.TimeoutError:
        t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format(
            name, namespace)
        logging.exception(t.failure)
    except Exception as e:  # pylint: disable-msg=broad-except
        # TODO(jlewi): I'm observing flakes where the exception has message "status"
        # in an effort to try to nail down this exception we print out more
        # information about the exception.
        logging.exception("There was a problem running the job; Exception %s",
                          e)
        # We want to catch all exceptions because we want the test as failed.
        t.failure = ("Exception occured; type {0} message {1}".format(
            e.__class__, e.message))
    finally:
        t.time = time.time() - start
        if args.junit_path:
            test_util.create_junit_xml_file([t], args.junit_path, gcs_client)