def ks_deploy(app_dir, component, params, env=None, account=None, namespace=None): """Deploy the specified ksonnet component. Args: app_dir: The ksonnet directory component: Name of the component to deployed params: A dictionary of parameters to set; can be empty but should not be None. env: (Optional) The environment to use, if none is specified a new one is created. account: (Optional) The account to use. namespace: (Optional) The namespace to use when adding the environment Raises: ValueError: If input arguments aren't valid. """ if not component: raise ValueError("component can't be None.") # TODO(jlewi): It might be better if the test creates the app and uses # the latest stable release of the ksonnet configs. That however will cause # problems when we make changes to the TFJob operator that require changes # to the ksonnet configs. One advantage of checking in the app is that # we can modify the files in vendor if needed so that changes to the code # and config can be submitted in the same pr. now = datetime.datetime.now() if not env: env = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] logging.info("Using app directory: %s", app_dir) if not namespace: util.run(["ks", "env", "add", env], cwd=app_dir) else: util.run(["ks", "env", "add", env, "--namespace=" + namespace], cwd=app_dir) for k, v in params.iteritems(): util.run( ["ks", "param", "set", "--env=" + env, component, k, v], cwd=app_dir) apply_command = ["ks", "apply", env, "-c", component] if account: apply_command.append("--as=" + account) util.run(apply_command, cwd=app_dir)
def generate_env_from_head(args): commit = util.run(["git", "rev-parse", "HEAD"], cwd=os.path.join(args.repos_dir, os.getenv("REPO_OWNER"), os.getenv("REPO_NAME"))) pull_base_sha = commit[0:8] date_str = datetime.datetime.now().strftime("%Y%m%d") build_number = uuid.uuid4().hex[0:4] version_tag = "v{0}-{1}".format(date_str, pull_base_sha) env_var = { "PULL_BASE_SHA": pull_base_sha, "BUILD_NUMBER": build_number, "VERSION_TAG": version_tag, } for k in env_var: if os.getenv(k): continue os.environ[k] = env_var.get(k)
def setup_kubeflow_ks_app(args, api_client): """Create a ksonnet app for Kubeflow""" if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = args.namespace namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) if args.github_token: logging.info("Setting GITHUB_TOKEN to %s.", args.github_token) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token if not os.getenv("GITHUB_TOKEN"): logging.warn("GITHUB_TOKEN not set; you will probably hit Github API " "limits.") # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) return app_dir
def run(test_files_dirs, flake8_path_args, test_case): # Go through each Python file in test_files_dirs and run flake8 for test_files_dir in test_files_dirs: for root, _, files in os.walk(test_files_dir): for test_file in files: full_path = os.path.join(root, test_file) assert root == os.path.dirname(full_path) if should_test(full_path): logging.info("Testing: %s", test_file) try: output = util.run(['flake8', full_path] + FLAKE8_OPTS, cwd=root) try: parsed = json.loads(output) except AttributeError: logging.error( "Output of flake8 could not be parsed as json; " "output: %s", output) parsed = {} if not hasattr(parsed, "get"): # Legacy style tests emit true rather than a json object. # Parsing the string as json converts it to a bool so we # just use parsed as test_passed # Old style tests actually use std.assert so flake8 will # actually return an error in the case the test did # not pass. logging.warn( "flake8 is using old style and not emitting an object. " "Result was: %s. Output will be treated as a boolean", output) test_passed = parsed else: test_passed = parsed.get("pass", False) if not test_passed: msg = '{} test failed'.format(test_file) test_case.add_failure_info(msg) logging.error( '{}. See Subprocess output for details.'.format(msg)) except Exception as e: msg = '{} test failed'.format(test_file) test_case.add_failure_info(msg) logging.error('{} with exception %s. See Subprocess output for ' 'details.'.format(msg, e))
def run(test_files_dirs, jsonnet_path_args, test_case): # Go through each jsonnet file in test_files_dirs and run jsonnet eval for test_files_dir in test_files_dirs: for root, _, files in os.walk(test_files_dir): for test_file in files: full_path = os.path.join(root, test_file) if should_test(full_path): logging.info("Testing: %s", test_file) try: output = util.run( ['jsonnet', 'eval', full_path] + jsonnet_path_args, cwd=os.path.dirname(full_path)) try: parsed = json.loads(output) except AttributeError: logging.error( "Output of jsonnet eval could not be parsed as json; " "output: %s", output) parsed = {} if not hasattr(parsed, "get"): # Legacy style tests emit true rather than a json object. # Parsing the string as json converts it to a bool so we # just use parsed as test_passed # Old style tests actually use std.assert so jsonnet eval # will actually return an error in the case the test didn't # pass. logging.warn( "jsonnet test is using old style and not emitting an object. " "Result was: %s. Output will be treated as a boolean", output) test_passed = parsed else: test_passed = parsed.get("pass", false) if not test_passed: test_case.add_failure_info('{} test failed'.format(test_file)) logging.error('%s test failed. See Subprocess output for details.', test_file) except Exception as e: test_case.add_failure_info('{} test failed'.format(test_file)) logging.error('%s test failed with exception %s. ' 'See Subprocess output for details.', e, test_file)
def test_profiles(): app_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") if app_credentials: logging.info("Activate service account") util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + app_credentials ]) # util.load_kube_config appears to hang on python3 kube_config.load_kube_config() api_client = k8s_client.ApiClient() this_dir = os.path.dirname(__file__) util.run(["kubectl", "apply", "-f", "sample_profile.yaml"], cwd=this_dir) # TODO: check CR status/condition instead of sleep # conditions = ["Ready"] # namespace = "kubeflow" # name = "john" # results = util.wait_for_cr_condition(api_client, GROUP, PLURAL, VERSION, # namespace, name, conditions) # logging.info("Result of CRD:\n%s", results) time.sleep(10) # Verifies the namespace is created. name = "john" # The name of the profile, also the new namespace's name. coreV1 = k8s_client.CoreV1Api(api_client) retry_read_namespace = retry( wait_exponential_multiplier= 1000, # wait 2^i * 1000 ms, on the i-th retry wait_exponential_max=60000, # 60 sec max )(coreV1.read_namespace) resp = retry_read_namespace(name) logging.info("found namespace: %s", resp) rbacV1 = k8s_client.RbacAuthorizationV1Api(api_client) resp = rbacV1.read_namespaced_role("edit", name) logging.info("role: %s", resp) resp = rbacV1.read_namespaced_role_binding("default", name) logging.info("role binding: %s", resp) # delete the profile and make sure namespace is deleted util.run(["kubectl", "delete", "-f", "sample_profile.yaml"], cwd=this_dir) time.sleep(15) with pytest.raises(ApiException) as e: resp = coreV1.read_namespace(name) logging.info("exception info: %s", e)
def _check_if_pr_exists(self, commit=None): """Check if a PR is already open. Returns: exists: True if a PR updating the image to the specified commit already exists and false otherwise. """ # TODO(jlewi): Modeled on # https://github.com/kubeflow/examples/blob/master/code_search/docker/ks/update_index.sh # TODO(jlewi): We should use the GitHub API and check if there is an # existing open pull request. Or potentially just use the hub CLI. if not commit: commit = self.last_commit logging.info("No commit specified defaulting to %s", commit) pr_title = self._pr_title(commit) # See hub conventions: # https://hub.github.com/hub.1.html # The GitHub repository is determined automatically based on the name # of remote repositories output = util.run(["hub", "pr", "list", "--format=%U;%t\n"], cwd=self.manifests_repo_dir) lines = output.splitlines() prs = {} for l in lines: n, t = l.split(";", 1) prs[t] = n if pr_title in prs: logging.info( "PR %s already exists to update the Jupyter web app image " "to %s", prs[pr_title], commit) return True return False
def test_serve(self): # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. api_client = k8s_client.ApiClient() # TODO (jinchihe) beflow code will be removed once new test-worker image # is publish in https://github.com/kubeflow/testing/issues/373. kusUrl = 'https://github.com/kubernetes-sigs/kustomize/' \ 'releases/download/v2.0.3/kustomize_2.0.3_linux_amd64' util.run(['wget', '-O', '/usr/local/bin/kustomize', kusUrl], cwd=self.app_dir) util.run(['chmod', 'a+x', '/usr/local/bin/kustomize'], cwd=self.app_dir) # Apply the components configmap = 'mnist-map-serving' for pair in self.params.split(","): k, v = pair.split("=", 1) if k == "namespace": util.run(['kustomize', 'edit', 'set', k, v], cwd=self.app_dir) else: util.run([ 'kustomize', 'edit', 'add', 'configmap', configmap, '--from-literal=' + k + '=' + v ], cwd=self.app_dir) # Seems the util.run cannot handle pipes case, using check_call. subCmd = 'kustomize build ' + self.app_dir + '| kubectl apply -f -' subprocess.check_call(subCmd, shell=True) util.wait_for_deployment(api_client, self.namespace, self.name, timeout_minutes=4)
def deploy_with_kfctl_go(kfctl_path, args, app_dir, env): """Deploy Kubeflow using kfctl go binary.""" # username and password are passed as env vars and won't appear in the logs # # TODO(https://github.com/kubeflow/kubeflow/issues/2831): We should be # loading the config in the repo we have checked out kfctl doesn't support # specifying a file URI. Once it does we should change --version to # use it. # # TODO(zhenghuiwang): use the master of kubeflow/manifests once # https://github.com/kubeflow/kubeflow/issues/3475 is fixed. logging.warning("Loading configs from master.") util.run([ kfctl_path, "init", app_dir, "-V", "--platform=gcp", "--version=master", "--package-manager=kustomize", "--skip-init-gcp-project", "--disable_usage_report", "--use_istio", "--project=" + args.project ], env=env) # We need to specify a valid email because # 1. We need to create appropriate RBAC rules to allow the current user # to create the required K8s resources. # 2. Setting the IAM policy will fail if the email is invalid. # TODO(jlewi): kfctl should eventually do this automatically. email = util.run(["gcloud", "config", "get-value", "account"]) if not email: raise ValueError("Could not determine GCP account being used.") util.run([ kfctl_path, "generate", "-V", "all", "--email=" + email, "--zone=" + args.zone ], env=env, cwd=app_dir) util.run([kfctl_path, "apply", "-V", "all"], env=env, cwd=app_dir)
def _gcloud_list(): # For debugging purposes output the command util.run(["gcloud", "config", "list"]) util.run(["gcloud", "auth", "list"])
def deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=None): # pylint: disable=too-many-branches """Deploy Kubeflow using kfctl go binary.""" # username and password are passed as env vars and won't appear in the logs # # We need to edit and rewrite the config file to the app dir because # kfctl uses the path of the config file as the app dir.s logging.warning("Loading configs %s.", args.kfctl_config) if args.kfctl_config.startswith("http"): response = requests.get(args.kfctl_config) raw_config = response.content else: with open(args.kfctl_config) as hf: raw_config = hf.read() config_spec = yaml.load(raw_config) # We need to specify a valid email because # 1. We need to create appropriate RBAC rules to allow the current user # to create the required K8s resources. # 2. Setting the IAM policy will fail if the email is invalid. email = args.email if not email: logging.info("email not set trying to get default from gcloud") email = util.run(["gcloud", "auth", "list", "--filter", "status:ACTIVE", "--format", "value(account)"]) if not email: raise ValueError("Could not determine GCP account being used.") kfdef_version = config_spec["apiVersion"].strip().lower() if kfdef_version == KFDEF_V1ALPHA1: config_spec = build_v06_spec(config_spec, args.project, email, args.zone, args.setup_project) else: config_spec = build_v07_spec(config_spec, args.project, email, args.zone, args.setup_project) config_spec["spec"] = util.filter_spartakus(config_spec["spec"]) # Remove name because we will auto infer from directory. if "name" in config_spec["metadata"]: logging.info("Deleting name in kfdef spec.") del config_spec["metadata"]["name"] app_name = os.path.basename(app_dir) if not "labels" in config_spec["metadata"]: config_spec["metadata"]["labels"] = {} if labels: config_spec["metadata"]["labels"].update(labels) logging.info("KFDefSpec:\n%s", yaml.safe_dump(config_spec)) if kfdef_version == KFDEF_V1ALPHA1: logging.info("Deploying using v06 syntax") logging.info("Checking if deployment %s already exists in project %s", args.project, app_name) if check_if_kfapp_exists(args.project, app_name, args.zone): # With v0.6 kfctl can't successfully run apply a 2nd time so if # the deployment already exists we can't redeploy. logging.info("Deployment %s already exists in project %s; not " "redeploying", args.project, app_name) return with tempfile.NamedTemporaryFile(prefix="tmpkf_config", suffix=".yaml", delete=False) as hf: config_file = hf.name logging.info("Writing file %s", config_file) yaml.dump(config_spec, hf) util.run([kfctl_path, "init", app_dir, "-V", "--config=" + config_file], env=env) util.run([kfctl_path, "generate", "-V", "all"], env=env, cwd=app_dir) util.run([kfctl_path, "apply", "-V", "all"], env=env, cwd=app_dir) else: logging.info("Deploying using v07 syntax") if not os.path.exists(app_dir): logging.info("Creating app dir %s", app_dir) os.makedirs(app_dir) config_file = os.path.join(app_dir, "kf_config.yaml") with open(config_file, "w") as hf: logging.info("Writing file %s", config_file) yaml.dump(config_spec, hf) util.run([kfctl_path, "apply", "-V", "-f", config_file], env=env) # We will hit lets encrypt rate limiting with the managed certificates # So create a self signed certificate and update the ingress to use it. if args.use_self_cert: logging.info("Configuring self signed certificate") util.load_kube_credentials() api_client = k8s_client.ApiClient() ingress_namespace = "istio-system" ingress_name = "envoy-ingress" tls_endpoint = "{0}.endpoints.{1}.cloud.goog".format(app_name, args.project) logging.info("Configuring self signed cert for %s", tls_endpoint) util.use_self_signed_for_ingress(ingress_namespace, ingress_name, tls_endpoint, api_client)
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--project", default="kubeflow-ci-deployment", type=str, help=("The project.")) parser.add_argument( "--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default=("gs://kubeflow-ci-deployment_kf-data/" "kf-iap-oauth.kubeflow-ci-deployment.yaml"), type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) # TODO(jlewi): Should rename this argument to something like kfctl_src # We should try to do it in a backwards compatible way. parser.add_argument( "--kubeflow_repo", default="/src/kubeflow/kubeflow", type=str, help=("Path to the source for kfctl. Should be the directory " "containing the Makefile to build kfctl")) parser.add_argument( "--kfctl_path", default="", type=str, help=("Path to kfctl; can be a URL.")) parser.add_argument( "--kfctl_config", default=("https://raw.githubusercontent.com/kubeflow/manifests" "/master/kfdef/kfctl_gcp_iap.yaml"), type=str, help=("Path to the kfctl config to use")) parser.add_argument( "--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) parser.add_argument( "--name", type=str, default="kf-vmaster-{uid}", help=("Name for the deployment. This can be a python format string " "with the variable uid. Uid will automatically be substituted " "for a unique value based on the time.")) parser.add_argument( "--email", type=str, default="", help=("(Optional). Email of the person to create the default profile" "for. If not specificied uses the gcloud config value.")) parser.add_argument( "--extra_users", type=str, default="", help=("Comma separated list of additional users to grant access. " "Should be in the form user:[email protected] or" "serviceAccount:[email protected]")) parser.add_argument( "--labels", type=str, default="", help=("Comma separated list of extra labels; e.g " "--labels=k1=v1,k2=v2")) parser.add_argument("--setup_project", dest="setup_project", action="store_true", help="Setup the project") parser.add_argument("--no-setup_project", dest="setup_project", action="store_false", help="Do not setup the project") parser.set_defaults(setup_project=True) parser.add_argument("--use_self_cert", dest="use_self_cert", action="store_true", help="Use a self signed certificate") parser.add_argument("--no-use_self_cert", dest="use_self_cert", action="store_false", help="Do not use a self signed certificate") parser.set_defaults(use_self_cert=True) args = parser.parse_args() util.maybe_activate_service_account() # Wait for credentials to deal with workload identity issues gcp_util.get_gcp_credentials() # Wrap gcloud commands in retry loop to deal with metadata; workload # identity issues. @retrying.retry(stop_max_delay=5*60*1000, wait_exponential_max=10000) def _gcloud_list(): # For debugging purposes output the command util.run(["gcloud", "config", "list"]) util.run(["gcloud", "auth", "list"]) _gcloud_list() bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) if args.kubeflow_repo and args.kfctl_path: raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds " "to be set.") if not args.kubeflow_repo and not args.kfctl_path: raise ValueError("Exactly one of --kubeflow_repo and --kfctl_path neeeds " "to be set.") git_describe = "" if args.kubeflow_repo: git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") kfctl_path = build_kfctl_go(args) else: if args.kfctl_path.startswith("http"): temp_dir = tempfile.mkdtemp() filename = "kfctl" zipped = False if args.kfctl_path.endswith(".tar.gz"): zipped = True filename = filename + ".tar.gz" util.run(["curl", "-L", "-o", filename, args.kfctl_path], cwd=temp_dir) if zipped: util.run(["tar", "-xvf", "kfctl.tar.gz"], cwd=temp_dir) kfctl_path = os.path.join(temp_dir, "kfctl") logging.info("Changing permissions on %s", kfctl_path) os.chmod(kfctl_path, 0o777) else: kfctl_path = args.kfctl_path git_describe = util.run([kfctl_path, "version"]) logging.info("kfctl path set to %s", kfctl_path) # We need to keep the name short to avoid hitting limits with certificates. uid = datetime.datetime.now().strftime("%m%d") + "-" uid = uid + uuid.uuid4().hex[0:3] args.name = args.name.format(uid=uid) logging.info("Using name %s", args.name) app_dir = os.path.join(args.apps_dir, args.name) if not os.path.exists(args.apps_dir): os.makedirs(args.apps_dir) env = {} env.update(os.environ) env.update(oauth_info) # GCP labels can only take as input alphanumeric characters, hyphens, and # underscores. Replace not valid characters with hyphens. labels = {"kfctl-git": git_describe, "purpose": "kf-test-cluster", "auto-deploy": "true"} for k, v in labels.items(): val = v.lower().replace("\"", "") val = re.sub(r"[^a-z0-9\-_]", "-", val) labels[k] = val if args.labels: logging.info("Parsing labels %s", args.labels) for pair in args.labels.split(","): pieces = pair.split("=") if len(pieces) != 2: logging.error("Skipping pair %s; not of the form key=value", pair) continue key = pieces[0].strip() value = pieces[1].strip() labels[key] = value logging.info("labels: %s", labels) deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels) add_extra_users(args.project, args.extra_users)
def main(unparsed_args=None): # pylint: disable=too-many-locals logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals # create the top-level parser parser = argparse.ArgumentParser(description="Get Images by regex") parser.add_argument( "--pattern", default="", type=str, help="Regex pattern e.g. .*tensorflow.*notebook.*:v20180619.*") parser.add_argument("--images_file", default="image_tags.yaml", type=str, help="Yaml file containing the tags to attach.") parser.add_argument("--repository", default=None, type=str, help="GCR repository name (optional).") args = parser.parse_args() with open(args.images_file) as hf: config = yaml.load(hf) existing_images = {} for image in config["images"]: existing_images[image["name"]] = {} for v in image["versions"]: existing_images[image["name"]][v["digest"]] = v list_images_cmd = [ "gcloud", "--project=kubeflow-images-public", "container", "images", "list", "--format=json" ] # By default gcloud uses gcr.io/[project] as the repository. However for # images like katib, we may need to specify the repository as # gcr.io/[project]/katib. if args.repository: list_images_cmd.append("--repository=" + args.repository) raw_images = util.run(list_images_cmd) all_images = json.loads(raw_images) name_pattern, tag_pattern = args.pattern.split(":") name_re = re.compile(name_pattern) tag_re = re.compile(tag_pattern) matching = [] for image in all_images: if not name_re.match(image["name"]): continue logging.info("Matching image: %s", image["name"]) matching.append(image) # For each image ist all tags and find the matching ones images_to_add = {} for image in matching: raw_tags = util.run([ "gcloud", "--project=kubeflow-images-public", "container", "images", "list-tags", image["name"], "--format=json" ]) tags = json.loads(raw_tags) for info in tags: for t in info["tags"]: if tag_re.match(t): is_match = True versions = images_to_add.get(image["name"], {}) versions[info["digest"]] = info images_to_add[image["name"]] = versions # Merge in any missing versions for name, versions in images_to_add.iteritems(): if name not in existing_images: existing_images[name] = {} for v in versions.itervalues(): if v["digest"] in existing_images[name]: logging.info("Image %s sha %s already defined.", name, v["digest"]) else: logging.info("Image %s adding sha %s", name, v["digest"]) existing_images[name][v["digest"]] = v # Convert to the expected output output = {} output["images"] = [] names = sorted(existing_images.keys()) for name in names: versions = existing_images[name] new_image = {} new_image["name"] = name new_image["versions"] = [] for v in versions.itervalues(): new_image["versions"].append(v) output["images"].append(new_image) with open(args.images_file, "w") as hf: hf.write(yaml.safe_dump(output, default_flow_style=False)) logging.info("Done.")
def test_train(self): # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. api_client = k8s_client.ApiClient() # Setup parameters for kustomize # TODO(jinchihe): Should enhance here after the kustomize util created. configmap = 'mnist-map-gcs' for pair in self.params.split(","): k, v = pair.split("=", 1) if k == "namespace" or k == "image": util.run(["kustomize edit set", k, v], cwd=self.app_dir) elif k == "numPs": util.run(["./definition.sh --numPs", v], cwd=self.app_dir) elif k == "numWorkers": util.run(["./definition.sh --numWorkers", v], cwd=self.app_dir) elif k == "secret": secretName, secretMountPath = v.split("=", 1) util.run([ "kustomize edit add configmap", configmap, "--from-literal=secretName=", secretName ], cwd=self.app_dir) util.run([ "kustomize edit add configmap", configmap, "--from-literal=secretMountPath=", secretMountPath ], cwd=self.app_dir) elif k == "envVariables": var_k, var_v = v.split("=", 1) util.run([ "kustomize edit add configmap", configmap, "--from-literal=", var_k, "=", var_v ], cwd=self.app_dir) else: util.run([ "kustomize edit add configmap", configmap, "--from-literal=", k, "=", v ], cwd=self.app_dir) # Create the TF job util.run(["kustomize build . |kubectl apply -f -"], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # Check for errors creating pods and services. Can potentially # help debug failed test runs. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: logging.warning(creation_failures) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( # pylint: disable=attribute-defined-outside-init self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
def test_kf_is_ready(namespace, use_basic_auth, use_istio, app_path): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ set_logging() logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. # TODO(yanniszark): This list is incomplete and missing a lot of components. deployment_names = [ "argo-ui", "centraldashboard", "jupyter-web-app-deployment", "minio", "ml-pipeline", "ml-pipeline-persistenceagent", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "ml-pipeline-viewer-controller-deployment", "mysql", "notebook-controller-deployment", "profiles-deployment", "pytorch-operator", "tf-job-operator", "workflow-controller", ] stateful_set_names = [] with open(os.path.join(app_path, "app.yaml")) as f: kfdef = yaml.safe_load(f) platform = kfdef["spec"]["platform"] ingress_related_deployments = [ "istio-citadel", "istio-egressgateway", "istio-galley", "istio-ingressgateway", "istio-pilot", "istio-policy", "istio-sidecar-injector", "istio-telemetry", "istio-tracing", "kiali", "prometheus", ] ingress_related_stateful_sets = [] knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] if platform == "gcp": deployment_names.extend(["cloud-endpoints-controller"]) stateful_set_names.extend(["kfserving-controller-manager"]) if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) elif platform == "existing_arrikto": deployment_names.extend(["dex"]) ingress_related_deployments.extend(["authservice"]) knative_related_deployments = [] # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def run_benchmark_job(): """Submit benchmark jobs to remote kubernetes cluster.""" args = parse_args() app_dir = os.path.join(str(os.environ['BENCHMARK_DIR']), "ks-app") kubeconfig_path = str(os.environ['KUBECONFIG']) api_client = deploy_utils.create_k8s_client(kubeconfig_path) namespace = args.namespace job_name = args.experiment_name # Set the namespace of kb job to default namespace = "default" # Deploy Kubebench util.run( ["ks", "generate", "kubebench-job", job_name, "--name=" + job_name], cwd=app_dir) job_config_prefix = "ks param set " + job_name + " " cmd = job_config_prefix + "mainJobKsRegistry " + args.training_job_registry util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "mainJobKsPackage " + args.training_job_pkg util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "mainJobKsPrototype " + args.training_job_prototype util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "mainJobConfig " + args.training_job_config util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "awsCredentialsSecret " + args.aws_secret util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "awsCredentialsSecretAccessKeyId " + args.aws_access_key_id util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "awsCredentialsSecretAccessKey " + args.aws_secret_access_key util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "awsRegion " + args.aws_region util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "githubTokenSecret " + args.github_secret_name util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "githubTokenSecretKey GITHUB_TOKEN" util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "controllerImage seedjeffwan/configurator:20190415" util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "postJobImage seedjeffwan/mpi-post-processor:logs" util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "postJobArgs null" util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "reporterType null" util.run(cmd.split(), cwd=app_dir) cmd = job_config_prefix + "experimentDataPvc " + args.data_pvc util.run(cmd.split(), cwd=app_dir) # cmd = "ks param set " + job_name + " config_args -- --config-file=" + pvc_mount + \ # "/config/" + config_name + ".yaml" # util.run(cmd.split(), cwd=app_dir) # cmd = "ks param set " + job_name + " report_args -- --output-file=" + pvc_mount + \ # "/output/results.csv" # util.run(cmd.split(), cwd=app_dir) apply_command = ["ks", "apply", "default", "-c", job_name] util.run(apply_command, cwd=app_dir) # TODO: expose timeout setting here. deploy_utils.wait_for_benchmark_job(job_name, namespace) deploy_utils.cleanup_benchmark_job(app_dir, job_name)
def setup_kubeflow_ks_app(dir, namespace, github_token, api_client): """Create a ksonnet app for Kubeflow""" util.makedirs(dir) logging.info("Using test directory: %s", dir) namespace_name = namespace namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) if github_token: logging.info("Setting GITHUB_TOKEN to %s.", github_token) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = github_token if not os.getenv("GITHUB_TOKEN"): logging.warning( "GITHUB_TOKEN not set; you will probably hit Github API " "limits.") # Initialize a ksonnet app. app_name = "kubeflow-test-" + uuid.uuid4().hex[0:4] util.run([ "ks", "init", app_name, ], cwd=dir) app_dir = os.path.join(dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = [ "kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job", "kubeflow/pytorch-job", "kubeflow/argo" ] # Instead of installing packages we edit the app.yaml file directly #for p in packages: # util.run(["ks", "pkg", "install", p], cwd=app_dir) app_file = os.path.join(app_dir, "app.yaml") with open(app_file) as f: app_yaml = yaml.load(f) libraries = {} for pkg in packages: pkg = pkg.split("/")[1] libraries[pkg] = { 'gitVersion': { 'commitSha': 'fake', 'refSpec': 'fake' }, 'name': pkg, 'registry': "kubeflow" } app_yaml['libraries'] = libraries with open(app_file, "w") as f: yaml.dump(app_yaml, f) # Create vendor directory with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) return app_dir
def test_kf_is_ready(namespace, use_basic_auth, use_istio): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. deployment_names = [ "argo-ui", "centraldashboard", "cloud-endpoints-controller", "jupyter-web-app-deployment", "metadata-db", "metadata-deployment", "metadata-ui", "ml-pipeline", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "notebook-controller-deployment", "tf-job-operator", "pytorch-operator", "katib-controller", "workflow-controller", ] stateful_set_names = [ "kfserving-controller-manager", ] ingress_related_deployments = [] ingress_related_stateful_sets = [] if use_basic_auth: deployment_names.extend(["basic-auth-login"]) ingress_related_stateful_sets.extend(["backend-updater"]) else: ingress_related_deployments.extend(["iap-enabler"]) ingress_related_stateful_sets.extend(["backend-updater"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name, 10) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) all_stateful_sets = [(namespace, name) for name in stateful_set_names] all_stateful_sets.extend([(ingress_namespace, name) for name in ingress_related_stateful_sets]) for ss_namespace, name in all_stateful_sets: logging.info("Verifying that stateful set %s.%s started...", ss_namespace, name) try: util.wait_for_statefulset(api_client, ss_namespace, name) except: # Collect debug information by running describe util.run([ "kubectl", "-n", ss_namespace, "describe", "statefulsets", name ]) raise # TODO(jlewi): We should verify that the ingress is created and healthy. knative_namespace = "knative-serving" knative_related_deployments = [ "activator", "autoscaler", "controller", ] for deployment_name in knative_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10)
def install_kubeflow(api_client, app_dir, namespace): """Deploy required kubeflow packages to run benchmark""" util.run(["ks", "generate", "argo", "argo"], cwd=app_dir) util.run(["ks", "generate", "tf-job-operator", "tf-job-operator"], cwd=app_dir) util.run(["ks", "generate", "mpi-operator", "mpi-operator"], cwd=app_dir) if namespace != 'default': cmd = "ks param set tf-job-operator namespace " + namespace util.run(cmd.split(), cwd=app_dir) cmd = "ks param set mpi-operator namespace " + namespace util.run(cmd.split(), cwd=app_dir) cmd = "ks param set argo namespace " + namespace util.run(cmd.split(), cwd=app_dir) util.run(cmd.split(), cwd=app_dir) apply_command = ["ks", "apply", "default", "-c", "argo", "-c", "tf-job-operator", "-c", "mpi-operator"] util.run(apply_command, cwd=app_dir)
def check_if_kfapp_exists(project, name, zone): # pylint: disable=too-many-branches """Check if a deployment with the specified name already exists.""" credentials = GoogleCredentials.get_application_default() dm = discovery.build("deploymentmanager", "v2", credentials=credentials) deployments_client = dm.deployments() enable_api = False try: deployments_client.get(project=project, deployment=name).execute() except errors.HttpError as e: if not e.content: raise error_content = json.loads(e.content) if error_content.get("error", {}).get("code", 0) == 404: # pylint: disable=no-else-return return False elif error_content.get("error", {}).get("code", 0) == 403: # We get a 403 if the deployment manager API isn't enabled logging.info("Fetching deployment %s in project %s returned error:\n%s", name, project, error_content) enable_api = True else: raise if enable_api: logging.info("Enabling the deployment manager api.") util.run(["gcloud", "--project=" + project, "services", "enable", "deploymentmanager.googleapis.com"]) logging.info("Api enabled; raising ApiNotEnabledError to force retry") raise ApiNotEnabledError # TODO(jlewi): It would be better to get the actual zone of the deployment util.run(["gcloud", "--project=" + project, "container", "clusters", "get-credentials", "--zone=" + zone, name]) logging.info("Checking if project %s kfapp %s finished setup.", project, name) util.load_kube_credentials() # TODO(jlewi): This is a bit of a hack for v0.6. For v0.6 we check if the # ingress already exists and if it does we report it as true and otherwise # false. The reasoning is if the ingress doesn't exist we want to see # if we can fix/resume the deployment by running reapply # With v0.7 kfctl apply should be an idempotent operation so we can always # rerun apply; but with v0.6 rerunning apply if the ingress exists results # in an error. api_client = k8s_client.ApiClient() v1 = k8s_client.CoreV1Api(api_client) ingress_namespace = "istio-system" ingress_name = "envoy-ingress" extensions = k8s_client.ExtensionsV1beta1Api(api_client) missing_ingress = True try: logging.info("Trying to read ingress %s.%s", ingress_name, ingress_namespace) extensions.read_namespaced_ingress(ingress_name, ingress_namespace) missing_ingress = False logging.info("Ingress %s.%s exists", ingress_name, ingress_namespace) except rest.ApiException as e: if e.status == 404: logging.info("Project: %s, KFApp: %s is missing ingress %s.%s", project, name, ingress_namespace, ingress_name) missing_ingress = True else: raise if missing_ingress: # Check if the service istio-ingressgateway already exists # if it does we need to delete it before rerunning apply. service_name = "istio-ingressgateway" logging.info("ingress %s.%s exists; checking if service %s.%s exists", ingress_namespace, ingress_name, ingress_namespace, service_name) has_service = False try: v1.read_namespaced_service(service_name, ingress_namespace) has_service = True except rest.ApiException as e: if e.status == 404: logging.info("Project: %s, KFApp: %s is missing service %s.%s", project, name, ingress_namespace, service_name) else: raise if has_service: logging.info("Deleting service: %s.%s", ingress_namespace, service_name) v1.delete_namespaced_service(service_name, ingress_namespace, body=k8s_client.V1DeleteOptions()) logging.info("Deleted service: %s.%s", ingress_namespace, service_name) return False return True
def main(): # pylint: disable=too-many-locals logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals # create the top-level parser parser = argparse.ArgumentParser( description="Test Kubeflow E2E.") parser.add_argument( "--test_dir", default="", type=str, help="Directory to use for all the test files. If not set a temporary " "directory is created.") parser.add_argument( "--artifacts_dir", default="", type=str, help="Directory to use for artifacts that should be preserved after " "the test runs. Defaults to test_dir if not set.") parser.add_argument( "--project", default=None, type=str, help="The project to use.") parser.add_argument( "--cluster", default=None, type=str, help=("The name of the cluster. If not set assumes the " "script is running in a cluster and uses that cluster.")) parser.add_argument( "--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser.add_argument( "--github_token", default=None, type=str, help=("The GitHub API token to use. This is needed since ksonnet uses the " "GitHub API and without it we get rate limited. For more info see: " "https://github.com/ksonnet/ksonnet/blob/master/docs" "/troubleshooting.md")) args = parser.parse_args() if not args.test_dir: logging.info("--test_dir not set; using a temporary directory.") now = datetime.datetime.now() label = "test_deploy-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] # Create a temporary directory for this test run args.test_dir = os.path.join(tempfile.gettempdir(), label) if not args.artifacts_dir: args.artifacts_dir = args.test_dir # Setup a logging file handler. This way we can upload the log outputs # to gubernator. root_logger = logging.getLogger() test_log = os.path.join(args.artifacts_dir, "logs", "test_deploy.log.txt") if not os.path.exists(os.path.dirname(test_log)): os.makedirs(os.path.dirname(test_log)) file_handler = logging.FileHandler(test_log) root_logger.addHandler(file_handler) # We need to explicitly set the formatter because it will not pick up # the BasicConfig. formatter = logging.Formatter(fmt=("%(levelname)s|%(asctime)s" "|%(pathname)s|%(lineno)d| %(message)s"), datefmt="%Y-%m-%dT%H:%M:%S") file_handler.setFormatter(formatter) logging.info("Logging to %s", test_log) if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): logging.info("GOOGLE_APPLICATION_CREDENTIALS is set; configuring gcloud " "to use service account.") # Since a service account is set tell gcloud to use it. util.run(["gcloud", "auth", "activate-service-account", "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")]) setup(args)
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def install_kubebench_nfs(api_client, app_dir, namespace): """Deploy required kubeflow packages to run benchmark""" util.run(["ks", "pkg", "install", "kubebench/kubebench-quickstarter"], cwd=app_dir) util.run(["ks", "generate", "kubebench-quickstarter-service", "kubebench-quickstarter-service"], cwd=app_dir) util.run(["ks", "generate", "kubebench-quickstarter-volume", "kubebench-quickstarter-volume"], cwd=app_dir) util.run(["ks", "param", "set", "kubebench-quickstarter-service", "namespace", namespace], cwd=app_dir) util.run(["ks", "param", "set", "kubebench-quickstarter-volume", "namespace", namespace], cwd=app_dir) apply_command = ["ks", "apply", "default", "-c", "kubebench-quickstarter-service"] util.run(apply_command, cwd=app_dir) kubebench_nfs_deployment_name = "kubebench-nfs-deploy" kubebench_nfs_service_name = "kubebench-nfs-svc" logging.info("Verifying NFS deployment started") util.wait_for_deployment(api_client, namespace, kubebench_nfs_deployment_name) service = get_k8s_service(api_client, namespace, kubebench_nfs_service_name) util.run(["ks", "param", "set", "kubebench-quickstarter-volume", "nfsServiceIP", service.spec.cluster_ip], cwd=app_dir) apply_command = ["ks", "apply", "default", "-c", "kubebench-quickstarter-volume"] util.run(apply_command, cwd=app_dir)
def run_delete(): util.run([kfctl_path, "delete", "-V", "-f", kfdef_path], cwd=app_path)
def test_build_kfctl_go(record_xml_attribute, app_name, app_path, project, use_basic_auth, use_istio, config_path, build_and_apply, kfctl_repo_path, cluster_creation_script, self_signed_cert, values): """Test building and deploying Kubeflow. Args: app_name: kubeflow deployment name. app_path: The path to the Kubeflow app. project: The GCP project to use. use_basic_auth: Whether to use basic_auth. use_istio: Whether to use Istio or not config_path: Path to the KFDef spec file. cluster_creation_script: script invoked to create a new cluster build_and_apply: whether to build and apply or apply kfctl_repo_path: path to the kubeflow/kfctl repo. self_signed_cert: whether to use self-signed cert for ingress. values: Comma separated list of variables to substitute into config_path """ util.set_pytest_junit(record_xml_attribute, "test_build_kfctl_go") # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) # TODO(yanniszark): split this into a separate workflow step if cluster_creation_script: logging.info("Cluster creation script specified: %s", cluster_creation_script) util.run(["/bin/bash", "-c", cluster_creation_script]) logging.info("using kfctl repo: %s" % kfctl_repo_path) if values: pairs = values.split(",") path_vars = {} for p in pairs: k, v = p.split("=") path_vars[k] = v config_path = config_path.format(**path_vars) logging.info("config_path after substitution: %s", config_path) kfctl_path = kfctl_util.build_kfctl_go(kfctl_repo_path) app_path = kfctl_util.kfctl_deploy_kubeflow(app_path, project, use_basic_auth, use_istio, config_path, kfctl_path, build_and_apply) if not cluster_creation_script: kfctl_util.verify_kubeconfig(app_path) # Use self-signed cert for testing to prevent quota limiting. if self_signed_cert: logging.info("Configuring self signed certificate") util.load_kube_credentials() api_client = k8s_client.ApiClient() ingress_namespace = "istio-system" ingress_name = "envoy-ingress" tls_endpoint = "{0}.endpoints.{1}.cloud.goog".format(app_name, project) logging.info("Configuring self signed cert for %s", tls_endpoint) util.use_self_signed_for_ingress(ingress_namespace, ingress_name, tls_endpoint, api_client)
def test_kf_is_ready(namespace, use_basic_auth, use_istio): """Test that Kubeflow was successfully deployed. Args: namespace: The namespace Kubeflow is deployed to. """ logging.info("Using namespace %s", namespace) # Need to activate account for scopes. if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ]) api_client = deploy_utils.create_k8s_client() util.load_kube_config() # Verify that components are actually deployed. # TODO(jlewi): We need to parameterize this list based on whether # we are using IAP or basic auth. deployment_names = [ "argo-ui", "centraldashboard", "cert-manager", "cloud-endpoints-controller", "jupyter-web-app", "ml-pipeline", "ml-pipeline-scheduledworkflow", "ml-pipeline-ui", "notebooks-controller", "tf-job-operator", "pytorch-operator", "studyjob-controller", "workflow-controller", ] ingress_related_deployments = [] stateful_sets = [ "backend-updater", ] if use_basic_auth: deployment_names.extend(["basic-auth"]) else: ingress_related_deployments.extend(["iap-enabler"]) # TODO(jlewi): Might want to parallelize this. for deployment_name in deployment_names: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name) ingress_namespace = "istio-system" if use_istio else namespace for deployment_name in ingress_related_deployments: logging.info("Verifying that deployment %s started...", deployment_name) util.wait_for_deployment(api_client, ingress_namespace, deployment_name) for name in stateful_sets: logging.info("Verifying that statefulset %s started...", name) util.wait_for_statefulset(api_client, ingress_namespace, name)
def setup(args): """Test deploying Kubeflow.""" api_client = create_k8s_client(args) now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = args.namespace namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) if args.github_token: logging.info("Setting GITHUB_TOKEN to %s.", args.github_token) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token if not os.getenv("GITHUB_TOKEN"): logging.warn("GITHUB_TOKEN not set; you will probably hit Github API " "limits.") # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/kubeflow/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) REPO_ORG = "kubeflow" REPO_NAME = "kubeflow" REGISTRY_PATH = "kubeflow" source = os.path.join(args.test_dir, "src", REPO_ORG, REPO_NAME, REGISTRY_PATH) logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) if args.deploy_tf_serving: logging.info("Deploying tf-serving.") util.run([ "ks", "generate", "tf-serving", "modelServer", "--name=inception", "--namespace=" + namespace.metadata.name, "--model_path=gs://kubeflow-models/inception", "--model_server_image=" + args.model_server_image ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "modelServer", ] util.run(apply_command, cwd=app_dir) core_api = k8s_client.CoreV1Api(api_client) deploy = core_api.read_namespaced_service("inception", namespace.metadata.name) cluster_ip = deploy.spec.cluster_ip util.wait_for_deployment(api_client, namespace.metadata.name, "inception") logging.info("Verified TF serving started.")
def deploy_minikube(args): """Create a VM and setup minikube.""" credentials = GoogleCredentials.get_application_default() gce = discovery.build("compute", "v1", credentials=credentials, cache_discovery=False) instances = gce.instances() body = { "name": args.vm_name, "machineType": "zones/{0}/machineTypes/n1-standard-16".format(args.zone), "disks": [ { "boot": True, "initializeParams": { "sourceImage": "projects/ubuntu-os-cloud/global/images/family/ubuntu-1604-lts", "diskSizeGb": 100, "autoDelete": True, }, }, ], "networkInterfaces": [ { "accessConfigs": [ { "name": "external-nat", "type": "ONE_TO_ONE_NAT", }, ], "network": "global/networks/default", }, ], } request = instances.insert(project=args.project, zone=args.zone, body=body) response = None try: response = request.execute() print("done") except errors.HttpError as e: if not e.content: raise content = json.loads(e.content) if content.get("error", {}).get("code") == requests.codes.CONFLICT: # We don't want to keep going so we reraise the error after logging # a helpful error message. logging.error( "Either the VM or the disk %s already exists in zone " "%s in project %s ", args.vm_name, args.zone, args.project) raise else: raise op_id = response.get("name") final_op = vm_util.wait_for_operation(gce, args.project, args.zone, op_id) logging.info("Final result for insert operation: %s", final_op) if final_op.get("status") != "DONE": raise ValueError("Insert operation has status %s", final_op.get("status")) if final_op.get("error"): message = "Insert operation resulted in error %s".format( final_op.get("error")) logging.error(message) raise ValueError(message) # Locate the install minikube script. install_script = os.path.join(os.path.dirname(__file__), "install_minikube.sh") if not os.path.exists(install_script): logging.error("Could not find minikube install script: %s", install_script) vm_util.wait_for_vm(args.project, args.zone, args.vm_name) vm_util.execute_script(args.project, args.zone, args.vm_name, install_script) # Copy the .kube and .minikube files to test_dir target = "~/.kube" full_target = "{0}:{1}".format(args.vm_name, target) logging.info("Copying %s to %s", target, args.test_dir) util.run([ "gcloud", "compute", "--project=" + args.project, "scp", "--recurse", full_target, args.test_dir, "--zone=" + args.zone ]) # The .minikube directory contains some really large ISO and other files that we don't need; so we # only copy the files we need. minikube_dir = os.path.join(args.test_dir, ".minikube") try: os.makedirs(minikube_dir) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(minikube_dir): pass else: raise for target in ["~/.minikube/*.crt", "~/.minikube/client.key"]: full_target = "{0}:{1}".format(args.vm_name, target) logging.info("Copying %s to %s", target, minikube_dir) util.run([ "gcloud", "compute", "--project=" + args.project, "scp", "--recurse", full_target, minikube_dir, "--zone=" + args.zone ]) config_path = os.path.join(args.test_dir, ".kube", "config") modify_minikube_config(config_path, minikube_dir)
def get_gcp_identity(): identity = util.run(["gcloud", "config", "get-value", "account"]) logging.info("Current GCP account: %s", identity) return identity
def main(): # pylint: disable=too-many-locals,too-many-statements logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals # create the top-level parser parser = argparse.ArgumentParser(description="Test Kubeflow E2E.") parser.add_argument( "--test_dir", default="", type=str, help="Directory to use for all the test files. If not set a temporary " "directory is created.") parser.add_argument( "--artifacts_dir", default="", type=str, help="Directory to use for artifacts that should be preserved after " "the test runs. Defaults to test_dir if not set.") parser.add_argument( "--as_gcloud_user", dest="as_gcloud_user", action="store_true", help=("Impersonate the user corresponding to the gcloud " "command with kubectl and ks.")) parser.add_argument("--no-as_gcloud_user", dest="as_gcloud_user", action="store_false") parser.set_defaults(as_gcloud_user=False) # TODO(jlewi): This should not be a global flag. parser.add_argument("--project", default=None, type=str, help="The project to use.") # TODO(jlewi): This should not be a global flag. parser.add_argument("--namespace", default=None, type=str, help=("The namespace to use.")) parser.add_argument( "--github_token", default=None, type=str, help= ("The GitHub API token to use. This is needed since ksonnet uses the " "GitHub API and without it we get rate limited. For more info see: " "https://github.com/ksonnet/ksonnet/blob/master/docs" "/troubleshooting.md. Can also be set using environment variable " "GITHUB_TOKEN.")) parser.add_argument("--deploy_name", default="", type=str, help="The name of the deployment.") parser.add_argument("--workflow_name", default="", type=str, help="The name of the workflow.") subparsers = parser.add_subparsers() parser_teardown = subparsers.add_parser( "teardown", help="teardown the test infrastructure.") parser_teardown.set_defaults(func=teardown) parser_tf_serving = subparsers.add_parser( "deploy_model", help="Deploy a TF serving model.") parser_tf_serving.set_defaults(func=deploy_model) parser_tf_serving.add_argument( "--params", default="", type=str, help=("Comma separated list of parameters to set on the model.")) parser_pytorch_job = subparsers.add_parser("deploy_pytorchjob", help="Deploy a pytorch-job") parser_pytorch_job.set_defaults(func=deploy_pytorchjob) parser_pytorch_job.add_argument( "--params", default="", type=str, help=("Comma separated list of parameters to set on the model.")) parser_argo_job = subparsers.add_parser("deploy_argo", help="Deploy argo") parser_argo_job.set_defaults(func=deploy_argo) parser_katib_test = subparsers.add_parser("test_katib", help="Test Katib") parser_katib_test.set_defaults(func=test_katib) parser_minikube = subparsers.add_parser( "deploy_minikube", help="Setup a K8s cluster on minikube.") parser_minikube.set_defaults(func=deploy_minikube) parser_minikube.add_argument("--vm_name", required=True, type=str, help="The name of the VM to use.") parser_minikube.add_argument("--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser_teardown_minikube = subparsers.add_parser( "teardown_minikube", help="Delete the VM running minikube.") parser_teardown_minikube.set_defaults(func=teardown_minikube) parser_teardown_minikube.add_argument("--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser_teardown_minikube.add_argument("--vm_name", required=True, type=str, help="The name of the VM to use.") args = parser.parse_args() if not args.test_dir: logging.info("--test_dir not set; using a temporary directory.") now = datetime.datetime.now() label = "test_deploy-" + now.strftime( "%m%d-%H%M-") + uuid.uuid4().hex[0:4] # Create a temporary directory for this test run args.test_dir = os.path.join(tempfile.gettempdir(), label) if not args.artifacts_dir: args.artifacts_dir = args.test_dir test_log = os.path.join( args.artifacts_dir, "logs", "test_deploy." + args.func.__name__ + args.deploy_name + ".log.txt") try: os.makedirs(os.path.dirname(test_log)) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir( os.path.dirname(test_log)): pass else: raise # TODO(jlewi): We should make this a util routine in kubeflow.testing.util # Setup a logging file handler. This way we can upload the log outputs # to gubernator. root_logger = logging.getLogger() file_handler = logging.FileHandler(test_log) root_logger.addHandler(file_handler) # We need to explicitly set the formatter because it will not pick up # the BasicConfig. formatter = logging.Formatter( fmt=("%(levelname)s|%(asctime)s" "|%(pathname)s|%(lineno)d| %(message)s"), datefmt="%Y-%m-%dT%H:%M:%S") file_handler.setFormatter(formatter) logging.info("Logging to %s", test_log) util.run([ks, "version"]) util.maybe_activate_service_account() config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION) # Print out the config to help debugging. output = util.run_and_output(["gcloud", "config", "config-helper"]) logging.info("gcloud config: \n%s", output) wrap_test(args)
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": diff_command = ["git", "diff", "--name-only", "master"] elif job_type == "postsubmit": diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha] changed_files = [] if job_type == "presubmit" or job_type == "postsubmit": changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] if args.config_file: workflows.extend(parse_config_file(args.config_file, args.repos_dir)) create_started_file(args.bucket) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() workflow_names = [] ui_urls = {} for w in workflows: # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name ks_cmd = get_ksonnet_cmd(w) # Print ksonnet version util.run([ks_cmd, "version"]) # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info("Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info("Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info("Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # Create a new environment for this run env = workflow_name util.run([ks_cmd, "env", "add", env], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME"] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket], cwd=w.app_dir) if args.release: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG")], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k])], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) success = True workflow_phase = {} try: results = argo_client.wait_for_workflows(get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status) for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase if phase != "Succeeded": success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) except util.TimeoutError: success = False logging.exception("Time out waiting for Workflows %s to finish", ",".join(workflow_names)) except Exception as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) raise finally: success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success
def deploy_kubeflow(test_case): """Deploy Kubeflow.""" args = parse_args() test_dir = test_case.test_suite.test_dir namespace = args.namespace api_client = deploy_utils.create_k8s_client() app_dir = deploy_utils.setup_kubeflow_ks_app(test_dir, namespace, args.github_token, api_client) # ks generate tf-job-operator tf-job-operator # TODO(jlewi): We don't need to generate a core component if we are # just deploying TFServing. Might be better to refactor this code. # Deploy Kubeflow util.run( [ "ks", "generate", "tf-job-operator", "tf-job-operator", ], cwd=app_dir) util.run( [ "ks", "generate", "pytorch-operator", "pytorch-operator", ], cwd=app_dir) util.run( [ "ks", "generate", "jupyter", "jupyter", ], cwd=app_dir) util.run( [ "ks", "generate", "katib", "katib", ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "tf-job-operator", "-c", "pytorch-operator", "-c", "jupyter", "-c", "katib", ] if args.as_gcloud_user: account = deploy_utils.get_gcp_identity() logging.info("Impersonate %s", account) # If we don't use --as to impersonate the service account then we # observe RBAC errors when doing certain operations. The problem appears # to be that we end up using the in cluster config (e.g. pod service account) # and not the GCP service account which has more privileges. apply_command.append("--as=" + account) util.run(apply_command, cwd=app_dir) # Verify that Jupyter is actually deployed. jupyter_name = "jupyter" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace, jupyter_name) # Verify that core components are actually deployed. deployment_names = ["tf-job-operator-v1beta1", "pytorch-operator", "studyjob-controller"] for deployment_name in deployment_names: logging.info("Verifying that %s started...", deployment_name) util.wait_for_deployment(api_client, namespace, deployment_name)