def main(): # pylint: disable=too-many-locals logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals # create the top-level parser parser = argparse.ArgumentParser(description="Test Kubeflow E2E.") parser.add_argument( "--test_dir", default="", type=str, help="Directory to use for all the test files. If not set a temporary " "directory is created.") parser.add_argument( "--artifacts_dir", default="", type=str, help="Directory to use for artifacts that should be preserved after " "the test runs. Defaults to test_dir if not set.") parser.add_argument("--project", default=None, type=str, help="The project to use.") parser.add_argument( "--cluster", default=None, type=str, help=("The name of the cluster. If not set assumes the " "script is running in a cluster and uses that cluster.")) parser.add_argument("--namespace", required=True, type=str, help=("The namespace to use.")) parser.add_argument("--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser.add_argument( "--github_token", default=None, type=str, help= ("The GitHub API token to use. This is needed since ksonnet uses the " "GitHub API and without it we get rate limited. For more info see: " "https://github.com/ksonnet/ksonnet/blob/master/docs" "/troubleshooting.md. Can also be set using environment variable " "GITHUB_TOKEN.")) subparsers = parser.add_subparsers() parser_setup = subparsers.add_parser("setup", help="setup the test infrastructure.") parser_setup.set_defaults(func=setup) parser_teardown = subparsers.add_parser( "teardown", help="teardown the test infrastructure.") parser_teardown.set_defaults(func=teardown) parser_setup.add_argument( "--deploy_tf_serving", default=False, type=bool, help=("If True, deploy the tf-serving component.")) parser_setup.add_argument("--model_server_image", default="gcr.io/kubeflow/model-server:1.0", type=str, help=("The TF serving image to use.")) args = parser.parse_args() if not args.test_dir: logging.info("--test_dir not set; using a temporary directory.") now = datetime.datetime.now() label = "test_deploy-" + now.strftime( "%m%d-%H%M-") + uuid.uuid4().hex[0:4] # Create a temporary directory for this test run args.test_dir = os.path.join(tempfile.gettempdir(), label) if not args.artifacts_dir: args.artifacts_dir = args.test_dir test_log = os.path.join(args.artifacts_dir, "logs", "test_deploy." + args.func.__name__ + ".log.txt") if not os.path.exists(os.path.dirname(test_log)): os.makedirs(os.path.dirname(test_log)) # TODO(jlewi): We should make this a util routine in kubeflow.testing.util # Setup a logging file handler. This way we can upload the log outputs # to gubernator. root_logger = logging.getLogger() file_handler = logging.FileHandler(test_log) root_logger.addHandler(file_handler) # We need to explicitly set the formatter because it will not pick up # the BasicConfig. formatter = logging.Formatter( fmt=("%(levelname)s|%(asctime)s" "|%(pathname)s|%(lineno)d| %(message)s"), datefmt="%Y-%m-%dT%H:%M:%S") file_handler.setFormatter(formatter) logging.info("Logging to %s", test_log) util.maybe_activate_service_account() wrap_test(args)
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": # We need to get a common ancestor for the PR and the master branch common_ancestor = util.run(["git", "merge-base", "HEAD", "master"], cwd=os.path.join(args.repos_dir, repo_owner, repo_name)) diff_command = ["git", "diff", "--name-only", common_ancestor] elif job_type == "postsubmit": # See: https://git-scm.com/docs/git-diff # This syntax compares the commit before pull_base_sha with the commit # at pull_base_sha diff_command = [ "git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha ] changed_files = [] if job_type == "presubmit" or job_type == "postsubmit": changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] if args.config_file: workflows.extend(parse_config_file(args.config_file, args.repos_dir)) create_started_file(args.bucket) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() workflow_names = [] ui_urls = {} for w in workflows: # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir) # Print ksonnet version util.run([ks_cmd, "version"]) # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info( "Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info( "Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info( "Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # Create a new environment for this run env = workflow_name util.run( [ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name ], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = [ "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME" ] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket ], cwd=w.app_dir) if args.release: util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG") ], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k]) ], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ( "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) success = True workflow_phase = {} try: results = argo_client.wait_for_workflows( get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status) for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase if phase != "Succeeded": success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) except util.TimeoutError: success = False logging.exception("Time out waiting for Workflows %s to finish", ",".join(workflow_names)) except Exception as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) raise finally: success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success
def main(): # pylint: disable=too-many-locals logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals util.maybe_activate_service_account() now = datetime.datetime.now() # create the top-level parser parser = argparse.ArgumentParser(description="Setup clusters for testing.") subparsers = parser.add_subparsers() ############################################################################# # setup # parser_setup = subparsers.add_parser( "setup_cluster", help="Setup a cluster for testing.") parser_setup.add_argument( "--accelerator", dest="accelerators", action="append", help="Accelerator to add to the cluster. Should be of the form type=count.") parser_setup.add_argument( "--namespace", default="kubeflow-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4], help="The directory containing the ksonnet app used for testing.", ) parser_setup.set_defaults(func=setup_cluster) add_common_args(parser_setup) parser_kubeflow = subparsers.add_parser( "setup_kubeflow", help="Deploy Kubeflow for testing.") parser_kubeflow.add_argument( "--tf_job_version", dest="tf_job_version", help="Which version of the TFJobOperator to deploy.") parser_kubeflow.set_defaults(func=setup_kubeflow) parser_kubeflow.add_argument( "--namespace", default="kubeflow-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4], help="The directory containing the ksonnet app used for testing.", ) parser_kubeflow.add_argument( "--image", help="The image to use", ) add_common_args(parser_kubeflow) parser_kubeflow.add_argument( "--test_app_dir", help="The directory containing the ksonnet app used for testing.", ) ############################################################################# # teardown # parser_teardown = subparsers.add_parser( "teardown", help="Teardown the cluster.") parser_teardown.set_defaults(func=teardown) add_common_args(parser_teardown) # parse the args and call whatever function was selected args = parser.parse_args() args.func(args)
def main(): # pylint: disable=too-many-locals,too-many-statements logging.getLogger().setLevel(logging.INFO) # pylint: disable=too-many-locals # create the top-level parser parser = argparse.ArgumentParser(description="Test Kubeflow E2E.") parser.add_argument( "--test_dir", default="", type=str, help="Directory to use for all the test files. If not set a temporary " "directory is created.") parser.add_argument( "--artifacts_dir", default="", type=str, help="Directory to use for artifacts that should be preserved after " "the test runs. Defaults to test_dir if not set.") parser.add_argument( "--as_gcloud_user", dest="as_gcloud_user", action="store_true", help=("Impersonate the user corresponding to the gcloud " "command with kubectl and ks.")) parser.add_argument("--no-as_gcloud_user", dest="as_gcloud_user", action="store_false") parser.set_defaults(as_gcloud_user=False) # TODO(jlewi): This should not be a global flag. parser.add_argument("--project", default=None, type=str, help="The project to use.") # TODO(jlewi): This should not be a global flag. parser.add_argument("--namespace", default=None, type=str, help=("The namespace to use.")) parser.add_argument( "--github_token", default=None, type=str, help= ("The GitHub API token to use. This is needed since ksonnet uses the " "GitHub API and without it we get rate limited. For more info see: " "https://github.com/ksonnet/ksonnet/blob/master/docs" "/troubleshooting.md. Can also be set using environment variable " "GITHUB_TOKEN.")) parser.add_argument("--deploy_name", default="", type=str, help="The name of the deployment.") subparsers = parser.add_subparsers() parser_teardown = subparsers.add_parser( "teardown", help="teardown the test infrastructure.") parser_teardown.set_defaults(func=teardown) parser_tf_serving = subparsers.add_parser( "deploy_model", help="Deploy a TF serving model.") parser_tf_serving.set_defaults(func=deploy_model) parser_tf_serving.add_argument( "--params", default="", type=str, help=("Comma separated list of parameters to set on the model.")) parser_pytorch_job = subparsers.add_parser("deploy_pytorchjob", help="Deploy a pytorch-job") parser_pytorch_job.set_defaults(func=deploy_pytorchjob) parser_pytorch_job.add_argument( "--params", default="", type=str, help=("Comma separated list of parameters to set on the model.")) parser_argo_job = subparsers.add_parser("deploy_argo", help="Deploy argo") parser_argo_job.set_defaults(func=deploy_argo) parser_katib_test = subparsers.add_parser("test_katib", help="Test Katib") parser_katib_test.set_defaults(func=test_katib) parser_minikube = subparsers.add_parser( "deploy_minikube", help="Setup a K8s cluster on minikube.") parser_minikube.set_defaults(func=deploy_minikube) parser_minikube.add_argument("--vm_name", required=True, type=str, help="The name of the VM to use.") parser_minikube.add_argument("--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser_teardown_minikube = subparsers.add_parser( "teardown_minikube", help="Delete the VM running minikube.") parser_teardown_minikube.set_defaults(func=teardown_minikube) parser_teardown_minikube.add_argument("--zone", default="us-east1-d", type=str, help="The zone for the cluster.") parser_teardown_minikube.add_argument("--vm_name", required=True, type=str, help="The name of the VM to use.") args = parser.parse_args() if not args.test_dir: logging.info("--test_dir not set; using a temporary directory.") now = datetime.datetime.now() label = "test_deploy-" + now.strftime( "%m%d-%H%M-") + uuid.uuid4().hex[0:4] # Create a temporary directory for this test run args.test_dir = os.path.join(tempfile.gettempdir(), label) if not args.artifacts_dir: args.artifacts_dir = args.test_dir test_log = os.path.join( args.artifacts_dir, "logs", "test_deploy." + args.func.__name__ + args.deploy_name + ".log.txt") if not os.path.exists(os.path.dirname(test_log)): os.makedirs(os.path.dirname(test_log)) # TODO(jlewi): We should make this a util routine in kubeflow.testing.util # Setup a logging file handler. This way we can upload the log outputs # to gubernator. root_logger = logging.getLogger() file_handler = logging.FileHandler(test_log) root_logger.addHandler(file_handler) # We need to explicitly set the formatter because it will not pick up # the BasicConfig. formatter = logging.Formatter( fmt=("%(levelname)s|%(asctime)s" "|%(pathname)s|%(lineno)d| %(message)s"), datefmt="%Y-%m-%dT%H:%M:%S") file_handler.setFormatter(formatter) logging.info("Logging to %s", test_log) util.run(["ks", "version"]) util.maybe_activate_service_account() config_file = os.path.expanduser(kube_config.KUBE_CONFIG_DEFAULT_LOCATION) # Print out the config to help debugging. output = util.run_and_output(["gcloud", "config", "config-helper"]) logging.info("gcloud config: \n%s", output) wrap_test(args)
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches # Check https://github.com/kubernetes/test-infra/blob/master/prow/jobs.md # for a description of the injected environment variables. job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") base_branch_name = os.getenv("PULL_BASE_REF") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": # We need to get a common ancestor for the PR and the base branch cloned_repo_dir = os.path.join(args.repos_dir, repo_owner, repo_name) _ = util.run(["git", "fetch", "origin", base_branch_name + ":refs/remotes/origin/" + base_branch_name], cwd=cloned_repo_dir) diff_command = ["git", "diff", "--name-only"] diff_branch = "remotes/origin/{}".format(base_branch_name) try: common_ancestor = util.run(["git", "merge-base", "HEAD", diff_branch], cwd=cloned_repo_dir) diff_command.append(common_ancestor) except subprocess.CalledProcessError as e: logging.warning("git merge-base failed; see " "https://github.com/kubeflow/kubeflow/issues/3523. Diff " "will be computed against the current master and " "therefore files not changed in the PR might be " "considered when determining which tests to trigger") diff_command.append(diff_branch) elif job_type == "postsubmit": # See: https://git-scm.com/docs/git-diff # This syntax compares the commit before pull_base_sha with the commit # at pull_base_sha diff_command = ["git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha] changed_files = [] if job_type in ("presubmit", "postsubmit"): changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] config = {} if args.config_file: config, new_workflows = parse_config_file(args.config_file, args.repos_dir) workflows.extend(new_workflows) # Add any paths to the python path extra_py_paths = [] for p in config.get("python_paths", []): # Assume that python_paths are in the format $REPO_OWNER/$REPO_NAME/path, # we need to ensure that the repo is checked out if it is different from # the current one, and if the repo is not kubeflow/testing (which is already # checked out). segments = p.split("/") if ((segments[0] != repo_owner or segments[1] != repo_name) and not p.startswith("kubeflow/testing")): logging.info("Need to clone %s/%s", segments[0], segments[1]) util.clone_repo(os.path.join(args.repos_dir, segments[0], segments[1]), segments[0], segments[1]) path = os.path.join(args.repos_dir, p) extra_py_paths.append(path) kf_test_path = os.path.join(args.repos_dir, "kubeflow/testing/py") if kf_test_path not in extra_py_paths: logging.info("Adding %s to extra python paths", kf_test_path) extra_py_paths.append(kf_test_path) logging.info("Extra python paths: %s", ":".join(extra_py_paths)) # Create an initial version of the file with no urls create_started_file(args.bucket, {}) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() tekton_runner = tekton_client.TektonRunner() workflow_names = [] tkn_names = [] tkn_cleanup_args = [] ui_urls = {} for w in workflows: # pylint: disable=too-many-nested-blocks # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. # # TODO(jlewi):This should no longer be used with Tekton. For tekton # name should be based on generateName; we should use labels to # provide additional metadata info like PR number. workflow_name = os.getenv("JOB_NAME", "") + "-" + w.name # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info("Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info("Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info("Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": # When not running under prow we might not set all environment variables if os.getenv("PULL_NUMBER"): workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) if os.getenv("PULL_PULL_SHA"): workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": if os.getenv("PULL_BASE_SHA"): workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) # Append the last 4 digits of the build number if os.getenv("BUILD_NUMBER"): workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")[-4:]) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) if w.tekton_run: tkn_names.append(workflow_name) else: workflow_names.append(workflow_name) # check if ks workflow and run if w.app_dir: ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir) # Print ksonnet version util.run([ks_cmd, "version"]) # Create a new environment for this run env = workflow_name util.run([ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = ["JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME"] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args)], cwd=w.app_dir) util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket], cwd=w.app_dir) if args.release: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG")], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k])], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) elif w.tekton_run: pull_revision = None if os.getenv("PULL_NUMBER"): pull_revision = "refs/pull/{pull_num}/head".format( pull_num=os.getenv("PULL_NUMBER")) elif os.getenv("PULL_BASE_SHA"): pull_revision = os.getenv("PULL_BASE_SHA") else: pull_revision = "master" logging.info("Adding Tekton pipeline %s", w.name) try: pipeline_runner = tekton_client.PipelineRunner( w.tekton_params, w.kwargs.get(TEST_TARGET_ARG_NAME, w.name), w.tekton_run, args.bucket, repo_owner, repo_name, pull_revision) except (FileNotFoundError, ValueError) as e: logging.error("Error when starting Tekton workflow:%s\n Exception %s;\n" "stacktrace:\n%s", w.tekton_run, e, traceback.format_exc()) continue if w.tekton_teardown: logging.info("Appending teardown process for Tekton pipeline %s", w.name) pipeline_runner.append_teardown(tekton_client.PipelineRunner( w.tekton_teardown_params, w.kwargs.get(TEST_TARGET_ARG_NAME, w.name), w.tekton_teardown, args.bucket, repo_owner, repo_name, pull_revision)) tekton_runner.append(pipeline_runner) else: w.kwargs["name"] = workflow_name w.kwargs["namespace"] = get_namespace(args) if TEST_TARGET_ARG_NAME not in w.kwargs: w.kwargs[TEST_TARGET_ARG_NAME] = w.name logging.info("Workflow %s doesn't set arg %s; defaulting to %s", w.name, TEST_TARGET_ARG_NAME, w.kwargs[TEST_TARGET_ARG_NAME]) # TODO(https://github.com/kubeflow/testing/issues/467): We shell out # to e2e_tool in order to dumpy the Argo workflow to a file which then # reimport. We do this because importing the py_func module appears # to break when we have to dynamically adjust sys.path to insert # new paths. Setting PYTHONPATH before launching python however appears # to work which is why we shell out to e2e_tool. command = ["python", "-m", "kubeflow.testing.e2e_tool", "show", w.py_func] for k, v in w.kwargs.items(): # The fire module turns underscores in parameter names into hyphens # so we convert underscores in parameter names to hyphens command.append("--{0}={1}".format(k.replace("_", "-"), v)) with tempfile.NamedTemporaryFile(delete=False) as hf: workflow_file = hf.name command.append("--output=" + hf.name) env = os.environ.copy() env["PYTHONPATH"] = ":".join(extra_py_paths) util.run(command, env=env) with open(workflow_file) as hf: wf_result = yaml.load(hf) group, version = wf_result['apiVersion'].split('/') k8s_co = k8s_client.CustomObjectsApi() workflow_name = wf_result["metadata"]["name"] py_func_result = k8s_co.create_namespaced_custom_object( group=group, version=version, namespace=wf_result["metadata"]["namespace"], plural='workflows', body=wf_result) logging.info("Created workflow:\n%s", yaml.safe_dump(py_func_result)) ui_url = ("http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) ui_urls.update(tekton_runner.run( tekton_client.ClusterInfo(args.project, TEKTON_CLUSTER_ZONE, TEKTON_CLUSTER_NAME), tekton_client.ClusterInfo(args.project, args.zone, args.cluster))) # We delay creating started.json until we know the Argo workflow URLs create_started_file(args.bucket, ui_urls) workflow_success = False workflow_phase = {} workflow_status_yamls = {} results = [] tekton_results = [] try: results = argo_client.wait_for_workflows( get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status ) util.configure_kubectl(args.project, "us-east1-d", "kf-ci-v1") util.load_kube_config() tekton_results = tekton_runner.join() workflow_success = True except util.ExceptionWithWorkflowResults as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) results = e.workflow_results raise except Exception as e: logging.exception("Other exception: %s", e) raise finally: util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket) # Upload workflow status to GCS. for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase workflow_status_yamls[name] = yaml.safe_dump(r, default_flow_style=False) if phase != "Succeeded": workflow_success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) for wf_name, wf_status in workflow_status_yamls.items(): util.upload_to_gcs( wf_status, os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name))) for r in tekton_results: condition = "Failed" name = r.get("metadata", {}).get("name") if r.get("status", {}).get("conditions", []): condition = r["status"]["conditions"][0].get("reason", "Failed") workflow_phase[name] = condition workflow_status_yamls[name] = yaml.safe_dump(r, default_flow_style=False) if condition != "Succeeded": workflow_success = False logging.info("Workflow %s/%s finished phase: %s", args.tekton_namespace, name, condition) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts_dir, "build-log.txt")) all_tests_success = prow_artifacts.finalize_prow_job( args.bucket, workflow_success, workflow_phase, ui_urls) return all_tests_success
def run_papermill_job( notebook_path, name, namespace, # pylint: disable=too-many-branches,too-many-statements repos, image, artifacts_gcs="", test_target_name=""): """Generate a K8s job to run a notebook using papermill Args: notebook_path: Path to the notebook. This should be in the form "{REPO_OWNER}/{REPO}/path/to/notebook.ipynb" name: Name for the K8s job namespace: The namespace where the job should run. repos: Which repos to checkout; if None or empty tries to infer based on PROW environment variables image: The docker image to run the notebook in. """ util.maybe_activate_service_account() with open("job.yaml") as hf: job = yaml.load(hf) if notebook_path.startswith("/"): raise ValueError( "notebook_path={0} should not start with /".format(notebook_path)) # We need to checkout the correct version of the code # in presubmits and postsubmits. We should check the environment variables # for the prow environment variables to get the appropriate values. # We should probably also only do that if the # See # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables if not repos: repos = argo_build_util.get_repo_from_prow_env() logging.info(f"Using repos {repos}") if not repos: raise ValueError("Could not get repos from prow environment variable " "and --repos isn't explicitly set") repos += ",kubeflow/testing@HEAD" logging.info("Repos set to %s", repos) job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [ "/usr/local/bin/checkout_repos.sh", "--repos=" + repos, "--src_dir=/src", "--depth=all", ] job["spec"]["template"]["spec"]["containers"][0]["image"] = image full_notebook_path = os.path.join("/src", notebook_path) job["spec"]["template"]["spec"]["containers"][0]["command"] = [ "python3", "-m", "kubeflow.examples.notebook_tests.execute_notebook", "--notebook_path", full_notebook_path ] job["spec"]["template"]["spec"]["containers"][0][ "workingDir"] = os.path.dirname(full_notebook_path) # The prow bucket to use for results/artifacts prow_bucket = prow_artifacts.PROW_RESULTS_BUCKET if artifacts_gcs: prow_dir = os.path.join(artifacts_gcs, "artifacts") if test_target_name: prow_dir = os.path.join(prow_dir, test_target_name) logging.info("Prow artifacts directory: %s", prow_dir) prow_bucket, prow_path = util.split_gcs_uri(prow_dir) elif os.getenv("REPO_OWNER") and os.getenv("REPO_NAME"): # Running under prow prow_dir = prow_artifacts.get_gcs_dir(prow_bucket) logging.info("Prow artifacts dir: %s", prow_dir) prow_dir = os.path.join(prow_dir, "artifacts") if os.getenv("TEST_TARGET_NAME"): prow_dir = os.path.join(prow_dir, os.getenv("TEST_TARGET_NAME").lstrip("/")) prow_bucket, prow_path = util.split_gcs_uri(prow_dir) else: prow_path = "notebook-test" + datetime.datetime.now().strftime( "%H%M%S") prow_path = prow_path + "-" + uuid.uuid4().hex[0:3] prow_dir = util.to_gcs_uri(prow_bucket, prow_path) prow_path = os.path.join(prow_path, name + ".html") output_gcs = util.to_gcs_uri(NB_BUCKET, prow_path) job["spec"]["template"]["spec"]["containers"][0]["env"] = [ { "name": "OUTPUT_GCS", "value": output_gcs }, { "name": "PYTHONPATH", "value": "/src/kubeflow/testing/py:/src/kubeflow/examples/py" }, ] logging.info("Notebook will be written to %s", output_gcs) util.load_kube_config(persist_config=False) if name: job["metadata"]["name"] = name else: job["metadata"]["name"] = ("notebook-test-" + datetime.datetime.now().strftime("%H%M%S") + "-" + uuid.uuid4().hex[0:3]) name = job["metadata"]["name"] job["metadata"]["namespace"] = namespace # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() batch_api = k8s_client.BatchV1Api(api_client) logging.info("Creating job:\n%s", yaml.dump(job)) actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"], job) logging.info("Created job %s.%s:\n%s", namespace, name, yaml.safe_dump(actual_job.to_dict())) logging.info("*********************Job logs************************") logging.info(logs_for_job(PROJECT, name)) logging.info("*****************************************************") final_job = util.wait_for_job(api_client, namespace, name, timeout=datetime.timedelta(minutes=30)) logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict())) logging.info("*********************Job logs************************") logging.info(logs_for_job(PROJECT, name)) logging.info("*****************************************************") # Download notebook html to artifacts logging.info("Copying %s to bucket %s", output_gcs, prow_bucket) storage_client = storage.Client() bucket = storage_client.get_bucket(NB_BUCKET) blob = bucket.get_blob(prow_path) destination_bucket = storage_client.get_bucket(prow_bucket) bucket.copy_blob(blob, destination_bucket) if not final_job.status.conditions: raise RuntimeError("Job {0}.{1}; did not complete".format( namespace, name)) last_condition = final_job.status.conditions[-1] if last_condition.type not in ["Complete"]: logging.error("Job didn't complete successfully") raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches # Print ksonnet version util.run(["ks", "version"]) if args.release: generate_env_from_head(args) workflows = [] if args.config_file: workflows.extend(parse_config_file(args.config_file, args.repos_dir)) if args.app_dir and args.component: # TODO(jlewi): We can get rid of this branch once all repos are using a prow_config.xml file. workflows.append( WorkflowComponent("legacy", args.app_dir, args.component, {})) create_started_file(args.bucket) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() api_client = k8s_client.ApiClient() workflow_names = [] ui_urls = {} for w in workflows: # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name job_type = os.getenv("JOB_TYPE") if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # Create a new environment for this run env = workflow_name util.run(["ks", "env", "add", env], cwd=w.app_dir) util.run([ "ks", "param", "set", "--env=" + env, w.component, "name", workflow_name ], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = [ "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME" ] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ "ks", "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env) ], cwd=w.app_dir) util.run([ "ks", "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args) ], cwd=w.app_dir) util.run([ "ks", "param", "set", "--env=" + env, w.component, "bucket", args.bucket ], cwd=w.app_dir) if args.release: util.run([ "ks", "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG") ], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ "ks", "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k]) ], cwd=w.app_dir) # For debugging print out the manifest util.run(["ks", "show", env, "-c", w.component], cwd=w.app_dir) util.run(["ks", "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ( "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) success = True workflow_phase = {} try: results = argo_client.wait_for_workflows( api_client, get_namespace(args), workflow_names, status_callback=argo_client.log_status) for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase if phase != "Succeeded": success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) except util.TimeoutError: success = False logging.error("Time out waiting for Workflows %s to finish", ",".join(workflow_names)) except Exception as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.error("Exception occurred: %s", e) raise finally: success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success
def test_xgboost_synthetic( record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements repos, image): '''Generate Job and summit.''' util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic") util.maybe_activate_service_account() with open("job.yaml") as hf: job = yaml.load(hf) # We need to checkout the correct version of the code # in presubmits and postsubmits. We should check the environment variables # for the prow environment variables to get the appropriate values. # We should probably also only do that if the # See # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables if not repos: repos = argo_build_util.get_repo_from_prow_env() logging.info("Repos set to %s", repos) job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [ "/usr/local/bin/checkout_repos.sh", "--repos=" + repos, "--src_dir=/src", "--depth=all", ] job["spec"]["template"]["spec"]["containers"][0]["image"] = image util.load_kube_config(persist_config=False) if name: job["metadata"]["name"] = name else: job["metadata"]["name"] = ("xgboost-test-" + datetime.datetime.now().strftime("%H%M%S") + "-" + uuid.uuid4().hex[0:3]) name = job["metadata"]["name"] job["metadata"]["namespace"] = namespace # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() batch_api = k8s_client.BatchV1Api(api_client) logging.info("Creating job:\n%s", yaml.dump(job)) actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"], job) logging.info("Created job %s.%s:\n%s", namespace, name, yaml.safe_dump(actual_job.to_dict())) final_job = util.wait_for_job(api_client, namespace, name, timeout=datetime.timedelta(minutes=30)) logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict())) if not final_job.status.conditions: raise RuntimeError("Job {0}.{1}; did not complete".format( namespace, name)) last_condition = final_job.status.conditions[-1] if last_condition.type not in ["Complete"]: logging.error("Job didn't complete successfully") raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
def get_credentials(project="kubeflow-ci-deployment", pattern=DEFAULT_PATTERN, location=None, output="", testing_label=None): """Get the latest deployment information and use it to get credentials. Args: project: string, Name of deployed GCP project. pattern: Regex pattern to look for location: zone or region to search for clusters. output: (Optional) if supplied write information about matching cluster to this YAML file. testing_label: string, annotation used to identify testing clusters. Optional. """ logging.info( "Calling get_credential - this call needs gcloud client CLI.") util.maybe_activate_service_account() command = [ "gcloud", "container", "clusters", "get-credentials", "--project=" + project ] info = { "project": project, "location": location, } if location: c = _get_latest_cluster(project, location, pattern) if not c: message = ( "No clusters found matching: project: {0}, location: {1}, " "pattern: {2}").format(project, location, pattern) raise ValueError(message) if ZONE_PATTERN.match(location): command.append("--zone=" + location) else: command.append("--region=" + location) command.append(c["name"]) info["cluster"] = c else: # This is the pre blueprint which is using deployment manager logging.warning( "Invoking deprecated path because location not set") dm = get_latest(project=project, testing_label=testing_label, base_name=pattern, field="all") command.append("--zone=" + dm["zone"], dm["name"]) info["cluster"] = dm if output: logging.info(f"Writing cluster information to {output}") with open(output, "w") as hf: yaml.dump(info, hf) # This call may be flaky due to timeout. @retry(stop_max_attempt_number=10, wait_fixed=5000) def run_get_credentials(): util.run(command) run_get_credentials()
def main(): logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument("--project", default="kubeflow-ci", type=str, help=("The project.")) parser.add_argument("--max_age_hours", default=3, type=int, help=("The age of deployments to gc.")) subparsers = parser.add_subparsers() ###################################################### # Paraser for everything parser_all = subparsers.add_parser("all", help="Cleanup everything") add_deployments_args(parser_all) add_workflow_args(parser_all) parser_all.set_defaults(func=cleanup_all) ###################################################### # Parser for argo_workflows parser_argo = subparsers.add_parser("workflows", help="Cleanup workflows") add_workflow_args(parser_argo) parser_argo.set_defaults(func=cleanup_workflows) ###################################################### # Parser for endpoints parser_endpoints = subparsers.add_parser("endpoints", help="Cleanup endpoints") parser_endpoints.set_defaults(func=cleanup_endpoints) ###################################################### # Parser for service accounts parser_service_account = subparsers.add_parser( "service_accounts", help="Cleanup service accounts") parser_service_account.set_defaults(func=cleanup_service_accounts) ###################################################### # Parser for deployments parser_deployments = subparsers.add_parser("deployments", help="Cleanup deployments") add_deployments_args(parser_deployments) parser_deployments.set_defaults(func=cleanup_deployments) args = parser.parse_args() util.maybe_activate_service_account() args.func(args)
def test_xgboost_synthetic( record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements repos, image, notebook_artifacts_dir): '''Generate Job and summit.''' util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic") util.maybe_activate_service_account() with open("job.yaml") as hf: job = yaml.load(hf) # We need to checkout the correct version of the code # in presubmits and postsubmits. We should check the environment variables # for the prow environment variables to get the appropriate values. # We should probably also only do that if the # See # https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables if not repos: repos = argo_build_util.get_repo_from_prow_env() repos += ",kubeflow/testing@HEAD" logging.info("Repos set to %s", repos) job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [ "/usr/local/bin/checkout_repos.sh", "--repos=" + repos, "--src_dir=/src", "--depth=all", ] nb_bucket = "kubeflow-ci-deployment" nb_path = os.path.join("xgboost_synthetic_testing", os.getenv("JOB_TYPE"), os.getenv("HOSTNAME"), "notebook.html") output_gcs = util.to_gcs_uri(nb_bucket, nb_path) logging.info("Tested notebook will be outputed to: %s", output_gcs) job["spec"]["template"]["spec"]["containers"][0]["env"] = [ { "name": "PYTHONPATH", "value": "/src/kubeflow/testing/py" }, { "name": "OUTPUT_GCS", "value": output_gcs }, ] job["spec"]["template"]["spec"]["containers"][0]["image"] = image util.load_kube_config(persist_config=False) if name: job["metadata"]["name"] = name else: job["metadata"]["name"] = ("xgboost-test-" + datetime.datetime.now().strftime("%H%M%S") + "-" + uuid.uuid4().hex[0:3]) name = job["metadata"]["name"] job["metadata"]["namespace"] = namespace # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() batch_api = k8s_client.BatchV1Api(api_client) logging.info("Creating job:\n%s", yaml.dump(job)) actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"], job) logging.info("Created job %s.%s:\n%s", namespace, name, yaml.safe_dump(actual_job.to_dict())) final_job = util.wait_for_job(api_client, namespace, name, timeout=datetime.timedelta(minutes=30)) logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict())) if not final_job.status.conditions: raise RuntimeError("Job {0}.{1}; did not complete".format( namespace, name)) last_condition = final_job.status.conditions[-1] # Download notebook html to artifacts notebook_artifacts_path = os.path.join(notebook_artifacts_dir, "notebook.html") logging.info("Writing notebook artifact to: %s", notebook_artifacts_path) os.makedirs(notebook_artifacts_dir, exist_ok=True) storage_client = storage.Client() bucket = storage_client.get_bucket(nb_bucket) blob = bucket.get_blob(nb_path) blob.download_to_filename(notebook_artifacts_path) if last_condition.type not in ["Complete"]: logging.error("Job didn't complete successfully") raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument("--project", default="kubeflow-ci-deployment", type=str, help=("The project.")) parser.add_argument("--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) parser.add_argument( "--oauth_file", default=("gs://kubeflow-ci-deployment_kf-data/" "kf-iap-oauth.kubeflow-ci-deployment.yaml"), type=str, help=("The file containing the OAuth client ID & secret" "for IAP.")) # TODO(jlewi): Should rename this argument to something like kfctl_src # We should try to do it in a backwards compatible way. parser.add_argument( "--kubeflow_repo", default="/src/kubeflow/kubeflow", type=str, help=("Path to the source for kfctl. Should be the directory " "containing the Makefile to build kfctl")) parser.add_argument("--kfctl_path", default="", type=str, help=("Path to kfctl; can be a URL.")) parser.add_argument( "--kfctl_config", default=("https://raw.githubusercontent.com/kubeflow/manifests" "/master/kfdef/kfctl_gcp_iap.yaml"), type=str, help=("Path to the kfctl config to use")) parser.add_argument("--apps_dir", default=os.getcwd(), type=str, help=("Directory to store kubeflow apps.")) parser.add_argument( "--name", type=str, default="kf-vmaster-{uid}", help=("Name for the deployment. This can be a python format string " "with the variable uid. Uid will automatically be substituted " "for a unique value based on the time.")) parser.add_argument( "--email", type=str, default="", help=("(Optional). Email of the person to create the default profile" "for. If not specificied uses the gcloud config value.")) parser.add_argument( "--extra_users", type=str, default="", help=("Comma separated list of additional users to grant access. " "Should be in the form user:[email protected] or" "serviceAccount:[email protected]")) parser.add_argument("--setup_project", dest="setup_project", action="store_true", help="Setup the project") parser.add_argument("--no-setup_project", dest="setup_project", action="store_false", help="Do not setup the project") parser.set_defaults(setup_project=True) parser.add_argument("--use_self_cert", dest="use_self_cert", action="store_true", help="Use a self signed certificate") parser.add_argument("--no-use_self_cert", dest="use_self_cert", action="store_false", help="Do not use a self signed certificate") parser.set_defaults(use_self_cert=True) args = parser.parse_args() util.maybe_activate_service_account() # For debugging purposes output the command util.run(["gcloud", "config", "list"]) util.run(["gcloud", "auth", "list"]) bucket, blob_path = util.split_gcs_uri(args.oauth_file) client = storage.Client(project=args.project) bucket = client.get_bucket(bucket) blob = bucket.get_blob(blob_path) contents = blob.download_as_string() oauth_info = yaml.load(contents) if args.kubeflow_repo and args.kfctl_path: raise ValueError( "Exactly one of --kubeflow_repo and --kfctl_path neeeds " "to be set.") if not args.kubeflow_repo and not args.kfctl_path: raise ValueError( "Exactly one of --kubeflow_repo and --kfctl_path neeeds " "to be set.") git_describe = "" if args.kubeflow_repo: git_describe = util.run( ["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") kfctl_path = build_kfctl_go(args) else: if args.kfctl_path.startswith("http"): temp_dir = tempfile.mkdtemp() util.run(["curl", "-L", "-o", "kfctl.tar.gz", args.kfctl_path], cwd=temp_dir) util.run(["tar", "-xvf", "kfctl.tar.gz"], cwd=temp_dir) kfctl_path = os.path.join(temp_dir, "kfctl") git_describe = util.run([kfctl_path, "version"]) else: kfctl_path = args.kfctl_path logging.info("kfctl path set to %s", kfctl_path) # We need to keep the name short to avoid hitting limits with certificates. uid = datetime.datetime.now().strftime("%m%d") + "-" uid = uid + uuid.uuid4().hex[0:3] args.name = args.name.format(uid=uid) logging.info("Using name %s", args.name) app_dir = os.path.join(args.apps_dir, args.name) if not os.path.exists(args.apps_dir): os.makedirs(args.apps_dir) env = {} env.update(os.environ) env.update(oauth_info) # GCP labels can only take as input alphanumeric characters, hyphens, and # underscores. Replace not valid characters with hyphens. labels = { "git": git_describe, "purpose": "kf-test-cluster", } for k, v in labels.items(): val = v.lower().replace("\"", "") val = re.sub(r"[^a-z0-9\-_]", "-", val) labels[k] = val deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels) add_extra_users(args.project, args.extra_users)
def main(): logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "--project", default="kubeflow-ci", type=str, help=("The project.")) # The values prefixed with testing_ refer to the test cluster where the # Argo workflows run. In contrast --project is the project where the tests # spin up Kubeflow instances. parser.add_argument( "--testing_project", default="kubeflow-ci", type=str, help=("The cluster used for Argo workflows.")) parser.add_argument( "--testing_cluster", default="kubeflow-testing", type=str, help=("The cluster used for Argo workflows.")) parser.add_argument( "--testing_zone", default="us-east1-d", type=str, help=("The zone of the cluster used for Argo workflows.")) parser.add_argument( "--max_age_hours", default=3, type=int, help=("The age of deployments to gc.")) parser.add_argument( "--gc_backend_services", default=False, type=bool, help=("""Whether to GC backend services.""")) parser.add_argument( "--max_wf_age_hours", default=7*24, type=int, help=("How long to wait before garbage collecting Argo workflows.")) parser.add_argument('--dryrun', dest='dryrun', action='store_true') parser.add_argument('--no-dryrun', dest='dryrun', action='store_false') parser.set_defaults(dryrun=False) subparsers = parser.add_subparsers() ###################################################### # Paraser for everything parser_all = subparsers.add_parser( "all", help="Cleanup everything") add_deployments_args(parser_all) add_workflow_args(parser_all) parser_all.set_defaults(func=cleanup_all) ###################################################### # Parser for argo_workflows parser_argo = subparsers.add_parser( "workflows", help="Cleanup workflows") add_workflow_args(parser_argo) parser_argo.set_defaults(func=cleanup_workflows) ###################################################### # Parser for endpoints parser_endpoints = subparsers.add_parser( "endpoints", help="Cleanup endpoints") parser_endpoints.set_defaults(func=cleanup_endpoints) ###################################################### # Parser for firewallrules parser_firewall = subparsers.add_parser( "firewall", help="Cleanup firewall rules") parser_firewall.set_defaults(func=cleanup_firewall_rules) ###################################################### # Parser for health checks parser_health = subparsers.add_parser( "health_checks", help="Cleanup health checks") parser_health.set_defaults(func=cleanup_health_checks) ###################################################### # Parser for service accounts parser_service_account = subparsers.add_parser( "service_accounts", help="Cleanup service accounts") parser_service_account.set_defaults(func=cleanup_service_accounts) ###################################################### # Parser for service account bindings parser_service_account = subparsers.add_parser( "service_account_bindings", help="Cleanup service account bindings") parser_service_account.set_defaults(func=cleanup_service_account_bindings) ###################################################### # Parser for certificates parser_certificates = subparsers.add_parser( "certificates", help="Cleanup certificates") parser_certificates.set_defaults(func=cleanup_certificates) ###################################################### # Parser for auto deployments parser_auto_deployments = subparsers.add_parser( "auto_deployments", help="Cleanup auto deployments") add_deployments_args(parser_auto_deployments) parser_auto_deployments.set_defaults(func=cleanup_auto_deployments) ###################################################### # Parser for deployments parser_deployments = subparsers.add_parser( "deployments", help="Cleanup deployments") add_deployments_args(parser_deployments) parser_deployments.set_defaults(func=cleanup_deployments) ###################################################### # Parser for clusters parser_clusters = subparsers.add_parser( "clusters", help="Cleanup clusters") parser_clusters.add_argument( "--zones", default="us-east1-d,us-central1-a", type=str, help="Comma separated list of zones to check.") parser_clusters.set_defaults(func=cleanup_clusters) ###################################################### # Parser for instance groups parser_ig = subparsers.add_parser( "instance_groups", help="Cleanup instance groups") add_deployments_args(parser_ig) parser_ig.set_defaults(func=cleanup_instance_groups) args = parser.parse_args() # Update max age MAX_LIFETIME[E2E_INFRA] = datetime.timedelta(hours=args.max_age_hours) logging.info("Max lifetime:\n%s", MAX_LIFETIME) util.maybe_activate_service_account() args.func(args)
def all( self, build_project, registry_project, remote_fork, # pylint: disable=too-many-statements,too-many-branches add_github_host=False): """Build the latest image and update the prototype. Args: build_project: GCP project used to build the image. registry_project: GCP project used to host the image. remote_fork: Url of the remote fork. The remote fork used to create the PR; e.g. [email protected]:jlewi/kubeflow.git. currently only ssh is supported. add_github_host: If true will add the github ssh host to known ssh hosts. """ repo = git.Repo(self._root_dir()) util.maybe_activate_service_account() last_commit = self.last_commit # Ensure github.com is in the known hosts if add_github_host: output = util.run(["ssh-keyscan", "github.com"]) with open(os.path.join(os.getenv("HOME"), ".ssh", "known_hosts"), mode='a') as hf: hf.write(output) if not remote_fork.startswith("*****@*****.**"): raise ValueError("Remote fork currently only supports ssh") remote_repo = self._find_remote_repo(repo, remote_fork) if not remote_repo: fork_name = remote_fork.split(":", 1)[-1].split("/", 1)[0] logging.info("Adding remote %s=%s", fork_name, remote_fork) remote_repo = repo.create_remote(fork_name, remote_fork) logging.info("Last change to components-jupyter-web-app was %s", last_commit) base = "gcr.io/{0}/jupyter-web-app".format(registry_project) # Check if there is already an image tagged with this commit. image = base + ":" + self.last_commit transport = transport_pool.Http(httplib2.Http) src = docker_name.from_string(image) creds = docker_creds.DefaultKeychain.Resolve(src) image_exists = False try: with v2_2_image.FromRegistry(src, creds, transport) as src_image: logging.info("Image %s exists; digest: %s", image, src_image.digest()) image_exists = True except docker_http.V2DiagnosticException as e: if e.status == 404: logging.info("%s doesn't exist", image) else: raise if not image_exists: logging.info("Building the image") image = self.build_image(build_project, registry_project) logging.info("Created image: %s", image) else: logging.info("Image %s already exists", image) # We should check what the current image is if and not update it # if its the existing image prototype_file = self.update_prototype(image) if not prototype_file: logging.info("Prototype not updated so not creating a PR.") return branch_name = "update_jupyter_{0}".format(last_commit) if repo.active_branch.name != branch_name: logging.info("Creating branch %s", branch_name) branch_names = [b.name for b in repo.branches] if branch_name in branch_names: logging.info("Branch %s exists", branch_name) util.run(["git", "checkout", branch_name], cwd=self._root_dir()) else: util.run(["git", "checkout", "-b", branch_name], cwd=self._root_dir()) if self._check_if_pr_exists(commit=last_commit): # Since a PR already exists updating to the specified commit # don't create a new one. # We don't want to just push -f because if the PR already exists # git push -f will retrigger the tests. # To force a recreate of the PR someone could close the existing # PR and a new PR will be created on the next cron run. return logging.info("Add file %s to repo", prototype_file) repo.index.add([prototype_file]) repo.index.commit( "Update the jupyter web app image to {0}".format(image)) util.run(["git", "push", "-f", remote_repo.name], cwd=self._root_dir()) self.create_pull_request(commit=last_commit)
def run_papermill_job( notebook_path, name, namespace, # pylint: disable=too-many-branches,too-many-statements image, output=""): """Generate a K8s job to run a notebook using papermill Args: notebook_path: Path to the notebook. name: Name for the K8s job namespace: The namespace where the job should run. image: The docker image to run the notebook in. output = Location where artifacts like the rendered notebook should be uploaded. Should generally be an object storage path. Currently only GCS is supported. """ util.maybe_activate_service_account() with open("job.yaml") as hf: job = yaml.load(hf) job["spec"]["template"]["spec"]["containers"][0]["image"] = image job["spec"]["template"]["spec"]["containers"][0]["command"] = [ "python3", "-m", "kubeflow.testing.notebook_tests.execute_notebook", "--notebook_path", notebook_path ] job["spec"]["template"]["spec"]["containers"][0]["env"] = [ { "name": "OUTPUT_GCS", "value": output }, { "name": "PYTHONPATH", "value": "/src/kubeflow/testing/py" }, ] logging.info("Notebook will be written to %s", output) util.load_kube_config(persist_config=False) if name: job["metadata"]["name"] = name else: job["metadata"]["name"] = ("notebook-test-" + datetime.datetime.now().strftime("%H%M%S") + "-" + uuid.uuid4().hex[0:3]) name = job["metadata"]["name"] job["metadata"]["namespace"] = namespace # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() batch_api = k8s_client.BatchV1Api(api_client) logging.info("Creating job:\n%s", yaml.dump(job)) actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"], job) logging.info("Created job %s.%s:\n%s", namespace, name, yaml.safe_dump(actual_job.to_dict())) logging.info("*********************Job logs************************") logging.info(logs_for_job(PROJECT, name)) logging.info("*****************************************************") final_job = util.wait_for_job(api_client, namespace, name, timeout=datetime.timedelta(minutes=30)) logging.info("Final job:\n%s", yaml.safe_dump(final_job.to_dict())) logging.info("*********************Job logs************************") logging.info(logs_for_job(PROJECT, name)) logging.info("*****************************************************") if not final_job.status.conditions: raise RuntimeError("Job {0}.{1}; did not complete".format( namespace, name)) last_condition = final_job.status.conditions[-1] if last_condition.type not in ["Complete"]: logging.error("Job didn't complete successfully") raise RuntimeError("Job {0}.{1} failed".format(namespace, name))
def run(args, file_handler): create_started_file(args.bucket) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") job_type = os.getenv("JOB_TYPE") if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) # Create a new environment for this run env = workflow_name util.run(["ks", "env", "add", env], cwd=args.app_dir) util.run([ "ks", "param", "set", "--env=" + env, args.component, "name", workflow_name ], cwd=args.app_dir) util.load_kube_config() api_client = k8s_client.ApiClient() # Set the prow environment variables. prow_env = [] names = [ "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME" ] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ "ks", "param", "set", "--env=" + env, args.component, "prow_env", ",".join(prow_env) ], cwd=args.app_dir) util.run([ "ks", "param", "set", "--env=" + env, args.component, "namespace", NAMESPACE ], cwd=args.app_dir) util.run([ "ks", "param", "set", "--env=" + env, args.component, "bucket", args.bucket ], cwd=args.app_dir) # For debugging print out the manifest util.run(["ks", "show", env, "-c", args.component], cwd=args.app_dir) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) ui_url = ( "http://testing-argo.kubeflow.io/timeline/kubeflow-test-infra/{0}" ";tab=workflow".format(workflow_name)) logging.info("URL for workflow: %s", ui_url) success = False try: results = argo_client.wait_for_workflow( api_client, NAMESPACE, workflow_name, status_callback=argo_client.log_status) if results["status"]["phase"] == "Succeeded": success = True logging.info("Workflow %s/%s finished phase: %s", NAMESPACE, workflow_name, results["status"]["phase"]) except util.TimeoutError: success = False logging.error("Time out waiting for Workflow %s/%s to finish", NAMESPACE, workflow_name) finally: create_finished_file(args.bucket, success) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success