def run_simple_tfjob(self, component): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Check for creation failures. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: # TODO(jlewi): Starting with # https://github.com/kubeflow/tf-operator/pull/646 the number of events # no longer seems to match the expected; it looks like maybe events # are being combined? For now we just log a warning rather than an # error. logging.warning(creation_failures) # Delete the TFJob. tf_job_client.delete_tf_job( api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_tfjob_with_shutdown_policy(self, component, shutdown_policy): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) if shutdown_policy == "worker": tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "worker", 1) else: tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_distributed_training_job(self, component): api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # Check for creation failures. creation_failures = tf_job_client.get_creation_failures_from_tfjob( api_client, self.namespace, results) if creation_failures: logging.warning(creation_failures) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_jupyter(record_xml_attribute, env, namespace): """Test the jupyter notebook. Args: record_xml_attribute: Test fixture provided by pytest. env: ksonnet environment. namespace: namespace to run in. """ util.set_pytest_junit(record_xml_attribute, "jupyter_test") app_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") if app_credentials: logging.info("Activate service account") util.run([ "gcloud", "auth", "activate-service-account", "--key-file=" + app_credentials ]) # util.load_kube_config appears to hang on python3 kube_config.load_kube_config() api_client = k8s_client.ApiClient() host = api_client.configuration.host logging.info("Kubernetes master: %s", host) master = host.rsplit("/", 1)[-1] this_dir = os.path.dirname(__file__) app_dir = os.path.join(this_dir, "test_app") ks_cmd = ks_util.get_ksonnet_cmd(app_dir) name = "jupyter-test" service = "jupyter-test" component = "jupyter" params = "" ks_util.setup_ks_app(app_dir, env, namespace, component, params) util.run([ks_cmd, "apply", env, "-c", component], cwd=app_dir) conditions = ["Running"] results = util.wait_for_cr_condition(api_client, GROUP, PLURAL, VERSION, namespace, name, conditions) logging.info("Result of CRD:\n%s", results) # We proxy the request through the APIServer so that we can connect # from outside the cluster. url = ("https://{master}/api/v1/namespaces/{namespace}/services/{service}:80" "/proxy/default/jupyter/lab?").format( master=master, namespace=namespace, service=service) logging.info("Request: %s", url) r = send_request(url, verify=False) if r.status_code != requests.codes.OK: msg = "Request to {0} exited with status code: {1} and content: {2}".format( url, r.status_code, r.content) logging.error(msg) raise RuntimeError(msg)
def test_invalid_tfjob_spec(self): api_client = k8s_client.ApiClient() component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) logging.info("Wait for conditions Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) last_condition = results.get("status", {}).get("conditions", [{}])[-1] if last_condition.get("type", "").lower() != "failed": self.failure = "Job {0} in namespace {1} did not fail; status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return pattern = ".*the spec is invalid.*" condition_message = last_condition.get("message", "") if not re.match(pattern, condition_message): self.failure = "Condition message {0} did not match pattern {1}".format( condition_message, pattern) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def ks_deploy(app_dir, component, params, env=None, account=None): """Deploy the specified ksonnet component. Args: app_dir: The ksonnet directory component: Name of the component to deployed params: A dictionary of parameters to set; can be empty but should not be None. env: (Optional) The environment to use, if none is specified a new one is created. account: (Optional) The account to use. Raises: ValueError: If input arguments aren't valid. """ if not component: raise ValueError("component can't be None.") # TODO(jlewi): It might be better if the test creates the app and uses # the latest stable release of the ksonnet configs. That however will cause # problems when we make changes to the TFJob operator that require changes # to the ksonnet configs. One advantage of checking in the app is that # we can modify the files in vendor if needed so that changes to the code # and config can be submitted in the same pr. now = datetime.datetime.now() if not env: env = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] logging.info("Using app directory: %s", app_dir) ks_cmd = ks_util.get_ksonnet_cmd(app_dir) logging.info("Using ksonnet cmd: %s", ks_cmd) try: util.run([ks_cmd, "env", "add", env], cwd=app_dir) except subprocess.CalledProcessError as e: if not re.search(".*environment.*already exists.*", e.output): raise for k, v in params.iteritems(): util.run([ks_cmd, "param", "set", "--env=" + env, component, k, v], cwd=app_dir) apply_command = [ks_cmd, "apply", env, "-c", component] if account: apply_command.append("--as=" + account) util.run(apply_command, cwd=app_dir)
def __init__(self, args): namespace, name, env = test_runner.parse_runtime_params(args) self.app_dir = args.app_dir if not self.app_dir: self.app_dir = os.path.join(os.path.dirname(__file__), "..", "ks_app") self.app_dir = os.path.abspath(self.app_dir) logging.info("--app_dir not set defaulting to: %s", self.app_dir) self.env = env self.namespace = namespace self.params = args.params self.ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) super(TFJobTest, self).__init__(class_name="TFJobTest", name=name)
def test_invalid_tfjob_spec(self): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) try: util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) except subprocess.CalledProcessError as e: if "invalid: spec.tfReplicaSpecs: Required value" in e.output: logging.info("Created job failed which is expected. Reason %s", e.output) else: self.failure = "Job {0} in namespace {1} failed because {2}".format( self.name, self.namespace, e.output) logging.error(self.failure)
def test_tfjob_and_verify_runconfig(self): api_client = k8s_client.ApiClient() masterHost = api_client.configuration.host component = COMPONENT_NAME + "_" + self.tfjob_version # Setup the ksonnet app ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) num_ps = results.get("spec", {}).get("tfReplicaSpecs", {}).get("PS", {}).get("replicas", 0) num_workers = results.get("spec", {}).get("tfReplicaSpecs", {}).get("Worker", {}).get("replicas", 0) verify_runconfig(masterHost, self.namespace, self.name, "chief", num_ps, num_workers) verify_runconfig(masterHost, self.namespace, self.name, "worker", num_ps, num_workers) verify_runconfig(masterHost, self.namespace, self.name, "ps", num_ps, num_workers) verify_runconfig(masterHost, self.namespace, self.name, "evaluator", num_ps, num_workers) tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": # We need to get a common ancestor for the PR and the master branch common_ancestor = util.run(["git", "merge-base", "HEAD", "master"], cwd=os.path.join(args.repos_dir, repo_owner, repo_name)) diff_command = ["git", "diff", "--name-only", common_ancestor] elif job_type == "postsubmit": # See: https://git-scm.com/docs/git-diff # This syntax compares the commit before pull_base_sha with the commit # at pull_base_sha diff_command = [ "git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha ] changed_files = [] if job_type == "presubmit" or job_type == "postsubmit": changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] if args.config_file: workflows.extend(parse_config_file(args.config_file, args.repos_dir)) # Create an initial version of the file with no urls create_started_file(args.bucket, {}) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() workflow_names = [] ui_urls = {} for w in workflows: # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir) # Print ksonnet version util.run([ks_cmd, "version"]) # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info( "Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info( "Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info( "Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # Create a new environment for this run env = workflow_name util.run( [ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args)], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name ], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = [ "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME" ] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket ], cwd=w.app_dir) if args.release: util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG") ], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k]) ], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ( "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) # We delay creating started.json until we know the Argo workflow URLs create_started_file(args.bucket, ui_urls) success = True workflow_phase = {} try: results = argo_client.wait_for_workflows( get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status) for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase if phase != "Succeeded": success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) except util.TimeoutError: success = False logging.exception("Time out waiting for Workflows %s to finish", ",".join(workflow_names)) except Exception as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) raise finally: success = prow_artifacts.finalize_prow_job(args.bucket, success, workflow_phase, ui_urls) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts.get_gcs_dir(args.bucket), "build-log.txt")) return success
def run(args, file_handler): # pylint: disable=too-many-statements,too-many-branches # Check https://github.com/kubernetes/test-infra/blob/master/prow/jobs.md # for a description of the injected environment variables. job_type = os.getenv("JOB_TYPE") repo_owner = os.getenv("REPO_OWNER") repo_name = os.getenv("REPO_NAME") base_branch_name = os.getenv("PULL_BASE_REF") pull_base_sha = os.getenv("PULL_BASE_SHA") # For presubmit/postsubmit jobs, find the list of files changed by the PR. diff_command = [] if job_type == "presubmit": # We need to get a common ancestor for the PR and the base branch cloned_repo_dir = os.path.join(args.repos_dir, repo_owner, repo_name) _ = util.run([ "git", "fetch", "origin", base_branch_name + ":refs/remotes/origin/" + base_branch_name ], cwd=cloned_repo_dir) diff_command = ["git", "diff", "--name-only"] diff_branch = "remotes/origin/{}".format(base_branch_name) try: common_ancestor = util.run( ["git", "merge-base", "HEAD", diff_branch], cwd=cloned_repo_dir) diff_command.append(common_ancestor) except subprocess.CalledProcessError as e: logging.warning( "git merge-base failed; see " "https://github.com/kubeflow/kubeflow/issues/3523. Diff " "will be computed against the current master and " "therefore files not changed in the PR might be " "considered when determining which tests to trigger") diff_command.append(diff_branch) elif job_type == "postsubmit": # See: https://git-scm.com/docs/git-diff # This syntax compares the commit before pull_base_sha with the commit # at pull_base_sha diff_command = [ "git", "diff", "--name-only", pull_base_sha + "^", pull_base_sha ] changed_files = [] if job_type in ("presubmit", "postsubmit"): changed_files = util.run(diff_command, cwd=os.path.join(args.repos_dir, repo_owner, repo_name)).splitlines() for f in changed_files: logging.info("File %s is modified.", f) if args.release: generate_env_from_head(args) workflows = [] config = {} if args.config_file: config, new_workflows = parse_config_file(args.config_file, args.repos_dir) workflows.extend(new_workflows) # Add any paths to the python path extra_py_paths = [] for p in config.get("python_paths", []): # Assume that python_paths are in the format $REPO_OWNER/$REPO_NAME/path, # we need to ensure that the repo is checked out if it is different from # the current one, and if the repo is not kubeflow/testing (which is already # checked out). segments = p.split("/") if ((segments[0] != repo_owner or segments[1] != repo_name) and not p.startswith("kubeflow/testing")): logging.info("Need to clone %s/%s", segments[0], segments[1]) util.clone_repo( os.path.join(args.repos_dir, segments[0], segments[1]), segments[0], segments[1]) path = os.path.join(args.repos_dir, p) extra_py_paths.append(path) kf_test_path = os.path.join(args.repos_dir, "kubeflow/testing/py") if kf_test_path not in extra_py_paths: logging.info("Adding %s to extra python paths", kf_test_path) extra_py_paths.append(kf_test_path) logging.info("Extra python paths: %s", ":".join(extra_py_paths)) # Create an initial version of the file with no urls create_started_file(args.bucket, {}) util.maybe_activate_service_account() util.configure_kubectl(args.project, args.zone, args.cluster) util.load_kube_config() workflow_names = [] ui_urls = {} for w in workflows: # pylint: disable=too-many-nested-blocks # Create the name for the workflow # We truncate sha numbers to prevent the workflow name from being too large. # Workflow name should not be more than 63 characters because its used # as a label on the pods. workflow_name = os.getenv("JOB_NAME") + "-" + w.name # Skip this workflow if it is scoped to a different job type. if w.job_types and not job_type in w.job_types: logging.info( "Skipping workflow %s because job type %s is not one of " "%s.", w.name, job_type, w.job_types) continue # If we are scoping this workflow to specific directories, check if any files # modified match the specified regex patterns. dir_modified = False if w.include_dirs: for f in changed_files: for d in w.include_dirs: if fnmatch.fnmatch(f, d): dir_modified = True logging.info( "Triggering workflow %s because %s in dir %s is modified.", w.name, f, d) break if dir_modified: break # Only consider modified files when the job is pre or post submit, and if # the include_dirs stanza is defined. if job_type != "periodic" and w.include_dirs and not dir_modified: logging.info( "Skipping workflow %s because no code modified in %s.", w.name, w.include_dirs) continue if job_type == "presubmit": # When not running under prow we might not set all environment variables if os.getenv("PULL_NUMBER"): workflow_name += "-{0}".format(os.getenv("PULL_NUMBER")) if os.getenv("PULL_PULL_SHA"): workflow_name += "-{0}".format(os.getenv("PULL_PULL_SHA")[0:7]) elif job_type == "postsubmit": if os.getenv("PULL_BASE_SHA"): workflow_name += "-{0}".format(os.getenv("PULL_BASE_SHA")[0:7]) # Append the last 4 digits of the build number if os.getenv("BUILD_NUMBER"): workflow_name += "-{0}".format(os.getenv("BUILD_NUMBER")[-4:]) salt = uuid.uuid4().hex[0:4] # Add some salt. This is mostly a convenience for the case where you # are submitting jobs manually for testing/debugging. Since the prow should # vend unique build numbers for each job. workflow_name += "-{0}".format(salt) workflow_names.append(workflow_name) # check if ks workflow and run if w.app_dir: ks_cmd = ks_util.get_ksonnet_cmd(w.app_dir) # Print ksonnet version util.run([ks_cmd, "version"]) # Create a new environment for this run env = workflow_name util.run([ ks_cmd, "env", "add", env, "--namespace=" + get_namespace(args) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "name", workflow_name ], cwd=w.app_dir) # Set the prow environment variables. prow_env = [] names = [ "JOB_NAME", "JOB_TYPE", "BUILD_ID", "BUILD_NUMBER", "PULL_BASE_SHA", "PULL_NUMBER", "PULL_PULL_SHA", "REPO_OWNER", "REPO_NAME" ] names.sort() for v in names: if not os.getenv(v): continue prow_env.append("{0}={1}".format(v, os.getenv(v))) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "prow_env", ",".join(prow_env) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "namespace", get_namespace(args) ], cwd=w.app_dir) util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "bucket", args.bucket ], cwd=w.app_dir) if args.release: util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, "versionTag", os.getenv("VERSION_TAG") ], cwd=w.app_dir) # Set any extra params. We do this in alphabetical order to make it easier to verify in # the unittest. param_names = w.params.keys() param_names.sort() for k in param_names: util.run([ ks_cmd, "param", "set", "--env=" + env, w.component, k, "{0}".format(w.params[k]) ], cwd=w.app_dir) # For debugging print out the manifest util.run([ks_cmd, "show", env, "-c", w.component], cwd=w.app_dir) util.run([ks_cmd, "apply", env, "-c", w.component], cwd=w.app_dir) ui_url = ( "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) else: w.kwargs["name"] = workflow_name w.kwargs["namespace"] = get_namespace(args) if TEST_TARGET_ARG_NAME not in w.kwargs: w.kwargs[TEST_TARGET_ARG_NAME] = w.name logging.info( "Workflow %s doesn't set arg %s; defaulting to %s", w.name, TEST_TARGET_ARG_NAME, w.kwargs[TEST_TARGET_ARG_NAME]) # TODO(https://github.com/kubeflow/testing/issues/467): We shell out # to e2e_tool in order to dumpy the Argo workflow to a file which then # reimport. We do this because importing the py_func module appears # to break when we have to dynamically adjust sys.path to insert # new paths. Setting PYTHONPATH before launching python however appears # to work which is why we shell out to e2e_tool. command = [ "python", "-m", "kubeflow.testing.e2e_tool", "show", w.py_func ] for k, v in w.kwargs.items(): # The fire module turns underscores in parameter names into hyphens # so we convert underscores in parameter names to hyphens command.append("--{0}={1}".format(k.replace("_", "-"), v)) with tempfile.NamedTemporaryFile(delete=False) as hf: workflow_file = hf.name command.append("--output=" + hf.name) env = os.environ.copy() env["PYTHONPATH"] = ":".join(extra_py_paths) util.run(command, env=env) with open(workflow_file) as hf: wf_result = yaml.load(hf) group, version = wf_result['apiVersion'].split('/') k8s_co = k8s_client.CustomObjectsApi() workflow_name = wf_result["metadata"]["name"] py_func_result = k8s_co.create_namespaced_custom_object( group=group, version=version, namespace=wf_result["metadata"]["namespace"], plural='workflows', body=wf_result) logging.info("Created workflow:\n%s", yaml.safe_dump(py_func_result)) ui_url = ( "http://testing-argo.kubeflow.org/workflows/kubeflow-test-infra/{0}" "?tab=workflow".format(workflow_name)) ui_urls[workflow_name] = ui_url logging.info("URL for workflow: %s", ui_url) # We delay creating started.json until we know the Argo workflow URLs create_started_file(args.bucket, ui_urls) workflow_success = False workflow_phase = {} workflow_status_yamls = {} results = [] try: results = argo_client.wait_for_workflows( get_namespace(args), workflow_names, timeout=datetime.timedelta(minutes=180), status_callback=argo_client.log_status) workflow_success = True except util.ExceptionWithWorkflowResults as e: # We explicitly log any exceptions so that they will be captured in the # build-log.txt that is uploaded to Gubernator. logging.exception("Exception occurred: %s", e) results = e.workflow_results raise finally: prow_artifacts_dir = prow_artifacts.get_gcs_dir(args.bucket) # Upload logs to GCS. No logs after this point will appear in the # file in gcs file_handler.flush() util.upload_file_to_gcs( file_handler.baseFilename, os.path.join(prow_artifacts_dir, "build-log.txt")) # Upload workflow status to GCS. for r in results: phase = r.get("status", {}).get("phase") name = r.get("metadata", {}).get("name") workflow_phase[name] = phase workflow_status_yamls[name] = yaml.safe_dump( r, default_flow_style=False) if phase != "Succeeded": workflow_success = False logging.info("Workflow %s/%s finished phase: %s", get_namespace(args), name, phase) for wf_name, wf_status in workflow_status_yamls.items(): util.upload_to_gcs( wf_status, os.path.join(prow_artifacts_dir, '{}.yaml'.format(wf_name))) all_tests_success = prow_artifacts.finalize_prow_job( args.bucket, workflow_success, workflow_phase, ui_urls) return all_tests_success
def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() # Setup the ksonnet app ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) return # All pods are deleted. if clean_pod_policy == "All": pod_labels = tf_job_client.get_labels(self.name) pod_selector = tf_job_client.to_selector(pod_labels) k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace, pod_selector) # Only running pods (PS) are deleted, completed pods are not. elif clean_pod_policy == "Running": tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Chief", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Worker", ["Succeeded"]) pod_labels = tf_job_client.get_labels(self.name, "PS") pod_selector = tf_job_client.to_selector(pod_labels) k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace, pod_selector) # No pods are deleted. elif clean_pod_policy == "None": tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Chief", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "Worker", ["Succeeded"]) tf_job_client.wait_for_replica_type_in_phases( api_client, self.namespace, self.name, "PS", ["Running"]) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def run_tfjob_with_replica_restart_policy(self, component, replica_restart_policy, exit_code): tf_operator_util.load_kube_config() api_client = k8s_client.ApiClient() # Setup the ksonnet app tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) # Create the TF job ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) # Wait for the job to either be in Running state or a terminal state logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) if replica_restart_policy == "Always" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "Always" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "OnFailure" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) elif replica_restart_policy == "OnFailure" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "Never" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "Never" and exit_code == 0: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) elif replica_restart_policy == "ExitCode" and exit_code == 1: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, False) else: res = tf_job_client.terminate_and_verify_start_time( api_client, self.namespace, self.name, "ps", 0, exit_code, True) if res is False: self.failure = "Job {0} in namespace {1} with restart policy {2} failed test \ with exit_code {3}".format(self.name, self.namespace, replica_restart_policy, exit_code) logging.error(self.failure) return # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)
def test_pod_names(self): api_client = k8s_client.ApiClient() component = COMPONENT_NAME + "_" + self.tfjob_version ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir) ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component, self.params) util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir) logging.info("Created job %s in namespaces %s", self.name, self.namespace) logging.info("Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"], version=self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) job_specs = extract_job_specs( results.get("spec", {}).get("tfReplicaSpecs", {})) expected_pod_names = [] for replica_type, replica_num in job_specs.items(): logging.info("job_type = %s, replica = %s", replica_type, replica_num) for i in range(replica_num): expected_pod_names.append( POD_NAME_FORMAT.format(name=self.name, replica=replica_type, index=i)) expected_pod_names = set(expected_pod_names) actual_pod_names = tf_job_client.get_pod_names(api_client, self.namespace, self.name) # We are not able to guarantee pods selected with default namespace and job # name are only for this test run only. Therefore we only do partial check, # e.g. make sure expected set of pod names are in the selected pod names. if not (expected_pod_names & actual_pod_names) == expected_pod_names: msg = "Actual pod names doesn't match. Expected: {0} Actual: {1}".format( str(expected_pod_names), str(actual_pod_names)) logging.error(msg) raise RuntimeError(msg) tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1) # Wait for the job to complete. logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) if not tf_job_client.job_succeeded(results): self.failure = "Job {0} in namespace {1} in status {2}".format( self.name, self.namespace, results.get("status", {})) logging.error(self.failure) # Delete the TFJob. tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name, self.namespace) tf_job_client.wait_for_delete(api_client, self.namespace, self.name, self.tfjob_version, status_callback=tf_job_client.log_status)