def test_write_xml(self): with tempfile.NamedTemporaryFile(delete=False) as hf: pass success = test_util.TestCase() success.class_name = "some_test" success.name = "first" success.time = 10 failure = test_util.TestCase() failure.class_name = "some_test" failure.name = "first" failure.time = 10 failure.failure = "failed for some reason." test_util.create_junit_xml_file([success, failure], hf.name) with open(hf.name) as hf: output = hf.read() print(output) expected = ( """<testsuite failures="1" tests="2" time="20">""" """<testcase classname="some_test" name="first" time="10" />""" """<testcase classname="some_test" """ """failure="failed for some reason." name="first" """ """time="10" /></testsuite>""") self.assertEquals(expected, output)
def test(args): """Run the tests.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) t = test_util.TestCase() try: start = time.time() util.run(["helm", "test", "tf-job"]) except subprocess.CalledProcessError as e: t.failure = "helm test failed;\n" + (e.output or "") # Reraise the exception so that the prow job will fail and the test # is marked as a failure. # TODO(jlewi): It would be better to this wholistically; e.g. by # processing all the junit xml files and checking for any failures. This # should be more tractable when we migrate off Airflow to Argo. raise finally: t.time = time.time() - start t.name = "e2e-test" t.class_name = "GKE" test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run_test(args): """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(args.spec) loader = jinja2.FileSystemLoader(os.path.dirname(args.spec)) if not args.image_tag: raise ValueError("--image_tag must be provided.") logging.info("Loading spec from %s with image_tag=%s", args.spec, args.image_tag) spec_contents = jinja2.Environment(loader=loader).get_template( os.path.basename(args.spec)).render(image_tag=args.image_tag) spec = yaml.load(spec_contents) # Make the job name unique. spec["metadata"]["name"] += "-" + uuid.uuid4().hex[0:4] try: start = time.time() api_response = tf_job_client.create_tf_job(api_client, spec) namespace = api_response["metadata"]["namespace"] name = api_response["metadata"]["name"] logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results["status"]["state"] != "succeeded": t.failure = "Job {0} in namespace {1} in state {2}".format( name, namespace, results["status"]["state"]) # TODO(jlewi): # Here are some validation checks to run: # 1. Check tensorboard is created if its part of the job spec. # 2. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def testSubprocessError(self): def run(): raise subprocess.CalledProcessError(10, "some command", output="some output") t = test_util.TestCase() self.assertRaises(subprocess.CalledProcessError, test_util.wrap_test, run, t) self.assertGreater(t.time, 0) self.assertEquals("Subprocess failed;\nsome output", t.failure)
def testOk(self): def ok(): time.sleep(1) t = test_util.TestCase() test_util.wrap_test(ok, t) self.assertGreater(t.time, 0) self.assertEquals(None, t.failure)
def testGeneralError(self): def run(): raise ValueError("some error") t = test_util.TestCase() self.assertRaises(ValueError, test_util.wrap_test, run, t) self.assertGreater(t.time, 0) self.assertEquals("Test failed; some error", t.failure)
def run_test(args, test_case): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" util.load_kube_config() api_client = k8s_client.ApiClient() t = test_util.TestCase() t.class_name = "tfjob_test" namespace, name, env = test_runner.setup_ks_app(args) t.name = os.path.basename(name) try: # pylint: disable=too-many-nested-blocks util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) logging.info("Wait for conditions Failed") results = tf_job_client.wait_for_condition( api_client, namespace, name, ["Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Final TFJob:\n %s", json.dumps(results, indent=2)) # For v1alpha2 check for non-empty completionTime last_condition = results.get("status", {}).get("conditions", [])[-1] if last_condition.get("type", "").lower() != "failed": message = "Job {0} in namespace {1} did not fail; status {2}".format( name, namespace, results.get("status", {})) logging.error(message) test_case.add_failure_info(message) return pattern = ".*the spec is invalid.*" condition_message = last_condition.get("message", "") if not re.match(pattern, condition_message): message = "Condition message {0} did not match pattern {1}".format( condition_message, pattern) logging.error(message) test_case.add_failure_info(message) except tf_operator_util.JobTimeoutError as e: if e.job: spec = "Job:\n" + json.dumps(e.job, indent=2) else: spec = "JobTimeoutError did not contain job" message = ("Timeout waiting for {0} in namespace {1} to finish; " ).format(name, namespace) + spec logging.exception(message) test_case.add_failure_info(message) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. message = "There was a problem running the job; Exception {0}".format( e) logging.exception(message) test_case.add_failure_info(message)
def run_tests(args): # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) # kubeflow_testing is imported as a submodule so we should exclude it # TODO(jlewi): Perhaps we should get a list of submodules and exclude # them automatically? dir_excludes = ["kubeflow_testing", "vendor"] includes = ["*_test.py"] test_cases = [] env = os.environ.copy() # TODO(jlewi): Once we switch to using Argo I think we can stop setting # the PYTHONPATH here and just inheriting it from the environment. # When we use ARGO each step will run in its own pod and we can set the # PYTHONPATH environment variable as needed for that pod. env["PYTHONPATH"] = (args.src_dir + ":" + os.path.join(args.src_dir, "kubeflow_testing", "py")) num_failed = 0 for root, dirs, files in os.walk(args.src_dir, topdown=True): # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. dirs[:] = [d for d in dirs if d not in dir_excludes] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) test_case = test_util.TestCase() test_case.class_name = "pytest" test_case.name = full_path[len(args.src_dir):] start_time = time.time() test_cases.append(test_case) try: util.run(["python", full_path], cwd=args.src_dir, env=env) except subprocess.CalledProcessError: test_case.failure = "{0} failed.".format(test_case.name) num_failed += 1 finally: test_case.time = time.time() - start_time if num_failed: logging.error("%s tests failed.", num_failed) else: logging.info("No lint issues.") if not args.junit_path: logging.info("No --junit_path.") return gcs_client = None if args.junit_path.startswith("gs://"): gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file(test_cases, args.junit_path, gcs_client)
def test_get_num_failures_success(self): success = test_util.TestCase() success.class_name = "some_test" success.name = "first" success.time = 10 e = test_util.create_xml([success]) s = StringIO.StringIO() e.write(s) xml_value = s.getvalue() self.assertEquals(0, test_util.get_num_failures(xml_value))
def test_get_num_failures(self): failure = test_util.TestCase() failure.class_name = "some_test" failure.name = "first" failure.time = 10 failure.failure = "failed for some reason." e = test_util.create_xml([failure]) s = StringIO.StringIO() e.write(s) xml_value = s.getvalue() self.assertEquals(1, test_util.get_num_failures(xml_value))
def run_tests(args): # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) dir_excludes = ["vendor"] includes = ["*_test.py"] test_cases = [] env = os.environ.copy() env["PYTHONPATH"] = args.src_dir num_failed = 0 for root, dirs, files in os.walk(args.src_dir, topdown=True): # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. dirs[:] = [d for d in dirs if d not in dir_excludes] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) test_case = test_util.TestCase() test_case.class_name = "pytest" test_case.name = full_path.strip(args.src_dir) start_time = time.time() test_cases.append(test_case) try: util.run(["python", full_path], cwd=args.src_dir, env=env) except subprocess.CalledProcessError: test_case.failure = "{0} failed.".format(test_case.name) num_failed += 1 finally: test_case.time = time.time() - start_time if num_failed: logging.error("%s tests failed.", num_failed) else: logging.info("No lint issues.") if not args.junit_path: logging.info("No --junit_path.") return gcs_client = None if args.junit_path.startswith("gs://"): gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file(test_cases, args.junit_path, gcs_client)
def run_lint(args): start_time = time.time() # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) dir_excludes = ["vendor"] includes = ["*.py"] failed_files = [] rc_file = os.path.join(args.src_dir, ".pylintrc") for root, dirs, files in os.walk(args.src_dir, topdown=True): # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. dirs[:] = [d for d in dirs if d not in dir_excludes] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) try: util.run(["pylint", "--rcfile=" + rc_file, full_path], cwd=args.src_dir) except subprocess.CalledProcessError: failed_files.append(full_path.strip(args.src_dir)) if failed_files: logging.error("%s files had lint errors.", len(failed_files)) else: logging.info("No lint issues.") if not args.junit_path: logging.info("No --junit_path.") return test_case = test_util.TestCase() test_case.class_name = "pylint" test_case.name = "pylint" test_case.time = time.time() - start_time if failed_files: test_case.failure = "Files with lint issues: {0}".format( ", ".join(failed_files)) gcs_client = None if args.junit_path.startswith("gs://"): gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([test_case], args.junit_path, gcs_client)
def test(args): """Run the tests.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) t = test_util.TestCase() try: start = time.time() util.run(["helm", "test", "tf-job"]) except subprocess.CalledProcessError as e: t.failure = "helm test failed;\n" + e.output finally: t.time = time.time() - start t.name = "e2e-test" t.class_name = "GKE" test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup_kubeflow(args): """Setup Kubeflow. Args: args: Command line arguments that control the setup process. """ project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, "tfJobVersion": args.tf_job_version, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) ks_deploy(args.test_app_dir, component, params, account=account) # Verify that the TfJob operator is actually deployed. if args.tf_job_version == "v1alpha2": tf_job_deployment_name = "tf-job-operator-v1alpha2" elif args.tf_job_version == "v1beta1": tf_job_deployment_name = "tf-job-operator-v1beta1" else: raise ValueError("Unrecognized value for tf_job_version %s" % args.tf_job_version) logging.info("Verifying TfJob deployment %s started.", tf_job_deployment_name) # TODO(jlewi): We should verify the image of the operator is the correct # one. try: util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) finally: # Run kubectl describe to get useful information about the deployment. # This will help troubleshoot any errors. util.run([ "kubectl", "-n", args.namespace, "describe", "deploy", tf_job_deployment_name ]) util.run([ "kubectl", "-n", args.namespace, "describe", "pods", "-l", "name=tf-job-operator" ]) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() masterHost = api_client.configuration.host t = test_util.TestCase() t.class_name = "tfjob_test" namespace, name, env = _setup_ks_app(args) t.name = os.path.basename(name) start = time.time() try: # pylint: disable=too-many-nested-blocks # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) logging.info("tfjob_version=%s", args.tfjob_version) # Wait for the job to either be in Running state or a terminal state if args.tfjob_version == "v1alpha1": logging.info("Wait for Phase Running, Done, or Failed") results = tf_job_client.wait_for_phase( api_client, namespace, name, ["Running", "Done", "Failed"], status_callback=tf_job_client.log_status) else: logging.info( "Wait for conditions Running, Succeeded, or Failed") results = tf_job_client.wait_for_condition( api_client, namespace, name, ["Running", "Succeeded", "Failed"], status_callback=tf_job_client.log_status) logging.info("Current TFJob:\n %s", json.dumps(results, indent=2)) # The job is now either running or done. if args.shutdown_policy: logging.info("Enforcing shutdownPolicy %s", args.shutdown_policy) if args.shutdown_policy in ["master", "chief"]: if args.tfjob_version == "v1alpha1": replica = "master" else: replica = "chief" elif args.shutdown_policy in ["worker", "all_workers"]: replica = "worker" else: raise ValueError("Unrecognized shutdown_policy " "%s" % args.shutdown_policy) # Number of targets. num_targets = 1 if args.shutdown_policy in ["all_workers"]: # Assume v1alpha2 num_targets = results.get("spec", {}).get( "tfReplicaSpecs", {}).get("Worker", {}).get("replicas", 0) logging.info("There are %s worker replicas", num_targets) if args.tfjob_version == "v1alpha1": runtime_id = results.get("spec", {}).get("RuntimeId") target = "{name}-{replica}-{runtime}".format( name=name, replica=replica, runtime=runtime_id) pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) else: target = "{name}-{replica}".format(name=name, replica=replica) pod_labels = get_labels_v1alpha2(namespace, name) pod_selector = to_selector(pod_labels) # Wait for the pods to be ready before we shutdown # TODO(jlewi): We are get pods using a label selector so there is # a risk that the pod we actual care about isn't present. logging.info( "Waiting for pods to be running before shutting down.") wait_for_pods_to_be_in_phases( api_client, namespace, pod_selector, ["Running"], timeout=datetime.timedelta(minutes=4)) logging.info("Pods are ready") logging.info("Issuing the terminate request") for num in range(num_targets): full_target = target + "-{0}".format(num) terminateReplica(masterHost, namespace, full_target) logging.info("Waiting for job to finish.") results = tf_job_client.wait_for_job( api_client, namespace, name, args.tfjob_version, status_callback=tf_job_client.log_status) if args.tfjob_version == "v1alpha1": if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break else: # For v1alpha2 check for non-empty completionTime last_condition = results.get("status", {}).get("conditions", [])[-1] if last_condition.get("type", "").lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in status {3}".format( trial, name, namespace, results.get("status", {})) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) uid = results.get("metadata", {}).get("uid") events = get_events(api_client, namespace, uid) for e in events: logging.info("K8s event: %s", e.message) # Print out the K8s events because it can be useful for debugging. for e in events: logging.info("Recieved K8s Event:\n%s", e) created_pods, created_services = parse_events(events) num_expected = 0 if args.tfjob_version == "v1alpha1": for replica in results.get("spec", {}).get("replicaSpecs", []): num_expected += replica.get("replicas", 0) else: for replicakey in results.get("spec", {}).get("tfReplicaSpecs", {}): replica_spec = results.get("spec", {}).get("tfReplicaSpecs", {}).get(replicakey, {}) if replica_spec: num_expected += replica_spec.get("replicas", 1) creation_failures = [] if len(created_pods) != num_expected: message = ("Expected {0} pods to be created but only " "got {1} create events.").format( num_expected, len(created_pods)) creation_failures.append(message) if len(created_services) != num_expected: message = ("Expected {0} services to be created but only " "got {1} create events.").format( num_expected, len(created_services)) creation_failures.append(message) if creation_failures: # TODO(jlewi): Starting with # https://github.com/kubeflow/tf-operator/pull/646 the number of events # no longer seems to match the expected; it looks like maybe events # are being combined? For now we just log a warning rather than an # error. logging.warning(creation_failures) if args.tfjob_version == "v1alpha1": pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) else: pod_labels = get_labels_v1alpha2(name) pod_selector = to_selector(pod_labels) # We don't wait for pods to be deleted in v1alpha2 because CleanPodPolicy # means completed pods won't be deleted. # TODO(jlewi): We should add a test to deal with deleted pods. if args.tfjob_version == "v1alpha1": wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) tf_job_client.delete_tf_job(api_client, namespace, name, version=args.tfjob_version) logging.info("Waiting for job %s in namespaces %s to be deleted.", name, namespace) wait_for_delete(api_client, namespace, name, args.tfjob_version, status_callback=tf_job_client.log_status) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.exception(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.exception("There was a problem running the job; Exception %s", e) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split("=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, "tfJobVersion": args.tf_job_version, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"v1alpha1": tf_job_deployment_name = "tf-job-operator" elif args.tf_job_version == "v1alpha2": tf_job_deployment_name = "tf-job-operator-v1alpha2" else: raise ValueError( "Unrecognized value for tf_job_version %s" % args.tf_job_version) logging.info("Verifying TfJob deployment %s started.", tf_job_deployment_name) # TODO(jlewi): We should verify the image of the operator is the correct. util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run_lint(args): start_time = time.time() # Print out the pylint version because different versions can produce # different results. util.run(["pylint", "--version"]) # kubeflow_testing is imported as a submodule so we should exclude it # TODO(jlewi): Perhaps we should get a list of submodules and exclude # them automatically? dir_excludes = [ "dashboard/frontend/node_modules", "kubeflow_testing", "test/test-app", "vendor", ] full_dir_excludes = [ os.path.join(os.path.abspath(args.src_dir), f) for f in dir_excludes ] includes = ["*.py"] failed_files = [] rc_file = os.path.join(args.src_dir, ".pylintrc") for root, dirs, files in os.walk(os.path.abspath(args.src_dir), topdown=True): # excludes can be done with fnmatch.filter and complementary set, # but it's more annoying to read. exclude = False for e in full_dir_excludes: if root.startswith(e): exclude = True break if exclude: continue dirs[:] = [d for d in dirs] for pat in includes: for f in fnmatch.filter(files, pat): full_path = os.path.join(root, f) try: util.run(["pylint", "--rcfile=" + rc_file, full_path], cwd=args.src_dir) except subprocess.CalledProcessError: failed_files.append(full_path[len(args.src_dir):]) if failed_files: failed_files.sort() logging.error("%s files had lint errors:\n%s", len(failed_files), "\n".join(failed_files)) else: logging.info("No lint issues.") if not args.junit_path: logging.info("No --junit_path.") return test_case = test_util.TestCase() test_case.class_name = "pylint" test_case.name = "pylint" test_case.time = time.time() - start_time if failed_files: test_case.failure = "Files with lint issues: {0}".format( ", ".join(failed_files)) gcs_client = None if args.junit_path.startswith("gs://"): gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([test_case], args.junit_path, gcs_client)
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) source = os.path.join(args.test_dir, "src", "kubeflow") logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone # TODO(jlewi): When using GKE we should copy the .kube config and any other # files to the test directory. We should then set the environment variable # KUBECONFIG to point at that file. This should prevent us from having # to rerun util.configure_kubectl on each step. Instead we could run it once # as part of GKE cluster creation and store the config in the NFS directory. # This would make the handling of credentials # and KUBECONFIG more consistent between GKE and minikube and eventually # this could be extended to other K8s deployments. if cluster_name: util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() salt = uuid.uuid4().hex[0:4] # Create a new environment for this run env = "test-env-{0}".format(salt) util.run(["ks", "env", "add", env], cwd=args.app_dir) name = None namespace = None for pair in args.params.split(","): k, v = pair.split("=", 1) if k == "name": name = v if k == "namespace": namespace = v util.run(["ks", "param", "set", "--env=" + env, args.component, k, v], cwd=args.app_dir) if not name: raise ValueError("name must be provided as a parameter.") t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(name) if not namespace: raise ValueError("namespace must be provided as a parameter.") start = time.time() try: # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) uid = results.get("metadata", {}).get("uid") events = get_events(api_client, namespace, uid) created_pods, created_services = parse_events(events) num_expected = 0 for replica in results.get("spec", {}).get("replicaSpecs", []): num_expected += replica.get("replicas", 0) creation_failures = [] if len(created_pods) != num_expected: message = ("Expected {0} pods to be created but only " "got {1} create events.").format( num_expected, len(created_pods)) creation_failures.append(message) if len(created_services) != num_expected: message = ("Expected {0} services to be created but only " "got {1} create events.").format( num_expected, len(created_services)) creation_failures.append(message) if creation_failures: t.failure = "Trial {0} Job {1} in namespace {2}: {3}".format( trial, name, namespace, ", ".join(creation_failures)) logging.error(t.failure) break pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) wait_for_pods_to_be_deleted(api_client, namespace, pod_selector) tf_job_client.delete_tf_job(api_client, namespace, name) logging.info("Waiting for job %s in namespaces %s to be deleted.", name, namespace) wait_for_delete(api_client, namespace, name, status_callback=tf_job_client.log_status) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.error(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.error("There was a problem running the job; Exception %s", e) logging.error( "There was a problem running the job; Exception " "message: %s", e.message) logging.error("Exception type: %s", e.__class__) logging.error("Exception args: %s", e.args) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run_test(args): """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() salt = uuid.uuid4().hex[0:4] # Create a new environment for this run env = "test-env-{0}".format(salt) util.run(["ks", "env", "add", env], cwd=args.app_dir) name = None namespace = None for pair in args.params.split(","): k, v = pair.split("=", 1) if k == "name": name = v if k == "namespace": namespace = v util.run(["ks", "param", "set", "--env=" + env, args.component, k, v], cwd=args.app_dir) if not name: raise ValueError("name must be provided as a parameter.") t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(name) if not namespace: raise ValueError("namespace must be provided as a parameter.") start = time.time() try: util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Job {0} in namespace {1} in state {2}".format( name, namespace, results.get("status", {}).get("state", None)) # TODO(jlewi): # Here are some validation checks to run: # 1. Check tensorboard is created if its part of the job spec. # 2. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.error("There was a problem running the job; Exception %s", e) logging.error( "There was a problem running the job; Exception " "message: %s", e.message) logging.error("Exception type: %s", e.__class__) logging.error("Exception args: %s", e.args) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup(args): """Test deploying Kubeflow.""" if args.cluster: project = args.project cluster_name = args.cluster zone = args.zone logging.info("Using cluster: %s in project: %s in zone: %s", cluster_name, project, zone) # Print out config to help debug issues with accounts and # credentials. util.run(["gcloud", "config", "list"]) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() else: # TODO(jlewi): This is sufficient for API access but it doesn't create # a kubeconfig file which ksonnet needs for ks init. logging.info("Running inside cluster.") incluster_config.load_incluster_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() now = datetime.datetime.now() run_label = "e2e-" + now.strftime("%m%d-%H%M-") + uuid.uuid4().hex[0:4] if not os.path.exists(args.test_dir): os.makedirs(args.test_dir) logging.info("Using test directory: %s", args.test_dir) namespace_name = run_label def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) # TODO(jlewi): In presubmits we probably want to change this so we can # pull the changes on a branch. Its not clear whether that's well supported # in Ksonnet yet. kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages # TODO(jlewi): For presubmits how do we pull the package from the desired # branch at the desired commit. packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf: key = json.load(hf) apply_command.append("--as=" + key["client_email"]) util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name) main_case = test_util.TestCase() main_case.class_name = "KubeFlow" main_case.name = "deploy-kubeflow" try: test_util.wrap_test(run, main_case) finally: # Delete the namespace logging.info("Deleting namespace %s", namespace_name) # We report teardown as a separate test case because this will help # us track down issues with garbage collecting namespaces. teardown = test_util.TestCase(main_case.class_name, "teardown") def run_teardown(): core_api = k8s_client.CoreV1Api(api_client) core_api.delete_namespace(namespace_name, {}) try: test_util.wrap_test(run_teardown, teardown) except Exception as e: # pylint: disable-msg=broad-except logging.error("There was a problem deleting namespace: %s; %s", namespace_name, e.message) junit_path = os.path.join(args.artifacts_dir, "junit_kubeflow-deploy.xml") logging.info("Writing test results to %s", junit_path) test_util.create_junit_xml_file([main_case, teardown], junit_path)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone chart = args.chart machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, # TODO(jlewi): Stop pinning GKE version once 1.8 becomes the default. "initialClusterVersion": "1.8.1-gke.1", } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split( "=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() util.setup_cluster(api_client) if chart.startswith("gs://"): remote = chart chart = os.path.join(tempfile.gettempdir(), os.path.basename(chart)) gcs_client = storage.Client(project=project) bucket_name, path = util.split_gcs_uri(remote) bucket = gcs_client.get_bucket(bucket_name) blob = bucket.blob(path) logging.info("Downloading %s to %s", remote, chart) blob.download_to_filename(chart) t = test_util.TestCase() try: start = time.time() util.run([ "helm", "install", chart, "-n", "tf-job", "--wait", "--replace", "--set", "rbac.install=true,cloud=gke" ]) except subprocess.CalledProcessError as e: t.failure = "helm install failed;\n" + e.output finally: t.time = time.time() - start t.name = "helm-tfjob-install" t.class_name = "GKE" test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup_cluster(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split( "=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"setup-cluster failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "setup-cluster" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def run_test(args): # pylint: disable=too-many-branches,too-many-statements """Run a test.""" gcs_client = storage.Client(project=args.project) project = args.project cluster_name = args.cluster zone = args.zone util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() api_client = k8s_client.ApiClient() salt = uuid.uuid4().hex[0:4] # Create a new environment for this run env = "test-env-{0}".format(salt) util.run(["ks", "env", "add", env], cwd=args.app_dir) name = None namespace = None for pair in args.params.split(","): k, v = pair.split("=", 1) if k == "name": name = v if k == "namespace": namespace = v util.run(["ks", "param", "set", "--env=" + env, args.component, k, v], cwd=args.app_dir) if not name: raise ValueError("name must be provided as a parameter.") t = test_util.TestCase() t.class_name = "tfjob_test" t.name = os.path.basename(name) if not namespace: raise ValueError("namespace must be provided as a parameter.") start = time.time() try: # We repeat the test multiple times. # This ensures that if we delete the job we can create a new job with the # same name. # TODO(jlewi): We should make this an argument. num_trials = 2 for trial in range(num_trials): logging.info("Trial %s", trial) util.run(["ks", "apply", env, "-c", args.component], cwd=args.app_dir) logging.info("Created job %s in namespaces %s", name, namespace) results = tf_job_client.wait_for_job( api_client, namespace, name, status_callback=tf_job_client.log_status) if results.get("status", {}).get("state", {}).lower() != "succeeded": t.failure = "Trial {0} Job {1} in namespace {2} in state {3}".format( trial, name, namespace, results.get("status", {}).get("state", None)) logging.error(t.failure) break runtime_id = results.get("spec", {}).get("RuntimeId") logging.info("Trial %s Job %s in namespace %s runtime ID %s", trial, name, namespace, runtime_id) # TODO(jlewi): We should check that pods were created for each replica pod_labels = get_labels(name, runtime_id) pod_selector = to_selector(pod_labels) pods = list_pods(api_client, namespace, pod_selector) logging.info("Trial %s selector: %s matched %s pods", trial, pod_selector, len(pods.items)) if not pods.items: t.failure = ( "Trial {0} Job {1} in namespace {2} no pods found for " " selector {3}").format(trial, name, namespace, pod_selector) logging.error(t.failure) break tf_job_client.delete_tf_job(api_client, namespace, name) wait_for_delete(api_client, namespace, name, status_callback=tf_job_client.log_status) # Verify the pods have been deleted. tf_job_client uses foreground # deletion so there shouldn't be any resources for the job left # once the job is gone. pods = list_pods(api_client, namespace, pod_selector) logging.info("Trial %s selector: %s matched %s pods", trial, pod_selector, len(pods.items)) if pods.items: t.failure = ( "Trial {0} Job {1} in namespace {2} pods found for " " selector {3}; pods\n{4}").format(trial, name, namespace, pod_selector, pods) logging.error(t.failure) break logging.info("Trial %s all pods deleted.", trial) # TODO(jlewi): # Here are some validation checks to run: # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. except util.TimeoutError: t.failure = "Timeout waiting for {0} in namespace {1} to finish.".format( name, namespace) logging.error(t.failure) except Exception as e: # pylint: disable-msg=broad-except # TODO(jlewi): I'm observing flakes where the exception has message "status" # in an effort to try to nail down this exception we print out more # information about the exception. logging.error("There was a problem running the job; Exception %s", e) logging.error( "There was a problem running the job; Exception " "message: %s", e.message) logging.error("Exception type: %s", e.__class__) logging.error("Exception args: %s", e.args) # We want to catch all exceptions because we want the test as failed. t.failure = ("Exception occured; type {0} message {1}".format( e.__class__, e.message)) finally: t.time = time.time() - start if args.junit_path: test_util.create_junit_xml_file([t], args.junit_path, gcs_client)