def test_wait_for_deployment(self): api_client = mock.MagicMock(spec=k8s_client.ApiClient) response = k8s_client.ExtensionsV1beta1Deployment() response.status = k8s_client.ExtensionsV1beta1DeploymentStatus() response.status.ready_replicas = 1 api_client.call_api.return_value = response util.wait_for_deployment(api_client, "some-namespace", "some-deployment")
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run(["ks", "init", app_name,], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) # TODO(jlewi): In presubmits we probably want to change this so we can # pull the changes on a branch. Its not clear whether that's well supported # in Ksonnet yet. kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages # TODO(jlewi): For presubmits how do we pull the package from the desired # branch at the desired commit. packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Deploy Kubeflow util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name], cwd=app_dir) apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",] if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf: key = json.load(hf) apply_command.append("--as=" + key["client_email"]) util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
def run(): namespace = _setup_test(api_client, namespace_name) logging.info("Using namespace: %s", namespace) # Set a GITHUB_TOKEN so that we don't rate limited by GitHub; # see: https://github.com/ksonnet/ksonnet/issues/233 os.environ["GITHUB_TOKEN"] = args.github_token # Initialize a ksonnet app. app_name = "kubeflow-test" util.run([ "ks", "init", app_name, ], cwd=args.test_dir, use_print=True) app_dir = os.path.join(args.test_dir, app_name) kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow" util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir) # Install required packages packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"] for p in packages: util.run(["ks", "pkg", "install", p], cwd=app_dir) # Delete the vendor directory and replace with a symlink to the src # so that we use the code at the desired commit. target_dir = os.path.join(app_dir, "vendor", "kubeflow") logging.info("Deleting %s", target_dir) shutil.rmtree(target_dir) source = os.path.join(args.test_dir, "src", "kubeflow") logging.info("Creating link %s -> %s", target_dir, source) os.symlink(source, target_dir) # Deploy Kubeflow util.run([ "ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core", "--namespace=" + namespace.metadata.name ], cwd=app_dir) # TODO(jlewi): For reasons I don't understand even though we ran # configure_kubectl above, if we don't rerun it we get rbac errors # when we do ks apply; I think because we aren't using the proper service # account. This might have something to do with the way ksonnet gets # its credentials; maybe we need to configure credentials after calling # ks init? if args.cluster: util.configure_kubectl(args.project, args.zone, args.cluster) apply_command = [ "ks", "apply", "default", "-c", "kubeflow-core", ] util.run(apply_command, cwd=app_dir) # Verify that the TfJob operator is actually deployed. tf_job_deployment_name = "tf-job-operator" logging.info("Verifying TfJob controller started.") util.wait_for_deployment(api_client, namespace.metadata.name, tf_job_deployment_name) # Verify that JupyterHub is actually deployed. jupyter_name = "tf-hub" logging.info("Verifying TfHub started.") util.wait_for_statefulset(api_client, namespace.metadata.name, jupyter_name)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split( "=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() t = test_util.TestCase() try: start = time.time() params = { "tfJobImage": args.image, "name": "kubeflow-core", "namespace": args.namespace, } component = "core" account = util.run_and_output( ["gcloud", "config", "get-value", "account", "--quiet"]).strip() logging.info("Using GCP account %s", account) util.run([ "kubectl", "create", "clusterrolebinding", "default-admin", "--clusterrole=cluster-admin", "--user="******"tf-job-operator" logging.info("Verifying TfJob controller started.") # TODO(jlewi): We should verify the image of the operator is the correct. util.wait_for_deployment(api_client, args.namespace, tf_job_deployment_name) # Reraise the exception so that the step fails because there's no point # continuing the test. except subprocess.CalledProcessError as e: t.failure = "kubeflow-deploy failed;\n" + (e.output or "") raise except util.TimeoutError as e: t.failure = e.message raise finally: t.time = time.time() - start t.name = "kubeflow-deploy" t.class_name = "GKE" gcs_client = storage.Client(project=args.project) test_util.create_junit_xml_file([t], args.junit_path, gcs_client)
def setup(args): """Setup a GKE cluster for TensorFlow jobs. Args: args: Command line arguments that control the setup process. """ gke = discovery.build("container", "v1") project = args.project cluster_name = args.cluster zone = args.zone chart = args.chart machine_type = "n1-standard-8" cluster_request = { "cluster": { "name": cluster_name, "description": "A GKE cluster for TF.", "initialNodeCount": 1, "nodeConfig": { "machineType": machine_type, "oauthScopes": [ "https://www.googleapis.com/auth/cloud-platform", ], }, # TODO(jlewi): Stop pinning GKE version once 1.8 becomes the default. "initialClusterVersion": "1.8.5-gke.0", } } if args.accelerators: # TODO(jlewi): Stop enabling Alpha once GPUs make it out of Alpha cluster_request["cluster"]["enableKubernetesAlpha"] = True cluster_request["cluster"]["nodeConfig"]["accelerators"] = [] for accelerator_spec in args.accelerators: accelerator_type, accelerator_count = accelerator_spec.split( "=", 1) cluster_request["cluster"]["nodeConfig"]["accelerators"].append({ "acceleratorCount": accelerator_count, "acceleratorType": accelerator_type, }) util.create_cluster(gke, project, zone, cluster_request) util.configure_kubectl(project, zone, cluster_name) util.load_kube_config() # Create an API client object to talk to the K8s master. api_client = k8s_client.ApiClient() util.setup_cluster(api_client) # A None gcs_client should be passed to test_util.create_junit_xml_file # unless chart.startswith("gs://"), e.g. https://storage.googleapis.com/... gcs_client = None if chart.startswith("gs://"): remote = chart chart = os.path.join(tempfile.gettempdir(), os.path.basename(chart)) gcs_client = storage.Client(project=project) bucket_name, path = util.split_gcs_uri(remote) bucket = gcs_client.get_bucket(bucket_name) blob = bucket.blob(path) logging.info("Downloading %s to %s", remote, chart) blob.download_to_filename(chart) t = test_util.TestCase() try: start = time.time() util.run([ "helm", "install", chart, "-n", "tf-job", "--wait", "--replace", "--set", "rbac.install=true,cloud=gke" ]) util.wait_for_deployment(api_client, "default", "tf-job-operator") except subprocess.CalledProcessError as e: t.failure = "helm install failed;\n" + e.output except util.TimeoutError as e: t.failure = e.message finally: t.time = time.time() - start t.name = "helm-tfjob-install" t.class_name = "GKE" test_util.create_junit_xml_file([t], args.junit_path, gcs_client)