Exemplo n.º 1
0
  def run_simple_tfjob(self, component):
    api_client = k8s_client.ApiClient()

    # Setup the ksonnet app
    ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                         self.params)

    # Create the TF job
    ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
    util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir)
    logging.info("Created job %s in namespaces %s", self.name, self.namespace)

    # Wait for the job to either be in Running state or a terminal state
    logging.info("Wait for conditions Running, Succeeded, or Failed")
    results = tf_job_client.wait_for_condition(
      api_client,
      self.namespace,
      self.name, ["Running", "Succeeded", "Failed"],
      version=self.tfjob_version,
      status_callback=tf_job_client.log_status)
    logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

    # Wait for the job to complete.
    logging.info("Waiting for job to finish.")
    results = tf_job_client.wait_for_job(
      api_client,
      self.namespace,
      self.name,
      self.tfjob_version,
      status_callback=tf_job_client.log_status)
    logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

    if not tf_job_client.job_succeeded(results):
      self.failure = "Job {0} in namespace {1} in status {2}".format(
        self.name, self.namespace, results.get("status", {}))
      logging.error(self.failure)
      return

    # Check for creation failures.
    creation_failures = tf_job_client.get_creation_failures_from_tfjob(
      api_client, self.namespace, results)
    if creation_failures:
      # TODO(jlewi): Starting with
      # https://github.com/kubeflow/tf-operator/pull/646 the number of events
      # no longer seems to match the expected; it looks like maybe events
      # are being combined? For now we just log a warning rather than an
      # error.
      logging.warning(creation_failures)

    # Delete the TFJob.
    tf_job_client.delete_tf_job(
      api_client, self.namespace, self.name, version=self.tfjob_version)
    logging.info("Waiting for job %s in namespaces %s to be deleted.",
                 self.name, self.namespace)
    tf_job_client.wait_for_delete(
      api_client,
      self.namespace,
      self.name,
      self.tfjob_version,
      status_callback=tf_job_client.log_status)
Exemplo n.º 2
0
    def run_tfjob_with_shutdown_policy(self, component, shutdown_policy):
        tf_operator_util.load_kube_config()
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace,
                                      component, self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        if shutdown_policy == "worker":
            tf_job_client.terminate_replicas(api_client, self.namespace,
                                             self.name, "worker", 1)
        else:
            tf_job_client.terminate_replicas(api_client, self.namespace,
                                             self.name, "chief", 1)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Exemplo n.º 3
0
    def run_distributed_training_job(self, component):
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Check for creation failures.
        creation_failures = tf_job_client.get_creation_failures_from_tfjob(
            api_client, self.namespace, results)
        if creation_failures:
            logging.warning(creation_failures)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Exemplo n.º 4
0
def test_tf_job_simple(test_case):  # pylint: disable=redefined-outer-name
    args = parse_args()
    namespace = "default"
    name = "tf-job-simple"

    util.load_kube_config()
    api_client = k8s_client.ApiClient()
    create_app_and_job(args, namespace, name)
    try:
        tf_job_client.wait_for_condition(
            api_client,
            namespace,
            name, ["Running"],
            status_callback=tf_job_client.log_status)
        logging.info("TFJob launched successfully")
    except Exception as e:
        logging.error("Test failed waiting for job; %s", e)
        test_case.add_failure_info(e.message)
Exemplo n.º 5
0
    def test_invalid_tfjob_spec(self):
        api_client = k8s_client.ApiClient()
        component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        logging.info("Wait for conditions Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)

        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        last_condition = results.get("status", {}).get("conditions", [{}])[-1]
        if last_condition.get("type", "").lower() != "failed":
            self.failure = "Job {0} in namespace {1} did not fail; status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        pattern = ".*the spec is invalid.*"
        condition_message = last_condition.get("message", "")
        if not re.match(pattern, condition_message):
            self.failure = "Condition message {0} did not match pattern {1}".format(
                condition_message, pattern)
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Exemplo n.º 6
0
    def test_tfjob_and_verify_runconfig(self):
        api_client = k8s_client.ApiClient()
        masterHost = api_client.configuration.host
        component = COMPONENT_NAME + "_" + self.tfjob_version

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        num_ps = results.get("spec", {}).get("tfReplicaSpecs",
                                             {}).get("PS",
                                                     {}).get("replicas", 0)
        num_workers = results.get("spec",
                                  {}).get("tfReplicaSpecs",
                                          {}).get("Worker",
                                                  {}).get("replicas", 0)
        verify_runconfig(masterHost, self.namespace, self.name, "chief",
                         num_ps, num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "worker",
                         num_ps, num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "ps", num_ps,
                         num_workers)
        verify_runconfig(masterHost, self.namespace, self.name, "evaluator",
                         num_ps, num_workers)

        tf_job_client.terminate_replicas(api_client, self.namespace, self.name,
                                         "chief", 1)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Exemplo n.º 7
0
    def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy):
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # All pods are deleted.
        if clean_pod_policy == "All":
            pod_labels = tf_job_client.get_labels(self.name)
            pod_selector = tf_job_client.to_selector(pod_labels)
            k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace,
                                                 pod_selector)
        # Only running pods (PS) are deleted, completed pods are not.
        elif clean_pod_policy == "Running":
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Chief", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Worker", ["Succeeded"])
            pod_labels = tf_job_client.get_labels(self.name, "PS")
            pod_selector = tf_job_client.to_selector(pod_labels)
            k8s_util.wait_for_pods_to_be_deleted(api_client, self.namespace,
                                                 pod_selector)
        # No pods are deleted.
        elif clean_pod_policy == "None":
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Chief", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "Worker", ["Succeeded"])
            tf_job_client.wait_for_replica_type_in_phases(
                api_client, self.namespace, self.name, "PS", ["Running"])

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Exemplo n.º 8
0
    def run_tfjob_with_replica_restart_policy(self, component,
                                              replica_restart_policy,
                                              exit_code):
        tf_operator_util.load_kube_config()
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace,
                                      component, self.params)

        # Create the TF job
        ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
        util.run([ks_cmd, "apply", self.env, "-c", component],
                 cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        if replica_restart_policy == "Always" and exit_code == 0:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                True)

        elif replica_restart_policy == "Always" and exit_code == 1:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                True)

        elif replica_restart_policy == "OnFailure" and exit_code == 1:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                True)

        elif replica_restart_policy == "OnFailure" and exit_code == 0:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                False)

        elif replica_restart_policy == "Never" and exit_code == 1:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                False)

        elif replica_restart_policy == "Never" and exit_code == 0:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                False)

        elif replica_restart_policy == "ExitCode" and exit_code == 1:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                False)

        else:
            res = tf_job_client.terminate_and_verify_start_time(
                api_client, self.namespace, self.name, "ps", 0, exit_code,
                True)

        if res is False:
            self.failure = "Job {0} in namespace {1} with restart policy {2} failed test \
        with exit_code {3}".format(self.name, self.namespace,
                                   replica_restart_policy, exit_code)
            logging.error(self.failure)
            return

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
Exemplo n.º 9
0
    def test_pod_names(self):
        api_client = k8s_client.ApiClient()
        component = COMPONENT_NAME + "_" + self.tfjob_version

        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)
        util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        job_specs = extract_job_specs(
            results.get("spec", {}).get("tfReplicaSpecs", {}))
        expected_pod_names = []
        for replica_type, replica_num in job_specs.items():
            logging.info("job_type = %s, replica = %s", replica_type,
                         replica_num)
            for i in range(replica_num):
                expected_pod_names.append(
                    POD_NAME_FORMAT.format(name=self.name,
                                           replica=replica_type,
                                           index=i))
        expected_pod_names = set(expected_pod_names)
        actual_pod_names = tf_job_client.get_pod_names(api_client,
                                                       self.namespace,
                                                       self.name)

        # We are not able to guarantee pods selected with default namespace and job
        # name are only for this test run only. Therefore we only do partial check,
        # e.g. make sure expected set of pod names are in the selected pod names.
        if not (expected_pod_names & actual_pod_names) == expected_pod_names:
            msg = "Actual pod names doesn't match. Expected: {0} Actual: {1}".format(
                str(expected_pod_names), str(actual_pod_names))
            logging.error(msg)
            raise RuntimeError(msg)

        tf_job_client.terminate_replicas(api_client, self.namespace, self.name,
                                         "chief", 1)
        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)