예제 #1
0
    def run_tfjob_with_shutdown_policy(self, component, shutdown_policy):
        api_client = k8s_client.ApiClient()

        # Setup the ksonnet app
        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)

        # Create the TF job
        util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)

        # Wait for the job to either be in Running state or a terminal state
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        if shutdown_policy == "worker":
            tf_job_client.terminate_replicas(api_client, self.namespace,
                                             self.name, "worker", 1)
        else:
            tf_job_client.terminate_replicas(api_client, self.namespace,
                                             self.name, "chief", 1)

        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)
            return

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)
예제 #2
0
  def test_tfjob_and_verify_runconfig(self):
    api_client = k8s_client.ApiClient()
    masterHost = api_client.configuration.host

    # Setup the ksonnet app
    ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, COMPONENT_NAME, self.params)

    # Create the TF job
    util.run(["ks", "apply", self.env, "-c", COMPONENT_NAME], cwd=self.app_dir)
    logging.info("Created job %s in namespaces %s", self.name, self.namespace)

    # Wait for the job to either be in Running state or a terminal state
    logging.info("Wait for conditions Running, Succeeded, or Failed")
    results = tf_job_client.wait_for_condition(
      api_client, self.namespace, self.name, ["Running", "Succeeded", "Failed"],
      status_callback=tf_job_client.log_status)
    logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

    num_ps = results.get("spec", {}).get("tfReplicaSpecs", {}).get(
      "PS", {}).get("replicas", 0)
    num_workers = results.get("spec", {}).get("tfReplicaSpecs", {}).get(
      "Worker", {}).get("replicas", 0)
    verify_runconfig(masterHost, self.namespace, self.name, "chief", num_ps, num_workers)
    verify_runconfig(masterHost, self.namespace, self.name, "worker", num_ps, num_workers)
    verify_runconfig(masterHost, self.namespace, self.name, "ps", num_ps, num_workers)

    tf_job_client.terminate_replicas(api_client, self.namespace, self.name, "chief", 1)

    # Wait for the job to complete.
    logging.info("Waiting for job to finish.")
    results = tf_job_client.wait_for_job(
      api_client, self.namespace, self.name, self.tfjob_version,
      status_callback=tf_job_client.log_status)
    logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

    if not tf_job_client.job_succeeded(results):
      self.failure = "Job {0} in namespace {1} in status {2}".format(
        self.name, self.namespace, results.get("status", {}))
      logging.error(self.failure)

    # Delete the TFJob.
    tf_job_client.delete_tf_job(api_client, self.namespace, self.name, version=self.tfjob_version)
    logging.info("Waiting for job %s in namespaces %s to be deleted.", self.name,
                 self.namespace)
    tf_job_client.wait_for_delete(
      api_client, self.namespace, self.name, self.tfjob_version,
      status_callback=tf_job_client.log_status)
    def test_pod_names(self):
        api_client = k8s_client.ApiClient()
        component = COMPONENT_NAME + "_" + self.tfjob_version

        ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
                             self.params)
        util.run(["ks", "apply", self.env, "-c", component], cwd=self.app_dir)
        logging.info("Created job %s in namespaces %s", self.name,
                     self.namespace)
        logging.info("Wait for conditions Running, Succeeded, or Failed")
        results = tf_job_client.wait_for_condition(
            api_client,
            self.namespace,
            self.name, ["Running", "Succeeded", "Failed"],
            version=self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Current TFJob:\n %s", json.dumps(results, indent=2))

        job_specs = extract_job_specs(
            results.get("spec", {}).get("tfReplicaSpecs", {}))
        expected_pod_names = []
        for replica_type, replica_num in job_specs.items():
            logging.info("job_type = %s, replica = %s", replica_type,
                         replica_num)
            for i in range(replica_num):
                expected_pod_names.append(
                    POD_NAME_FORMAT.format(name=self.name,
                                           replica=replica_type,
                                           index=i))
        expected_pod_names = set(expected_pod_names)
        actual_pod_names = tf_job_client.get_pod_names(api_client,
                                                       self.namespace,
                                                       self.name)

        # We are not able to guarantee pods selected with default namespace and job
        # name are only for this test run only. Therefore we only do partial check,
        # e.g. make sure expected set of pod names are in the selected pod names.
        if not (expected_pod_names & actual_pod_names) == expected_pod_names:
            msg = "Actual pod names doesn't match. Expected: {0} Actual: {1}".format(
                str(expected_pod_names), str(actual_pod_names))
            logging.error(msg)
            raise RuntimeError(msg)

        tf_job_client.terminate_replicas(api_client, self.namespace, self.name,
                                         "chief", 1)
        # Wait for the job to complete.
        logging.info("Waiting for job to finish.")
        results = tf_job_client.wait_for_job(
            api_client,
            self.namespace,
            self.name,
            self.tfjob_version,
            status_callback=tf_job_client.log_status)
        logging.info("Final TFJob:\n %s", json.dumps(results, indent=2))

        if not tf_job_client.job_succeeded(results):
            self.failure = "Job {0} in namespace {1} in status {2}".format(
                self.name, self.namespace, results.get("status", {}))
            logging.error(self.failure)

        # Delete the TFJob.
        tf_job_client.delete_tf_job(api_client,
                                    self.namespace,
                                    self.name,
                                    version=self.tfjob_version)
        logging.info("Waiting for job %s in namespaces %s to be deleted.",
                     self.name, self.namespace)
        tf_job_client.wait_for_delete(api_client,
                                      self.namespace,
                                      self.name,
                                      self.tfjob_version,
                                      status_callback=tf_job_client.log_status)