Exemplo n.º 1
0
    def get(self, name=None, namespace=None):
        """
    Get the tfjob
    :param name: existing tfjob name
    :param namespace: defaults to current or default namespace
    :return: tfjob
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        if name:
            try:
                return self.api_instance.get_namespaced_custom_object(
                    constants.TFJOB_GROUP, constants.TFJOB_VERSION, namespace,
                    constants.TFJOB_PLURAL, name)
            except client.rest.ApiException as e:
                raise RuntimeError(
                    "Exception when calling CustomObjectsApi->get_namespaced_custom_object:\
            %s\n" % e)
        else:
            try:
                return self.api_instance.list_namespaced_custom_object(
                    constants.TFJOB_GROUP, constants.TFJOB_VERSION, namespace,
                    constants.TFJOB_PLURAL)
            except client.rest.ApiException as e:
                raise RuntimeError(
                    "Exception when calling CustomObjectsApi->list_namespaced_custom_object:\
          %s\n" % e)
Exemplo n.º 2
0
    def wait_for_job(
            self,
            name,  #pylint: disable=inconsistent-return-statements
            namespace=None,
            timeout_seconds=600,
            polling_interval=30,
            watch=False,
            status_callback=None):
        """Wait for the specified job to finish.

    :param name: Name of the TfJob.
    :param namespace: defaults to current or default namespace.
    :param timeout_seconds: How long to wait for the job.
    :param polling_interval: How often to poll for the status of the job.
    :param watch: Watch the TFJob if `True`.
    :param status_callback: (Optional): Callable. If supplied this callable is
           invoked after we poll the job. Callable takes a single argument which
           is the job.
    :return:
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        if watch:
            tfjob_watch(name=name,
                        namespace=namespace,
                        timeout_seconds=timeout_seconds)
        else:
            return self.wait_for_condition(name, ["Succeeded", "Failed"],
                                           namespace=namespace,
                                           timeout_seconds=timeout_seconds,
                                           polling_interval=polling_interval,
                                           status_callback=status_callback)
Exemplo n.º 3
0
    def wait_for_job(self,
                     name,
                     namespace=None,
                     timeout_seconds=600,
                     polling_interval=30,
                     status_callback=None):
        """Wait for the specified job to finish.

    Args:
      name: Name of the TfJob.
      namespace: defaults to current or default namespace.
      timeout_seconds: How long to wait for the job.
      polling_interval: How often to poll for the status of the job.
      status_callback: (Optional): Callable. If supplied this callable is
        invoked after we poll the job. Callable takes a single argument which
        is the job.
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        return self.wait_for_condition(name, ["Succeeded", "Failed"],
                                       namespace=namespace,
                                       timeout_seconds=timeout_seconds,
                                       polling_interval=polling_interval,
                                       status_callback=status_callback)
Exemplo n.º 4
0
def watch(name=None, namespace=None, timeout_seconds=600):
  """Watch the created or patched InferenceService in the specified namespace"""

  if namespace is None:
    namespace = utils.get_default_target_namespace()

  stream = k8s_watch.Watch().stream(
    client.CustomObjectsApi().list_namespaced_custom_object,
    constants.TFJOB_GROUP,
    constants.TFJOB_VERSION,
    namespace,
    constants.TFJOB_PLURAL,
    timeout_seconds=timeout_seconds)

  for event in stream:
    tfjob = event['object']
    tfjob_name = tfjob['metadata']['name']
    if name and name != tfjob_name:
      continue
    else:
      status = ''
      update_time = ''
      last_condition = tfjob.get('status', {}).get('conditions', [])[-1]
      status = last_condition.get('type', '')
      update_time = last_condition.get('lastTransitionTime', '')

      tbl(tfjob_name, status, update_time)

      if name == tfjob_name:
        if status == 'Succeeded' or status == 'Failed':
          break
Exemplo n.º 5
0
    def get(self, name=None, namespace=None):
        """
    Get the tfjob
    :param name: existing tfjob name
    :param namespace: defaults to current or default namespace
    :return: tfjob
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        if name:
            thread = self.api_instance.get_namespaced_custom_object(
                constants.TFJOB_GROUP,
                constants.TFJOB_VERSION,
                namespace,
                constants.TFJOB_PLURAL,
                name,
                async_req=True)

            tfjobs = None
            try:
                tfjobs = thread.get(constants.APISERVER_TIMEOUT)
            except multiprocessing.TimeoutError:
                raise RuntimeError("Timeout trying to get TFJob.")
            except client.rest.ApiException as e:
                raise RuntimeError(
                    "Exception when calling CustomObjectsApi->get_namespaced_custom_object:\
          %s\n" % e)
            except Exception as e:
                raise RuntimeError(
                    "There was a problem to get TFJob {0} in namespace {1}. Exception: \
          {2} ".format(name, namespace, e))

        else:
            thread = self.api_instance.list_namespaced_custom_object(
                constants.TFJOB_GROUP,
                constants.TFJOB_VERSION,
                namespace,
                constants.TFJOB_PLURAL,
                async_req=True)

            tfjobs = None
            try:
                tfjobs = thread.get(constants.APISERVER_TIMEOUT)
            except multiprocessing.TimeoutError:
                raise RuntimeError("Timeout trying to get TFJob.")
            except client.rest.ApiException as e:
                raise RuntimeError(
                    "Exception when calling CustomObjectsApi->list_namespaced_custom_object:\
          %s\n" % e)
            except Exception as e:
                raise RuntimeError(
                    "There was a problem to List TFJob in namespace {0}. \
          Exception: {1} ".format(namespace, e))

        return tfjobs
Exemplo n.º 6
0
    def get_job_status(self, name, namespace=None):
        """Returns TFJob status, such as Running, Failed or Succeeded.

    :param name: The TFJob name.
    :param namespace: defaults to current or default namespace.
    :return: Object TFJob status
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        tfjob = self.get(name, namespace=namespace)
        last_condition = tfjob.get("status", {}).get("conditions", [{}])[-1]
        return last_condition.get("type", "")
Exemplo n.º 7
0
    def wait_for_condition(self,
                           name,
                           expected_condition,
                           namespace=None,
                           timeout_seconds=600,
                           polling_interval=30,
                           status_callback=None):
        """Waits until any of the specified conditions occur.

    :param name: Name of the job.
    :param expected_condition: A list of conditions. Function waits until any of the
           supplied conditions is reached.
    :param namespace: defaults to current or default namespace.
    :param timeout_seconds: How long to wait for the job.
    :param polling_interval: How often to poll for the status of the job.
    :param status_callback: (Optional): Callable. If supplied this callable is
           invoked after we poll the job. Callable takes a single argument which
           is the job.
    :return: Object TFJob status
    """

        if namespace is None:
            namespace = utils.get_default_target_namespace()

        for _ in range(round(timeout_seconds / polling_interval)):

            tfjob = None
            tfjob = self.get(name, namespace=namespace)

            if tfjob:
                if status_callback:
                    status_callback(tfjob)

                # If we poll the CRD quick enough status won't have been set yet.
                conditions = tfjob.get("status", {}).get("conditions", [])
                # Conditions might have a value of None in status.
                conditions = conditions or []
                for c in conditions:
                    if c.get("type", "") in expected_condition:
                        return tfjob

            time.sleep(polling_interval)

        raise RuntimeError(
            "Timeout waiting for TFJob {0} in namespace {1} to enter one of the "
            "conditions {2}.".format(name, namespace, expected_condition),
            tfjob)
Exemplo n.º 8
0
    def get_pod_names(
            self,
            name,
            namespace=None,
            master=False,  #pylint: disable=inconsistent-return-statements
            replica_type=None,
            replica_index=None):
        """
    Get pod names of TFJob.
    :param name: tfjob name
    :param namespace: defaults to current or default namespace.
    :param master: Only get pod with label 'job-role: master' pod if True.
    :param replica_type: User can specify one of 'worker, ps, chief' to only get one type pods.
           By default get all type pods.
    :param replica_index: User can specfy replica index to get one pod of TFJob.
    :return: set: pods name
    """

        if namespace is None:
            namespace = utils.get_default_target_namespace()

        labels = utils.get_labels(name,
                                  master=master,
                                  replica_type=replica_type,
                                  replica_index=replica_index)

        try:
            resp = self.core_api.list_namespaced_pod(
                namespace, label_selector=utils.to_selector(labels))
        except client.rest.ApiException as e:
            raise RuntimeError(
                "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n"
                % e)

        pod_names = []
        for pod in resp.items:
            if pod.metadata and pod.metadata.name:
                pod_names.append(pod.metadata.name)

        if not pod_names:
            logging.warning(
                "Not found Pods of the TFJob %s with the labels %s.", name,
                labels)
        else:
            return set(pod_names)
Exemplo n.º 9
0
    def delete(self, name, namespace=None):
        """
    Delete the tfjob
    :param name: tfjob name
    :param namespace: defaults to current or default namespace
    :return:
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        try:
            return self.api_instance.delete_namespaced_custom_object(
                constants.TFJOB_GROUP, constants.TFJOB_VERSION, namespace,
                constants.TFJOB_PLURAL, name, client.V1DeleteOptions())
        except client.rest.ApiException as e:
            raise RuntimeError(
                "Exception when calling CustomObjectsApi->delete_namespaced_custom_object:\
         %s\n" % e)
Exemplo n.º 10
0
    def get_logs(self,
                 name,
                 namespace=None,
                 master=True,
                 replica_type=None,
                 replica_index=None,
                 follow=False):
        """
    Get training logs of the TFJob.
    By default only get the logs of Pod that has labels 'job-role: master'.
    :param name: tfjob name
    :param namespace: defaults to current or default namespace.
    :param master: By default get pod with label 'job-role: master' pod if True.
                   If need to get more Pod Logs, set False.
    :param replica_type: User can specify one of 'worker, ps, chief' to only get one type pods.
           By default get all type pods.
    :param replica_index: User can specfy replica index to get one pod of TFJob.
    :param follow: Follow the log stream of the pod. Defaults to false.
    :return: str: pods logs
    """

        if namespace is None:
            namespace = utils.get_default_target_namespace()

        pod_names = self.get_pod_names(name,
                                       namespace=namespace,
                                       master=master,
                                       replica_type=replica_type,
                                       replica_index=replica_index)

        if pod_names:
            for pod in pod_names:
                try:
                    pod_logs = self.core_api.read_namespaced_pod_log(
                        pod, namespace, follow=follow)
                    logging.info("The logs of Pod %s:\n %s", pod, pod_logs)
                except client.rest.ApiException as e:
                    raise RuntimeError(
                        "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n"
                        % e)
        else:
            raise RuntimeError("Not found Pods of the TFJob {} "
                               "in namespace {}".format(name, namespace))
Exemplo n.º 11
0
    def get_logs(self,
                 name,
                 namespace=None,
                 master=True,
                 replica_type=None,
                 replica_index=None,
                 follow=False):
        """
    Get training logs of the TFJob.
    By default only get the logs of Pod that has labels 'job-role: master'.
    :param name: tfjob name
    :param namespace: defaults to current or default namespace.
    :param master: By default get pod with label 'job-role: master' pod if True.
                   If need to get more Pod Logs, set False.
    :param replica_type: User can specify one of 'worker, ps, chief' to only get one type pods.
           By default get all type pods.
    :param replica_index: User can specfy replica index to get one pod of TFJob.
    :param follow: Follow the log stream of the pod. Defaults to false.
    :return: str: pods logs
    """

        if namespace is None:
            namespace = utils.get_default_target_namespace()

        pod_names = list(
            self.get_pod_names(name,
                               namespace=namespace,
                               master=master,
                               replica_type=replica_type,
                               replica_index=replica_index))
        if pod_names:
            if follow:
                log_streams = []
                for pod in pod_names:
                    log_streams.append(k8s_watch.Watch().stream(
                        self.core_api.read_namespaced_pod_log,
                        name=pod,
                        namespace=namespace))
                finished = [False for _ in log_streams]

                # create thread and queue per stream, for non-blocking iteration
                log_queue_pool = get_log_queue_pool(log_streams)

                # iterate over every watching pods' log queue
                while True:
                    for index, log_queue in enumerate(log_queue_pool):
                        if all(finished):
                            return
                        if finished[index]:
                            continue
                        # grouping the every 50 log lines of the same pod
                        for _ in range(50):
                            try:
                                logline = log_queue.get(timeout=1)
                                if logline is None:
                                    finished[index] = True
                                    break
                                logging.info("[Pod %s]: %s", pod_names[index],
                                             logline)
                            except queue.Empty:
                                break
            else:
                for pod in pod_names:
                    try:
                        pod_logs = self.core_api.read_namespaced_pod_log(
                            pod, namespace)
                        logging.info("The logs of Pod %s:\n %s", pod, pod_logs)
                    except client.rest.ApiException as e:
                        raise RuntimeError(
                            "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n"
                            % e)
        else:
            raise RuntimeError("Not found Pods of the TFJob {} "
                               "in namespace {}".format(name, namespace))
Exemplo n.º 12
0
    def get(self, name=None, namespace=None, watch=False, timeout_seconds=600):  #pylint: disable=inconsistent-return-statements
        """
    Get the tfjob
    :param name: existing tfjob name, if not defined, the get all tfjobs in the namespace.
    :param namespace: defaults to current or default namespace
    :param watch: Watch the TFJob if `True`.
    :param timeout_seconds: How long to watch the job..
    :return: tfjob
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        if name:
            if watch:
                tfjob_watch(name=name,
                            namespace=namespace,
                            timeout_seconds=timeout_seconds)
            else:
                thread = self.custom_api.get_namespaced_custom_object(
                    constants.TFJOB_GROUP,
                    constants.TFJOB_VERSION,
                    namespace,
                    constants.TFJOB_PLURAL,
                    name,
                    async_req=True)

                tfjob = None
                try:
                    tfjob = thread.get(constants.APISERVER_TIMEOUT)
                except multiprocessing.TimeoutError:
                    raise RuntimeError("Timeout trying to get TFJob.")
                except client.rest.ApiException as e:
                    raise RuntimeError(
                        "Exception when calling CustomObjectsApi->get_namespaced_custom_object:\
            %s\n" % e)
                except Exception as e:
                    raise RuntimeError(
                        "There was a problem to get TFJob {0} in namespace {1}. Exception: \
            {2} ".format(name, namespace, e))
                return tfjob
        else:
            if watch:
                tfjob_watch(namespace=namespace,
                            timeout_seconds=timeout_seconds)
            else:
                thread = self.custom_api.list_namespaced_custom_object(
                    constants.TFJOB_GROUP,
                    constants.TFJOB_VERSION,
                    namespace,
                    constants.TFJOB_PLURAL,
                    async_req=True)

                tfjobs = None
                try:
                    tfjobs = thread.get(constants.APISERVER_TIMEOUT)
                except multiprocessing.TimeoutError:
                    raise RuntimeError("Timeout trying to get TFJob.")
                except client.rest.ApiException as e:
                    raise RuntimeError(
                        "Exception when calling CustomObjectsApi->list_namespaced_custom_object:\
            %s\n" % e)
                except Exception as e:
                    raise RuntimeError(
                        "There was a problem to list TFJobs in namespace {0}. \
            Exception: {1} ".format(namespace, e))
                return tfjobs