def wait_for_job(
            self,
            name,  #pylint: disable=inconsistent-return-statements
            namespace=None,
            watch=False,
            timeout_seconds=600,
            polling_interval=30,
            status_callback=None):
        """Wait for the specified job to finish.

    :param name: Name of the PyTorchJob.
    :param namespace: defaults to current or default namespace.
    :param timeout_seconds: How long to wait for the job.
    :param polling_interval: How often to poll for the status of the job.
    :param status_callback: (Optional): Callable. If supplied this callable is
           invoked after we poll the job. Callable takes a single argument which
           is the job.
    :return:
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        if watch:
            pytorchjob_watch(name=name,
                             namespace=namespace,
                             timeout_seconds=timeout_seconds)
        else:
            return self.wait_for_condition(name, ["Succeeded", "Failed"],
                                           namespace=namespace,
                                           timeout_seconds=timeout_seconds,
                                           polling_interval=polling_interval,
                                           status_callback=status_callback)
  def wait_for_job(self, name,
                   namespace=None,
                   timeout_seconds=600,
                   polling_interval=30,
                   status_callback=None):
    """Wait for the specified job to finish.

    Args:
      name: Name of the PyTorchJob.
      namespace: defaults to current or default namespace.
      timeout_seconds: How long to wait for the job.
      polling_interval: How often to poll for the status of the job.
      status_callback: (Optional): Callable. If supplied this callable is
        invoked after we poll the job. Callable takes a single argument which
        is the job.
    """
    if namespace is None:
      namespace = utils.get_default_target_namespace()

    return self.wait_for_condition(
      name,
      ["Succeeded", "Failed"],
      namespace=namespace,
      timeout_seconds=timeout_seconds,
      polling_interval=polling_interval,
      status_callback=status_callback)
示例#3
0
    def get(self, name=None, namespace=None):
        """
    Get the pytorchjob
    :param name: existing pytorchjob name
    :param namespace: defaults to current or default namespace
    :return: pytorchjob
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        if name:
            try:
                return self.api_instance.get_namespaced_custom_object(
                    constants.PYTORCHJOB_GROUP, constants.PYTORCHJOB_VERSION,
                    namespace, constants.PYTORCHJOB_PLURAL, name)
            except client.rest.ApiException as e:
                raise RuntimeError(
                    "Exception when calling CustomObjectsApi->get_namespaced_custom_object:\
            %s\n" % e)
        else:
            try:
                return self.api_instance.list_namespaced_custom_object(
                    constants.PYTORCHJOB_GROUP, constants.PYTORCHJOB_VERSION,
                    namespace, constants.PYTORCHJOB_PLURAL)
            except client.rest.ApiException as e:
                raise RuntimeError(
                    "Exception when calling CustomObjectsApi->list_namespaced_custom_object:\
          %s\n" % e)
示例#4
0
def watch(name=None, namespace=None, timeout_seconds=600):
    """Watch the created or patched InferenceService in the specified namespace"""

    if namespace is None:
        namespace = utils.get_default_target_namespace()

    stream = k8s_watch.Watch().stream(
        client.CustomObjectsApi().list_namespaced_custom_object,
        constants.PYTORCHJOB_GROUP,
        constants.PYTORCHJOB_VERSION,
        namespace,
        constants.PYTORCHJOB_PLURAL,
        timeout_seconds=timeout_seconds)

    for event in stream:
        pytorchjob = event['object']
        pytorchjob_name = pytorchjob['metadata']['name']
        if name and name != pytorchjob_name:
            continue
        else:
            status = ''
            update_time = ''
            last_condition = pytorchjob.get('status',
                                            {}).get('conditions', [])[-1]
            status = last_condition.get('type', '')
            update_time = last_condition.get('lastTransitionTime', '')

            tbl(pytorchjob_name, status, update_time)

            if name == pytorchjob_name:
                if status == 'Succeeded' or status == 'Failed':
                    break
  def get(self, name=None, namespace=None):
    """
    Get the pytorchjob
    :param name: existing pytorchjob name
    :param namespace: defaults to current or default namespace
    :return: pytorchjob
    """
    if namespace is None:
      namespace = utils.get_default_target_namespace()

    if name:
      thread = self.api_instance.get_namespaced_custom_object(
        constants.PYTORCHJOB_GROUP,
        constants.PYTORCHJOB_VERSION,
        namespace,
        constants.PYTORCHJOB_PLURAL,
        name,
        async_req=True)

      pytorchjob = None
      try:
        pytorchjob = thread.get(constants.APISERVER_TIMEOUT)
      except multiprocessing.TimeoutError:
        raise RuntimeError("Timeout trying to get PyTorchJob.")
      except client.rest.ApiException as e:
        raise RuntimeError(
          "Exception when calling CustomObjectsApi->get_namespaced_custom_object:\
          %s\n" % e)
      except Exception as e:
        raise RuntimeError(
          "There was a problem to get PyTorchJob {0} in namespace {1}. Exception: \
          {2} ".format(name, namespace, e))

    else:
      thread = self.api_instance.list_namespaced_custom_object(
        constants.PYTORCHJOB_GROUP,
        constants.PYTORCHJOB_VERSION,
        namespace,
        constants.PYTORCHJOB_PLURAL,
        async_req=True)

      pytorchjob = None
      try:
        pytorchjob = thread.get(constants.APISERVER_TIMEOUT)
      except multiprocessing.TimeoutError:
        raise RuntimeError("Timeout trying to get PyTorchJob.")
      except client.rest.ApiException as e:
        raise RuntimeError(
          "Exception when calling CustomObjectsApi->list_namespaced_custom_object: \
          %s\n" % e)
      except Exception as e:
        raise RuntimeError(
          "There was a problem to List PyTorchJob in namespace {0}. \
          Exception: {1} ".format(namespace, e))

    return pytorchjob
    def get_job_status(self, name, namespace=None):
        """Returns PyTorchJob status, such as Running, Failed or Succeeded.

    :param name: The PyTorchJob name.
    :param namespace: defaults to current or default namespace.
    :return: str: PyTorchJob status
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        pytorchjob = self.get(name, namespace=namespace)
        last_condition = pytorchjob.get("status", {}).get("conditions", [])[-1]
        return last_condition.get("type", "")
    def wait_for_condition(self,
                           name,
                           expected_condition,
                           namespace=None,
                           timeout_seconds=600,
                           polling_interval=30,
                           status_callback=None):
        """Waits until any of the specified conditions occur.

    :param name: Name of the job.
    :param expected_condition: A list of conditions. Function waits until any of the
           supplied conditions is reached.
    :param namespace: defaults to current or default namespace.
    :param timeout_seconds: How long to wait for the job.
    :param polling_interval: How often to poll for the status of the job.
    :param status_callback: (Optional): Callable. If supplied this callable is
           invoked after we poll the job. Callable takes a single argument which
           is the job.
    :return: Object: PyTorchJob
    """

        if namespace is None:
            namespace = utils.get_default_target_namespace()

        for _ in range(round(timeout_seconds / polling_interval)):

            pytorchjob = None
            pytorchjob = self.get(name, namespace=namespace)

            if pytorchjob:
                if status_callback:
                    status_callback(pytorchjob)

                # If we poll the CRD quick enough status won't have been set yet.
                conditions = pytorchjob.get("status", {}).get("conditions", [])
                # Conditions might have a value of None in status.
                conditions = conditions or []
                for c in conditions:
                    if c.get("type", "") in expected_condition:
                        return pytorchjob

            time.sleep(polling_interval)

        raise RuntimeError(
            "Timeout waiting for PyTorchJob {0} in namespace {1} to enter one of the "
            "conditions {2}.".format(name, namespace, expected_condition),
            pytorchjob)
    def get_pod_names(
            self,
            name,
            namespace=None,
            master=False,  #pylint: disable=inconsistent-return-statements
            replica_type=None,
            replica_index=None):
        """
    Get pod names of PyTorchJob.
    :param name: PyTorchJob name
    :param namespace: defaults to current or default namespace.
    :param master: Only get pod with label 'job-role: master' pod if True.
    :param replica_type: User can specify one of 'master, worker' to only get one type pods.
           By default get all type pods.
    :param replica_index: User can specfy replica index to get one pod of PyTorchJob.
    :return: set: pods name
    """

        if namespace is None:
            namespace = utils.get_default_target_namespace()

        labels = utils.get_labels(name,
                                  master=master,
                                  replica_type=replica_type,
                                  replica_index=replica_index)

        try:
            resp = self.core_api.list_namespaced_pod(
                namespace, label_selector=utils.to_selector(labels))
        except client.rest.ApiException as e:
            raise RuntimeError(
                "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n"
                % e)

        pod_names = []
        for pod in resp.items:
            if pod.metadata and pod.metadata.name:
                pod_names.append(pod.metadata.name)

        if not pod_names:
            logging.warning(
                "Not found Pods of the PyTorchJob %s with the labels %s.",
                name, labels)
        else:
            return set(pod_names)
    def get_logs(self,
                 name,
                 namespace=None,
                 master=True,
                 replica_type=None,
                 replica_index=None,
                 follow=False):
        """
    Get training logs of the PyTorchJob.
    By default only get the logs of Pod that has labels 'job-role: master'.
    :param name: PyTorchJob name
    :param namespace: defaults to current or default namespace.
    :param master: By default get pod with label 'job-role: master' pod if True.
                   If need to get more Pod Logs, set False.
    :param replica_type: User can specify one of 'master, worker' to only get one type pods.
           By default get all type pods.
    :param replica_index: User can specfy replica index to get one pod of PyTorchJob.
    :param follow: Follow the log stream of the pod. Defaults to false.
    :return: str: pods logs
    """

        if namespace is None:
            namespace = utils.get_default_target_namespace()

        pod_names = self.get_pod_names(name,
                                       namespace=namespace,
                                       master=master,
                                       replica_type=replica_type,
                                       replica_index=replica_index)

        if pod_names:
            for pod in pod_names:
                try:
                    pod_logs = self.core_api.read_namespaced_pod_log(
                        pod, namespace, follow=follow)
                    logging.info("The logs of Pod %s:\n %s", pod, pod_logs)
                except client.rest.ApiException as e:
                    raise RuntimeError(
                        "Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n"
                        % e)
        else:
            raise RuntimeError("Not found Pods of the PyTorchJob {} "
                               "in namespace {}".format(name, namespace))
    def delete(self, name, namespace=None):
        """
    Delete the pytorchjob
    :param name: pytorchjob name
    :param namespace: defaults to current or default namespace
    :return:
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        try:
            return self.custom_api.delete_namespaced_custom_object(
                constants.PYTORCHJOB_GROUP, constants.PYTORCHJOB_VERSION,
                namespace, constants.PYTORCHJOB_PLURAL, name,
                client.V1DeleteOptions())
        except client.rest.ApiException as e:
            raise RuntimeError(
                "Exception when calling CustomObjectsApi->delete_namespaced_custom_object:\
         %s\n" % e)
    def get(self, name=None, namespace=None, watch=False, timeout_seconds=600):  #pylint: disable=inconsistent-return-statements
        """
    Get the pytorchjob
    :param name: existing pytorchjob name, if not defined, get all pytorchjobs in the namespace.
    :param namespace: defaults to current or default namespace
    :param watch: Watch the pytorchjob if `True`.
    :param timeout_seconds: How long to watch the pytorchjob.
    :return: pytorchjob
    """
        if namespace is None:
            namespace = utils.get_default_target_namespace()

        if name:
            if watch:
                pytorchjob_watch(name=name,
                                 namespace=namespace,
                                 timeout_seconds=timeout_seconds)
            else:
                thread = self.custom_api.get_namespaced_custom_object(
                    constants.PYTORCHJOB_GROUP,
                    constants.PYTORCHJOB_VERSION,
                    namespace,
                    constants.PYTORCHJOB_PLURAL,
                    name,
                    async_req=True)

                pytorchjob = None
                try:
                    pytorchjob = thread.get(constants.APISERVER_TIMEOUT)
                except multiprocessing.TimeoutError:
                    raise RuntimeError("Timeout trying to get PyTorchJob.")
                except client.rest.ApiException as e:
                    raise RuntimeError(
                        "Exception when calling CustomObjectsApi->get_namespaced_custom_object:\
            %s\n" % e)
                except Exception as e:
                    raise RuntimeError(
                        "There was a problem to get PyTorchJob {0} in namespace {1}. Exception: \
            {2} ".format(name, namespace, e))
                return pytorchjob
        else:
            if watch:
                pytorchjob_watch(namespace=namespace,
                                 timeout_seconds=timeout_seconds)
            else:
                thread = self.custom_api.list_namespaced_custom_object(
                    constants.PYTORCHJOB_GROUP,
                    constants.PYTORCHJOB_VERSION,
                    namespace,
                    constants.PYTORCHJOB_PLURAL,
                    async_req=True)

                pytorchjob = None
                try:
                    pytorchjob = thread.get(constants.APISERVER_TIMEOUT)
                except multiprocessing.TimeoutError:
                    raise RuntimeError("Timeout trying to get PyTorchJob.")
                except client.rest.ApiException as e:
                    raise RuntimeError(
                        "Exception when calling CustomObjectsApi->list_namespaced_custom_object: \
            %s\n" % e)
                except Exception as e:
                    raise RuntimeError(
                        "There was a problem to List PyTorchJob in namespace {0}. \
            Exception: {1} ".format(namespace, e))

                return pytorchjob