예제 #1
0
def wait_for_pods_to_be_deleted(
    client,
    namespace,
    pod_selector,
    timeout=datetime.timedelta(minutes=5),
    polling_interval=datetime.timedelta(seconds=30)):
    """Wait for the specified job to be deleted.

  Args:
    client: K8s api client.
    namespace: Namespace.
    pod_selector: Selector for the pods.
    timeout: How long to wait for the job.
    polling_interval: How often to poll for the status of the job.
    status_callback: (Optional): Callable. If supplied this callable is
      invoked after we poll the job. Callable takes a single argument which
      is the job.
  """
    end_time = datetime.datetime.now() + timeout
    while True:
        pods = list_pods(client, namespace, pod_selector)

        logging.info("%s pods matched %s pods", len(pods.items), pod_selector)

        if not pods.items:
            return

        if datetime.datetime.now() + polling_interval > end_time:
            raise util.TimeoutError("Timeout waiting for pods to be deleted.")

        time.sleep(polling_interval.seconds)
예제 #2
0
def wait_for_condition(client,
                        namespace,
                        name,
                        expected_condition,
                        timeout=datetime.timedelta(minutes=10),
                        polling_interval=datetime.timedelta(seconds=30),
                        status_callback=None):
  """Waits until any of the specified conditions occur.

  This function only works with v1alpha2 jobs.

  Args:
    client: K8s api client.
    namespace: namespace for the job.
    name: Name of the job.
    expected_condition: A list of conditions. Function waits until any of the
      supplied conditions is reached.
    timeout: How long to wait for the job.
    polling_interval: How often to poll for the status of the job.
    status_callback: (Optional): Callable. If supplied this callable is
      invoked after we poll the job. Callable takes a single argument which
      is the job.
  """
  crd_api = k8s_client.CustomObjectsApi(client)
  end_time = datetime.datetime.now() + timeout
  version = "v1alpha2"
  while True:
    # By setting async=True ApiClient returns multiprocessing.pool.AsyncResult
    # If we don't set async=True then it could potentially block forever.
    thread = crd_api.get_namespaced_custom_object(
      TF_JOB_GROUP, version, namespace, TF_JOB_PLURAL, name, async=True)

    # Try to get the result but timeout.
    results = None
    try:
      results = thread.get(TIMEOUT)
    except multiprocessing.TimeoutError:
      logging.error("Timeout trying to get TFJob.")

    if results:
      if status_callback:
        status_callback(results)

      # If we poll the CRD quick enough status won't have been set yet.
      conditions = results.get("status", {}).get("conditions", [])
      for c in conditions:
        if c.get("type", "") in expected_condition:
          return results

    if datetime.datetime.now() + polling_interval > end_time:
      raise util.TimeoutError(
        "Timeout waiting for job {0} in namespace {1} to enter one of the "
        "conditions {2}.".format(
          name, namespace, conditions))

    time.sleep(polling_interval.seconds)

  # Linter complains if we don't have a return statement even though
  # this code is unreachable.
  return None
예제 #3
0
def wait_for_job(client,
                 namespace,
                 name,
                 version="v1alpha1",
                 timeout=datetime.timedelta(minutes=10),
                 polling_interval=datetime.timedelta(seconds=30),
                 status_callback=None):
  """Wait for the specified job to finish.

  Args:
    client: K8s api client.
    namespace: namespace for the job.
    name: Name of the job.
    timeout: How long to wait for the job.
    polling_interval: How often to poll for the status of the job.
    status_callback: (Optional): Callable. If supplied this callable is
      invoked after we poll the job. Callable takes a single argument which
      is the job.
  """
  crd_api = k8s_client.CustomObjectsApi(client)
  end_time = datetime.datetime.now() + timeout
  while True:
    # By setting async=True ApiClient returns multiprocessing.pool.AsyncResult
    # If we don't set async=True then it could potentially block forever.
    thread = crd_api.get_namespaced_custom_object(
      TF_JOB_GROUP, version, namespace, TF_JOB_PLURAL, name, async=True)

    # Try to get the result but timeout.
    results = None
    try:
      results = thread.get(TIMEOUT)
    except multiprocessing.TimeoutError:
      logging.error("Timeout trying to get TFJob.")

    if results:
      if status_callback:
        status_callback(results)

      # If we poll the CRD quick enough status won't have been set yet.
      if version == "v1alpha1":
        if results.get("status", {}).get("phase", {}) == "Done":
          return results
      else:
        # For v1alpha2 check for non-empty completionTime
        # TODO(jlewi): https://github.com/kubeflow/tf-operator/issues/673
        # Once that issue is fixed we should be able to look at the condition.
        if results.get("status", {}).get("completionTime", ""):
          return results

    if datetime.datetime.now() + polling_interval > end_time:
      raise util.TimeoutError(
        "Timeout waiting for job {0} in namespace {1} to finish.".format(
          name, namespace))

    time.sleep(polling_interval.seconds)

  # Linter complains if we don't have a return statement even though
  # this code is unreachable.
  return None
예제 #4
0
def wait_for_tf_k8s_tests(client,
                          run_id,
                          timeout=datetime.timedelta(minutes=30),
                          polling_interval=datetime.timedelta(seconds=15)):
    """Wait for the E2E pipeline to finish.

  Args:
    client: Airflow client.
    run_id: Id of the Airflow run
    timeout: Timeout. Defaults to 20 minutes.
    polling_interval: How often to poll for pipeline status.

  Returns:
    state: The state of the final task.
  """
    endtime = datetime.datetime.now() + timeout
    while True:
        # TODO(jlewi): Airflow only allows us to get the stats of individual tasks
        # not the overall DAG. So we just get the status of the final step.
        # This should be sufficient for our purposes.
        #
        # In the ui it looks like every DAG has a task "undefined" that indicates
        # overall status of the DAG; but we get an error if we try to get this
        # task using the API.
        resp = client.get_task_status(E2E_DAG, run_id, "done")

        state = resp.get("state", "")
        logging.info("State of DAG %s run %s step done: %s", E2E_DAG, run_id,
                     state)
        # If earlier stages fail and teardown_cluster never than the state of
        # of the step will be "upstream_failed"
        if state and not state in ["queued", "running", "None"]:
            return state
        if datetime.datetime.now() + polling_interval > endtime:
            raise util.TimeoutError(
                "Timed out waiting for DAG {0} run {1} to finish.".format(
                    E2E_DAG, run_id))
        logging.info("Waiting for DAG %s run %s to finish.", E2E_DAG, run_id)
        time.sleep(polling_interval.seconds)

    # Linter complains if we don't have a return statement even though
    # this code is unreachable.
    return None
예제 #5
0
def wait_for_job(client,
                 namespace,
                 name,
                 timeout=datetime.timedelta(minutes=5),
                 polling_interval=datetime.timedelta(seconds=30),
                 status_callback=None):
    """Wait for the specified job to finish.

  Args:
    client: K8s api client.
    namespace: namespace for the job.
    name: Name of the job.
    timeout: How long to wait for the job.
    polling_interval: How often to poll for the status of the job.
    status_callback: (Optional): Callable. If supplied this callable is
      invoked after we poll the job. Callable takes a single argument which
      is the job.
  """
    crd_api = k8s_client.CustomObjectsApi(client)
    end_time = datetime.datetime.now() + timeout
    while True:
        results = crd_api.get_namespaced_custom_object(TF_JOB_GROUP,
                                                       TF_JOB_VERSION,
                                                       namespace,
                                                       TF_JOB_PLURAL, name)

        if status_callback:
            status_callback(results)

        # If we poll the CRD quick enough status won't have been set yet.
        if results.get("status", {}).get("phase", {}) == "Done":
            return results

        if datetime.datetime.now() + polling_interval > end_time:
            raise util.TimeoutError(
                "Timeout waiting for job {0} in namespace {1} to finish.".
                format(name, namespace))

        time.sleep(polling_interval.seconds)

    # Linter complains if we don't have a return statement even though
    # this code is unreachable.
    return None
예제 #6
0
def wait_for_delete(client,
                    namespace,
                    name,
                    version="v1alpha1",
                    timeout=datetime.timedelta(minutes=5),
                    polling_interval=datetime.timedelta(seconds=30),
                    status_callback=None):
    """Wait for the specified job to be deleted.

  Args:
    client: K8s api client.
    namespace: namespace for the job.
    name: Name of the job.
    timeout: How long to wait for the job.
    polling_interval: How often to poll for the status of the job.
    status_callback: (Optional): Callable. If supplied this callable is
      invoked after we poll the job. Callable takes a single argument which
      is the job.
  """
    crd_api = k8s_client.CustomObjectsApi(client)
    end_time = datetime.datetime.now() + timeout
    while True:
        try:
            results = crd_api.get_namespaced_custom_object(
                TF_JOB_GROUP, version, namespace, TF_JOB_PLURAL, name)
        except rest.ApiException as e:
            if e.status == httplib.NOT_FOUND:
                return
            logging.exception("rest.ApiException thrown")
            raise
        if status_callback:
            status_callback(results)

        if datetime.datetime.now() + polling_interval > end_time:
            raise util.TimeoutError(
                "Timeout waiting for job {0} in namespace {1} to be deleted.".
                format(name, namespace))

        time.sleep(polling_interval.seconds)
예제 #7
0
def wait_for_workflow(client,
                      namespace,
                      name,
                      timeout=datetime.timedelta(minutes=5),
                      polling_interval=datetime.timedelta(seconds=30),
                      status_callback=None):
    """Wait for the specified workflow to finish.

  Args:
    client: K8s api client.
    namespace: namespace for the workflow.
    name: Name of the workflow.
    timeout: How long to wait for the workflow.
    polling_interval: How often to poll for the status of the workflow.
    status_callback: (Optional): Callable. If supplied this callable is
      invoked after we poll the job. Callable takes a single argument which
      is the job.

  Raises:
    TimeoutError: If timeout waiting for the job to finish.
  """
    crd_api = k8s_client.CustomObjectsApi(client)
    end_time = datetime.datetime.now() + timeout
    while True:
        results = crd_api.get_namespaced_custom_object(GROUP, VERSION,
                                                       namespace, PLURAL, name)

        if status_callback:
            status_callback(results)

        if results["status"]["phase"] in ["Failed", "Succeeded"]:
            return results

        if datetime.datetime.now() + polling_interval > end_time:
            raise util.TimeoutError(
                "Timeout waiting for workflow {0} in namespace {1} to finish.".
                format(name, namespace))

        time.sleep(polling_interval.seconds)