def wait_for_pods_to_be_deleted( client, namespace, pod_selector, timeout=datetime.timedelta(minutes=5), polling_interval=datetime.timedelta(seconds=30)): """Wait for the specified job to be deleted. Args: client: K8s api client. namespace: Namespace. pod_selector: Selector for the pods. timeout: How long to wait for the job. polling_interval: How often to poll for the status of the job. status_callback: (Optional): Callable. If supplied this callable is invoked after we poll the job. Callable takes a single argument which is the job. """ end_time = datetime.datetime.now() + timeout while True: pods = list_pods(client, namespace, pod_selector) logging.info("%s pods matched %s pods", len(pods.items), pod_selector) if not pods.items: return if datetime.datetime.now() + polling_interval > end_time: raise util.TimeoutError("Timeout waiting for pods to be deleted.") time.sleep(polling_interval.seconds)
def wait_for_condition(client, namespace, name, expected_condition, timeout=datetime.timedelta(minutes=10), polling_interval=datetime.timedelta(seconds=30), status_callback=None): """Waits until any of the specified conditions occur. This function only works with v1alpha2 jobs. Args: client: K8s api client. namespace: namespace for the job. name: Name of the job. expected_condition: A list of conditions. Function waits until any of the supplied conditions is reached. timeout: How long to wait for the job. polling_interval: How often to poll for the status of the job. status_callback: (Optional): Callable. If supplied this callable is invoked after we poll the job. Callable takes a single argument which is the job. """ crd_api = k8s_client.CustomObjectsApi(client) end_time = datetime.datetime.now() + timeout version = "v1alpha2" while True: # By setting async=True ApiClient returns multiprocessing.pool.AsyncResult # If we don't set async=True then it could potentially block forever. thread = crd_api.get_namespaced_custom_object( TF_JOB_GROUP, version, namespace, TF_JOB_PLURAL, name, async=True) # Try to get the result but timeout. results = None try: results = thread.get(TIMEOUT) except multiprocessing.TimeoutError: logging.error("Timeout trying to get TFJob.") if results: if status_callback: status_callback(results) # If we poll the CRD quick enough status won't have been set yet. conditions = results.get("status", {}).get("conditions", []) for c in conditions: if c.get("type", "") in expected_condition: return results if datetime.datetime.now() + polling_interval > end_time: raise util.TimeoutError( "Timeout waiting for job {0} in namespace {1} to enter one of the " "conditions {2}.".format( name, namespace, conditions)) time.sleep(polling_interval.seconds) # Linter complains if we don't have a return statement even though # this code is unreachable. return None
def wait_for_job(client, namespace, name, version="v1alpha1", timeout=datetime.timedelta(minutes=10), polling_interval=datetime.timedelta(seconds=30), status_callback=None): """Wait for the specified job to finish. Args: client: K8s api client. namespace: namespace for the job. name: Name of the job. timeout: How long to wait for the job. polling_interval: How often to poll for the status of the job. status_callback: (Optional): Callable. If supplied this callable is invoked after we poll the job. Callable takes a single argument which is the job. """ crd_api = k8s_client.CustomObjectsApi(client) end_time = datetime.datetime.now() + timeout while True: # By setting async=True ApiClient returns multiprocessing.pool.AsyncResult # If we don't set async=True then it could potentially block forever. thread = crd_api.get_namespaced_custom_object( TF_JOB_GROUP, version, namespace, TF_JOB_PLURAL, name, async=True) # Try to get the result but timeout. results = None try: results = thread.get(TIMEOUT) except multiprocessing.TimeoutError: logging.error("Timeout trying to get TFJob.") if results: if status_callback: status_callback(results) # If we poll the CRD quick enough status won't have been set yet. if version == "v1alpha1": if results.get("status", {}).get("phase", {}) == "Done": return results else: # For v1alpha2 check for non-empty completionTime # TODO(jlewi): https://github.com/kubeflow/tf-operator/issues/673 # Once that issue is fixed we should be able to look at the condition. if results.get("status", {}).get("completionTime", ""): return results if datetime.datetime.now() + polling_interval > end_time: raise util.TimeoutError( "Timeout waiting for job {0} in namespace {1} to finish.".format( name, namespace)) time.sleep(polling_interval.seconds) # Linter complains if we don't have a return statement even though # this code is unreachable. return None
def wait_for_tf_k8s_tests(client, run_id, timeout=datetime.timedelta(minutes=30), polling_interval=datetime.timedelta(seconds=15)): """Wait for the E2E pipeline to finish. Args: client: Airflow client. run_id: Id of the Airflow run timeout: Timeout. Defaults to 20 minutes. polling_interval: How often to poll for pipeline status. Returns: state: The state of the final task. """ endtime = datetime.datetime.now() + timeout while True: # TODO(jlewi): Airflow only allows us to get the stats of individual tasks # not the overall DAG. So we just get the status of the final step. # This should be sufficient for our purposes. # # In the ui it looks like every DAG has a task "undefined" that indicates # overall status of the DAG; but we get an error if we try to get this # task using the API. resp = client.get_task_status(E2E_DAG, run_id, "done") state = resp.get("state", "") logging.info("State of DAG %s run %s step done: %s", E2E_DAG, run_id, state) # If earlier stages fail and teardown_cluster never than the state of # of the step will be "upstream_failed" if state and not state in ["queued", "running", "None"]: return state if datetime.datetime.now() + polling_interval > endtime: raise util.TimeoutError( "Timed out waiting for DAG {0} run {1} to finish.".format( E2E_DAG, run_id)) logging.info("Waiting for DAG %s run %s to finish.", E2E_DAG, run_id) time.sleep(polling_interval.seconds) # Linter complains if we don't have a return statement even though # this code is unreachable. return None
def wait_for_job(client, namespace, name, timeout=datetime.timedelta(minutes=5), polling_interval=datetime.timedelta(seconds=30), status_callback=None): """Wait for the specified job to finish. Args: client: K8s api client. namespace: namespace for the job. name: Name of the job. timeout: How long to wait for the job. polling_interval: How often to poll for the status of the job. status_callback: (Optional): Callable. If supplied this callable is invoked after we poll the job. Callable takes a single argument which is the job. """ crd_api = k8s_client.CustomObjectsApi(client) end_time = datetime.datetime.now() + timeout while True: results = crd_api.get_namespaced_custom_object(TF_JOB_GROUP, TF_JOB_VERSION, namespace, TF_JOB_PLURAL, name) if status_callback: status_callback(results) # If we poll the CRD quick enough status won't have been set yet. if results.get("status", {}).get("phase", {}) == "Done": return results if datetime.datetime.now() + polling_interval > end_time: raise util.TimeoutError( "Timeout waiting for job {0} in namespace {1} to finish.". format(name, namespace)) time.sleep(polling_interval.seconds) # Linter complains if we don't have a return statement even though # this code is unreachable. return None
def wait_for_delete(client, namespace, name, version="v1alpha1", timeout=datetime.timedelta(minutes=5), polling_interval=datetime.timedelta(seconds=30), status_callback=None): """Wait for the specified job to be deleted. Args: client: K8s api client. namespace: namespace for the job. name: Name of the job. timeout: How long to wait for the job. polling_interval: How often to poll for the status of the job. status_callback: (Optional): Callable. If supplied this callable is invoked after we poll the job. Callable takes a single argument which is the job. """ crd_api = k8s_client.CustomObjectsApi(client) end_time = datetime.datetime.now() + timeout while True: try: results = crd_api.get_namespaced_custom_object( TF_JOB_GROUP, version, namespace, TF_JOB_PLURAL, name) except rest.ApiException as e: if e.status == httplib.NOT_FOUND: return logging.exception("rest.ApiException thrown") raise if status_callback: status_callback(results) if datetime.datetime.now() + polling_interval > end_time: raise util.TimeoutError( "Timeout waiting for job {0} in namespace {1} to be deleted.". format(name, namespace)) time.sleep(polling_interval.seconds)
def wait_for_workflow(client, namespace, name, timeout=datetime.timedelta(minutes=5), polling_interval=datetime.timedelta(seconds=30), status_callback=None): """Wait for the specified workflow to finish. Args: client: K8s api client. namespace: namespace for the workflow. name: Name of the workflow. timeout: How long to wait for the workflow. polling_interval: How often to poll for the status of the workflow. status_callback: (Optional): Callable. If supplied this callable is invoked after we poll the job. Callable takes a single argument which is the job. Raises: TimeoutError: If timeout waiting for the job to finish. """ crd_api = k8s_client.CustomObjectsApi(client) end_time = datetime.datetime.now() + timeout while True: results = crd_api.get_namespaced_custom_object(GROUP, VERSION, namespace, PLURAL, name) if status_callback: status_callback(results) if results["status"]["phase"] in ["Failed", "Succeeded"]: return results if datetime.datetime.now() + polling_interval > end_time: raise util.TimeoutError( "Timeout waiting for workflow {0} in namespace {1} to finish.". format(name, namespace)) time.sleep(polling_interval.seconds)