def wait_for_job_success( self, job_name, namespace, wait_timeout=DEFAULT_WAIT_TIMEOUT, wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS, num_pods_to_wait_for=DEFAULT_JOB_POD_COUNT, ): '''Poll a job for successful completion. Args: job_name (str): Name of the job to wait for. namespace (str): Namespace in which the job is located. wait_timeout (numeric, optional): Timeout after which to give up and raise exception. Defaults to DEFAULT_WAIT_TIMEOUT. wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults to DEFAULT_WAIT_BETWEEN_ATTEMPTS. Raises: DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered. ''' check.str_param(job_name, 'job_name') check.str_param(namespace, 'namespace') check.numeric_param(wait_timeout, 'wait_timeout') check.numeric_param(wait_time_between_attempts, 'wait_time_between_attempts') check.int_param(num_pods_to_wait_for, 'num_pods_to_wait_for') job = None start = self.timer() # Ensure we found the job that we launched while not job: if self.timer() - start > wait_timeout: raise DagsterK8sError('Timed out while waiting for job to launch') jobs = self.batch_api.list_namespaced_job(namespace=namespace) job = next((j for j in jobs.items if j.metadata.name == job_name), None) if not job: self.logger('Job "{job_name}" not yet launched, waiting'.format(job_name=job_name)) self.sleeper(wait_time_between_attempts) # Wait for job completed status while True: if self.timer() - start > wait_timeout: raise DagsterK8sError('Timed out while waiting for job to complete') # See: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#jobstatus-v1-batch status = self.batch_api.read_namespaced_job_status(job_name, namespace=namespace).status if status.failed and status.failed > 0: raise DagsterK8sError('Encountered failed job pods with status: %s' % str(status)) # done waiting for pod completion if status.succeeded == num_pods_to_wait_for: break self.sleeper(wait_time_between_attempts)
def retry_pg_connection_fn(fn, retry_limit=5, retry_wait=0.2): """Reusable retry logic for any psycopg2/sqlalchemy PG connection functions that may fail. Intended to be used anywhere we connect to PG, to gracefully handle transient connection issues. """ check.callable_param(fn, "fn") check.int_param(retry_limit, "retry_limit") check.numeric_param(retry_wait, "retry_wait") while True: try: return fn() except ( # See: https://www.psycopg.org/docs/errors.html # These are broad, we may want to list out specific exceptions to capture psycopg2.DatabaseError, psycopg2.OperationalError, sqlalchemy.exc.DatabaseError, sqlalchemy.exc.OperationalError, ) as exc: logging.warning("Retrying failed database connection") if retry_limit == 0: raise DagsterPostgresException( "too many retries for DB connection") from exc time.sleep(retry_wait) retry_limit -= 1
def retry_mysql_creation_fn(fn, retry_limit=5, retry_wait=0.2): # Retry logic to recover from the case where two processes are creating # tables at the same time using sqlalchemy check.callable_param(fn, "fn") check.int_param(retry_limit, "retry_limit") check.numeric_param(retry_wait, "retry_wait") while True: try: return fn() except ( mysql.ProgrammingError, mysql.IntegrityError, db.exc.ProgrammingError, db.exc.IntegrityError, ) as exc: if ( isinstance(exc, db.exc.ProgrammingError) and exc.orig and exc.orig.errno == mysql.errorcode.ER_TABLE_EXISTS_ERROR ) or ( isinstance(exc, mysql.ProgrammingError) and exc.errno == mysql.errorcode.ER_TABLE_EXISTS_ERROR ): raise logging.warning("Retrying failed database creation") if retry_limit == 0: raise DagsterMySQLException("too many retries for DB creation") from exc time.sleep(retry_wait) retry_limit -= 1
def wait_for_job( self, job_name, namespace, wait_timeout=DEFAULT_WAIT_TIMEOUT, wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS, start_time=None, ): """Wait for a job to launch and be running. Args: job_name (str): Name of the job to wait for. namespace (str): Namespace in which the job is located. wait_timeout (numeric, optional): Timeout after which to give up and raise exception. Defaults to DEFAULT_WAIT_TIMEOUT. wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults to DEFAULT_WAIT_BETWEEN_ATTEMPTS. Raises: DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered. """ check.str_param(job_name, "job_name") check.str_param(namespace, "namespace") check.numeric_param(wait_timeout, "wait_timeout") check.numeric_param(wait_time_between_attempts, "wait_time_between_attempts") job = None start = start_time or self.timer() while not job: if self.timer() - start > wait_timeout: raise DagsterK8sTimeoutError( "Timed out while waiting for job {job_name}" " to launch".format(job_name=job_name)) # Get all jobs in the namespace and find the matching job def _get_jobs_for_namespace(): jobs = self.batch_api.list_namespaced_job( namespace=namespace, field_selector="metadata.name={}".format(job_name)) if jobs.items: check.invariant( len(jobs.items) == 1, 'There should only be one k8s job with name "{}", but got multiple jobs:" {}' .format(job_name, jobs.items), ) return jobs.items[0] else: return None job = k8s_api_retry(_get_jobs_for_namespace, max_retries=3, timeout=wait_time_between_attempts) if not job: self.logger( 'Job "{job_name}" not yet launched, waiting'.format( job_name=job_name)) self.sleeper(wait_time_between_attempts)
def retry_mysql_connection_fn(fn, retry_limit=5, retry_wait=0.2): """Reusable retry logic for any MySQL connection functions that may fail. Intended to be used anywhere we connect to MySQL, to gracefully handle transient connection issues. """ check.callable_param(fn, "fn") check.int_param(retry_limit, "retry_limit") check.numeric_param(retry_wait, "retry_wait") while True: try: return fn() except ( mysql.DatabaseError, mysql.OperationalError, db.exc.DatabaseError, db.exc.OperationalError, mysql.errors.InterfaceError, ) as exc: logging.warning("Retrying failed database connection") if retry_limit == 0: raise DagsterMySQLException("too many retries for DB connection") from exc time.sleep(retry_wait) retry_limit -= 1
def retry_pg_creation_fn(fn, retry_limit=5, retry_wait=0.2): # Retry logic to recover from the case where two processes are creating # tables at the same time using sqlalchemy check.callable_param(fn, "fn") check.int_param(retry_limit, "retry_limit") check.numeric_param(retry_wait, "retry_wait") while True: try: return fn() except ( psycopg2.ProgrammingError, psycopg2.IntegrityError, sqlalchemy.exc.ProgrammingError, sqlalchemy.exc.IntegrityError, ) as exc: # Only programming error we want to retry on is the DuplicateTable error if (isinstance(exc, sqlalchemy.exc.ProgrammingError) and exc.orig and exc.orig.pgcode != psycopg2.errorcodes.DUPLICATE_TABLE ) or (isinstance(exc, psycopg2.ProgrammingError) and exc.pgcode != psycopg2.errorcodes.DUPLICATE_TABLE): raise logging.warning("Retrying failed database creation") if retry_limit == 0: raise DagsterPostgresException( "too many retries for DB creation") from exc time.sleep(retry_wait) retry_limit -= 1
def retry_pg_connection_fn(fn, retry_limit=5, retry_wait=0.2): """ Reusable retry logic for any psycopg2/sqlalchemy PG connection functions that may fail. Intended to be used anywhere we connect to PG, to gracefully handle transient connection issues. """ check.callable_param(fn, "fn") check.int_param(retry_limit, "retry_limit") check.numeric_param(retry_wait, "retry_wait") attempt_num = 0 while True: attempt_num += 1 try: return fn() except ( # See: https://www.psycopg.org/docs/errors.html # These are broad, we may want to list out specific exceptions to capture psycopg2.DatabaseError, psycopg2.OperationalError, sqlalchemy.exc.DatabaseError, sqlalchemy.exc.OperationalError, ) as exc: logging.warning("Retrying failed database connection: %s", exc) if attempt_num > retry_limit: raise DagsterPostgresException( "too many retries for DB connection") from exc time.sleep( calculate_delay( attempt_num=attempt_num, base_delay=retry_wait, jitter=Jitter.PLUS_MINUS, backoff=Backoff.EXPONENTIAL, ))
def run(self, interval_seconds=2): """ Run the coordinator daemon Arguments: interval_seconds (float): time in seconds to wait between dequeuing attempts """ check.numeric_param(interval_seconds, "interval_seconds") while True: self.attempt_to_launch_runs() time.sleep(interval_seconds)
def __init__(self, interval_seconds): self._logger = get_default_daemon_logger(type(self).__name__) self.interval_seconds = check.numeric_param(interval_seconds, "interval_seconds") self._last_iteration_time = None self._last_heartbeat_time = None self._current_iteration_exceptions = None self._last_iteration_exceptions = None
def k8s_api_retry( fn, max_retries, timeout, msg_fn=lambda: "Unexpected error encountered in Kubernetes API Client.", ): check.callable_param(fn, "fn") check.int_param(max_retries, "max_retries") check.numeric_param(timeout, "timeout") remaining_attempts = 1 + max_retries while remaining_attempts > 0: remaining_attempts -= 1 try: return fn() except kubernetes.client.rest.ApiException as e: # Only catch whitelisted ApiExceptions status = e.status # Check if the status code is generally whitelisted whitelisted = status in WHITELISTED_TRANSIENT_K8S_STATUS_CODES # If there are remaining attempts, swallow the error if whitelisted and remaining_attempts > 0: time.sleep(timeout) elif whitelisted and remaining_attempts == 0: raise_from( DagsterK8sAPIRetryLimitExceeded( msg_fn(), k8s_api_exception=e, max_retries=max_retries, original_exc_info=sys.exc_info(), ), e, ) else: raise_from( DagsterK8sUnrecoverableAPIError( msg_fn(), k8s_api_exception=e, original_exc_info=sys.exc_info(), ), e, )
def wait_for_job( self, job_name, namespace, wait_timeout=DEFAULT_WAIT_TIMEOUT, wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS, ): """ Wait for a job to launch and be running. Args: job_name (str): Name of the job to wait for. namespace (str): Namespace in which the job is located. wait_timeout (numeric, optional): Timeout after which to give up and raise exception. Defaults to DEFAULT_WAIT_TIMEOUT. wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults to DEFAULT_WAIT_BETWEEN_ATTEMPTS. Raises: DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered. """ check.str_param(job_name, "job_name") check.str_param(namespace, "namespace") check.numeric_param(wait_timeout, "wait_timeout") check.numeric_param(wait_time_between_attempts, "wait_time_between_attempts") job = None start = self.timer() # Ensure we found the job that we launched while not job: if self.timer() - start > wait_timeout: raise DagsterK8sError( "Timed out while waiting for job to launch") jobs = self.batch_api.list_namespaced_job(namespace=namespace) job = next((j for j in jobs.items if j.metadata.name == job_name), None) if not job: self.logger( 'Job "{job_name}" not yet launched, waiting'.format( job_name=job_name)) self.sleeper(wait_time_between_attempts)
def __init__(self, interval_seconds): self._logger = get_default_daemon_logger(type(self).__name__) self.interval_seconds = check.numeric_param(interval_seconds, "interval_seconds") self._last_iteration_time = None self._last_heartbeat_time = None self._errors = [] # (SerializableErrorInfo, timestamp) tuples self._first_error_logged = False
def numeric_column( name, expected_dtypes, min_value=-float('inf'), max_value=float('inf'), non_nullable=False, unique=False, ): return PandasColumn( name=check.str_param(name, 'name'), constraints=[ ColumnTypeConstraint(expected_dtypes), InRangeColumnConstraint( check.numeric_param(min_value, 'min_value'), check.numeric_param(max_value, 'max_value'), ), ] + _construct_keyword_constraints(non_nullable=non_nullable, unique=unique), )
def float_column( name, min_value=-float("inf"), max_value=float("inf"), non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None, ): """ Simple constructor for PandasColumns that expresses numeric constraints on float dtypes. Args: name (str): Name of the column. This must match up with the column name in the dataframe you expect to receive. min_value (Optional[Union[int,float]]): The lower bound for values you expect in this column. Defaults to -float('inf') max_value (Optional[Union[int,float]]): The upper bound for values you expect in this column. Defaults to float('inf') non_nullable (Optional[bool]): If true, this column will enforce a constraint that all values in the column ought to be non null values. unique (Optional[bool]): If true, this column will enforce a uniqueness constraint on the column values. ignore_missing_vals (Optional[bool]): A flag that is passed into most constraints. If true, the constraint will only evaluate non-null data. Ignore_missing_vals and non_nullable cannot both be True. is_required (Optional[bool]): Flag indicating the optional/required presence of the column. If the column exists the validate function will validate the column. Default to True. """ return PandasColumn( name=check.str_param(name, "name"), constraints=[ ColumnDTypeFnConstraint(is_float_dtype), InRangeColumnConstraint( check.numeric_param(min_value, "min_value"), check.numeric_param(max_value, "max_value"), ignore_missing_vals=ignore_missing_vals, ), ] + _construct_keyword_constraints( non_nullable=non_nullable, unique=unique, ignore_missing_vals=ignore_missing_vals), is_required=is_required, )
def numeric_column( cls, name, expected_dtypes, min_value=-float('inf'), max_value=float('inf'), exists=False, unique=False, ): return cls( name=check.str_param(name, 'name'), constraints=cls.add_configurable_constraints( [ ColumnTypeConstraint(expected_dtypes), InRangeColumnConstraint( check.numeric_param(min_value, 'min_value'), check.numeric_param(max_value, 'max_value'), ), ], exists=exists, unique=unique, ), )
def __init__(self, address, timeout, inst_data=None): self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData) self._address = check.str_param(address, 'address') self._timeout = check.numeric_param(timeout, 'timeout') self._handle = None self._instance = None self._validated = False parsed_url = urlparse(address) check.invariant( parsed_url.scheme and parsed_url.netloc, 'Address {address} is not a valid URL. Host URL should include scheme ie http://localhost'.format( address=self._address ), )
def __init__(self, host, token, poll_interval_sec=10, max_wait_time_sec=_DEFAULT_RUN_MAX_WAIT_TIME_SEC): """Args: host (str): Databricks host, e.g. https://uksouth.azuredatabricks.net token (str): Databricks token """ self.host = check.str_param(host, "host") self.token = check.str_param(token, "token") self.poll_interval_sec = check.numeric_param(poll_interval_sec, "poll_interval_sec") self.max_wait_time_sec = check.int_param(max_wait_time_sec, "max_wait_time_sec") self._client = DatabricksClient(host=self.host, token=self.token)
def wait_for_pod( pod_name, namespace, wait_for_state=WaitForPodState.Ready, wait_timeout=DEFAULT_WAIT_TIMEOUT, wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS, ): '''Wait for a pod to launch and be running, or wait for termination (useful for job pods). Args: pod_name (str): Name of the pod to wait for. namespace (str): Namespace in which the pod is located. wait_for_state (WaitForPodState, optional): Whether to wait for pod readiness or termination. Defaults to waiting for readiness. wait_timeout (numeric, optional): Timeout after which to give up and raise exception. Defaults to DEFAULT_WAIT_TIMEOUT. wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults to DEFAULT_WAIT_BETWEEN_ATTEMPTS. Raises: DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered ''' check.str_param(pod_name, 'pod_name') check.str_param(namespace, 'namespace') check.inst_param(wait_for_state, 'wait_for_state', WaitForPodState) check.numeric_param(wait_timeout, 'wait_timeout') check.numeric_param(wait_time_between_attempts, 'wait_time_between_attempts') logging.info('Waiting for pod %s' % pod_name) start = time.time() while True: pods = (kubernetes.client.CoreV1Api().list_namespaced_pod( namespace=namespace, field_selector='metadata.name=%s' % pod_name).items) pod = pods[0] if pods else None if time.time() - start > wait_timeout: raise DagsterK8sError( 'Timed out while waiting for pod to become ready with pod info: %s' % str(pod)) if pod is None: logging.info('Waiting for pod "%s" to launch...' % pod_name) time.sleep(wait_time_between_attempts) continue if not pod.status.container_statuses: logging.info( 'Waiting for pod container status to be set by kubernetes...') time.sleep(wait_time_between_attempts) continue # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstatus-v1-core container_status = pod.status.container_statuses[0] # State checks below, see: # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstate-v1-core state = container_status.state if state.running is not None: if wait_for_state == WaitForPodState.Ready: # ready is boolean field of container status ready = container_status.ready if not ready: logging.info('Waiting for pod "%s" to become ready...' % pod_name) time.sleep(wait_time_between_attempts) continue else: logging.info('Pod "%s" is ready, done waiting' % pod_name) break elif wait_for_state == WaitForPodState.Terminated: time.sleep(wait_time_between_attempts) continue else: raise DagsterK8sError('Unknown wait for state %s' % str(wait_for_state.value)) break elif state.waiting is not None: # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstatewaiting-v1-core if state.waiting.reason == 'PodInitializing': logging.info('Waiting for pod "%s" to initialize...' % pod_name) time.sleep(wait_time_between_attempts) continue elif state.waiting.reason == 'ContainerCreating': logging.info('Waiting for container creation...') time.sleep(wait_time_between_attempts) continue elif state.waiting.reason in [ 'ErrImagePull', 'ImagePullBackOff', 'CrashLoopBackOff', 'RunContainerError', ]: raise DagsterK8sError('Failed: %s' % state.waiting.message) else: raise DagsterK8sError('Unknown issue: %s' % state.waiting) # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstateterminated-v1-core elif state.terminated is not None: if not state.terminated.exit_code == 0: raw_logs = retrieve_pod_logs(pod_name, namespace) raise DagsterK8sError( 'Pod did not exit successfully. Failed with message: %s and pod logs: %s' % (state.terminated.message, str(raw_logs))) break else: raise DagsterK8sError('Should not get here, unknown pod state')
def __init__(self, interval_seconds): self.interval_seconds = check.numeric_param(interval_seconds, "interval_seconds") super().__init__()
def __init__( self, instance, daemons, gen_workspace, heartbeat_interval_seconds=DEFAULT_HEARTBEAT_INTERVAL_SECONDS, heartbeat_tolerance_seconds=DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS, error_interval_seconds=DEFAULT_DAEMON_ERROR_INTERVAL_SECONDS, handler="default", ): self._daemon_uuid = str(uuid.uuid4()) self._daemons = {} self._daemon_threads = {} self._instance = check.inst_param(instance, "instance", DagsterInstance) self._daemons = { daemon.daemon_type(): daemon for daemon in check.list_param(daemons, "daemons", of_type=DagsterDaemon) } self._gen_workspace = check.callable_param(gen_workspace, "gen_workspace") self._heartbeat_interval_seconds = check.numeric_param( heartbeat_interval_seconds, "heartbeat_interval_seconds" ) self._heartbeat_tolerance_seconds = check.numeric_param( heartbeat_tolerance_seconds, "heartbeat_tolerance_seconds" ) if not self._daemons: raise Exception("No daemons configured on the DagsterInstance") self._daemon_shutdown_event = threading.Event() configure_loggers(handler=handler) self._logger = logging.getLogger("dagster.daemon") self._logger.info( "instance is configured with the following daemons: {}".format( _sorted_quoted(type(daemon).__name__ for daemon in self.daemons) ) ) self._last_healthy_heartbeat_times = {} for daemon_type, daemon in self._daemons.items(): self._daemon_threads[daemon_type] = threading.Thread( target=daemon.run_daemon_loop, args=( self._instance.get_ref(), self._daemon_uuid, self._daemon_shutdown_event, gen_workspace, heartbeat_interval_seconds, error_interval_seconds, ), name="dagster-daemon-{daemon_type}".format(daemon_type=daemon_type), daemon=True, # Individual daemons should not outlive controller process ) self._last_healthy_heartbeat_times[daemon_type] = time.time() self._daemon_threads[daemon_type].start() self._start_time = pendulum.now("UTC")
def wait_for_pod( self, pod_name, namespace, wait_for_state=WaitForPodState.Ready, wait_timeout=DEFAULT_WAIT_TIMEOUT, wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS, ): """Wait for a pod to launch and be running, or wait for termination (useful for job pods). Args: pod_name (str): Name of the pod to wait for. namespace (str): Namespace in which the pod is located. wait_for_state (WaitForPodState, optional): Whether to wait for pod readiness or termination. Defaults to waiting for readiness. wait_timeout (numeric, optional): Timeout after which to give up and raise exception. Defaults to DEFAULT_WAIT_TIMEOUT. wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults to DEFAULT_WAIT_BETWEEN_ATTEMPTS. Raises: DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered """ check.str_param(pod_name, "pod_name") check.str_param(namespace, "namespace") check.inst_param(wait_for_state, "wait_for_state", WaitForPodState) check.numeric_param(wait_timeout, "wait_timeout") check.numeric_param(wait_time_between_attempts, "wait_time_between_attempts") self.logger('Waiting for pod "%s"' % pod_name) start = self.timer() while True: pods = self.core_api.list_namespaced_pod( namespace=namespace, field_selector="metadata.name=%s" % pod_name).items pod = pods[0] if pods else None if self.timer() - start > wait_timeout: raise DagsterK8sError( "Timed out while waiting for pod to become ready with pod info: %s" % str(pod)) if pod is None: self.logger('Waiting for pod "%s" to launch...' % pod_name) self.sleeper(wait_time_between_attempts) continue if not pod.status.container_statuses: self.logger( "Waiting for pod container status to be set by kubernetes..." ) self.sleeper(wait_time_between_attempts) continue # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstatus-v1-core container_status = pod.status.container_statuses[0] # State checks below, see: # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstate-v1-core state = container_status.state if state.running is not None: if wait_for_state == WaitForPodState.Ready: # ready is boolean field of container status ready = container_status.ready if not ready: self.logger('Waiting for pod "%s" to become ready...' % pod_name) self.sleeper(wait_time_between_attempts) continue else: self.logger('Pod "%s" is ready, done waiting' % pod_name) break else: check.invariant( wait_for_state == WaitForPodState.Terminated, "New invalid WaitForPodState") self.sleeper(wait_time_between_attempts) continue elif state.waiting is not None: # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstatewaiting-v1-core if state.waiting.reason == KubernetesWaitingReasons.PodInitializing: self.logger('Waiting for pod "%s" to initialize...' % pod_name) self.sleeper(wait_time_between_attempts) continue if state.waiting.reason == KubernetesWaitingReasons.CreateContainerConfigError: self.logger( 'Pod "%s" is waiting due to a CreateContainerConfigError with message "%s" - trying again to see if it recovers' % (pod_name, state.waiting.message)) self.sleeper(wait_time_between_attempts) continue elif state.waiting.reason == KubernetesWaitingReasons.ContainerCreating: self.logger("Waiting for container creation...") self.sleeper(wait_time_between_attempts) continue elif state.waiting.reason in [ KubernetesWaitingReasons.ErrImagePull, KubernetesWaitingReasons.ImagePullBackOff, KubernetesWaitingReasons.CrashLoopBackOff, KubernetesWaitingReasons.RunContainerError, ]: raise DagsterK8sError( 'Failed: Reason="{reason}" Message="{message}"'.format( reason=state.waiting.reason, message=state.waiting.message)) else: raise DagsterK8sError("Unknown issue: %s" % state.waiting) # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#containerstateterminated-v1-core elif state.terminated is not None: if not state.terminated.exit_code == 0: raw_logs = self.retrieve_pod_logs(pod_name, namespace) message = state.terminated.message raise DagsterK8sError( f'Pod did not exit successfully. Failed with message: "{message}" ' f'and pod logs: "{raw_logs}"') else: self.logger("Pod {pod_name} exitted successfully".format( pod_name=pod_name)) break else: raise DagsterK8sError("Should not get here, unknown pod state")
def wait_for_job_success( self, job_name, namespace, instance=None, run_id=None, wait_timeout=DEFAULT_WAIT_TIMEOUT, wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS, num_pods_to_wait_for=DEFAULT_JOB_POD_COUNT, ): """Poll a job for successful completion. Args: job_name (str): Name of the job to wait for. namespace (str): Namespace in which the job is located. wait_timeout (numeric, optional): Timeout after which to give up and raise exception. Defaults to DEFAULT_WAIT_TIMEOUT. wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults to DEFAULT_WAIT_BETWEEN_ATTEMPTS. Raises: DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered. """ check.str_param(job_name, "job_name") check.str_param(namespace, "namespace") check.opt_inst_param(instance, "instance", DagsterInstance) check.opt_str_param(run_id, "run_id") check.numeric_param(wait_timeout, "wait_timeout") check.numeric_param(wait_time_between_attempts, "wait_time_between_attempts") check.int_param(num_pods_to_wait_for, "num_pods_to_wait_for") start = self.timer() # Wait for job to be running self.wait_for_job( job_name, namespace, wait_timeout=wait_timeout, wait_time_between_attempts=wait_time_between_attempts, start_time=start, ) # Wait for the job status to be completed. We check the status every # wait_time_between_attempts seconds while True: if self.timer() - start > wait_timeout: raise DagsterK8sTimeoutError( "Timed out while waiting for job {job_name}" " to complete".format(job_name=job_name)) # Reads the status of the specified job. Returns a V1Job object that # we need to read the status off of. status = None def _get_job_status(): job = self.batch_api.read_namespaced_job_status( job_name, namespace=namespace) return job.status status = k8s_api_retry(_get_job_status, max_retries=3, timeout=wait_time_between_attempts) # status.succeeded represents the number of pods which reached phase Succeeded. if status.succeeded == num_pods_to_wait_for: break # status.failed represents the number of pods which reached phase Failed. if status.failed and status.failed > 0: raise DagsterK8sError( "Encountered failed job pods for job {job_name} with status: {status}, " "in namespace {namespace}".format(job_name=job_name, status=status, namespace=namespace)) if instance and run_id: pipeline_run = instance.get_run_by_id(run_id) if not pipeline_run: raise DagsterK8sPipelineStatusException() pipeline_run_status = pipeline_run.status if pipeline_run_status != PipelineRunStatus.STARTED: raise DagsterK8sPipelineStatusException() self.sleeper(wait_time_between_attempts)
def wait_for_job_success( self, job_name, namespace, instance=None, run_id=None, wait_timeout=DEFAULT_WAIT_TIMEOUT, wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS, num_pods_to_wait_for=DEFAULT_JOB_POD_COUNT, ): '''Poll a job for successful completion. Args: job_name (str): Name of the job to wait for. namespace (str): Namespace in which the job is located. wait_timeout (numeric, optional): Timeout after which to give up and raise exception. Defaults to DEFAULT_WAIT_TIMEOUT. wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults to DEFAULT_WAIT_BETWEEN_ATTEMPTS. Raises: DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered. ''' check.str_param(job_name, 'job_name') check.str_param(namespace, 'namespace') check.opt_inst_param(instance, 'instance', DagsterInstance) check.opt_str_param(run_id, 'run_id') check.numeric_param(wait_timeout, 'wait_timeout') check.numeric_param(wait_time_between_attempts, 'wait_time_between_attempts') check.int_param(num_pods_to_wait_for, 'num_pods_to_wait_for') job = None start = self.timer() # Ensure we found the job that we launched while not job: if self.timer() - start > wait_timeout: raise DagsterK8sError('Timed out while waiting for job to launch') jobs = self.batch_api.list_namespaced_job(namespace=namespace) job = next((j for j in jobs.items if j.metadata.name == job_name), None) if not job: self.logger('Job "{job_name}" not yet launched, waiting'.format(job_name=job_name)) self.sleeper(wait_time_between_attempts) # Wait for job completed status while True: if self.timer() - start > wait_timeout: raise DagsterK8sError('Timed out while waiting for job to complete') # See: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#jobstatus-v1-batch status = self.batch_api.read_namespaced_job_status(job_name, namespace=namespace).status if status.failed and status.failed > 0: pods = self.core_api.list_namespaced_pod( label_selector='job-name=={}'.format(job_name), namespace=namespace ) logs = {} for pod in pods.items: pod_name = pod.metadata.name try: logs[pod_name] = self.core_api.read_namespaced_pod_log( name=pod_name, namespace=namespace ) except kubernetes.client.rest.ApiException as e: logs[pod_name] = e raise DagsterK8sError( 'Encountered failed job pods with status: {}, and logs: {}'.format(status, logs) ) # done waiting for pod completion if status.succeeded == num_pods_to_wait_for: break if instance and run_id: pipeline_run_status = instance.get_run_by_id(run_id).status if pipeline_run_status != PipelineRunStatus.STARTED: raise DagsterK8sPipelineStatusException() self.sleeper(wait_time_between_attempts)
def wait_for_job_success( self, job_name, namespace, instance=None, run_id=None, wait_timeout=DEFAULT_WAIT_TIMEOUT, wait_time_between_attempts=DEFAULT_WAIT_BETWEEN_ATTEMPTS, num_pods_to_wait_for=DEFAULT_JOB_POD_COUNT, ): """Poll a job for successful completion. Args: job_name (str): Name of the job to wait for. namespace (str): Namespace in which the job is located. wait_timeout (numeric, optional): Timeout after which to give up and raise exception. Defaults to DEFAULT_WAIT_TIMEOUT. wait_time_between_attempts (numeric, optional): Wait time between polling attempts. Defaults to DEFAULT_WAIT_BETWEEN_ATTEMPTS. Raises: DagsterK8sError: Raised when wait_timeout is exceeded or an error is encountered. """ check.str_param(job_name, "job_name") check.str_param(namespace, "namespace") check.opt_inst_param(instance, "instance", DagsterInstance) check.opt_str_param(run_id, "run_id") check.numeric_param(wait_timeout, "wait_timeout") check.numeric_param(wait_time_between_attempts, "wait_time_between_attempts") check.int_param(num_pods_to_wait_for, "num_pods_to_wait_for") job = None start = self.timer() # Wait for job to launch while not job: if self.timer() - start > wait_timeout: raise DagsterK8sTimeoutError( "Timed out while waiting for job {job_name}" " to launch".format(job_name=job_name)) # Get all jobs in the namespace and find the matching job def _get_jobs_for_namespace(): jobs = self.batch_api.list_namespaced_job( namespace=namespace, field_selector="metadata.name={}".format(job_name)) if jobs.items: check.invariant( len(jobs.items) == 1, 'There should only be one k8s job with name "{}", but got multiple jobs:" {}' .format(job_name, jobs.items), ) return jobs.items[0] else: return None job = k8s_api_retry(_get_jobs_for_namespace, max_retries=3) if not job: self.logger( 'Job "{job_name}" not yet launched, waiting'.format( job_name=job_name)) self.sleeper(wait_time_between_attempts) # Wait for the job status to be completed. We check the status every # wait_time_between_attempts seconds while True: if self.timer() - start > wait_timeout: raise DagsterK8sTimeoutError( "Timed out while waiting for job {job_name}" " to complete".format(job_name=job_name)) # Reads the status of the specified job. Returns a V1Job object that # we need to read the status off of. status = None def _get_job_status(): job = self.batch_api.read_namespaced_job_status( job_name, namespace=namespace) return job.status status = k8s_api_retry(_get_job_status, max_retries=3) # status.succeeded represents the number of pods which reached phase Succeeded. if status.succeeded == num_pods_to_wait_for: break # status.failed represents the number of pods which reached phase Failed. if status.failed and status.failed > 0: raise DagsterK8sError( "Encountered failed job pods for job {job_name} with status: {status}" .format(job_name=job_name, status=status)) if instance and run_id: pipeline_run = instance.get_run_by_id(run_id) if not pipeline_run: raise DagsterK8sPipelineStatusException() pipeline_run_status = pipeline_run.status if pipeline_run_status != PipelineRunStatus.STARTED: raise DagsterK8sPipelineStatusException() self.sleeper(wait_time_between_attempts)