def test_add_heartbeat(self, storage): self._skip_in_memory(storage) # test insert added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(1000).float_timestamp, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[], ) storage.add_daemon_heartbeat(added_heartbeat) assert len(storage.get_daemon_heartbeats()) == 1 stored_heartbeat = storage.get_daemon_heartbeats()[ SensorDaemon.daemon_type()] assert stored_heartbeat == added_heartbeat # test update second_added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(2000).float_timestamp, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[], ) storage.add_daemon_heartbeat(second_added_heartbeat) assert len(storage.get_daemon_heartbeats()) == 1 stored_heartbeat = storage.get_daemon_heartbeats()[ SensorDaemon.daemon_type()] assert stored_heartbeat == second_added_heartbeat
def test_add_heartbeat(self, storage): import pendulum self._skip_in_memory(storage) # test insert added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(1000), daemon_type="foobar", daemon_id=None, info=None) storage.add_daemon_heartbeat(added_heartbeat) assert len(storage.get_daemon_heartbeats()) == 1 stored_heartbeat = storage.get_daemon_heartbeats()["foobar"] assert stored_heartbeat == added_heartbeat # test update second_added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(2000), daemon_type="foobar", daemon_id=None, info=None) storage.add_daemon_heartbeat(second_added_heartbeat) assert len(storage.get_daemon_heartbeats()) == 1 stored_heartbeat = storage.get_daemon_heartbeats()["foobar"] assert stored_heartbeat == second_added_heartbeat
def _add_heartbeat(self, daemon): """ Add a heartbeat for the given daemon """ self._instance.add_daemon_heartbeat( DaemonHeartbeat(pendulum.now("UTC"), type(daemon).__name__, None, None))
def _check_add_heartbeat(self, instance, curr_time, daemon_uuid): if (self._last_heartbeat_time and (curr_time - self._last_heartbeat_time).total_seconds() < DAEMON_HEARTBEAT_INTERVAL_SECONDS): return daemon_type = self.daemon_type() last_stored_heartbeat = instance.get_daemon_heartbeats().get( daemon_type) if (self._last_heartbeat_time # not the first heartbeat and last_stored_heartbeat and last_stored_heartbeat.daemon_id != daemon_uuid): self._logger.warning( "Taking over from another {} daemon process. If this " "message reoccurs, you may have multiple daemons running which is not supported. " "Last heartbeat daemon id: {}, " "Current daemon_id: {}".format( daemon_type, last_stored_heartbeat.daemon_id, daemon_uuid, )) self._last_heartbeat_time = curr_time instance.add_daemon_heartbeat( DaemonHeartbeat( curr_time.float_timestamp, daemon_type, daemon_uuid, errors=self._last_iteration_exceptions, ))
def test_get_individual_daemons(self, graphql_context): if graphql_context.instance.is_ephemeral: pytest.skip( "The daemon isn't compatible with an in-memory instance") graphql_context.instance.add_daemon_heartbeat( DaemonHeartbeat(timestamp=100.0, daemon_type=DaemonType.SENSOR, daemon_id=None, error=None)) results = execute_dagster_graphql(graphql_context, INDIVIDUAL_DAEMON_QUERY) assert results.data == { "instance": { "daemonHealth": { "sensor": { "daemonType": "SENSOR", "required": True, "healthy": False, "lastHeartbeatTime": 100.0, }, "run_coordinator": { "daemonType": "QUEUED_RUN_COORDINATOR", "required": False, "healthy": None, "lastHeartbeatTime": None, }, "scheduler": { "daemonType": "SCHEDULER", "required": False, "healthy": None, "lastHeartbeatTime": None, }, } } }
def _check_add_heartbeat(self, daemon_type, curr_time): """ Add a heartbeat for the given daemon """ if (not daemon_type in self._last_heartbeat_times) or ( (curr_time - self._last_heartbeat_times[daemon_type]).total_seconds() >= DAEMON_HEARTBEAT_INTERVAL_SECONDS ): last_stored_heartbeat = self._instance.get_daemon_heartbeats().get(daemon_type) if ( daemon_type in self._last_heartbeat_times # not the first heartbeat and last_stored_heartbeat and last_stored_heartbeat.daemon_id != self._daemon_uuid ): self._logger.warning( "Taking over from another {} daemon process. If this " "message reoccurs, you may have multiple daemons running which is not supported. " "Last heartbeat daemon id: {}, " "Current daemon_id: {}".format( daemon_type.value, last_stored_heartbeat.daemon_id, self._daemon_uuid, ) ) self._last_heartbeat_times[daemon_type] = curr_time self._instance.add_daemon_heartbeat( DaemonHeartbeat( pendulum.now("UTC").float_timestamp, daemon_type, self._daemon_uuid, self._last_iteration_exceptions[daemon_type], ) )
def debug_daemon_heartbeats(instance): daemon = SensorDaemon() timestamp = pendulum.now("UTC").float_timestamp instance.add_daemon_heartbeat(DaemonHeartbeat(timestamp, daemon.daemon_type(), None, None)) returned_timestamp = instance.get_daemon_heartbeats()[daemon.daemon_type()].timestamp print( # pylint: disable=print-call f"Written timestamp: {timestamp}\nRead timestamp: {returned_timestamp}" )
def debug_daemon_heartbeats(instance): daemon = SensorDaemon(instance, interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS,) timestamp = pendulum.now("UTC").float_timestamp instance.add_daemon_heartbeat(DaemonHeartbeat(timestamp, daemon.daemon_type(), None, None)) returned_timestamp = instance.get_daemon_heartbeats()[daemon.daemon_type()].timestamp print( # pylint: disable=print-call f"Written timetstamp: {timestamp}\nRead timestamp: {returned_timestamp}" )
def _check_add_heartbeat( self, instance, daemon_uuid, heartbeat_interval_seconds, error_interval_seconds ): error_max_time = pendulum.now("UTC").subtract(seconds=error_interval_seconds) while len(self._errors): _earliest_error, earliest_timestamp = self._errors[-1] if earliest_timestamp >= error_max_time: break self._errors.pop() curr_time = pendulum.now("UTC") if ( self._last_heartbeat_time and (curr_time - self._last_heartbeat_time).total_seconds() < heartbeat_interval_seconds ): return daemon_type = self.daemon_type() last_stored_heartbeat = instance.get_daemon_heartbeats().get(daemon_type) if ( self._last_heartbeat_time and last_stored_heartbeat and last_stored_heartbeat.daemon_id != daemon_uuid ): self._logger.error( "Another {} daemon is still sending heartbeats. You likely have multiple " "daemon processes running at once, which is not supported. " "Last heartbeat daemon id: {}, " "Current daemon_id: {}".format( daemon_type, last_stored_heartbeat.daemon_id, daemon_uuid, ) ) self._last_heartbeat_time = curr_time instance.add_daemon_heartbeat( DaemonHeartbeat( curr_time.float_timestamp, daemon_type, daemon_uuid, errors=[error for (error, timestamp) in self._errors], ) ) if ( not self._last_log_time or (curr_time - self._last_log_time).total_seconds() >= TELEMETRY_LOGGING_INTERVAL ): log_action( instance, DAEMON_ALIVE, metadata={"DAEMON_SESSION_ID": get_telemetry_daemon_session_id()}, ) self._last_log_time = curr_time
def _add_heartbeat(self, daemon): """ Add a heartbeat for the given daemon """ self._instance.add_daemon_heartbeat( DaemonHeartbeat( pendulum.now("UTC").float_timestamp, daemon.daemon_type(), None, daemon.last_iteration_exception, ))
def test_wipe_heartbeats(self, storage): self._skip_in_memory(storage) added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(1000).float_timestamp, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[], ) storage.add_daemon_heartbeat(added_heartbeat) storage.wipe_daemon_heartbeats()
def get_daemon_heartbeats(self): with self.connect() as conn: rows = conn.execute(db.select(DaemonHeartbeatsTable.columns)) return { row.daemon_type: DaemonHeartbeat( timestamp=row.timestamp, daemon_type=row.daemon_type, daemon_id=row.daemon_id, info=row.info, ) for row in rows }
def test_wipe_heartbeats(self, storage): self._skip_in_memory(storage) added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(1000), daemon_type=DaemonType.SENSOR, daemon_id=None, info=None, ) storage.add_daemon_heartbeat(added_heartbeat) storage.wipe_daemon_heartbeats() assert storage.get_daemon_heartbeats() == {}
def test_wipe_heartbeats(self, storage): self._skip_in_memory(storage) if not self.can_delete_runs(): pytest.skip("storage cannot delete") added_heartbeat = DaemonHeartbeat( timestamp=pendulum.from_timestamp(1000).float_timestamp, daemon_type=SensorDaemon.daemon_type(), daemon_id=None, errors=[], ) storage.add_daemon_heartbeat(added_heartbeat) storage.wipe_daemon_heartbeats()
def _check_add_heartbeat(self, instance, daemon_uuid, heartbeat_interval_seconds, error_interval_seconds): error_max_time = pendulum.now("UTC").subtract( seconds=error_interval_seconds) while len(self._errors): _earliest_error, earliest_timestamp = self._errors[-1] if earliest_timestamp >= error_max_time: break self._errors.pop() curr_time = pendulum.now("UTC") if (self._last_heartbeat_time and (curr_time - self._last_heartbeat_time).total_seconds() < heartbeat_interval_seconds): return daemon_type = self.daemon_type() last_stored_heartbeat = instance.get_daemon_heartbeats().get( daemon_type) if (self._last_heartbeat_time and last_stored_heartbeat and last_stored_heartbeat.daemon_id != daemon_uuid): self._logger.warning( "Taking over from another {} daemon process. If this " "message reoccurs, you may have multiple daemons running which is not supported. " "Last heartbeat daemon id: {}, " "Current daemon_id: {}".format( daemon_type, last_stored_heartbeat.daemon_id, daemon_uuid, )) self._last_heartbeat_time = curr_time instance.add_daemon_heartbeat( DaemonHeartbeat( curr_time.float_timestamp, daemon_type, daemon_uuid, errors=[error for (error, timestamp) in self._errors], )) if (not self._last_log_time or (curr_time - self._last_log_time).total_seconds() >= TELEMETRY_LOGGING_INTERVAL): log_action(instance, DAEMON_ALIVE) self._last_log_time = curr_time
def _check_add_heartbeat(self, instance, daemon_uuid): # Always log a heartbeat after the first time an iteration returns an error to make sure we # don't incorrectly say the daemon is healthy first_time_logging_error = self._last_iteration_exceptions and not self._first_error_logged curr_time = pendulum.now("UTC") if not first_time_logging_error and ( self._last_heartbeat_time and (curr_time - self._last_heartbeat_time).total_seconds() < DAEMON_HEARTBEAT_INTERVAL_SECONDS ): return if first_time_logging_error: self._first_error_logged = True daemon_type = self.daemon_type() last_stored_heartbeat = instance.get_daemon_heartbeats().get(daemon_type) if ( self._last_heartbeat_time # not the first heartbeat and last_stored_heartbeat and last_stored_heartbeat.daemon_id != daemon_uuid ): self._logger.warning( "Taking over from another {} daemon process. If this " "message reoccurs, you may have multiple daemons running which is not supported. " "Last heartbeat daemon id: {}, " "Current daemon_id: {}".format( daemon_type, last_stored_heartbeat.daemon_id, daemon_uuid, ) ) self._last_heartbeat_time = curr_time instance.add_daemon_heartbeat( DaemonHeartbeat( curr_time.float_timestamp, daemon_type, daemon_uuid, errors=self._last_iteration_exceptions, ) )
def _check_add_heartbeat(self, instance, daemon_uuid, heartbeat_interval_seconds, error_interval_seconds): error_max_time = pendulum.now("UTC").subtract( seconds=error_interval_seconds) self._errors = self._errors[:DAEMON_HEARTBEAT_ERROR_LIMIT] self._errors = [(error, timestamp) for (error, timestamp) in self._errors if timestamp >= error_max_time] curr_time = pendulum.now("UTC") if (self._last_heartbeat_time and (curr_time - self._last_heartbeat_time).total_seconds() < heartbeat_interval_seconds): return daemon_type = self.daemon_type() last_stored_heartbeat = instance.get_daemon_heartbeats().get( daemon_type) if (self._last_heartbeat_time and last_stored_heartbeat and last_stored_heartbeat.daemon_id != daemon_uuid): self._logger.warning( "Taking over from another {} daemon process. If this " "message reoccurs, you may have multiple daemons running which is not supported. " "Last heartbeat daemon id: {}, " "Current daemon_id: {}".format( daemon_type, last_stored_heartbeat.daemon_id, daemon_uuid, )) self._last_heartbeat_time = curr_time instance.add_daemon_heartbeat( DaemonHeartbeat( curr_time.float_timestamp, daemon_type, daemon_uuid, errors=[error for (error, timestamp) in self._errors], ))
def test_get_daemon_error(self, graphql_context): if graphql_context.instance.is_ephemeral: pytest.skip( "The daemon isn't compatible with an in-memory instance") graphql_context.instance.add_daemon_heartbeat( DaemonHeartbeat( timestamp=100.0, daemon_type=DaemonType.SENSOR, daemon_id=None, error=SerializableErrorInfo(message="foobar", stack=[], cls_name=None, cause=None), )) results = execute_dagster_graphql(graphql_context, DAEMON_HEALTH_QUERY) assert results.data["instance"]["daemonHealth"]["sensor"] == { "lastHeartbeatError": { "message": "foobar" }, }
def _check_add_heartbeat(self, daemon, curr_time): """ Add a heartbeat for the given daemon """ daemon_type = daemon.daemon_type() if (not daemon_type in self._last_heartbeat_times) or ( (curr_time - self._last_heartbeat_times[daemon_type]).total_seconds() >= DAEMON_HEARTBEAT_INTERVAL_SECONDS ): self._last_heartbeat_times[daemon_type] = curr_time self._instance.add_daemon_heartbeat( DaemonHeartbeat( pendulum.now("UTC").float_timestamp, daemon.daemon_type(), None, daemon.last_iteration_exception, ) )