def test_healthy(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, }) as instance: init_time = pendulum.now("UTC") beyond_tolerated_time = init_time.float_timestamp + 100 controller = DagsterDaemonController(instance) assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp) assert not all_daemons_live( instance, curr_time_seconds=init_time.float_timestamp) controller.run_iteration(init_time) assert all_daemons_healthy(instance, curr_time_seconds=init_time.float_timestamp) assert all_daemons_live(instance, curr_time_seconds=init_time.float_timestamp) assert not all_daemons_healthy(instance, curr_time_seconds=beyond_tolerated_time) assert not all_daemons_live(instance, curr_time_seconds=beyond_tolerated_time)
def test_heartbeat(): with instance_for_test() as instance: assert all_daemons_healthy(instance) is False with start_daemon(): time.sleep(5) assert all_daemons_healthy(instance) is True frozen_datetime = pendulum.now().add( seconds=DEFAULT_HEARTBEAT_INTERVAL_SECONDS + DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS + 5) with pendulum.test(frozen_datetime): assert all_daemons_healthy(instance) is False
def test_heartbeat(tmpdir, ): dagster_home_path = tmpdir.strpath with setup_instance(dagster_home_path, "") as instance: assert all_daemons_healthy(instance) is False with start_daemon(): time.sleep(5) assert all_daemons_healthy(instance) is True frozen_datetime = pendulum.now().add( seconds=DEFAULT_HEARTBEAT_INTERVAL_SECONDS + DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS + 5) with pendulum.test(frozen_datetime): assert all_daemons_healthy(instance) is False
def health_check_command(): with DagsterInstance.get() as instance: if all_daemons_healthy(instance): click.echo("Daemon healthy") else: click.echo("Daemon not healthy") sys.exit(1)
def liveness_check_command(): with DagsterInstance.get() as instance: if all_daemons_healthy(instance): click.echo("Daemon healthy") else: click.echo("Daemon(s) not running") sys.exit(1)
def health_check_command(): warnings.warn("health-check is deprecated. Use liveness-check instead.") with DagsterInstance.get() as instance: if all_daemons_healthy(instance): click.echo("Daemon healthy") else: click.echo("Daemon not healthy") sys.exit(1)
def test_healthy(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, }) as instance: init_time = pendulum.now("UTC") assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp) assert not all_daemons_live( instance, curr_time_seconds=init_time.float_timestamp) with daemon_controller_from_instance( instance, wait_for_processes_on_exit=True) as controller: while True: now = pendulum.now("UTC") if all_daemons_healthy( instance, curr_time_seconds=now.float_timestamp ) and all_daemons_live(instance, curr_time_seconds=now.float_timestamp): controller.check_daemons() beyond_tolerated_time = now.float_timestamp + 100 assert not all_daemons_healthy( instance, curr_time_seconds=beyond_tolerated_time) assert not all_daemons_live( instance, curr_time_seconds=beyond_tolerated_time) break if (now - init_time).total_seconds() > 10: raise Exception( "timed out waiting for instance to become healthy") time.sleep(0.5)
def test_healthy_with_different_daemons(): with instance_for_test() as instance: init_time = pendulum.now("UTC") controller = DagsterDaemonController(instance) controller.run_iteration(init_time) with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, } ) as instance: assert not all_daemons_healthy(instance, curr_time=init_time)
def test_monitoring(): # with setup_instance() as instance: with instance_for_test( { "run_monitoring": {"enabled": True, "poll_interval_seconds": 5}, "run_launcher": { "class": "DockerRunLauncher", "module": "dagster_docker", "config": {}, }, } ) as instance: with start_daemon(): time.sleep(5) assert all_daemons_healthy(instance)
def test_healthy_with_different_daemons(): with instance_for_test() as instance: with DagsterDaemonController.create_from_instance(instance): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, } ) as other_instance: now = pendulum.now("UTC") assert not all_daemons_healthy( other_instance, curr_time_seconds=now.float_timestamp ) assert not all_daemons_live(other_instance, curr_time_seconds=now.float_timestamp)
def test_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_, _instance, _daemon_shutdown_event, _grpc_server_registry): raise DagsterInvariantViolationError("foobar") yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, wait_for_processes_on_exit=True) as controller: while True: now = pendulum.now("UTC") if all_daemons_live(instance): # Despite error, daemon should still be running controller.check_daemons() status = get_daemon_status(instance, SensorDaemon.daemon_type(), now.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 1 assert ( status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar" ) assert not all_daemons_healthy( instance, curr_time_seconds=now.float_timestamp) assert all_daemons_live( instance, curr_time_seconds=now.float_timestamp) break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): raise DagsterInvariantViolationError("foobar") yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 1 assert (status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar") assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp) assert all_daemons_live(instance, curr_time_seconds=init_time.float_timestamp)
def test_healthy(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, }) as instance: init_time = pendulum.now("UTC") heartbeat_interval_seconds = 1 assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert not all_daemons_live( instance, curr_time_seconds=init_time.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) with daemon_controller_from_instance( instance, heartbeat_interval_seconds=heartbeat_interval_seconds ) as controller: while True: now = pendulum.now("UTC") if all_daemons_healthy( instance, curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) and all_daemons_live( instance, curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ): controller.check_daemon_threads() controller.check_daemon_heartbeats() beyond_tolerated_time = ( now.float_timestamp + DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS + 1) assert not all_daemons_healthy( instance, curr_time_seconds=beyond_tolerated_time, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert not all_daemons_live( instance, curr_time_seconds=beyond_tolerated_time, heartbeat_interval_seconds=heartbeat_interval_seconds, ) break if (now - init_time).total_seconds() > 10: raise Exception( "timed out waiting for instance to become healthy") time.sleep(0.5)