def test_healthy(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, }) as instance: init_time = pendulum.now("UTC") beyond_tolerated_time = init_time.float_timestamp + 100 controller = DagsterDaemonController(instance) assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp) assert not all_daemons_live( instance, curr_time_seconds=init_time.float_timestamp) controller.run_iteration(init_time) assert all_daemons_healthy(instance, curr_time_seconds=init_time.float_timestamp) assert all_daemons_live(instance, curr_time_seconds=init_time.float_timestamp) assert not all_daemons_healthy(instance, curr_time_seconds=beyond_tolerated_time) assert not all_daemons_live(instance, curr_time_seconds=beyond_tolerated_time)
def test_different_intervals(caplog): with instance_for_test( overrides={ "scheduler": { "module": "dagster.core.scheduler", "class": "DagsterDaemonScheduler", }, "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", "config": { "dequeue_interval_seconds": 5 }, }, }) as instance: init_time = pendulum.now("UTC") controller = DagsterDaemonController(instance) assert caplog.record_tuples == [( "dagster-daemon", logging.INFO, "instance is configured with the following daemons: ['QueuedRunCoordinatorDaemon', 'SchedulerDaemon', 'SensorDaemon']", )] controller.run_iteration(init_time) scheduler_daemon = controller.get_daemon(SchedulerDaemon.__name__) run_daemon = controller.get_daemon(QueuedRunCoordinatorDaemon.__name__) assert scheduler_daemon assert scheduler_daemon.last_iteration_time == init_time assert _scheduler_ran(caplog) assert run_daemon assert run_daemon.last_iteration_time == init_time assert _run_coordinator_ran(caplog) caplog.clear() next_time = init_time + datetime.timedelta(seconds=5) controller.run_iteration(next_time) # Run coordinator does another iteration, scheduler does not assert scheduler_daemon.last_iteration_time == init_time assert not _scheduler_ran(caplog) assert run_daemon.last_iteration_time == next_time assert _run_coordinator_ran(caplog) caplog.clear() next_time = init_time + datetime.timedelta(seconds=30) controller.run_iteration(next_time) # 30 seconds later both daemons do another iteration assert scheduler_daemon.last_iteration_time == next_time assert _scheduler_ran(caplog) assert run_daemon.last_iteration_time == next_time assert _run_coordinator_ran(caplog)
def run_command(): with DagsterInstance.get() as instance: controller = DagsterDaemonController(instance) while True: curr_time = pendulum.now("UTC") controller.run_iteration(curr_time) time.sleep(0.5)
def run_command(): with DagsterInstance.get() as instance: if instance.is_ephemeral: raise Exception( "dagster-daemon can't run using an in-memory instance. Make sure " "the DAGSTER_HOME environment variable has been set correctly and that " "you have created a dagster.yaml file there.") controller = DagsterDaemonController(instance) while True: curr_time = pendulum.now("UTC") controller.run_iteration(curr_time) time.sleep(0.5)
def test_healthy_with_different_daemons(): with instance_for_test() as instance: init_time = pendulum.now("UTC") controller = DagsterDaemonController(instance) controller.run_iteration(init_time) with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, } ) as instance: assert not all_daemons_healthy(instance, curr_time=init_time)
def test_empty_instance(): with instance_for_test() as instance: with pytest.raises( Exception, match=re.escape( "No daemons configured on the DagsterInstance")): DagsterDaemonController(instance)
def run_command(): with capture_interrupts(): with DagsterInstance.get() as instance: if instance.is_ephemeral: raise Exception( "dagster-daemon can't run using an in-memory instance. Make sure " "the DAGSTER_HOME environment variable has been set correctly and that " "you have created a dagster.yaml file there." ) with DagsterDaemonController( instance, create_daemons_from_instance(instance) ) as controller: start_time = pendulum.now("UTC") while True: # Wait until a daemon has been unhealthy for a long period of time # before potentially restarting it due to a hanging or failed daemon with raise_interrupts_as(KeyboardInterrupt): time.sleep(1) if ( pendulum.now("UTC") - start_time ).total_seconds() < 2 * DAEMON_HEARTBEAT_TOLERANCE_SECONDS: continue controller.check_daemons() start_time = pendulum.now("UTC")
def test_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): raise DagsterInvariantViolationError("foobar") monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert (status.last_heartbeat.error.message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar")
def test_warn_multiple_daemons(capsys): with instance_for_test() as instance: init_time = pendulum.now("UTC") next_time = init_time.add(seconds=100) controller1 = DagsterDaemonController(instance) controller1.run_iteration(init_time) captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" not in captured.out controller2 = DagsterDaemonController(instance) controller2.run_iteration(init_time) captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" not in captured.out controller1.run_iteration(next_time) captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" in captured.out
def test_required(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, }) as instance: assert DagsterDaemonController.required(instance)
def test_multiple_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 2 assert status.last_heartbeat.errors[0].message.strip() == "foobar" assert status.last_heartbeat.errors[1].message.strip() == "bizbuz"
def test_scheduler_instance(): with instance_for_test( overrides={ "scheduler": {"module": "dagster.core.scheduler", "class": "DagsterDaemonScheduler",}, } ) as instance: controller = DagsterDaemonController(instance) daemons = controller.daemons assert len(daemons) == 2 assert any(isinstance(daemon, SchedulerDaemon) for daemon in daemons)
def test_backfill_instance(): with instance_for_test(overrides={ "backfill": { "daemon_enabled": True }, }) as instance: with DagsterDaemonController.create_from_instance( instance) as controller: daemons = controller.daemons assert len(daemons) == 3 assert any( isinstance(daemon, BackfillDaemon) for daemon in daemons)
def test_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_): raise DagsterInvariantViolationError("foobar") yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) controller = DagsterDaemonController(instance) init_time = pendulum.now("UTC") controller.run_iteration(init_time) status = get_daemon_status(instance, SensorDaemon.daemon_type(), init_time.float_timestamp) assert status.healthy == False assert len(status.last_heartbeat.errors) == 1 assert (status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar") assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp) assert all_daemons_live(instance, curr_time_seconds=init_time.float_timestamp)
def test_run_coordinator_instance(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, } ) as instance: with DagsterDaemonController.create_from_instance(instance) as controller: daemons = controller.daemons assert len(daemons) == 3 assert any(isinstance(daemon, QueuedRunCoordinatorDaemon) for daemon in daemons)
def test_run_coordinator_instance(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, }) as instance: controller = DagsterDaemonController(instance) daemons = controller.daemons assert len(daemons) == 1 assert isinstance(daemons[0], QueuedRunCoordinatorDaemon)
def run_command(): with DagsterInstance.get() as instance: if instance.is_ephemeral: raise Exception( "dagster-daemon can't run using an in-memory instance. Make sure " "the DAGSTER_HOME environment variable has been set correctly and that " "you have created a dagster.yaml file there.") with DagsterDaemonController( instance, create_daemons_from_instance(instance)) as controller: while True: # Wait until a daemon has been unhealthy for a long period of time # before potentially restarting it due to a hanging or failed daemon time.sleep(2 * DAEMON_HEARTBEAT_TOLERANCE_SECONDS) controller.check_daemons()
def test_set_sensor_interval(caplog): with instance_for_test(overrides={"sensor_settings": {"interval_seconds": 5}}) as instance: init_time = pendulum.now("UTC") with DagsterDaemonController.create_from_instance(instance): while True: now = pendulum.now("UTC") # Wait until the run coordinator has run three times # Scheduler has only run once if _sensor_ran(caplog) == 1: break if (now - init_time).total_seconds() > 10: raise Exception("Timed out waiting for sensor daemon to execute") time.sleep(0.5)
def test_different_intervals(caplog): with instance_for_test( overrides={ "scheduler": { "module": "dagster.core.scheduler", "class": "DagsterDaemonScheduler", }, "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", "config": { "dequeue_interval_seconds": 5 }, }, }) as instance: init_time = pendulum.now("UTC") with DagsterDaemonController.create_from_instance(instance): while True: now = pendulum.now("UTC") # Wait until the run coordinator has run three times # Scheduler has only run once if _run_coordinator_ran(caplog) == 3: assert _scheduler_ran(caplog) == 1 break if (now - init_time).total_seconds() > 45: raise Exception( "Timed out waiting for run queue daemon to execute twice" ) time.sleep(0.5) init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") if _scheduler_ran(caplog) == 2: assert _run_coordinator_ran(caplog) > 2 break if (now - init_time).total_seconds() > 45: raise Exception( "Timed out waiting for schedule daemon to execute twice" ) time.sleep(0.5)
def test_healthy_with_different_daemons(): with instance_for_test() as instance: with DagsterDaemonController.create_from_instance(instance): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, } ) as other_instance: now = pendulum.now("UTC") assert not all_daemons_healthy( other_instance, curr_time_seconds=now.float_timestamp ) assert not all_daemons_live(other_instance, curr_time_seconds=now.float_timestamp)
def test_healthy(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, }) as instance: init_time = pendulum.now("UTC") beyond_tolerated_time = init_time + datetime.timedelta(seconds=60) controller = DagsterDaemonController(instance) assert not controller.daemon_healthy(instance, curr_time=init_time) controller.run_iteration(init_time) assert controller.daemon_healthy(instance, curr_time=init_time) assert not controller.daemon_healthy(instance, curr_time=beyond_tolerated_time)
def test_thread_die_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SchedulerDaemon, SensorDaemon iteration_ran = {"ran": False} def run_iteration_error(_, _instance): iteration_ran["ran"] = True raise KeyboardInterrupt yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with DagsterDaemonController.create_from_instance(instance) as controller: while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SchedulerDaemon.daemon_type(), now.float_timestamp ) if iteration_ran["ran"] and status.healthy: try: controller.check_daemons() # Should throw since the sensor thread is interrupted except Exception as e: # pylint: disable=broad-except assert ( "Stopping dagster-daemon process since the following threads are no longer sending heartbeats: ['SENSOR']" in str(e) ) break else: raise Exception("check_daemons should fail if a thread has died") if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_healthy(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, } ) as instance: init_time = pendulum.now("UTC") assert not all_daemons_healthy(instance, curr_time_seconds=init_time.float_timestamp) assert not all_daemons_live(instance, curr_time_seconds=init_time.float_timestamp) with DagsterDaemonController.create_from_instance(instance) as controller: while True: now = pendulum.now("UTC") if all_daemons_healthy( instance, curr_time_seconds=now.float_timestamp ) and all_daemons_live(instance, curr_time_seconds=now.float_timestamp): controller.check_daemons() beyond_tolerated_time = now.float_timestamp + 100 assert not all_daemons_healthy( instance, curr_time_seconds=beyond_tolerated_time ) assert not all_daemons_live(instance, curr_time_seconds=beyond_tolerated_time) break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for instance to become healthy") time.sleep(0.5)
def test_multiple_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_, _instance): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with DagsterDaemonController.create_from_instance(instance) as controller: while True: now = pendulum.now("UTC") if all_daemons_live(instance): # Despite error, daemon should still be running controller.check_daemons() status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp ) assert status.healthy == False assert len(status.last_heartbeat.errors) == 2 assert status.last_heartbeat.errors[0].message.strip() == "foobar" assert status.last_heartbeat.errors[1].message.strip() == "bizbuz" break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_error_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SensorDaemon def run_iteration_error(_, _instance): raise DagsterInvariantViolationError("foobar") yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error) init_time = pendulum.now("UTC") with DagsterDaemonController.create_from_instance(instance) as controller: while True: now = pendulum.now("UTC") if all_daemons_live(instance): # Despite error, daemon should still be running controller.check_daemons() status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp ) assert status.healthy == False assert len(status.last_heartbeat.errors) == 1 assert ( status.last_heartbeat.errors[0].message.strip() == "dagster.core.errors.DagsterInvariantViolationError: foobar" ) assert not all_daemons_healthy(instance, curr_time_seconds=now.float_timestamp) assert all_daemons_live(instance, curr_time_seconds=now.float_timestamp) break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_warn_multiple_daemons(capsys): from dagster.daemon.daemon import SensorDaemon with instance_for_test() as instance: init_time = pendulum.now("UTC") with DagsterDaemonController.create_from_instance(instance): while True: now = pendulum.now("UTC") if all_daemons_live(instance): captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" not in captured.out break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for daemon status") time.sleep(0.5) capsys.readouterr() init_time = pendulum.now("UTC") status = get_daemon_status(instance, SensorDaemon.daemon_type(), now.float_timestamp) last_heartbeat_time = status.last_heartbeat.timestamp # No warning when a second controller starts up again with DagsterDaemonController.create_from_instance(instance): while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp ) if status.last_heartbeat and status.last_heartbeat.timestamp != last_heartbeat_time: captured = capsys.readouterr() assert "Taking over from another SENSOR daemon process" not in captured.out break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for new daemon status") time.sleep(0.5) status = get_daemon_status(instance, SensorDaemon.daemon_type(), now.float_timestamp) last_heartbeat_time = status.last_heartbeat.timestamp # Starting up a controller while one is running produces the warning though with DagsterDaemonController.create_from_instance(instance): # Wait for heartbeats while two controllers are running at once and there will # be a warning init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") captured = capsys.readouterr() if "Taking over from another SENSOR daemon process" in captured.out: break if (now - init_time).total_seconds() > 120: raise Exception("timed out waiting for heartbeats") time.sleep(5)