def test_transient_heartbeat_failure(mocker): with instance_for_test() as instance: mocker.patch( "dagster.daemon.controller.get_daemon_status", side_effect=Exception("Transient heartbeat failure"), ) heartbeat_interval_seconds = 1 heartbeat_tolerance_seconds = 5 with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, heartbeat_tolerance_seconds=heartbeat_tolerance_seconds, ) as controller: controller.check_daemon_heartbeats() # doesn't immediately fail despite transient error time.sleep(2 * heartbeat_tolerance_seconds) with pytest.raises( Exception, match="Stopping dagster-daemon process since the following threads are no longer sending heartbeats", ): controller.check_daemon_heartbeats()
def test_multiple_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon def run_loop_error(_, _instance, _workspace): # ?message stack cls_name cause" yield SerializableErrorInfo("foobar", None, None, None) yield SerializableErrorInfo("bizbuz", None, None, None) while True: yield time.sleep(0.5) monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error) init_time = pendulum.now("UTC") heartbeat_interval_seconds = 1 with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, ) as controller: while True: now = pendulum.now("UTC") if all_daemons_live( instance, heartbeat_interval_seconds=heartbeat_interval_seconds): # Despite error, daemon should still be running controller.check_daemon_threads() controller.check_daemon_heartbeats() status = get_daemon_statuses( instance, [SensorDaemon.daemon_type()], now.float_timestamp)[SensorDaemon.daemon_type()] if status.healthy == False and len( status.last_heartbeat.errors) == 2: assert status.last_heartbeat.errors[0].message.strip( ) == "bizbuz" assert status.last_heartbeat.errors[1].message.strip( ) == "foobar" break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5)
def test_thread_die_daemon(monkeypatch): with instance_for_test(overrides={}) as instance: from dagster.daemon.daemon import SchedulerDaemon, SensorDaemon iteration_ran = {"ran": False} def run_loop_error(_, _instance, _workspace): iteration_ran["ran"] = True raise KeyboardInterrupt yield # pylint: disable=unreachable monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error) heartbeat_interval_seconds = 1 init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, ) as controller: while True: now = pendulum.now("UTC") status = get_daemon_statuses( instance, [SchedulerDaemon.daemon_type()], now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, )[SchedulerDaemon.daemon_type()] if iteration_ran["ran"] and status.healthy: try: controller.check_daemon_threads( ) # Should eventually throw since the sensor thread is interrupted except Exception as e: assert ( "Stopping dagster-daemon process since the following threads are no longer running: ['SENSOR']" in str(e)) break if (now - init_time).total_seconds() > 20: raise Exception( "timed out waiting for check_daemons to fail") time.sleep(0.5)
def test_scheduler_instance(): with instance_for_test( overrides={ "scheduler": { "module": "dagster.core.scheduler", "class": "DagsterDaemonScheduler", }, }) as instance: with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), ) as controller: daemons = controller.daemons assert len(daemons) == 3 assert any( isinstance(daemon, SchedulerDaemon) for daemon in daemons)
def test_run_coordinator_instance(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, }) as instance: with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), ) as controller: daemons = controller.daemons assert len(daemons) == 4 assert any( isinstance(daemon, QueuedRunCoordinatorDaemon) for daemon in daemons)
def test_healthy_with_different_daemons(): with instance_for_test() as instance: with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), ): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, } ) as other_instance: now = pendulum.now("UTC") assert not all_daemons_healthy( other_instance, curr_time_seconds=now.float_timestamp ) assert not all_daemons_live(other_instance, curr_time_seconds=now.float_timestamp)
def workspace_fixture(): with create_test_daemon_workspace(workspace_load_target=EmptyWorkspaceTarget()) as workspace: yield workspace
def test_warn_multiple_daemons(capsys): from dagster.daemon.daemon import SensorDaemon with instance_for_test() as instance: init_time = pendulum.now("UTC") heartbeat_interval_seconds = 1 with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, ): while True: now = pendulum.now("UTC") if all_daemons_live( instance, heartbeat_interval_seconds=heartbeat_interval_seconds ): captured = capsys.readouterr() assert "Another SENSOR daemon is still sending heartbeats" not in captured.out break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for daemon status") time.sleep(0.5) capsys.readouterr() init_time = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) last_heartbeat_time = status.last_heartbeat.timestamp # No warning when a second controller starts up again with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, ): while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) if status.last_heartbeat and status.last_heartbeat.timestamp != last_heartbeat_time: captured = capsys.readouterr() assert "Another SENSOR daemon is still sending heartbeats" not in captured.out break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for new daemon status") time.sleep(0.5) status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) last_heartbeat_time = status.last_heartbeat.timestamp # Starting up a controller while one is running produces the warning though with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, ): # Wait for heartbeats while two controllers are running at once and there will # be a warning init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") captured = capsys.readouterr() if "Another SENSOR daemon is still sending heartbeats" in captured.out: break if (now - init_time).total_seconds() > 60: raise Exception("timed out waiting for heartbeats") time.sleep(5)
def test_error_daemon(monkeypatch): with instance_for_test() as instance: from dagster.daemon.daemon import SensorDaemon should_raise_errors = True error_count = {"count": 0} def run_loop_error(_, _instance, _workspace): if should_raise_errors: time.sleep(0.5) error_count["count"] = error_count["count"] + 1 raise DagsterInvariantViolationError("foobar:" + str(error_count["count"])) while True: yield time.sleep(0.5) def _get_error_number(error): error_message = error.message.strip() return int(error_message.split("foobar:")[1]) monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error) heartbeat_interval_seconds = 1 gen_daemons = lambda instance: [SensorDaemon()] init_time = pendulum.now("UTC") with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, gen_daemons=gen_daemons, error_interval_seconds=10, ) as controller: while True: now = pendulum.now("UTC") if get_daemon_status( instance, SensorDaemon.daemon_type(), heartbeat_interval_seconds=heartbeat_interval_seconds, ignore_errors=True, ).healthy: # Despite error, daemon should still be running controller.check_daemon_threads() controller.check_daemon_heartbeats() status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert status.healthy == False # Errors build up until there are > 5, then pull off the last if len(status.last_heartbeat.errors) >= 5: first_error_number = _get_error_number(status.last_heartbeat.errors[0]) if first_error_number > 5: # Verify error numbers decrease consecutively assert [ _get_error_number(error) for error in status.last_heartbeat.errors ] == list(range(first_error_number, first_error_number - 5, -1)) assert not get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ).healthy assert get_daemon_status( instance, SensorDaemon.daemon_type(), curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ignore_errors=True, ).healthy time.sleep(3) status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) # Error count does not rise above 5, continues to increase assert len(status.last_heartbeat.errors) == 5 new_first_error_number = _get_error_number( status.last_heartbeat.errors[0] ) assert new_first_error_number > first_error_number break if (now - init_time).total_seconds() > 15: raise Exception("timed out waiting for heartbeat error") time.sleep(0.5) # Once the sensor no longer raises errors, they should return to 0 once # enough time passes should_raise_errors = False init_time = pendulum.now("UTC") while True: now = pendulum.now("UTC") status = get_daemon_status( instance, SensorDaemon.daemon_type(), now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) # Error count does not rise above 5 if len(status.last_heartbeat.errors) == 0: break if (now - init_time).total_seconds() > 15: raise Exception("timed out waiting for hearrteat errors to return to 0") time.sleep(0.5)
def test_healthy(): with instance_for_test( overrides={ "run_coordinator": { "module": "dagster.core.run_coordinator.queued_run_coordinator", "class": "QueuedRunCoordinator", }, } ) as instance: init_time = pendulum.now("UTC") heartbeat_interval_seconds = 1 assert not all_daemons_healthy( instance, curr_time_seconds=init_time.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert not all_daemons_live( instance, curr_time_seconds=init_time.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) with daemon_controller_from_instance( instance, workspace_load_target=EmptyWorkspaceTarget(), heartbeat_interval_seconds=heartbeat_interval_seconds, ) as controller: while True: now = pendulum.now("UTC") if all_daemons_healthy( instance, curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ) and all_daemons_live( instance, curr_time_seconds=now.float_timestamp, heartbeat_interval_seconds=heartbeat_interval_seconds, ): controller.check_daemon_threads() controller.check_daemon_heartbeats() beyond_tolerated_time = ( now.float_timestamp + DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS + 1 ) assert not all_daemons_healthy( instance, curr_time_seconds=beyond_tolerated_time, heartbeat_interval_seconds=heartbeat_interval_seconds, ) assert not all_daemons_live( instance, curr_time_seconds=beyond_tolerated_time, heartbeat_interval_seconds=heartbeat_interval_seconds, ) break if (now - init_time).total_seconds() > 10: raise Exception("timed out waiting for instance to become healthy") time.sleep(0.5)
def workspace_fixture(instance): with create_test_daemon_workspace( workspace_load_target=EmptyWorkspaceTarget(), instance=instance ) as workspace: yield workspace
def created_workspace_load_target(kwargs): check.dict_param(kwargs, "kwargs") if are_all_keys_empty(kwargs, WORKSPACE_CLI_ARGS): if kwargs.get("empty_workspace"): return EmptyWorkspaceTarget() if os.path.exists("workspace.yaml"): return WorkspaceFileTarget(paths=["workspace.yaml"]) raise click.UsageError("No arguments given and workspace.yaml not found.") if kwargs.get("workspace"): _check_cli_arguments_none( kwargs, "python_file", "working_directory", "empty_working_directory", "module_name", "package_name", "attribute", "grpc_host", "grpc_port", "grpc_socket", ) return WorkspaceFileTarget(paths=list(kwargs["workspace"])) if kwargs.get("python_file"): _check_cli_arguments_none( kwargs, "module_name", "package_name", "grpc_host", "grpc_port", "grpc_socket", ) working_directory = get_working_directory_from_kwargs(kwargs) return PythonFileTarget( python_file=kwargs.get("python_file"), attribute=kwargs.get("attribute"), working_directory=working_directory, location_name=None, ) if kwargs.get("module_name"): _check_cli_arguments_none( kwargs, "package_name", "working_directory", "empty_working_directory", "grpc_host", "grpc_port", "grpc_socket", ) return ModuleTarget( module_name=kwargs.get("module_name"), attribute=kwargs.get("attribute"), location_name=None, ) if kwargs.get("package_name"): _check_cli_arguments_none( kwargs, "working_directory", "empty_working_directory", "grpc_host", "grpc_port", "grpc_socket", ) return PackageTarget( package_name=kwargs.get("package_name"), attribute=kwargs.get("attribute"), location_name=None, ) if kwargs.get("grpc_port"): _check_cli_arguments_none( kwargs, "attribute", "working_directory", "empty_working_directory", "grpc_socket", ) return GrpcServerTarget( port=kwargs.get("grpc_port"), socket=None, host=(kwargs.get("grpc_host") if kwargs.get("grpc_host") else "localhost"), location_name=None, ) elif kwargs.get("grpc_socket"): _check_cli_arguments_none( kwargs, "attribute", "working_directory", "empty_working_directory", ) return GrpcServerTarget( port=None, socket=kwargs.get("grpc_socket"), host=(kwargs.get("grpc_host") if kwargs.get("grpc_host") else "localhost"), location_name=None, ) else: _cli_load_invariant(False)
def get_workspace_load_target(kwargs: Dict[str, str]): check.dict_param(kwargs, "kwargs") if are_all_keys_empty(kwargs, WORKSPACE_CLI_ARGS): if kwargs.get("empty_workspace"): return EmptyWorkspaceTarget() if os.path.exists("workspace.yaml"): return WorkspaceFileTarget(paths=["workspace.yaml"]) raise click.UsageError( "No arguments given and workspace.yaml not found.") if kwargs.get("workspace"): _check_cli_arguments_none( kwargs, "python_file", "working_directory", "module_name", "package_name", "attribute", "grpc_host", "grpc_port", "grpc_socket", ) return WorkspaceFileTarget( paths=list(cast(Union[List, Tuple], kwargs.get("workspace")))) if kwargs.get("python_file"): _check_cli_arguments_none( kwargs, "module_name", "package_name", "grpc_host", "grpc_port", "grpc_socket", ) working_directory = get_working_directory_from_kwargs(kwargs) return PythonFileTarget( python_file=check.str_elem(kwargs, "python_file"), attribute=check.opt_str_elem(kwargs, "attribute"), working_directory=working_directory, location_name=None, ) if kwargs.get("module_name"): _check_cli_arguments_none( kwargs, "package_name", "grpc_host", "grpc_port", "grpc_socket", ) working_directory = get_working_directory_from_kwargs(kwargs) return ModuleTarget( module_name=check.str_elem(kwargs, "module_name"), attribute=check.opt_str_elem(kwargs, "attribute"), working_directory=working_directory, location_name=None, ) if kwargs.get("package_name"): _check_cli_arguments_none( kwargs, "grpc_host", "grpc_port", "grpc_socket", ) working_directory = get_working_directory_from_kwargs(kwargs) return PackageTarget( package_name=check.str_elem(kwargs, "package_name"), attribute=check.opt_str_elem(kwargs, "attribute"), working_directory=working_directory, location_name=None, ) if kwargs.get("grpc_port"): _check_cli_arguments_none( kwargs, "attribute", "working_directory", "grpc_socket", ) return GrpcServerTarget( port=check.int_elem(kwargs, "grpc_port"), socket=None, host=check.opt_str_elem(kwargs, "grpc_host") or "localhost", location_name=None, ) elif kwargs.get("grpc_socket"): _check_cli_arguments_none( kwargs, "attribute", "working_directory", ) return GrpcServerTarget( port=None, socket=check.str_elem(kwargs, "grpc_socket"), host=check.opt_str_elem(kwargs, "grpc_host") or "localhost", location_name=None, ) else: _cli_load_invariant(False) # necessary for pyright, does not understand _cli_load_invariant(False) never returns assert False