示例#1
0
def test_transient_heartbeat_failure(mocker):
    with instance_for_test() as instance:
        mocker.patch(
            "dagster.daemon.controller.get_daemon_status",
            side_effect=Exception("Transient heartbeat failure"),
        )

        heartbeat_interval_seconds = 1
        heartbeat_tolerance_seconds = 5

        with daemon_controller_from_instance(
            instance,
            workspace_load_target=EmptyWorkspaceTarget(),
            heartbeat_interval_seconds=heartbeat_interval_seconds,
            heartbeat_tolerance_seconds=heartbeat_tolerance_seconds,
        ) as controller:
            controller.check_daemon_heartbeats()  # doesn't immediately fail despite transient error

            time.sleep(2 * heartbeat_tolerance_seconds)

            with pytest.raises(
                Exception,
                match="Stopping dagster-daemon process since the following threads are no longer sending heartbeats",
            ):
                controller.check_daemon_heartbeats()
def test_multiple_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_loop_error(_, _instance, _workspace):
            # ?message stack cls_name cause"
            yield SerializableErrorInfo("foobar", None, None, None)
            yield SerializableErrorInfo("bizbuz", None, None, None)

            while True:
                yield
                time.sleep(0.5)

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
                heartbeat_interval_seconds=heartbeat_interval_seconds,
        ) as controller:
            while True:

                now = pendulum.now("UTC")

                if all_daemons_live(
                        instance,
                        heartbeat_interval_seconds=heartbeat_interval_seconds):

                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_statuses(
                        instance, [SensorDaemon.daemon_type()],
                        now.float_timestamp)[SensorDaemon.daemon_type()]

                    if status.healthy == False and len(
                            status.last_heartbeat.errors) == 2:
                        assert status.last_heartbeat.errors[0].message.strip(
                        ) == "bizbuz"
                        assert status.last_heartbeat.errors[1].message.strip(
                        ) == "foobar"
                        break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
def test_thread_die_daemon(monkeypatch):
    with instance_for_test(overrides={}) as instance:
        from dagster.daemon.daemon import SchedulerDaemon, SensorDaemon

        iteration_ran = {"ran": False}

        def run_loop_error(_, _instance, _workspace):
            iteration_ran["ran"] = True
            raise KeyboardInterrupt
            yield  # pylint: disable=unreachable

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        heartbeat_interval_seconds = 1

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
                heartbeat_interval_seconds=heartbeat_interval_seconds,
        ) as controller:
            while True:
                now = pendulum.now("UTC")

                status = get_daemon_statuses(
                    instance,
                    [SchedulerDaemon.daemon_type()],
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )[SchedulerDaemon.daemon_type()]

                if iteration_ran["ran"] and status.healthy:
                    try:
                        controller.check_daemon_threads(
                        )  # Should eventually throw since the sensor thread is interrupted
                    except Exception as e:
                        assert (
                            "Stopping dagster-daemon process since the following threads are no longer running: ['SENSOR']"
                            in str(e))
                        break

                if (now - init_time).total_seconds() > 20:
                    raise Exception(
                        "timed out waiting for check_daemons to fail")

                time.sleep(0.5)
示例#4
0
def test_scheduler_instance():
    with instance_for_test(
            overrides={
                "scheduler": {
                    "module": "dagster.core.scheduler",
                    "class": "DagsterDaemonScheduler",
                },
            }) as instance:
        with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
        ) as controller:
            daemons = controller.daemons

            assert len(daemons) == 3

            assert any(
                isinstance(daemon, SchedulerDaemon) for daemon in daemons)
示例#5
0
def test_run_coordinator_instance():
    with instance_for_test(
            overrides={
                "run_coordinator": {
                    "module":
                    "dagster.core.run_coordinator.queued_run_coordinator",
                    "class": "QueuedRunCoordinator",
                },
            }) as instance:
        with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
        ) as controller:
            daemons = controller.daemons

            assert len(daemons) == 4
            assert any(
                isinstance(daemon, QueuedRunCoordinatorDaemon)
                for daemon in daemons)
示例#6
0
def test_healthy_with_different_daemons():
    with instance_for_test() as instance:
        with daemon_controller_from_instance(
            instance,
            workspace_load_target=EmptyWorkspaceTarget(),
        ):

            with instance_for_test(
                overrides={
                    "run_coordinator": {
                        "module": "dagster.core.run_coordinator.queued_run_coordinator",
                        "class": "QueuedRunCoordinator",
                    },
                }
            ) as other_instance:
                now = pendulum.now("UTC")
                assert not all_daemons_healthy(
                    other_instance, curr_time_seconds=now.float_timestamp
                )
                assert not all_daemons_live(other_instance, curr_time_seconds=now.float_timestamp)
示例#7
0
def workspace_fixture():
    with create_test_daemon_workspace(workspace_load_target=EmptyWorkspaceTarget()) as workspace:
        yield workspace
示例#8
0
def test_warn_multiple_daemons(capsys):
    from dagster.daemon.daemon import SensorDaemon

    with instance_for_test() as instance:
        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        with daemon_controller_from_instance(
            instance,
            workspace_load_target=EmptyWorkspaceTarget(),
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        ):
            while True:
                now = pendulum.now("UTC")

                if all_daemons_live(
                    instance, heartbeat_interval_seconds=heartbeat_interval_seconds
                ):
                    captured = capsys.readouterr()
                    assert "Another SENSOR daemon is still sending heartbeats" not in captured.out
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for daemon status")

                time.sleep(0.5)

            capsys.readouterr()

        init_time = pendulum.now("UTC")

        status = get_daemon_status(
            instance,
            SensorDaemon.daemon_type(),
            now.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )
        last_heartbeat_time = status.last_heartbeat.timestamp

        # No warning when a second controller starts up again
        with daemon_controller_from_instance(
            instance,
            workspace_load_target=EmptyWorkspaceTarget(),
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        ):
            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                if status.last_heartbeat and status.last_heartbeat.timestamp != last_heartbeat_time:
                    captured = capsys.readouterr()
                    assert "Another SENSOR daemon is still sending heartbeats" not in captured.out
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for new daemon status")

                time.sleep(0.5)

            status = get_daemon_status(
                instance,
                SensorDaemon.daemon_type(),
                now.float_timestamp,
                heartbeat_interval_seconds=heartbeat_interval_seconds,
            )
            last_heartbeat_time = status.last_heartbeat.timestamp

            # Starting up a controller while one is running produces the warning though
            with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
                heartbeat_interval_seconds=heartbeat_interval_seconds,
            ):
                # Wait for heartbeats while two controllers are running at once and there will
                # be a warning
                init_time = pendulum.now("UTC")

                while True:
                    now = pendulum.now("UTC")

                    captured = capsys.readouterr()
                    if "Another SENSOR daemon is still sending heartbeats" in captured.out:
                        break

                    if (now - init_time).total_seconds() > 60:
                        raise Exception("timed out waiting for heartbeats")

                    time.sleep(5)
示例#9
0
def test_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        should_raise_errors = True

        error_count = {"count": 0}

        def run_loop_error(_, _instance, _workspace):
            if should_raise_errors:
                time.sleep(0.5)
                error_count["count"] = error_count["count"] + 1
                raise DagsterInvariantViolationError("foobar:" + str(error_count["count"]))

            while True:
                yield
                time.sleep(0.5)

        def _get_error_number(error):
            error_message = error.message.strip()
            return int(error_message.split("foobar:")[1])

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        heartbeat_interval_seconds = 1

        gen_daemons = lambda instance: [SensorDaemon()]

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
            instance,
            workspace_load_target=EmptyWorkspaceTarget(),
            heartbeat_interval_seconds=heartbeat_interval_seconds,
            gen_daemons=gen_daemons,
            error_interval_seconds=10,
        ) as controller:
            while True:
                now = pendulum.now("UTC")

                if get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                    ignore_errors=True,
                ).healthy:
                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_status(
                        instance,
                        SensorDaemon.daemon_type(),
                        now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )

                    assert status.healthy == False

                    # Errors build up until there are > 5, then pull off the last
                    if len(status.last_heartbeat.errors) >= 5:

                        first_error_number = _get_error_number(status.last_heartbeat.errors[0])

                        if first_error_number > 5:

                            # Verify error numbers decrease consecutively
                            assert [
                                _get_error_number(error) for error in status.last_heartbeat.errors
                            ] == list(range(first_error_number, first_error_number - 5, -1))

                            assert not get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                curr_time_seconds=now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                            ).healthy
                            assert get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                curr_time_seconds=now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                                ignore_errors=True,
                            ).healthy

                            time.sleep(3)

                            status = get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                            )

                            # Error count does not rise above 5, continues to increase
                            assert len(status.last_heartbeat.errors) == 5

                            new_first_error_number = _get_error_number(
                                status.last_heartbeat.errors[0]
                            )

                            assert new_first_error_number > first_error_number

                            break

                if (now - init_time).total_seconds() > 15:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)

            # Once the sensor no longer raises errors, they should return to 0 once
            # enough time passes
            should_raise_errors = False
            init_time = pendulum.now("UTC")

            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                # Error count does not rise above 5
                if len(status.last_heartbeat.errors) == 0:
                    break

                if (now - init_time).total_seconds() > 15:
                    raise Exception("timed out waiting for hearrteat errors to return to 0")

                time.sleep(0.5)
示例#10
0
def test_healthy():

    with instance_for_test(
        overrides={
            "run_coordinator": {
                "module": "dagster.core.run_coordinator.queued_run_coordinator",
                "class": "QueuedRunCoordinator",
            },
        }
    ) as instance:
        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        assert not all_daemons_healthy(
            instance,
            curr_time_seconds=init_time.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )
        assert not all_daemons_live(
            instance,
            curr_time_seconds=init_time.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )

        with daemon_controller_from_instance(
            instance,
            workspace_load_target=EmptyWorkspaceTarget(),
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        ) as controller:

            while True:
                now = pendulum.now("UTC")
                if all_daemons_healthy(
                    instance,
                    curr_time_seconds=now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                ) and all_daemons_live(
                    instance,
                    curr_time_seconds=now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                ):

                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    beyond_tolerated_time = (
                        now.float_timestamp + DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS + 1
                    )

                    assert not all_daemons_healthy(
                        instance,
                        curr_time_seconds=beyond_tolerated_time,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )
                    assert not all_daemons_live(
                        instance,
                        curr_time_seconds=beyond_tolerated_time,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for instance to become healthy")

                time.sleep(0.5)
def workspace_fixture(instance):
    with create_test_daemon_workspace(
        workspace_load_target=EmptyWorkspaceTarget(), instance=instance
    ) as workspace:
        yield workspace
示例#12
0
def created_workspace_load_target(kwargs):
    check.dict_param(kwargs, "kwargs")
    if are_all_keys_empty(kwargs, WORKSPACE_CLI_ARGS):
        if kwargs.get("empty_workspace"):
            return EmptyWorkspaceTarget()
        if os.path.exists("workspace.yaml"):
            return WorkspaceFileTarget(paths=["workspace.yaml"])
        raise click.UsageError("No arguments given and workspace.yaml not found.")

    if kwargs.get("workspace"):
        _check_cli_arguments_none(
            kwargs,
            "python_file",
            "working_directory",
            "empty_working_directory",
            "module_name",
            "package_name",
            "attribute",
            "grpc_host",
            "grpc_port",
            "grpc_socket",
        )
        return WorkspaceFileTarget(paths=list(kwargs["workspace"]))
    if kwargs.get("python_file"):
        _check_cli_arguments_none(
            kwargs,
            "module_name",
            "package_name",
            "grpc_host",
            "grpc_port",
            "grpc_socket",
        )
        working_directory = get_working_directory_from_kwargs(kwargs)
        return PythonFileTarget(
            python_file=kwargs.get("python_file"),
            attribute=kwargs.get("attribute"),
            working_directory=working_directory,
            location_name=None,
        )
    if kwargs.get("module_name"):
        _check_cli_arguments_none(
            kwargs,
            "package_name",
            "working_directory",
            "empty_working_directory",
            "grpc_host",
            "grpc_port",
            "grpc_socket",
        )
        return ModuleTarget(
            module_name=kwargs.get("module_name"),
            attribute=kwargs.get("attribute"),
            location_name=None,
        )
    if kwargs.get("package_name"):
        _check_cli_arguments_none(
            kwargs,
            "working_directory",
            "empty_working_directory",
            "grpc_host",
            "grpc_port",
            "grpc_socket",
        )
        return PackageTarget(
            package_name=kwargs.get("package_name"),
            attribute=kwargs.get("attribute"),
            location_name=None,
        )
    if kwargs.get("grpc_port"):
        _check_cli_arguments_none(
            kwargs,
            "attribute",
            "working_directory",
            "empty_working_directory",
            "grpc_socket",
        )
        return GrpcServerTarget(
            port=kwargs.get("grpc_port"),
            socket=None,
            host=(kwargs.get("grpc_host") if kwargs.get("grpc_host") else "localhost"),
            location_name=None,
        )
    elif kwargs.get("grpc_socket"):
        _check_cli_arguments_none(
            kwargs,
            "attribute",
            "working_directory",
            "empty_working_directory",
        )
        return GrpcServerTarget(
            port=None,
            socket=kwargs.get("grpc_socket"),
            host=(kwargs.get("grpc_host") if kwargs.get("grpc_host") else "localhost"),
            location_name=None,
        )
    else:
        _cli_load_invariant(False)
示例#13
0
def get_workspace_load_target(kwargs: Dict[str, str]):
    check.dict_param(kwargs, "kwargs")
    if are_all_keys_empty(kwargs, WORKSPACE_CLI_ARGS):
        if kwargs.get("empty_workspace"):
            return EmptyWorkspaceTarget()
        if os.path.exists("workspace.yaml"):
            return WorkspaceFileTarget(paths=["workspace.yaml"])
        raise click.UsageError(
            "No arguments given and workspace.yaml not found.")

    if kwargs.get("workspace"):
        _check_cli_arguments_none(
            kwargs,
            "python_file",
            "working_directory",
            "module_name",
            "package_name",
            "attribute",
            "grpc_host",
            "grpc_port",
            "grpc_socket",
        )
        return WorkspaceFileTarget(
            paths=list(cast(Union[List, Tuple], kwargs.get("workspace"))))
    if kwargs.get("python_file"):
        _check_cli_arguments_none(
            kwargs,
            "module_name",
            "package_name",
            "grpc_host",
            "grpc_port",
            "grpc_socket",
        )
        working_directory = get_working_directory_from_kwargs(kwargs)
        return PythonFileTarget(
            python_file=check.str_elem(kwargs, "python_file"),
            attribute=check.opt_str_elem(kwargs, "attribute"),
            working_directory=working_directory,
            location_name=None,
        )
    if kwargs.get("module_name"):
        _check_cli_arguments_none(
            kwargs,
            "package_name",
            "grpc_host",
            "grpc_port",
            "grpc_socket",
        )
        working_directory = get_working_directory_from_kwargs(kwargs)
        return ModuleTarget(
            module_name=check.str_elem(kwargs, "module_name"),
            attribute=check.opt_str_elem(kwargs, "attribute"),
            working_directory=working_directory,
            location_name=None,
        )
    if kwargs.get("package_name"):
        _check_cli_arguments_none(
            kwargs,
            "grpc_host",
            "grpc_port",
            "grpc_socket",
        )
        working_directory = get_working_directory_from_kwargs(kwargs)
        return PackageTarget(
            package_name=check.str_elem(kwargs, "package_name"),
            attribute=check.opt_str_elem(kwargs, "attribute"),
            working_directory=working_directory,
            location_name=None,
        )
    if kwargs.get("grpc_port"):
        _check_cli_arguments_none(
            kwargs,
            "attribute",
            "working_directory",
            "grpc_socket",
        )
        return GrpcServerTarget(
            port=check.int_elem(kwargs, "grpc_port"),
            socket=None,
            host=check.opt_str_elem(kwargs, "grpc_host") or "localhost",
            location_name=None,
        )
    elif kwargs.get("grpc_socket"):
        _check_cli_arguments_none(
            kwargs,
            "attribute",
            "working_directory",
        )
        return GrpcServerTarget(
            port=None,
            socket=check.str_elem(kwargs, "grpc_socket"),
            host=check.opt_str_elem(kwargs, "grpc_host") or "localhost",
            location_name=None,
        )
    else:
        _cli_load_invariant(False)
        # necessary for pyright, does not understand _cli_load_invariant(False) never returns
        assert False