Пример #1
0
def test_healthy():

    with instance_for_test(
            overrides={
                "run_coordinator": {
                    "module":
                    "dagster.core.run_coordinator.queued_run_coordinator",
                    "class": "QueuedRunCoordinator",
                },
            }) as instance:
        init_time = pendulum.now("UTC")
        beyond_tolerated_time = init_time.float_timestamp + 100

        controller = DagsterDaemonController(instance)
        assert not all_daemons_healthy(
            instance, curr_time_seconds=init_time.float_timestamp)
        assert not all_daemons_live(
            instance, curr_time_seconds=init_time.float_timestamp)

        controller.run_iteration(init_time)
        assert all_daemons_healthy(instance,
                                   curr_time_seconds=init_time.float_timestamp)
        assert all_daemons_live(instance,
                                curr_time_seconds=init_time.float_timestamp)

        assert not all_daemons_healthy(instance,
                                       curr_time_seconds=beyond_tolerated_time)
        assert not all_daemons_live(instance,
                                    curr_time_seconds=beyond_tolerated_time)
Пример #2
0
def liveness_check_command():
    with DagsterInstance.get() as instance:
        if all_daemons_live(instance):
            click.echo("Daemon live")
        else:
            click.echo("Daemon(s) not running")
            sys.exit(1)
Пример #3
0
def liveness_check_command():
    with DagsterInstance.get() as instance:
        if all_daemons_live(instance, heartbeat_tolerance_seconds=_get_heartbeat_tolerance()):
            click.echo("Daemon live")
        else:
            click.echo("Daemon(s) not running")
            sys.exit(1)
Пример #4
0
def test_healthy():

    with instance_for_test(
            overrides={
                "run_coordinator": {
                    "module":
                    "dagster.core.run_coordinator.queued_run_coordinator",
                    "class": "QueuedRunCoordinator",
                },
            }) as instance:
        init_time = pendulum.now("UTC")

        assert not all_daemons_healthy(
            instance, curr_time_seconds=init_time.float_timestamp)
        assert not all_daemons_live(
            instance, curr_time_seconds=init_time.float_timestamp)

        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:

            while True:
                now = pendulum.now("UTC")
                if all_daemons_healthy(
                        instance, curr_time_seconds=now.float_timestamp
                ) and all_daemons_live(instance,
                                       curr_time_seconds=now.float_timestamp):

                    controller.check_daemons()

                    beyond_tolerated_time = now.float_timestamp + 100

                    assert not all_daemons_healthy(
                        instance, curr_time_seconds=beyond_tolerated_time)
                    assert not all_daemons_live(
                        instance, curr_time_seconds=beyond_tolerated_time)
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception(
                        "timed out waiting for instance to become healthy")

                time.sleep(0.5)
Пример #5
0
def test_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_, _instance, _daemon_shutdown_event,
                                _grpc_server_registry):
            raise DagsterInvariantViolationError("foobar")
            yield  # pylint: disable=unreachable

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:
            while True:
                now = pendulum.now("UTC")

                if all_daemons_live(instance):
                    # Despite error, daemon should still be running
                    controller.check_daemons()

                    status = get_daemon_status(instance,
                                               SensorDaemon.daemon_type(),
                                               now.float_timestamp)

                    assert status.healthy == False
                    assert len(status.last_heartbeat.errors) == 1
                    assert (
                        status.last_heartbeat.errors[0].message.strip() ==
                        "dagster.core.errors.DagsterInvariantViolationError: foobar"
                    )
                    assert not all_daemons_healthy(
                        instance, curr_time_seconds=now.float_timestamp)
                    assert all_daemons_live(
                        instance, curr_time_seconds=now.float_timestamp)
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
def test_multiple_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_loop_error(_, _instance, _workspace):
            # ?message stack cls_name cause"
            yield SerializableErrorInfo("foobar", None, None, None)
            yield SerializableErrorInfo("bizbuz", None, None, None)

            while True:
                yield
                time.sleep(0.5)

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
                heartbeat_interval_seconds=heartbeat_interval_seconds,
        ) as controller:
            while True:

                now = pendulum.now("UTC")

                if all_daemons_live(
                        instance,
                        heartbeat_interval_seconds=heartbeat_interval_seconds):

                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_statuses(
                        instance, [SensorDaemon.daemon_type()],
                        now.float_timestamp)[SensorDaemon.daemon_type()]

                    if status.healthy == False and len(
                            status.last_heartbeat.errors) == 2:
                        assert status.last_heartbeat.errors[0].message.strip(
                        ) == "bizbuz"
                        assert status.last_heartbeat.errors[1].message.strip(
                        ) == "foobar"
                        break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
def test_healthy_with_different_daemons():
    with instance_for_test() as instance:
        with DagsterDaemonController.create_from_instance(instance):

            with instance_for_test(
                overrides={
                    "run_coordinator": {
                        "module": "dagster.core.run_coordinator.queued_run_coordinator",
                        "class": "QueuedRunCoordinator",
                    },
                }
            ) as other_instance:
                now = pendulum.now("UTC")
                assert not all_daemons_healthy(
                    other_instance, curr_time_seconds=now.float_timestamp
                )
                assert not all_daemons_live(other_instance, curr_time_seconds=now.float_timestamp)
Пример #8
0
def test_multiple_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_, _instance, _daemon_shutdown_event_,
                                _grpc_server_registry):
            # ?message stack cls_name cause"
            yield SerializableErrorInfo("foobar", None, None, None)
            yield SerializableErrorInfo("bizbuz", None, None, None)

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        init_time = pendulum.now("UTC")

        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:
            while True:

                now = pendulum.now("UTC")

                if all_daemons_live(instance):

                    # Despite error, daemon should still be running
                    controller.check_daemons()

                    status = get_daemon_status(instance,
                                               SensorDaemon.daemon_type(),
                                               now.float_timestamp)

                    if status.healthy == False:
                        assert len(status.last_heartbeat.errors) == 2
                        assert status.last_heartbeat.errors[0].message.strip(
                        ) == "foobar"
                        assert status.last_heartbeat.errors[1].message.strip(
                        ) == "bizbuz"
                        break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
Пример #9
0
def test_error_daemon(monkeypatch):
    with instance_for_test(overrides={}) as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_):
            raise DagsterInvariantViolationError("foobar")
            yield  # pylint: disable=unreachable

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)
        controller = DagsterDaemonController(instance)
        init_time = pendulum.now("UTC")
        controller.run_iteration(init_time)

        status = get_daemon_status(instance, SensorDaemon.daemon_type(),
                                   init_time.float_timestamp)
        assert status.healthy == False
        assert len(status.last_heartbeat.errors) == 1
        assert (status.last_heartbeat.errors[0].message.strip() ==
                "dagster.core.errors.DagsterInvariantViolationError: foobar")
        assert not all_daemons_healthy(
            instance, curr_time_seconds=init_time.float_timestamp)
        assert all_daemons_live(instance,
                                curr_time_seconds=init_time.float_timestamp)
def test_warn_multiple_daemons(capsys):
    from dagster.daemon.daemon import SensorDaemon

    with instance_for_test() as instance:
        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds):
            while True:
                now = pendulum.now("UTC")

                if all_daemons_live(
                        instance,
                        heartbeat_interval_seconds=heartbeat_interval_seconds):
                    captured = capsys.readouterr()
                    assert "Taking over from another SENSOR daemon process" not in captured.out
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for daemon status")

                time.sleep(0.5)

            capsys.readouterr()

        init_time = pendulum.now("UTC")

        status = get_daemon_status(
            instance,
            SensorDaemon.daemon_type(),
            now.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )
        last_heartbeat_time = status.last_heartbeat.timestamp

        # No warning when a second controller starts up again
        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds):
            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                if status.last_heartbeat and status.last_heartbeat.timestamp != last_heartbeat_time:
                    captured = capsys.readouterr()
                    assert "Taking over from another SENSOR daemon process" not in captured.out
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for new daemon status")

                time.sleep(0.5)

            status = get_daemon_status(
                instance,
                SensorDaemon.daemon_type(),
                now.float_timestamp,
                heartbeat_interval_seconds=heartbeat_interval_seconds,
            )
            last_heartbeat_time = status.last_heartbeat.timestamp

            # Starting up a controller while one is running produces the warning though
            with daemon_controller_from_instance(
                    instance,
                    heartbeat_interval_seconds=heartbeat_interval_seconds):
                # Wait for heartbeats while two controllers are running at once and there will
                # be a warning
                init_time = pendulum.now("UTC")

                while True:
                    now = pendulum.now("UTC")

                    captured = capsys.readouterr()
                    if "Taking over from another SENSOR daemon process" in captured.out:
                        break

                    if (now - init_time).total_seconds() > 60:
                        raise Exception("timed out waiting for heartbeats")

                    time.sleep(5)
def test_healthy():

    with instance_for_test(
            overrides={
                "run_coordinator": {
                    "module":
                    "dagster.core.run_coordinator.queued_run_coordinator",
                    "class": "QueuedRunCoordinator",
                },
            }) as instance:
        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        assert not all_daemons_healthy(
            instance,
            curr_time_seconds=init_time.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )
        assert not all_daemons_live(
            instance,
            curr_time_seconds=init_time.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )

        with daemon_controller_from_instance(
                instance, heartbeat_interval_seconds=heartbeat_interval_seconds
        ) as controller:

            while True:
                now = pendulum.now("UTC")
                if all_daemons_healthy(
                        instance,
                        curr_time_seconds=now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                ) and all_daemons_live(
                        instance,
                        curr_time_seconds=now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                ):

                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    beyond_tolerated_time = (
                        now.float_timestamp +
                        DEFAULT_DAEMON_HEARTBEAT_TOLERANCE_SECONDS + 1)

                    assert not all_daemons_healthy(
                        instance,
                        curr_time_seconds=beyond_tolerated_time,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )
                    assert not all_daemons_live(
                        instance,
                        curr_time_seconds=beyond_tolerated_time,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception(
                        "timed out waiting for instance to become healthy")

                time.sleep(0.5)