Exemplo n.º 1
0
def test_get_required_daemon_types():
    from dagster.daemon.daemon import (
        SensorDaemon,
        BackfillDaemon,
        SchedulerDaemon,
        MonitoringDaemon,
    )

    with instance_for_test() as instance:
        assert instance.get_required_daemon_types() == [
            SensorDaemon.daemon_type(),
            BackfillDaemon.daemon_type(),
            SchedulerDaemon.daemon_type(),
        ]

    with instance_for_test(
        overrides={
            "run_launcher": {
                "module": "dagster_tests.daemon_tests.test_monitoring_daemon",
                "class": "TestRunLauncher",
            },
            "run_monitoring": {"enabled": True},
        }
    ) as instance:
        assert instance.get_required_daemon_types() == [
            SensorDaemon.daemon_type(),
            BackfillDaemon.daemon_type(),
            SchedulerDaemon.daemon_type(),
            MonitoringDaemon.daemon_type(),
        ]
Exemplo n.º 2
0
    def test_add_heartbeat(self, storage):
        self._skip_in_memory(storage)

        # test insert
        added_heartbeat = DaemonHeartbeat(
            timestamp=pendulum.from_timestamp(1000).float_timestamp,
            daemon_type=SensorDaemon.daemon_type(),
            daemon_id=None,
            errors=[],
        )
        storage.add_daemon_heartbeat(added_heartbeat)
        assert len(storage.get_daemon_heartbeats()) == 1
        stored_heartbeat = storage.get_daemon_heartbeats()[
            SensorDaemon.daemon_type()]
        assert stored_heartbeat == added_heartbeat

        # test update
        second_added_heartbeat = DaemonHeartbeat(
            timestamp=pendulum.from_timestamp(2000).float_timestamp,
            daemon_type=SensorDaemon.daemon_type(),
            daemon_id=None,
            errors=[],
        )
        storage.add_daemon_heartbeat(second_added_heartbeat)
        assert len(storage.get_daemon_heartbeats()) == 1
        stored_heartbeat = storage.get_daemon_heartbeats()[
            SensorDaemon.daemon_type()]
        assert stored_heartbeat == second_added_heartbeat
Exemplo n.º 3
0
def debug_daemon_heartbeats(instance):
    daemon = SensorDaemon()
    timestamp = pendulum.now("UTC").float_timestamp
    instance.add_daemon_heartbeat(DaemonHeartbeat(timestamp, daemon.daemon_type(), None, None))
    returned_timestamp = instance.get_daemon_heartbeats()[daemon.daemon_type()].timestamp
    print(  # pylint: disable=print-call
        f"Written timestamp: {timestamp}\nRead timestamp: {returned_timestamp}"
    )
Exemplo n.º 4
0
def debug_daemon_heartbeats(instance):
    daemon = SensorDaemon(instance, interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS,)
    timestamp = pendulum.now("UTC").float_timestamp
    instance.add_daemon_heartbeat(DaemonHeartbeat(timestamp, daemon.daemon_type(), None, None))
    returned_timestamp = instance.get_daemon_heartbeats()[daemon.daemon_type()].timestamp
    print(  # pylint: disable=print-call
        f"Written timetstamp: {timestamp}\nRead timestamp: {returned_timestamp}"
    )
Exemplo n.º 5
0
def create_daemon_of_type(daemon_type):
    if daemon_type == SchedulerDaemon.daemon_type():
        return SchedulerDaemon.create_from_instance(DagsterInstance.get())
    elif daemon_type == SensorDaemon.daemon_type():
        return SensorDaemon.create_from_instance(DagsterInstance.get())
    elif daemon_type == QueuedRunCoordinatorDaemon.daemon_type():
        return QueuedRunCoordinatorDaemon.create_from_instance(
            DagsterInstance.get())
    else:
        raise Exception("Unexpected daemon type {daemon_type}".format(
            daemon_type=daemon_type))
def test_multiple_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_loop_error(_, _instance, _workspace):
            # ?message stack cls_name cause"
            yield SerializableErrorInfo("foobar", None, None, None)
            yield SerializableErrorInfo("bizbuz", None, None, None)

            while True:
                yield
                time.sleep(0.5)

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        with daemon_controller_from_instance(
                instance,
                workspace_load_target=EmptyWorkspaceTarget(),
                heartbeat_interval_seconds=heartbeat_interval_seconds,
        ) as controller:
            while True:

                now = pendulum.now("UTC")

                if all_daemons_live(
                        instance,
                        heartbeat_interval_seconds=heartbeat_interval_seconds):

                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_statuses(
                        instance, [SensorDaemon.daemon_type()],
                        now.float_timestamp)[SensorDaemon.daemon_type()]

                    if status.healthy == False and len(
                            status.last_heartbeat.errors) == 2:
                        assert status.last_heartbeat.errors[0].message.strip(
                        ) == "bizbuz"
                        assert status.last_heartbeat.errors[1].message.strip(
                        ) == "foobar"
                        break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
Exemplo n.º 7
0
def create_daemon_of_type(daemon_type, instance):
    if daemon_type == SchedulerDaemon.daemon_type():
        return SchedulerDaemon(
            interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS)
    elif daemon_type == SensorDaemon.daemon_type():
        return SensorDaemon(interval_seconds=DEFAULT_SENSOR_DAEMON_INTERVAL)
    elif daemon_type == QueuedRunCoordinatorDaemon.daemon_type():
        return QueuedRunCoordinatorDaemon(
            interval_seconds=instance.run_coordinator.dequeue_interval_seconds)
    elif daemon_type == BackfillDaemon.daemon_type():
        return BackfillDaemon(interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS)
    else:
        raise Exception(f"Unexpected daemon type {daemon_type}")
Exemplo n.º 8
0
    def __init__(self, instance):
        self._instance = instance

        self._daemons = {}

        self._logger = get_default_daemon_logger("dagster-daemon")

        if isinstance(instance.scheduler, DagsterDaemonScheduler):
            max_catchup_runs = instance.scheduler.max_catchup_runs
            self._add_daemon(
                SchedulerDaemon(instance,
                                interval_seconds=30,
                                max_catchup_runs=max_catchup_runs))

        self._add_daemon(SensorDaemon(instance, interval_seconds=30))

        if isinstance(instance.run_coordinator, QueuedRunCoordinator):
            max_concurrent_runs = instance.run_coordinator.max_concurrent_runs
            dequeue_interval_seconds = instance.run_coordinator.dequeue_interval_seconds
            self._add_daemon(
                QueuedRunCoordinatorDaemon(
                    instance,
                    interval_seconds=dequeue_interval_seconds,
                    max_concurrent_runs=max_concurrent_runs,
                ))

        if not self._daemons:
            raise Exception("No daemons configured on the DagsterInstance")

        self._logger.info(
            "instance is configured with the following daemons: {}".format(
                _sorted_quoted(
                    type(daemon).__name__ for daemon in self.daemons)))
Exemplo n.º 9
0
def required_daemons(instance):
    """
    Return which daemon types are required by the instance
    """
    daemons = [SensorDaemon.daemon_type()]
    if isinstance(instance.scheduler, DagsterDaemonScheduler):
        daemons.append(SchedulerDaemon.daemon_type())
    if isinstance(instance.run_coordinator, QueuedRunCoordinator):
        daemons.append(QueuedRunCoordinatorDaemon.daemon_type())
    return daemons
Exemplo n.º 10
0
    def test_wipe_heartbeats(self, storage):
        self._skip_in_memory(storage)

        added_heartbeat = DaemonHeartbeat(
            timestamp=pendulum.from_timestamp(1000).float_timestamp,
            daemon_type=SensorDaemon.daemon_type(),
            daemon_id=None,
            errors=[],
        )
        storage.add_daemon_heartbeat(added_heartbeat)
        storage.wipe_daemon_heartbeats()
Exemplo n.º 11
0
def create_daemons_from_instance(instance):
    daemon_types = required_daemons(instance)

    daemons = []

    # Separate instance for each daemon since each is in its own thread
    for daemon_type in daemon_types:
        if daemon_type == SchedulerDaemon.daemon_type():
            daemons.append(
                SchedulerDaemon.create_from_instance(DagsterInstance.get()))
        elif daemon_type == SensorDaemon.daemon_type():
            daemons.append(
                SensorDaemon.create_from_instance(DagsterInstance.get()))
        elif daemon_type == QueuedRunCoordinatorDaemon.daemon_type():
            daemons.append(
                QueuedRunCoordinatorDaemon.create_from_instance(
                    DagsterInstance.get()))
        else:
            raise Exception("Unexpected daemon type {daemon_type}".format(
                daemon_type=daemon_type))

    return daemons
Exemplo n.º 12
0
    def test_wipe_heartbeats(self, storage):
        self._skip_in_memory(storage)

        if not self.can_delete_runs():
            pytest.skip("storage cannot delete")

        added_heartbeat = DaemonHeartbeat(
            timestamp=pendulum.from_timestamp(1000).float_timestamp,
            daemon_type=SensorDaemon.daemon_type(),
            daemon_id=None,
            errors=[],
        )
        storage.add_daemon_heartbeat(added_heartbeat)
        storage.wipe_daemon_heartbeats()
Exemplo n.º 13
0
    def __init__(self, instance):
        self._instance = instance

        self._daemon_uuid = str(uuid.uuid4())

        self._daemons = {}
        self._last_heartbeat_times = {}
        self._last_iteration_times = {}
        self._last_iteration_exceptions = {}
        self._current_iteration_exceptions = {}

        self._logger = get_default_daemon_logger("dagster-daemon")

        if isinstance(instance.scheduler, DagsterDaemonScheduler):
            max_catchup_runs = instance.scheduler.max_catchup_runs
            self._add_daemon(
                SchedulerDaemon(
                    instance,
                    interval_seconds=DEFAULT_DAEMON_INTERVAL_SECONDS,
                    max_catchup_runs=max_catchup_runs,
                )
            )

        self._add_daemon(SensorDaemon(instance, interval_seconds=SENSOR_DAEMON_INTERVAL,))

        if isinstance(instance.run_coordinator, QueuedRunCoordinator):
            max_concurrent_runs = instance.run_coordinator.max_concurrent_runs
            tag_concurrency_limits = instance.run_coordinator.tag_concurrency_limits
            self._add_daemon(
                QueuedRunCoordinatorDaemon(
                    instance,
                    interval_seconds=instance.run_coordinator.dequeue_interval_seconds,
                    max_concurrent_runs=max_concurrent_runs,
                    tag_concurrency_limits=tag_concurrency_limits,
                )
            )

        assert set(required_daemons(instance)) == self._daemons.keys()

        if not self._daemons:
            raise Exception("No daemons configured on the DagsterInstance")

        self._logger.info(
            "instance is configured with the following daemons: {}".format(
                _sorted_quoted(type(daemon).__name__ for daemon in self.daemons)
            )
        )
Exemplo n.º 14
0
def test_error_sensor_daemon(external_repo_context, monkeypatch):
    freeze_datetime = to_timezone(
        create_pendulum_time(year=2019, month=2, day=28, tz="UTC"),
        "US/Central")

    sleeps = []

    def fake_sleep(s):
        sleeps.append(s)
        pendulum.set_test_now(pendulum.now().add(seconds=s))

    monkeypatch.setattr(time, "sleep", fake_sleep)

    with instance_with_sensors(
            external_repo_context,
            overrides={
                "run_launcher": {
                    "module": "dagster.core.test_utils",
                    "class": "ExplodingRunLauncher",
                },
            },
    ) as (instance, workspace, _external_repo):

        @contextmanager
        def _gen_workspace(_instance):
            yield workspace

        with pendulum.test(freeze_datetime):
            instance.add_job_state(
                JobState(_get_unloadable_sensor_origin(), JobType.SENSOR,
                         JobStatus.RUNNING))
            sensor_daemon = SensorDaemon.create_from_instance(instance)
            daemon_shutdown_event = threading.Event()
            sensor_daemon.run_loop(
                "my_uuid",
                daemon_shutdown_event,
                _gen_workspace,
                heartbeat_interval_seconds=DEFAULT_HEARTBEAT_INTERVAL_SECONDS,
                error_interval_seconds=DEFAULT_DAEMON_ERROR_INTERVAL_SECONDS,
                until=freeze_datetime.add(seconds=65),
            )

            heartbeats = instance.get_daemon_heartbeats()
            heartbeat = heartbeats["SENSOR"]
            assert heartbeat
            assert heartbeat.errors
            assert len(heartbeat.errors) == DAEMON_HEARTBEAT_ERROR_LIMIT
Exemplo n.º 15
0
    def __init__(self, instance):
        self._instance = instance

        self._daemon_uuid = str(uuid.uuid4())

        self._daemons = {}
        self._last_heartbeat_time = None

        self._logger = get_default_daemon_logger("dagster-daemon")

        if isinstance(instance.scheduler, DagsterDaemonScheduler):
            max_catchup_runs = instance.scheduler.max_catchup_runs
            self._add_daemon(
                SchedulerDaemon(
                    instance,
                    interval_seconds=self._get_interval_seconds(
                        instance, SchedulerDaemon.__name__),
                    max_catchup_runs=max_catchup_runs,
                ))

        self._add_daemon(
            SensorDaemon(
                instance,
                interval_seconds=self._get_interval_seconds(
                    instance, SensorDaemon.__name__),
            ))

        if isinstance(instance.run_coordinator, QueuedRunCoordinator):
            max_concurrent_runs = instance.run_coordinator.max_concurrent_runs
            self._add_daemon(
                QueuedRunCoordinatorDaemon(
                    instance,
                    interval_seconds=self._get_interval_seconds(
                        instance, QueuedRunCoordinatorDaemon.__name__),
                    max_concurrent_runs=max_concurrent_runs,
                ))

        assert set(self._expected_daemons(instance)) == self._daemons.keys()

        if not self._daemons:
            raise Exception("No daemons configured on the DagsterInstance")

        self._logger.info(
            "instance is configured with the following daemons: {}".format(
                _sorted_quoted(
                    type(daemon).__name__ for daemon in self.daemons)))
Exemplo n.º 16
0
def test_error_daemon(monkeypatch):
    with instance_for_test(overrides={}) as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_):
            raise DagsterInvariantViolationError("foobar")

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)
        controller = DagsterDaemonController(instance)
        init_time = pendulum.now("UTC")
        controller.run_iteration(init_time)

        status = get_daemon_status(instance, SensorDaemon.daemon_type(),
                                   init_time.float_timestamp)
        assert status.healthy == False
        assert (status.last_heartbeat.error.message.strip() ==
                "dagster.core.errors.DagsterInvariantViolationError: foobar")
Exemplo n.º 17
0
def test_multiple_error_daemon(monkeypatch):
    with instance_for_test(overrides={}) as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_):
            # ?message stack cls_name cause"
            yield SerializableErrorInfo("foobar", None, None, None)
            yield SerializableErrorInfo("bizbuz", None, None, None)

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)
        controller = DagsterDaemonController(instance)
        init_time = pendulum.now("UTC")
        controller.run_iteration(init_time)

        status = get_daemon_status(instance, SensorDaemon.daemon_type(),
                                   init_time.float_timestamp)
        assert status.healthy == False
        assert len(status.last_heartbeat.errors) == 2
        assert status.last_heartbeat.errors[0].message.strip() == "foobar"
        assert status.last_heartbeat.errors[1].message.strip() == "bizbuz"
Exemplo n.º 18
0
def test_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_, _instance, _daemon_shutdown_event,
                                _grpc_server_registry):
            raise DagsterInvariantViolationError("foobar")
            yield  # pylint: disable=unreachable

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:
            while True:
                now = pendulum.now("UTC")

                if all_daemons_live(instance):
                    # Despite error, daemon should still be running
                    controller.check_daemons()

                    status = get_daemon_status(instance,
                                               SensorDaemon.daemon_type(),
                                               now.float_timestamp)

                    assert status.healthy == False
                    assert len(status.last_heartbeat.errors) == 1
                    assert (
                        status.last_heartbeat.errors[0].message.strip() ==
                        "dagster.core.errors.DagsterInvariantViolationError: foobar"
                    )
                    assert not all_daemons_healthy(
                        instance, curr_time_seconds=now.float_timestamp)
                    assert all_daemons_live(
                        instance, curr_time_seconds=now.float_timestamp)
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
Exemplo n.º 19
0
def test_multiple_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_, _instance, _daemon_shutdown_event_,
                                _grpc_server_registry):
            # ?message stack cls_name cause"
            yield SerializableErrorInfo("foobar", None, None, None)
            yield SerializableErrorInfo("bizbuz", None, None, None)

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        init_time = pendulum.now("UTC")

        with daemon_controller_from_instance(
                instance, wait_for_processes_on_exit=True) as controller:
            while True:

                now = pendulum.now("UTC")

                if all_daemons_live(instance):

                    # Despite error, daemon should still be running
                    controller.check_daemons()

                    status = get_daemon_status(instance,
                                               SensorDaemon.daemon_type(),
                                               now.float_timestamp)

                    if status.healthy == False:
                        assert len(status.last_heartbeat.errors) == 2
                        assert status.last_heartbeat.errors[0].message.strip(
                        ) == "foobar"
                        assert status.last_heartbeat.errors[1].message.strip(
                        ) == "bizbuz"
                        break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)
Exemplo n.º 20
0
 def test_get_daemon_error(self, graphql_context):
     if graphql_context.instance.is_ephemeral:
         pytest.skip(
             "The daemon isn't compatible with an in-memory instance")
     graphql_context.instance.add_daemon_heartbeat(
         DaemonHeartbeat(
             timestamp=100.0,
             daemon_type=SensorDaemon.daemon_type(),
             daemon_id=None,
             errors=[
                 SerializableErrorInfo(message="foobar",
                                       stack=[],
                                       cls_name=None,
                                       cause=None)
             ],
         ))
     results = execute_dagster_graphql(graphql_context, DAEMON_HEALTH_QUERY)
     assert results.data["instance"]["daemonHealth"]["sensor"] == {
         "lastHeartbeatErrors": [{
             "message": "foobar"
         }],
     }
Exemplo n.º 21
0
    def test_get_individual_daemons(self, graphql_context):
        if graphql_context.instance.is_ephemeral:
            pytest.skip(
                "The daemon isn't compatible with an in-memory instance")
        graphql_context.instance.add_daemon_heartbeat(
            DaemonHeartbeat(timestamp=100.0,
                            daemon_type=SensorDaemon.daemon_type(),
                            daemon_id=None,
                            errors=None))
        results = execute_dagster_graphql(graphql_context,
                                          INDIVIDUAL_DAEMON_QUERY)

        scheduler_required = isinstance(graphql_context.instance.scheduler,
                                        DagsterDaemonScheduler)
        assert results.data == {
            "instance": {
                "daemonHealth": {
                    "id": "daemonHealth",
                    "sensor": {
                        "daemonType": "SENSOR",
                        "required": True,
                        "healthy": False,
                        "lastHeartbeatTime": 100.0,
                    },
                    "run_coordinator": {
                        "daemonType": "QUEUED_RUN_COORDINATOR",
                        "required": False,
                        "healthy": None,
                        "lastHeartbeatTime": None,
                    },
                    "scheduler": {
                        "daemonType": "SCHEDULER",
                        "required": scheduler_required,
                        "healthy": False if scheduler_required else None,
                        "lastHeartbeatTime": None,
                    },
                }
            }
        }
Exemplo n.º 22
0
def test_error_daemon(monkeypatch):
    with instance_for_test(overrides={}) as instance:
        from dagster.daemon.daemon import SensorDaemon

        def run_iteration_error(_):
            raise DagsterInvariantViolationError("foobar")
            yield  # pylint: disable=unreachable

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)
        controller = DagsterDaemonController(instance)
        init_time = pendulum.now("UTC")
        controller.run_iteration(init_time)

        status = get_daemon_status(instance, SensorDaemon.daemon_type(),
                                   init_time.float_timestamp)
        assert status.healthy == False
        assert len(status.last_heartbeat.errors) == 1
        assert (status.last_heartbeat.errors[0].message.strip() ==
                "dagster.core.errors.DagsterInvariantViolationError: foobar")
        assert not all_daemons_healthy(
            instance, curr_time_seconds=init_time.float_timestamp)
        assert all_daemons_live(instance,
                                curr_time_seconds=init_time.float_timestamp)
Exemplo n.º 23
0
def test_warn_multiple_daemons(capsys):
    from dagster.daemon.daemon import SensorDaemon

    with instance_for_test() as instance:
        init_time = pendulum.now("UTC")

        heartbeat_interval_seconds = 1

        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds):
            while True:
                now = pendulum.now("UTC")

                if all_daemons_live(
                        instance,
                        heartbeat_interval_seconds=heartbeat_interval_seconds):
                    captured = capsys.readouterr()
                    assert "Taking over from another SENSOR daemon process" not in captured.out
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for daemon status")

                time.sleep(0.5)

            capsys.readouterr()

        init_time = pendulum.now("UTC")

        status = get_daemon_status(
            instance,
            SensorDaemon.daemon_type(),
            now.float_timestamp,
            heartbeat_interval_seconds=heartbeat_interval_seconds,
        )
        last_heartbeat_time = status.last_heartbeat.timestamp

        # No warning when a second controller starts up again
        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds):
            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                if status.last_heartbeat and status.last_heartbeat.timestamp != last_heartbeat_time:
                    captured = capsys.readouterr()
                    assert "Taking over from another SENSOR daemon process" not in captured.out
                    break

                if (now - init_time).total_seconds() > 10:
                    raise Exception("timed out waiting for new daemon status")

                time.sleep(0.5)

            status = get_daemon_status(
                instance,
                SensorDaemon.daemon_type(),
                now.float_timestamp,
                heartbeat_interval_seconds=heartbeat_interval_seconds,
            )
            last_heartbeat_time = status.last_heartbeat.timestamp

            # Starting up a controller while one is running produces the warning though
            with daemon_controller_from_instance(
                    instance,
                    heartbeat_interval_seconds=heartbeat_interval_seconds):
                # Wait for heartbeats while two controllers are running at once and there will
                # be a warning
                init_time = pendulum.now("UTC")

                while True:
                    now = pendulum.now("UTC")

                    captured = capsys.readouterr()
                    if "Taking over from another SENSOR daemon process" in captured.out:
                        break

                    if (now - init_time).total_seconds() > 60:
                        raise Exception("timed out waiting for heartbeats")

                    time.sleep(5)
Exemplo n.º 24
0
def test_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        should_raise_errors = True

        def run_iteration_error(_, _instance, _workspace):
            if should_raise_errors:
                raise DagsterInvariantViolationError("foobar")
            yield

        monkeypatch.setattr(SensorDaemon, "run_iteration", run_iteration_error)

        heartbeat_interval_seconds = 1

        gen_daemons = lambda instance: [SensorDaemon(interval_seconds=1)]

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
                instance,
                heartbeat_interval_seconds=heartbeat_interval_seconds,
                gen_daemons=gen_daemons,
                error_interval_seconds=10,
        ) as controller:
            while True:
                now = pendulum.now("UTC")

                if get_daemon_status(
                        instance,
                        SensorDaemon.daemon_type(),
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                        ignore_errors=True,
                ).healthy:
                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_status(
                        instance,
                        SensorDaemon.daemon_type(),
                        now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )

                    assert status.healthy == False

                    # Errors build up until they hit 5
                    if len(status.last_heartbeat.errors) == 5:
                        assert (
                            status.last_heartbeat.errors[0].message.strip() ==
                            "dagster.core.errors.DagsterInvariantViolationError: foobar"
                        )
                        assert not get_daemon_status(
                            instance,
                            SensorDaemon.daemon_type(),
                            curr_time_seconds=now.float_timestamp,
                            heartbeat_interval_seconds=
                            heartbeat_interval_seconds,
                        ).healthy
                        assert get_daemon_status(
                            instance,
                            SensorDaemon.daemon_type(),
                            curr_time_seconds=now.float_timestamp,
                            heartbeat_interval_seconds=
                            heartbeat_interval_seconds,
                            ignore_errors=True,
                        ).healthy

                        time.sleep(3)

                        status = get_daemon_status(
                            instance,
                            SensorDaemon.daemon_type(),
                            now.float_timestamp,
                            heartbeat_interval_seconds=
                            heartbeat_interval_seconds,
                        )

                        # Error count does not rise above 5
                        assert len(status.last_heartbeat.errors) == 5

                        break

                if (now - init_time).total_seconds() > 15:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)

            # Once the sensor no longer raises errors, they should return to 0 once
            # enough time passes
            should_raise_errors = False
            init_time = pendulum.now("UTC")

            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                # Error count does not rise above 5
                if len(status.last_heartbeat.errors) == 0:
                    break

                if (now - init_time).total_seconds() > 15:
                    raise Exception(
                        "timed out waiting for hearrteat errors to return to 0"
                    )

                time.sleep(0.5)
Exemplo n.º 25
0
def test_error_daemon(monkeypatch):
    with instance_for_test() as instance:
        from dagster.daemon.daemon import SensorDaemon

        should_raise_errors = True

        error_count = {"count": 0}

        def run_loop_error(_, _instance, _workspace):
            if should_raise_errors:
                time.sleep(0.5)
                error_count["count"] = error_count["count"] + 1
                raise DagsterInvariantViolationError("foobar:" + str(error_count["count"]))

            while True:
                yield
                time.sleep(0.5)

        def _get_error_number(error):
            error_message = error.message.strip()
            return int(error_message.split("foobar:")[1])

        monkeypatch.setattr(SensorDaemon, "core_loop", run_loop_error)

        heartbeat_interval_seconds = 1

        gen_daemons = lambda instance: [SensorDaemon()]

        init_time = pendulum.now("UTC")
        with daemon_controller_from_instance(
            instance,
            workspace_load_target=EmptyWorkspaceTarget(),
            heartbeat_interval_seconds=heartbeat_interval_seconds,
            gen_daemons=gen_daemons,
            error_interval_seconds=10,
        ) as controller:
            while True:
                now = pendulum.now("UTC")

                if get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                    ignore_errors=True,
                ).healthy:
                    # Despite error, daemon should still be running
                    controller.check_daemon_threads()
                    controller.check_daemon_heartbeats()

                    status = get_daemon_status(
                        instance,
                        SensorDaemon.daemon_type(),
                        now.float_timestamp,
                        heartbeat_interval_seconds=heartbeat_interval_seconds,
                    )

                    assert status.healthy == False

                    # Errors build up until there are > 5, then pull off the last
                    if len(status.last_heartbeat.errors) >= 5:

                        first_error_number = _get_error_number(status.last_heartbeat.errors[0])

                        if first_error_number > 5:

                            # Verify error numbers decrease consecutively
                            assert [
                                _get_error_number(error) for error in status.last_heartbeat.errors
                            ] == list(range(first_error_number, first_error_number - 5, -1))

                            assert not get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                curr_time_seconds=now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                            ).healthy
                            assert get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                curr_time_seconds=now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                                ignore_errors=True,
                            ).healthy

                            time.sleep(3)

                            status = get_daemon_status(
                                instance,
                                SensorDaemon.daemon_type(),
                                now.float_timestamp,
                                heartbeat_interval_seconds=heartbeat_interval_seconds,
                            )

                            # Error count does not rise above 5, continues to increase
                            assert len(status.last_heartbeat.errors) == 5

                            new_first_error_number = _get_error_number(
                                status.last_heartbeat.errors[0]
                            )

                            assert new_first_error_number > first_error_number

                            break

                if (now - init_time).total_seconds() > 15:
                    raise Exception("timed out waiting for heartbeat error")

                time.sleep(0.5)

            # Once the sensor no longer raises errors, they should return to 0 once
            # enough time passes
            should_raise_errors = False
            init_time = pendulum.now("UTC")

            while True:
                now = pendulum.now("UTC")

                status = get_daemon_status(
                    instance,
                    SensorDaemon.daemon_type(),
                    now.float_timestamp,
                    heartbeat_interval_seconds=heartbeat_interval_seconds,
                )

                # Error count does not rise above 5
                if len(status.last_heartbeat.errors) == 0:
                    break

                if (now - init_time).total_seconds() > 15:
                    raise Exception("timed out waiting for hearrteat errors to return to 0")

                time.sleep(0.5)