def test_partitions_outside_schedule_range(): execution_time = create_pendulum_time(year=2021, month=1, day=1, tz="UTC") context = build_schedule_context(scheduled_execution_time=execution_time) @monthly_schedule( pipeline_name="too early", start_date=create_pendulum_time(year=2021, month=1, day=1, tz="UTC"), ) def too_early(monthly_time): return {"monthly_time": monthly_time.isoformat()} execution_data = too_early.evaluate_tick(context) assert execution_data.skip_message == ( "Your partition (2020-12-01T00:00:00+00:00) is before the beginning of " "the partition set (2021-01-01T00:00:00+00:00). " "Verify your schedule's start_date is correct.") @monthly_schedule( pipeline_name="too late", start_date=create_pendulum_time(year=2020, month=1, day=1, tz="UTC"), end_date=create_pendulum_time(year=2020, month=12, day=1, tz="UTC"), partition_months_offset=0, ) def too_late(monthly_time): return {"monthly_time": monthly_time.isoformat()} execution_data = too_late.evaluate_tick(context) assert execution_data.skip_message == ( "Your partition (2021-01-01T00:00:00+00:00) is after the end of " "the partition set (2020-12-01T00:00:00+00:00). " "Verify your schedule's end_date is correct.")
def test_partitions_for_monthly_schedule_decorators_without_timezone( partition_months_offset: int): with pendulum.test( to_timezone(create_pendulum_time(2019, 3, 27, 0, 1, 1, tz="UTC"), "US/Eastern")): context_without_time = build_schedule_context() start_date = datetime(year=2019, month=1, day=1) @monthly_schedule( pipeline_name="foo_pipeline", execution_day_of_month=3, start_date=start_date, execution_time=time(9, 30), partition_months_offset=partition_months_offset, ) def monthly_foo_schedule(monthly_time): return {"monthly_time": monthly_time.isoformat()} valid_monthly_time = create_pendulum_time(year=2019, month=3, day=3, hour=9, minute=30, tz="UTC") context_with_valid_time = build_schedule_context( scheduled_execution_time=valid_monthly_time) execution_data = monthly_foo_schedule.evaluate_tick( context_with_valid_time) assert execution_data.run_requests assert len(execution_data.run_requests) == 1 assert execution_data.run_requests[0].run_config == { "monthly_time": create_pendulum_time( year=2019, month=3, day=1, tz="UTC").subtract(months=partition_months_offset).isoformat() } execution_data = monthly_foo_schedule.evaluate_tick( context_without_time) assert execution_data.run_requests assert len(execution_data.run_requests) == 1 assert execution_data.run_requests[0].run_config == { "monthly_time": create_pendulum_time( year=2019, month=3, day=1, tz="UTC").subtract(months=partition_months_offset).isoformat() } _check_partitions( monthly_foo_schedule, 3 - partition_months_offset, pendulum.instance(start_date, tz="UTC"), DEFAULT_MONTHLY_FORMAT, relativedelta(months=1), )
def test_partitions_for_hourly_schedule_decorators_without_timezone( partition_hours_offset: int): with pendulum.test( to_timezone(create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="UTC"), "US/Eastern")): context_without_time = build_schedule_context() start_date = datetime(year=2019, month=1, day=1) @hourly_schedule( pipeline_name="foo_pipeline", start_date=start_date, execution_time=time(hour=0, minute=25), partition_hours_offset=partition_hours_offset, ) def hourly_foo_schedule(hourly_time): return {"hourly_time": hourly_time.isoformat()} _check_partitions( hourly_foo_schedule, HOURS_UNTIL_FEBRUARY_27 + 1 - partition_hours_offset, pendulum.instance(start_date, tz="UTC"), DEFAULT_HOURLY_FORMAT_WITHOUT_TIMEZONE, relativedelta(hours=1), ) execution_data = hourly_foo_schedule.evaluate_tick( context_without_time) assert execution_data.run_requests assert len(execution_data.run_requests) == 1 assert execution_data.run_requests[0].run_config == { "hourly_time": create_pendulum_time( year=2019, month=2, day=27, tz="UTC").subtract(hours=partition_hours_offset).isoformat() } valid_time = create_pendulum_time(year=2019, month=1, day=27, hour=1, minute=25, tz="UTC") context_with_valid_time = build_schedule_context( scheduled_execution_time=valid_time) execution_data = hourly_foo_schedule.evaluate_tick( context_with_valid_time) assert execution_data.run_requests assert len(execution_data.run_requests) == 1 assert execution_data.run_requests[0].run_config == { "hourly_time": create_pendulum_time( year=2019, month=1, day=27, hour=1, tz="UTC").subtract(hours=partition_hours_offset).isoformat() }
def _invalid_partition_selector(_cotnext, _partition_set_def): return [ Partition( value=create_pendulum_time(year=2019, month=1, day=27, hour=1, minute=25), name="made_up", ) ]
def test_time_based_partitioned_job(): @daily_partitioned_config(start_date="2021-05-05") def my_daily_partitioned_config(_start, _end): return RUN_CONFIG assert my_daily_partitioned_config(None, None) == RUN_CONFIG @job(config=my_daily_partitioned_config) def my_job(): my_op() freeze_datetime = create_pendulum_time(year=2021, month=5, day=6, hour=23, minute=59, second=59, tz="UTC") partition_keys = my_daily_partitioned_config.get_partition_keys( freeze_datetime) assert len(partition_keys) == 1 partition_key = partition_keys[0] result = my_job.execute_in_process(partition_key=partition_key) assert result.success with pytest.raises( DagsterUnknownPartitionError, match="Could not find a partition with key `doesnotexist`"): result = my_job.execute_in_process(partition_key="doesnotexist")
def today_at_midnight(timezone_name="UTC"): check.str_param(timezone_name, "timezone_name") now = pendulum.now(timezone_name) return create_pendulum_time(now.year, now.month, now.day, tz=now.timezone.name)
def test_partitions_for_weekly_schedule_decorators_with_timezone( partition_weeks_offset: int): with pendulum.test( create_pendulum_time(2019, 2, 27, 0, 1, 1, tz="US/Central")): start_date = datetime(year=2019, month=1, day=1) @weekly_schedule( pipeline_name="foo_pipeline", execution_day_of_week=3, start_date=start_date, execution_time=time(9, 30), execution_timezone="US/Central", partition_weeks_offset=partition_weeks_offset, ) def weekly_foo_schedule(weekly_time): return {"weekly_time": weekly_time.isoformat()} assert weekly_foo_schedule.execution_timezone == "US/Central" valid_weekly_time = create_pendulum_time(year=2019, month=1, day=30, hour=9, minute=30, tz="US/Central") context_with_valid_time = build_schedule_context( scheduled_execution_time=valid_weekly_time) execution_data = weekly_foo_schedule.evaluate_tick( context_with_valid_time) assert execution_data.run_requests assert len(execution_data.run_requests) == 1 assert execution_data.run_requests[0].run_config == { "weekly_time": create_pendulum_time(year=2019, month=1, day=29, tz="US/Central").subtract( weeks=partition_weeks_offset).isoformat() } _check_partitions( weekly_foo_schedule, 9 - partition_weeks_offset, pendulum.instance(start_date, tz="US/Central"), DEFAULT_DATE_FORMAT, relativedelta(weeks=1), )
def test_failure_recovery_between_multi_runs(instance, external_repo, crash_location, crash_signal): initial_datetime = create_pendulum_time(year=2019, month=2, day=28, hour=0, minute=0, second=0) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule( "multi_run_schedule") with pendulum.test(frozen_datetime): instance.start_schedule(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 validate_run_exists(instance.get_runs()[0], initial_datetime) ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 frozen_datetime = frozen_datetime.add(minutes=1) with pendulum.test(frozen_datetime): scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 2 validate_run_exists(instance.get_runs()[0], initial_datetime) ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, TickStatus.SUCCESS, [run.run_id for run in instance.get_runs()], )
def test_unloadable_schedule(graphql_context): instance = graphql_context.instance initial_datetime = create_pendulum_time( year=2019, month=2, day=27, hour=23, minute=59, second=59, ) running_origin = _get_unloadable_schedule_origin("unloadable_running") running_instigator_state = InstigatorState( running_origin, InstigatorType.SCHEDULE, InstigatorStatus.RUNNING, ScheduleInstigatorData( "0 0 * * *", pendulum.now("UTC").timestamp(), ), ) stopped_origin = _get_unloadable_schedule_origin("unloadable_stopped") with pendulum.test(initial_datetime): instance.add_instigator_state(running_instigator_state) instance.add_instigator_state( InstigatorState( stopped_origin, InstigatorType.SCHEDULE, InstigatorStatus.STOPPED, ScheduleInstigatorData( "0 0 * * *", pendulum.now("UTC").timestamp(), ), )) result = execute_dagster_graphql(graphql_context, GET_UNLOADABLE_QUERY) assert len( result.data["unloadableInstigationStatesOrError"]["results"]) == 1 assert (result.data["unloadableInstigationStatesOrError"]["results"][0] ["name"] == "unloadable_running") # Verify that we can stop the unloadable schedule stop_result = execute_dagster_graphql( graphql_context, STOP_SCHEDULES_QUERY, variables={ "scheduleOriginId": running_instigator_state.instigator_origin_id, "scheduleSelectorId": running_instigator_state.selector_id, }, ) assert (stop_result.data["stopRunningSchedule"]["scheduleState"]["status"] == InstigatorStatus.STOPPED.value)
def test_cron_schedule_advances_past_dst(): # In Australia/Sydney, DST is at 2AM on 10/3/21. Verify that we don't # get stuck on the DST boundary. start_time = create_pendulum_time(year=2021, month=10, day=3, hour=1, minute=30, second=1, tz="Australia/Sydney") time_iter = schedule_execution_time_iterator(start_time.timestamp(), "*/15 * * * *", "Australia/Sydney") for _i in range(6): # 1:45, 3:00, 3:15, 3:30, 3:45, 4:00 next_time = next(time_iter) assert (next_time.timestamp() == create_pendulum_time( year=2021, month=10, day=3, hour=4, tz="Australia/Sydney").timestamp())
def test_run_record_timestamps(): with get_instance() as instance: freeze_datetime = to_timezone( create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): result = my_job.execute_in_process(instance=instance) records = instance.get_run_records(filters=PipelineRunsFilter(run_ids=[result.run_id])) assert len(records) == 1 record = records[0] assert record.start_time == 1572670800.0 assert record.end_time == 1572670800.0
def test_invalid_cron_string(): start_time = create_pendulum_time(year=2022, month=2, day=21, hour=1, minute=30, second=1, tz="US/Pacific") with pytest.raises(CheckError): next( schedule_execution_time_iterator(start_time.timestamp(), "* * * * * *", "US/Pacific"))
def test_get_unloadable_job(graphql_context): instance = graphql_context.instance initial_datetime = create_pendulum_time( year=2019, month=2, day=27, hour=23, minute=59, second=59, ) with pendulum.test(initial_datetime): instance.add_job_state( InstigatorState( _get_unloadable_schedule_origin("unloadable_running"), InstigatorType.SCHEDULE, InstigatorStatus.RUNNING, ScheduleInstigatorData( "0 0 * * *", pendulum.now("UTC").timestamp(), ), ) ) instance.add_job_state( InstigatorState( _get_unloadable_schedule_origin("unloadable_stopped"), InstigatorType.SCHEDULE, InstigatorStatus.STOPPED, ScheduleInstigatorData( "0 0 * * *", pendulum.now("UTC").timestamp(), ), ) ) result = execute_dagster_graphql(graphql_context, GET_UNLOADABLE_QUERY) assert len(result.data["unloadableInstigationStatesOrError"]["results"]) == 1 assert ( result.data["unloadableInstigationStatesOrError"]["results"][0]["name"] == "unloadable_running" )
def test_vixie_cronstring_schedule(): start_time = create_pendulum_time(year=2022, month=2, day=21, hour=1, minute=30, second=1, tz="US/Pacific") time_iter = schedule_execution_time_iterator(start_time.timestamp(), "@hourly", "US/Pacific") for _i in range(6): # 2:00, 3:00, 4:00, 5:00, 6:00, 7:00 next_time = next(time_iter) assert (next_time.timestamp() == create_pendulum_time( year=2022, month=2, day=21, hour=7, tz="US/Pacific").timestamp()) time_iter = schedule_execution_time_iterator(start_time.timestamp(), "@daily", "US/Pacific") for _i in range(6): # 2/22, 2/23, 2/24, 2/25, 2/26, 2/27 next_time = next(time_iter) assert (next_time.timestamp() == create_pendulum_time( year=2022, month=2, day=27, tz="US/Pacific").timestamp()) time_iter = schedule_execution_time_iterator(start_time.timestamp(), "@weekly", "US/Pacific") for _i in range(6): # 2/27, 3/6, 3/13, 3/20, 3/27, 4/3 next_time = next(time_iter) assert (next_time.timestamp() == create_pendulum_time( year=2022, month=4, day=3, tz="US/Pacific").timestamp()) time_iter = schedule_execution_time_iterator(start_time.timestamp(), "@monthly", "US/Pacific") for _i in range(6): # 3/1, 4/1, 5/1, 6/1, 7/1, 8/1 next_time = next(time_iter) assert (next_time.timestamp() == create_pendulum_time( year=2022, month=8, day=1, tz="US/Pacific").timestamp()) time_iter = schedule_execution_time_iterator(start_time.timestamp(), "@yearly", "US/Pacific") for _i in range(6): # 1/1/2023, 1/1/2024, 1/1/2025, 1/1/2026, 1/1/2027, 1/1/2028 next_time = next(time_iter) assert (next_time.timestamp() == create_pendulum_time( year=2028, month=1, day=1, tz="US/Pacific").timestamp())
def test_run_record_timestamps(self, storage): assert storage self._skip_in_memory(storage) @op def a(): pass @job def my_job(): a() with tempfile.TemporaryDirectory() as temp_dir: if storage._instance: # pylint: disable=protected-access instance = storage._instance # pylint: disable=protected-access else: instance = DagsterInstance( instance_type=InstanceType.EPHEMERAL, local_artifact_storage=LocalArtifactStorage(temp_dir), run_storage=storage, event_storage=InMemoryEventLogStorage(), compute_log_manager=NoOpComputeLogManager(), run_coordinator=DefaultRunCoordinator(), run_launcher=SyncInMemoryRunLauncher(), ) freeze_datetime = to_timezone( create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): result = my_job.execute_in_process(instance=instance) records = instance.get_run_records( filters=PipelineRunsFilter(run_ids=[result.run_id]) ) assert len(records) == 1 record = records[0] assert record.start_time == freeze_datetime.timestamp() assert record.end_time == freeze_datetime.timestamp()
def _test_backfill_in_subprocess(instance_ref, debug_crash_flags): execution_datetime = to_timezone( create_pendulum_time( year=2021, month=2, day=17, ), "US/Central", ) with DagsterInstance.from_ref(instance_ref) as instance: try: with pendulum.test(execution_datetime), create_test_daemon_workspace() as workspace: list( execute_backfill_iteration( instance, workspace, get_default_daemon_logger("BackfillDaemon"), debug_crash_flags=debug_crash_flags, ) ) finally: cleanup_test_instance(instance)
def test_future_ticks_until(graphql_context): schedule_selector = infer_schedule_selector(graphql_context, "timezone_schedule") future_ticks_start_time = create_pendulum_time( 2019, 2, 27, tz="US/Central").timestamp() # Start a single schedule, future tick run requests only available for running schedules start_result = execute_dagster_graphql( graphql_context, START_SCHEDULES_QUERY, variables={"scheduleSelector": schedule_selector}, ) assert (start_result.data["startSchedule"]["scheduleState"]["status"] == InstigatorStatus.RUNNING.value) future_ticks_start_time = create_pendulum_time( 2019, 2, 27, tz="US/Central").timestamp() future_ticks_end_time = create_pendulum_time(2019, 3, 2, tz="US/Central").timestamp() result = execute_dagster_graphql( graphql_context, GET_SCHEDULE_FUTURE_TICKS_UNTIL, variables={ "scheduleSelector": schedule_selector, "ticksAfter": future_ticks_start_time, "ticksUntil": future_ticks_end_time, }, ) future_ticks = result.data["scheduleOrError"]["futureTicks"] assert future_ticks assert len(future_ticks["results"]) == 3 timestamps = [ future_tick["timestamp"] for future_tick in future_ticks["results"] ] assert timestamps == [ create_pendulum_time(2019, 2, 27, tz="US/Central").timestamp(), create_pendulum_time(2019, 2, 28, tz="US/Central").timestamp(), create_pendulum_time(2019, 3, 1, tz="US/Central").timestamp(), ]
def test_differing_timezones(instance, workspace, external_repo): # Two schedules, one using US/Central, the other on US/Eastern freeze_datetime = to_timezone( create_pendulum_time(2019, 2, 27, 23, 59, 59, tz="US/Eastern"), "US/Pacific" ) with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule("daily_central_time_schedule") external_eastern_schedule = external_repo.get_external_schedule( "daily_eastern_time_schedule" ) schedule_origin = external_schedule.get_external_origin() eastern_origin = external_eastern_schedule.get_external_origin() instance.start_schedule(external_schedule) instance.start_schedule(external_eastern_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id) assert len(ticks) == 0 list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id) assert len(ticks) == 0 # Past midnight eastern time, the eastern timezone schedule will run, but not the central timezone freeze_datetime = freeze_datetime.add(minutes=1) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 1 ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id) assert len(ticks) == 1 expected_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, tz="US/Eastern"), "UTC" ) validate_tick( ticks[0], external_eastern_schedule, expected_datetime, TickStatus.SUCCESS, [run.run_id for run in instance.get_runs()], ) ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 wait_for_all_runs_to_start(instance) validate_run_started( instance, instance.get_runs()[0], expected_datetime, create_pendulum_time(2019, 2, 27, tz="US/Eastern"), ) # Past midnight central time, the central timezone schedule will now run freeze_datetime = freeze_datetime.add(hours=1) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 2 ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id) assert len(ticks) == 1 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 1 expected_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, tz="US/Central"), "UTC" ) validate_tick( ticks[0], external_schedule, expected_datetime, TickStatus.SUCCESS, [instance.get_runs()[0].run_id], ) wait_for_all_runs_to_start(instance) validate_run_started( instance, instance.get_runs()[0], expected_datetime, create_pendulum_time(2019, 2, 27, tz="US/Central"), ) # Verify idempotence list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 2 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 1 assert ticks[0].status == TickStatus.SUCCESS ticks = instance.get_ticks(eastern_origin.get_id(), external_eastern_schedule.selector_id) assert len(ticks) == 1 assert ticks[0].status == TickStatus.SUCCESS
def test_execute_during_dst_transition_fall_back(instance, workspace, external_repo): # A schedule that runs daily during a time that occurs twice during a fall DST transition # only executes once for that day freeze_datetime = to_timezone( create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule( "daily_dst_transition_schedule_doubled_time" ) schedule_origin = external_schedule.get_external_origin() instance.start_schedule(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(days=3) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 3 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 3 expected_datetimes_utc = [ create_pendulum_time(2019, 11, 4, 7, 30, 0, tz="UTC"), create_pendulum_time(2019, 11, 3, 7, 30, 0, tz="UTC"), create_pendulum_time(2019, 11, 2, 6, 30, 0, tz="UTC"), ] expected_partition_times = [ create_pendulum_time(2019, 11, 3, tz="US/Central"), create_pendulum_time(2019, 11, 2, tz="US/Central"), create_pendulum_time(2019, 11, 1, tz="US/Central"), ] for i in range(3): validate_tick( ticks[i], external_schedule, expected_datetimes_utc[i], TickStatus.SUCCESS, [instance.get_runs()[i].run_id], ) validate_run_started( instance, instance.get_runs()[i], expected_datetimes_utc[i], partition_time=expected_partition_times[i], ) # Verify idempotence list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 3 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 3
def test_execute_during_dst_transition_spring_forward(instance, workspace, external_repo): # Verify that a daily schedule that is supposed to execute at a time that is skipped # by the DST transition does not execute for that day # Day before DST freeze_datetime = to_timezone( create_pendulum_time(2019, 3, 9, 0, 0, 0, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule( "daily_dst_transition_schedule_skipped_time" ) schedule_origin = external_schedule.get_external_origin() instance.start_schedule(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(days=3) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 3 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 3 expected_datetimes_utc = [ to_timezone(create_pendulum_time(2019, 3, 11, 2, 30, 0, tz="US/Central"), "UTC"), to_timezone(create_pendulum_time(2019, 3, 10, 3, 00, 0, tz="US/Central"), "UTC"), to_timezone(create_pendulum_time(2019, 3, 9, 2, 30, 0, tz="US/Central"), "UTC"), ] expected_partition_times = [ create_pendulum_time(2019, 3, 10, tz="US/Central"), create_pendulum_time(2019, 3, 9, tz="US/Central"), create_pendulum_time(2019, 3, 8, tz="US/Central"), ] partition_set_def = the_repo.get_partition_set_def( "daily_dst_transition_schedule_skipped_time_partitions" ) partition_names = partition_set_def.get_partition_names() assert "2019-03-08" in partition_names assert "2019-03-09" in partition_names assert "2019-03-10" in partition_names for i in range(3): validate_tick( ticks[i], external_schedule, expected_datetimes_utc[i], TickStatus.SUCCESS, [instance.get_runs()[i].run_id], ) validate_run_started( instance, instance.get_runs()[i], expected_datetimes_utc[i], partition_time=expected_partition_times[i], ) # Verify idempotence list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 3 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 3
def test_daily_dst_fall_back(instance, workspace, external_repo): # Verify that a daily schedule still runs once per day during the fall DST transition # Night before DST freeze_datetime = to_timezone( create_pendulum_time(2019, 11, 3, 0, 0, 0, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule("daily_central_time_schedule") schedule_origin = external_schedule.get_external_origin() instance.start_schedule(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(days=2) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 3 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 3 # UTC time changed by one hour after the transition, still running daily at the same # time in CT expected_datetimes_utc = [ create_pendulum_time(2019, 11, 5, 6, 0, 0, tz="UTC"), create_pendulum_time(2019, 11, 4, 6, 0, 0, tz="UTC"), create_pendulum_time(2019, 11, 3, 5, 0, 0, tz="UTC"), ] expected_partition_times = [ create_pendulum_time(2019, 11, 4, tz="US/Central"), create_pendulum_time(2019, 11, 3, tz="US/Central"), create_pendulum_time(2019, 11, 2, tz="US/Central"), ] for i in range(3): validate_tick( ticks[i], external_schedule, expected_datetimes_utc[i], TickStatus.SUCCESS, [instance.get_runs()[i].run_id], ) validate_run_started( instance, instance.get_runs()[i], expected_datetimes_utc[i], partition_time=expected_partition_times[i], ) # Verify idempotence list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 3 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 3
def test_failure_after_run_launched(crash_location, crash_signal, capfd): frozen_datetime = to_timezone( create_pendulum_time( year=2019, month=2, day=28, hour=0, minute=0, second=0, tz="UTC", ), "US/Central", ) with instance_with_sensors() as ( instance, _grpc_server_registry, external_repo, ): with pendulum.test(frozen_datetime): external_sensor = external_repo.get_external_sensor("run_key_sensor") instance.add_instigator_state( InstigatorState( external_sensor.get_external_origin(), InstigatorType.SENSOR, InstigatorStatus.RUNNING, ) ) # create a run, launch but crash debug_crash_flags = {external_sensor.name: {crash_location: crash_signal}} launch_process = spawn_ctx.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 ticks = instance.get_ticks(external_sensor.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == TickStatus.STARTED assert instance.get_runs_count() == 1 run = instance.get_runs()[0] wait_for_all_runs_to_start(instance) assert run.tags.get(SENSOR_NAME_TAG) == "run_key_sensor" assert run.tags.get(RUN_KEY_TAG) == "only_once" capfd.readouterr() launch_process = spawn_ctx.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime.add(seconds=1), None], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode == 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] captured = capfd.readouterr() assert ( 'Skipping 1 run for sensor run_key_sensor already completed with run keys: ["only_once"]' in captured.out ) ticks = instance.get_ticks(external_sensor.get_external_origin_id()) assert len(ticks) == 2 assert ticks[0].status == TickStatus.SKIPPED
def test_non_utc_timezone_run(instance, workspace, external_repo): # Verify that schedule runs at the expected time in a non-UTC timezone freeze_datetime = to_timezone( create_pendulum_time(2019, 2, 27, 23, 59, 59, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule("daily_central_time_schedule") schedule_origin = external_schedule.get_external_origin() instance.start_schedule(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(seconds=2) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 1 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 1 expected_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, tz="US/Central"), "UTC" ) validate_tick( ticks[0], external_schedule, expected_datetime, TickStatus.SUCCESS, [run.run_id for run in instance.get_runs()], ) wait_for_all_runs_to_start(instance) validate_run_started( instance, instance.get_runs()[0], expected_datetime, create_pendulum_time(2019, 2, 27, tz="US/Central"), ) # Verify idempotence list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 1 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 1 assert ticks[0].status == TickStatus.SUCCESS
def test_failure_recovery_after_run_created(instance, external_repo, crash_location, crash_signal): # Verify that if the scheduler crashes or is interrupted after a run is created, # it will just re-launch the already-created run when it runs again initial_datetime = create_pendulum_time(year=2019, month=2, day=27, hour=0, minute=0, second=0) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule("simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 assert ticks[0].status == TickStatus.STARTED assert instance.get_runs_count() == 1 if crash_location == "RUN_CREATED": run = instance.get_runs()[0] # Run was created, but hasn't launched yet assert run.tags[ SCHEDULED_EXECUTION_TIME_TAG] == frozen_datetime.isoformat() assert run.tags[PARTITION_NAME_TAG] == "2019-02-26" assert run.status == PipelineRunStatus.NOT_STARTED else: # The run was created and launched - running again should do nothing other than # moving the tick to success state. # The fact that we need to add this line indicates that there is still a theoretical # possible race condition - if the scheduler fails after launching a run # and then runs again between when the run was launched and when its status is changed to STARTED by the executor, we could # end up launching the same run twice. Run queueing or some other way to immediately # identify that a run was launched would help eliminate this race condition. For now, # eliminate the possibility by waiting for the run to start before running the # scheduler again. wait_for_all_runs_to_start(instance) run = instance.get_runs()[0] validate_run_exists(instance.get_runs()[0], frozen_datetime, create_pendulum_time(2019, 2, 26)) frozen_datetime = frozen_datetime.add(minutes=5) with pendulum.test(frozen_datetime): # Running again just launches the existing run and marks the tick as success scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 wait_for_all_runs_to_start(instance) validate_run_exists(instance.get_runs()[0], initial_datetime, create_pendulum_time(2019, 2, 26)) ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, TickStatus.SUCCESS, [instance.get_runs()[0].run_id], )
def test_failure_recovery_after_tick_success(instance, external_repo, crash_location, crash_signal): initial_datetime = create_pendulum_time(year=2019, month=2, day=27, hour=0, minute=0, second=0) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule("simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 # As above there's a possible race condition here if the scheduler crashes # and launches the same run twice if we crash right after the launch and re-run # before the run actually starts wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 validate_run_exists(instance.get_runs()[0], initial_datetime, create_pendulum_time(2019, 2, 26)) ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 if crash_signal == get_terminate_signal(): run_ids = [] else: run_ids = [run.run_id for run in instance.get_runs()] validate_tick( ticks[0], external_schedule, initial_datetime, TickStatus.STARTED, run_ids, ) frozen_datetime = frozen_datetime.add(minutes=1) with pendulum.test(frozen_datetime): # Running again just marks the tick as success since the run has already started scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 validate_run_exists(instance.get_runs()[0], initial_datetime, create_pendulum_time(2019, 2, 26)) ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, TickStatus.SUCCESS, [instance.get_runs()[0].run_id], )
def test_different_days_in_different_timezones(instance, workspace, external_repo): freeze_datetime = to_timezone( create_pendulum_time(2019, 2, 27, 22, 59, 59, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): # Runs every day at 11PM (CST) external_schedule = external_repo.get_external_schedule("daily_late_schedule") schedule_origin = external_schedule.get_external_origin() instance.start_schedule(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(seconds=2) with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 1 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 1 expected_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, hour=23, tz="US/Central"), "UTC" ) validate_tick( ticks[0], external_schedule, expected_datetime, TickStatus.SUCCESS, [instance.get_runs()[0].run_id], ) wait_for_all_runs_to_start(instance) validate_run_started( instance, instance.get_runs()[0], expected_datetime, create_pendulum_time(2019, 2, 26, tz="US/Central"), ) # Verify idempotence list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 1 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 1 assert ticks[0].status == TickStatus.SUCCESS
def test_failure_recovery_before_run_created(instance, external_repo, crash_location, crash_signal): # Verify that if the scheduler crashes or is interrupted before a run is created, # it will create exactly one tick/run when it is re-launched initial_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=27, hour=0, minute=0, second=0, tz="UTC"), "US/Central", ) frozen_datetime = initial_datetime.add() external_schedule = external_repo.get_external_schedule("simple_schedule") with pendulum.test(frozen_datetime): instance.start_schedule(external_schedule) debug_crash_flags = { external_schedule.name: { crash_location: crash_signal } } scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, debug_crash_flags], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode != 0 ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 assert ticks[0].status == TickStatus.STARTED assert instance.get_runs_count() == 0 frozen_datetime = frozen_datetime.add(minutes=5) with pendulum.test(frozen_datetime): scheduler_process = spawn_ctx.Process( target=_test_launch_scheduled_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) scheduler_process.start() scheduler_process.join(timeout=60) assert scheduler_process.exitcode == 0 assert instance.get_runs_count() == 1 wait_for_all_runs_to_start(instance) validate_run_exists( instance.get_runs()[0], execution_time=initial_datetime, partition_time=create_pendulum_time(2019, 2, 26), ) ticks = instance.get_ticks(external_schedule.get_external_origin_id(), external_schedule.selector_id) assert len(ticks) == 1 validate_tick( ticks[0], external_schedule, initial_datetime, TickStatus.SUCCESS, [instance.get_runs()[0].run_id], )
def test_hourly_dst_spring_forward(instance, workspace, external_repo): # Verify that an hourly schedule still runs hourly during the spring DST transition # 1AM CST freeze_datetime = to_timezone( create_pendulum_time(2019, 3, 10, 1, 0, 0, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule("hourly_central_time_schedule") schedule_origin = external_schedule.get_external_origin() instance.start_schedule(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(hours=2) # DST has now happened, 2 hours later it is 4AM CST # Should be 3 runs: 1AM CST, 3AM CST, 4AM CST with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 3 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 3 expected_datetimes_utc = [ to_timezone(create_pendulum_time(2019, 3, 10, 4, 0, 0, tz="US/Central"), "UTC"), to_timezone(create_pendulum_time(2019, 3, 10, 3, 0, 0, tz="US/Central"), "UTC"), to_timezone(create_pendulum_time(2019, 3, 10, 1, 0, 0, tz="US/Central"), "UTC"), ] for i in range(3): validate_tick( ticks[i], external_schedule, expected_datetimes_utc[i], TickStatus.SUCCESS, [instance.get_runs()[i].run_id], ) validate_run_started( instance, instance.get_runs()[i], expected_datetimes_utc[i], partition_time=to_timezone(expected_datetimes_utc[i], "US/Central").subtract( hours=1 ), partition_fmt=DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE, ) # Verify idempotence list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 3 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 3
def test_failure_before_run_created(crash_location, crash_signal, capfd): frozen_datetime = to_timezone( create_pendulum_time(year=2019, month=2, day=28, hour=0, minute=0, second=1, tz="UTC"), "US/Central", ) with instance_with_sensors() as ( instance, _grpc_server_registry, external_repo, ): with pendulum.test(frozen_datetime): external_sensor = external_repo.get_external_sensor("simple_sensor") instance.add_instigator_state( InstigatorState( external_sensor.get_external_origin(), InstigatorType.SENSOR, InstigatorStatus.RUNNING, ) ) # create a tick launch_process = spawn_ctx.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime, None], ) launch_process.start() launch_process.join(timeout=60) ticks = instance.get_ticks(external_sensor.get_external_origin_id()) assert len(ticks) == 1 assert ticks[0].status == TickStatus.SKIPPED capfd.readouterr() # create a starting tick, but crash debug_crash_flags = {external_sensor.name: {crash_location: crash_signal}} launch_process = spawn_ctx.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime.add(seconds=31), debug_crash_flags], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode != 0 capfd.readouterr() ticks = instance.get_ticks(external_sensor.get_external_origin_id()) assert len(ticks) == 2 assert ticks[0].status == TickStatus.STARTED assert not int(ticks[0].timestamp) % 2 # skip condition for simple_sensor assert instance.get_runs_count() == 0 # create another tick, but ensure that the last evaluation time used is from the first, # successful tick rather than the failed tick launch_process = spawn_ctx.Process( target=_test_launch_sensor_runs_in_subprocess, args=[instance.get_ref(), frozen_datetime.add(seconds=62), None], ) launch_process.start() launch_process.join(timeout=60) assert launch_process.exitcode == 0 wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 1 run = instance.get_runs()[0] assert ( get_logger_output_from_capfd(capfd, "dagster.daemon.SensorDaemon") == f"""2019-02-27 18:01:03 -0600 - dagster.daemon.SensorDaemon - INFO - Checking for new runs for sensor: simple_sensor 2019-02-27 18:01:03 -0600 - dagster.daemon.SensorDaemon - INFO - Launching run for simple_sensor 2019-02-27 18:01:03 -0600 - dagster.daemon.SensorDaemon - INFO - Completed launch of run {run.run_id} for simple_sensor""" ) ticks = instance.get_ticks(external_sensor.get_external_origin_id()) assert len(ticks) == 3 assert ticks[0].status == TickStatus.SUCCESS
def test_hourly_dst_fall_back(instance, workspace, external_repo): # Verify that an hourly schedule still runs hourly during the fall DST transition # 12:30 AM CST freeze_datetime = to_timezone( create_pendulum_time(2019, 11, 3, 0, 30, 0, tz="US/Central"), "US/Pacific" ) with pendulum.test(freeze_datetime): external_schedule = external_repo.get_external_schedule("hourly_central_time_schedule") schedule_origin = external_schedule.get_external_origin() instance.start_schedule(external_schedule) assert instance.get_runs_count() == 0 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 0 freeze_datetime = freeze_datetime.add(hours=4) # DST has now happened, 4 hours later it is 3:30AM CST # Should be 4 runs: 1AM CDT, 1AM CST, 2AM CST, 3AM CST with pendulum.test(freeze_datetime): list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) wait_for_all_runs_to_start(instance) assert instance.get_runs_count() == 4 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 4 expected_datetimes_utc = [ create_pendulum_time(2019, 11, 3, 9, 0, 0, tz="UTC"), create_pendulum_time(2019, 11, 3, 8, 0, 0, tz="UTC"), create_pendulum_time(2019, 11, 3, 7, 0, 0, tz="UTC"), create_pendulum_time(2019, 11, 3, 6, 0, 0, tz="UTC"), ] expected_ct_times = [ "2019-11-03T03:00:00-06:00", # 3 AM CST "2019-11-03T02:00:00-06:00", # 2 AM CST "2019-11-03T01:00:00-06:00", # 1 AM CST "2019-11-03T01:00:00-05:00", # 1 AM CDT ] for i in range(4): assert ( to_timezone(expected_datetimes_utc[i], "US/Central").isoformat() == expected_ct_times[i] ) validate_tick( ticks[i], external_schedule, expected_datetimes_utc[i], TickStatus.SUCCESS, [instance.get_runs()[i].run_id], ) validate_run_started( instance, instance.get_runs()[i], expected_datetimes_utc[i], partition_time=to_timezone(expected_datetimes_utc[i], "US/Central").subtract( hours=1 ), partition_fmt=DEFAULT_HOURLY_FORMAT_WITH_TIMEZONE, ) # Verify idempotence list( launch_scheduled_runs( instance, workspace, logger(), pendulum.now("UTC"), ) ) assert instance.get_runs_count() == 4 ticks = instance.get_ticks(schedule_origin.get_id(), external_schedule.selector_id) assert len(ticks) == 4