示例#1
0
    def test_update_schedule(self, storage):
        assert storage

        schedule = self.build_schedule("my_schedule", "* * * * *")
        storage.add_job_state(schedule)

        now_time = get_current_datetime_in_utc().timestamp()

        new_schedule = schedule.with_status(JobStatus.RUNNING).with_data(
            ScheduleJobData(
                cron_schedule=schedule.job_specific_data.cron_schedule, start_timestamp=now_time,
            )
        )
        storage.update_job_state(new_schedule)

        schedules = storage.all_stored_job_state(self.fake_repo_target().get_id(), JobType.SCHEDULE)
        assert len(schedules) == 1

        schedule = schedules[0]
        assert schedule.job_name == "my_schedule"
        assert schedule.status == JobStatus.RUNNING
        assert schedule.job_specific_data.start_timestamp == now_time

        stopped_schedule = schedule.with_status(JobStatus.STOPPED).with_data(
            ScheduleJobData(schedule.job_specific_data.cron_schedule)
        )
        storage.update_job_state(stopped_schedule)

        schedules = storage.all_stored_job_state(self.fake_repo_target().get_id(), JobType.SCHEDULE)
        assert len(schedules) == 1

        schedule = schedules[0]
        assert schedule.job_name == "my_schedule"
        assert schedule.status == JobStatus.STOPPED
        assert schedule.job_specific_data.start_timestamp == None
示例#2
0
def test_get_unloadable_job(graphql_context):
    instance = graphql_context.instance
    initial_datetime = pendulum.datetime(year=2019, month=2, day=27, hour=23, minute=59, second=59,)
    with pendulum.test(initial_datetime):
        instance.add_job_state(
            JobState(
                _get_unloadable_schedule_origin("unloadable_running"),
                JobType.SCHEDULE,
                JobStatus.RUNNING,
                ScheduleJobData("0 0 * * *", pendulum.now("UTC").timestamp(),),
            )
        )

        instance.add_job_state(
            JobState(
                _get_unloadable_schedule_origin("unloadable_stopped"),
                JobType.SCHEDULE,
                JobStatus.STOPPED,
                ScheduleJobData("0 0 * * *", pendulum.now("UTC").timestamp(),),
            )
        )

    result = execute_dagster_graphql(graphql_context, GET_UNLOADABLE_QUERY)
    assert len(result.data["unloadableJobStatesOrError"]["results"]) == 1
    assert result.data["unloadableJobStatesOrError"]["results"][0]["name"] == "unloadable_running"
示例#3
0
    def stop_schedule_and_update_storage_state(self, instance,
                                               schedule_origin_id):
        """
        Updates the status of the given schedule to `JobStatus.STOPPED` in schedule storage,
        then calls `stop_schedule`.

        This should not be overridden by subclasses.

        Args:
            schedule_origin_id (string): The id of the schedule target to stop running.
        """

        check.str_param(schedule_origin_id, "schedule_origin_id")

        schedule_state = self._get_schedule_state(instance, schedule_origin_id)

        self.stop_schedule(instance, schedule_origin_id)
        stopped_schedule = schedule_state.with_status(
            JobStatus.STOPPED).with_data(
                ScheduleJobData(
                    cron_schedule=schedule_state.job_specific_data.
                    cron_schedule,
                    scheduler=self.__class__.__name__,
                ))
        instance.update_job_state(stopped_schedule)
        return stopped_schedule
示例#4
0
def test_reconcile_schedule_without_start_time():
    with TemporaryDirectory() as tempdir:
        instance = define_scheduler_instance(tempdir)
        with get_test_external_repo() as external_repo:
            external_schedule = external_repo.get_external_schedule(
                "no_config_pipeline_daily_schedule")

            legacy_schedule_state = JobState(
                external_schedule.get_external_origin(),
                JobType.SCHEDULE,
                JobStatus.RUNNING,
                ScheduleJobData(external_schedule.cron_schedule, None),
            )

            instance.add_job_state(legacy_schedule_state)

            instance.reconcile_scheduler_state(
                external_repository=external_repo)

            reconciled_schedule_state = instance.get_job_state(
                external_schedule.get_external_origin_id())

            assert reconciled_schedule_state.status == JobStatus.RUNNING
            assert (reconciled_schedule_state.job_specific_data.start_timestamp
                    == get_timestamp_from_utc_datetime(
                        get_current_datetime_in_utc()))
示例#5
0
 def build_schedule(
     cls, schedule_name, cron_schedule, status=JobStatus.STOPPED,
 ):
     return JobState(
         cls.fake_repo_target().get_job_origin(schedule_name),
         JobType.SCHEDULE,
         status,
         ScheduleJobData(cron_schedule, start_timestamp=None),
     )
示例#6
0
    def get_default_job_state(self):
        from dagster.core.scheduler.job import JobState, JobStatus, ScheduleJobData

        return JobState(
            self.get_external_origin(),
            JobType.SCHEDULE,
            JobStatus.STOPPED,
            ScheduleJobData(self.cron_schedule, start_timestamp=None),
        )
示例#7
0
    def _create_new_schedule_state(self, instance, external_schedule):
        schedule_state = JobState(
            external_schedule.get_external_origin(),
            JobType.SCHEDULE,
            JobStatus.STOPPED,
            ScheduleJobData(external_schedule.cron_schedule),
        )

        instance.add_job_state(schedule_state)
        return schedule_state
示例#8
0
def test_bad_load(capfd):
    with schedule_instance() as instance:
        fake_origin = _get_unloadable_schedule_origin()
        initial_datetime = pendulum.datetime(
            year=2019,
            month=2,
            day=27,
            hour=23,
            minute=59,
            second=59,
        )
        with pendulum.test(initial_datetime):
            schedule_state = JobState(
                fake_origin,
                JobType.SCHEDULE,
                JobStatus.RUNNING,
                ScheduleJobData(
                    "0 0 * * *",
                    pendulum.now("UTC").timestamp(),
                ),
            )
            instance.add_job_state(schedule_state)

        initial_datetime = initial_datetime.add(seconds=1)
        with pendulum.test(initial_datetime):
            launch_scheduled_runs(instance, logger(), pendulum.now("UTC"))

            assert instance.get_runs_count() == 0

            ticks = instance.get_job_ticks(fake_origin.get_id())

            assert len(ticks) == 0

            captured = capfd.readouterr()
            assert "Scheduler failed for doesnt_exist" in captured.out
            assert "doesnt_exist not found at module scope" in captured.out

        initial_datetime = initial_datetime.add(days=1)
        with pendulum.test(initial_datetime):
            launch_scheduled_runs(instance, logger(), pendulum.now("UTC"))
            assert instance.get_runs_count() == 0
            ticks = instance.get_job_ticks(fake_origin.get_id())
            assert len(ticks) == 0
示例#9
0
    def start_schedule_and_update_storage_state(self, instance,
                                                external_schedule):
        """
        Updates the status of the given schedule to `JobStatus.RUNNING` in schedule storage,
        then calls `start_schedule`.

        This should not be overridden by subclasses.

        Args:
            instance (DagsterInstance): The current instance.
            external_schedule (ExternalSchedule): The schedule to start

        """

        check.inst_param(instance, "instance", DagsterInstance)
        check.inst_param(external_schedule, "external_schedule",
                         ExternalSchedule)

        schedule_state = instance.get_job_state(
            external_schedule.get_external_origin_id())

        if not schedule_state:
            schedule_state = self._create_new_schedule_state(
                instance, external_schedule)

        if schedule_state.status == JobStatus.RUNNING:
            raise DagsterSchedulerError(
                "You have attempted to start schedule {name}, but it is already running"
                .format(name=external_schedule.name))

        self.start_schedule(instance, external_schedule)
        started_schedule = schedule_state.with_status(
            JobStatus.RUNNING).with_data(
                ScheduleJobData(
                    external_schedule.cron_schedule,
                    get_current_datetime_in_utc().timestamp(),
                    scheduler=self.__class__.__name__,
                ))
        instance.update_job_state(started_schedule)
        return started_schedule
示例#10
0
    def reconcile_scheduler_state(self, instance, external_repository):
        """Reconcile the ExternalSchedule list from the repository and ScheduleStorage
        on the instance to ensure there is a 1-1 correlation between ExternalSchedule and
        JobStates of type JobType.SCHEDULE, where the ExternalSchedule list is the source of truth.

        If a new ExternalSchedule is introduced, a new JobState is added to storage with status
        JobStatus.STOPPED.

        For every previously existing ExternalSchedule (where target id is the primary key),
        any changes to the definition are persisted in the corresponding JobState and the status is
        left unchanged. The schedule is also restarted to make sure the external artifacts (such
        as a cron job) are up to date.

        For every ScheduleDefinitions that is removed, the corresponding JobState is removed from
        the storage and the corresponding job is ended.
        """

        schedules_to_restart = []
        for external_schedule in external_repository.get_external_schedules():
            # If a schedule already exists for schedule_def, overwrite bash script and
            # metadata file
            existing_schedule_state = instance.get_job_state(
                external_schedule.get_external_origin_id())
            if existing_schedule_state:
                new_timestamp = existing_schedule_state.job_specific_data.start_timestamp
                if not new_timestamp and existing_schedule_state.status == JobStatus.RUNNING:
                    new_timestamp = get_timestamp_from_utc_datetime(
                        get_current_datetime_in_utc())

                # Keep the status, update target and cron schedule
                schedule_state = JobState(
                    external_schedule.get_external_origin(),
                    JobType.SCHEDULE,
                    existing_schedule_state.status,
                    ScheduleJobData(
                        external_schedule.cron_schedule,
                        new_timestamp,
                        scheduler=self.__class__.__name__,
                    ),
                )

                instance.update_job_state(schedule_state)
                schedules_to_restart.append(
                    (existing_schedule_state, external_schedule))
            else:
                self._create_new_schedule_state(instance, external_schedule)

        # Delete all existing schedules that are not in external schedules
        external_schedule_origin_ids = {
            s.get_external_origin_id()
            for s in external_repository.get_external_schedules()
        }
        existing_schedule_origin_ids = set([
            job.job_origin_id for job in instance.all_stored_job_state(
                external_repository.get_external_origin_id())
            if job.job_type == JobType.SCHEDULE
        ])
        schedule_origin_ids_to_delete = existing_schedule_origin_ids - external_schedule_origin_ids

        schedule_reconciliation_errors = []
        for schedule_state, external_schedule in schedules_to_restart:
            # Restart is only needed if the schedule was previously running
            if schedule_state.status == JobStatus.RUNNING:
                try:
                    self.refresh_schedule(instance, external_schedule)
                except DagsterSchedulerError as e:
                    schedule_reconciliation_errors.append(e)

            if schedule_state.status == JobStatus.STOPPED:
                try:
                    self.stop_schedule(
                        instance, external_schedule.get_external_origin_id())
                except DagsterSchedulerError as e:
                    schedule_reconciliation_errors.append(e)

        for schedule_origin_id in schedule_origin_ids_to_delete:
            try:
                instance.stop_schedule_and_delete_from_storage(
                    schedule_origin_id)
            except DagsterSchedulerError as e:
                schedule_reconciliation_errors.append(e)

        if len(schedule_reconciliation_errors):
            raise DagsterScheduleReconciliationError(
                "One or more errors were encountered by the Scheduler while starting or stopping schedules. "
                "Individual error messages follow:",
                errors=schedule_reconciliation_errors,
            )
示例#11
0
def test_bad_schedules_mixed_with_good_schedule(external_repo_context, capfd):
    with instance_with_schedules(external_repo_context) as (instance,
                                                            external_repo):
        good_schedule = external_repo.get_external_schedule("simple_schedule")
        bad_schedule = external_repo.get_external_schedule(
            "bad_should_execute_schedule_on_odd_days")

        good_origin = good_schedule.get_external_origin()
        bad_origin = bad_schedule.get_external_origin()
        unloadable_origin = _get_unloadable_schedule_origin()
        initial_datetime = pendulum.datetime(
            year=2019,
            month=2,
            day=27,
            hour=0,
            minute=0,
            second=0,
        )
        with pendulum.test(initial_datetime):
            instance.start_schedule_and_update_storage_state(good_schedule)
            instance.start_schedule_and_update_storage_state(bad_schedule)

            unloadable_schedule_state = JobState(
                unloadable_origin,
                JobType.SCHEDULE,
                JobStatus.RUNNING,
                ScheduleJobData("0 0 * * *",
                                pendulum.now("UTC").timestamp()),
            )
            instance.add_job_state(unloadable_schedule_state)

            launch_scheduled_runs(instance, logger(), pendulum.now("UTC"))

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(
                instance.get_runs()[0],
                execution_time=initial_datetime,
                partition_time=pendulum.datetime(2019, 2, 26),
            )

            good_ticks = instance.get_job_ticks(good_origin.get_id())
            assert len(good_ticks) == 1
            validate_tick(
                good_ticks[0],
                good_schedule,
                initial_datetime,
                JobTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )

            bad_ticks = instance.get_job_ticks(bad_origin.get_id())
            assert len(bad_ticks) == 1

            assert bad_ticks[0].status == JobTickStatus.FAILURE

            assert ("Error occurred during the execution of should_execute "
                    "for schedule bad_should_execute_schedule"
                    in bad_ticks[0].error.message)

            unloadable_ticks = instance.get_job_ticks(
                unloadable_origin.get_id())
            assert len(unloadable_ticks) == 0

            captured = capfd.readouterr()
            assert "Scheduler failed for doesnt_exist" in captured.out
            assert "doesnt_exist not found at module scope" in captured.out

        initial_datetime = initial_datetime.add(days=1)
        with pendulum.test(initial_datetime):
            new_now = pendulum.now("UTC")
            launch_scheduled_runs(instance, logger(), new_now)

            assert instance.get_runs_count() == 3
            wait_for_all_runs_to_start(instance)

            good_schedule_runs = instance.get_runs(
                filters=PipelineRunsFilter.for_schedule(good_schedule))
            assert len(good_schedule_runs) == 2
            validate_run_started(
                good_schedule_runs[0],
                execution_time=new_now,
                partition_time=pendulum.datetime(2019, 2, 27),
            )

            good_ticks = instance.get_job_ticks(good_origin.get_id())
            assert len(good_ticks) == 2
            validate_tick(
                good_ticks[0],
                good_schedule,
                new_now,
                JobTickStatus.SUCCESS,
                good_schedule_runs[0].run_id,
            )

            bad_schedule_runs = instance.get_runs(
                filters=PipelineRunsFilter.for_schedule(bad_schedule))
            assert len(bad_schedule_runs) == 1
            validate_run_started(
                bad_schedule_runs[0],
                execution_time=new_now,
                partition_time=pendulum.datetime(2019, 2, 27),
            )

            bad_ticks = instance.get_job_ticks(bad_origin.get_id())
            assert len(bad_ticks) == 2
            validate_tick(
                bad_ticks[0],
                bad_schedule,
                new_now,
                JobTickStatus.SUCCESS,
                bad_schedule_runs[0].run_id,
            )

            unloadable_ticks = instance.get_job_ticks(
                unloadable_origin.get_id())
            assert len(unloadable_ticks) == 0

            captured = capfd.readouterr()
            assert "Scheduler failed for doesnt_exist" in captured.out
            assert "doesnt_exist not found at module scope" in captured.out