def _create_sensor_tick(instance): with ProcessGrpcServerRegistry() as grpc_server_registry: with RepositoryLocationManager( grpc_server_registry) as location_manager: list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"), location_manager))
def evaluate_sensors(instance, grpc_server_registry): with RepositoryLocationManager(grpc_server_registry) as location_manager: list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"), location_manager, ))
def _test_launch_sensor_runs_in_subprocess(instance_ref, execution_datetime, debug_crash_flags): with DagsterInstance.from_ref(instance_ref) as instance: try: with pendulum.test(execution_datetime), ProcessGrpcServerRegistry( ) as grpc_server_registry: with RepositoryLocationManager( grpc_server_registry) as location_manager: list( execute_sensor_iteration( instance, get_default_daemon_logger("SensorDaemon"), location_manager, debug_crash_flags=debug_crash_flags, )) finally: cleanup_test_instance(instance)
def launch_scheduled_runs( instance, grpc_server_registry, logger, end_datetime_utc, max_catchup_runs=DEFAULT_MAX_CATCHUP_RUNS, debug_crash_flags=None, ): schedules = [ s for s in instance.all_stored_job_state(job_type=JobType.SCHEDULE) if s.status == JobStatus.RUNNING ] if not schedules: logger.info( "Not checking for any runs since no schedules have been started.") return schedule_names = ", ".join([schedule.job_name for schedule in schedules]) logger.info( f"Checking for new runs for the following schedules: {schedule_names}") with RepositoryLocationManager(grpc_server_registry) as location_manager: for schedule_state in schedules: error_info = None try: origin = schedule_state.origin.external_repository_origin.repository_location_origin repo_location = location_manager.get_location(origin) yield from launch_scheduled_runs_for_schedule( instance, logger, schedule_state, repo_location, end_datetime_utc, max_catchup_runs, (debug_crash_flags.get(schedule_state.job_name) if debug_crash_flags else None), ) except Exception: # pylint: disable=broad-except error_info = serializable_error_info_from_exc_info( sys.exc_info()) logger.error( f"Scheduler caught an error for schedule {schedule_state.job_name} : {error_info.to_string()}" ) yield error_info
def execute_sensor_iteration_loop(instance, grpc_server_registry, logger, until=None): """ Helper function that performs sensor evaluations on a tighter loop, while reusing grpc locations within a given daemon interval. Rather than relying on the daemon machinery to run the iteration loop every 30 seconds, sensors are continuously evaluated, every 5 seconds. We rely on each sensor definition's min_interval to check that sensor evaluations are spaced appropriately. """ from dagster.daemon.daemon import CompletedIteration location_manager = None manager_loaded_time = None RELOAD_LOCATION_MANAGER_INTERVAL = 60 start_time = pendulum.now("UTC").timestamp() with ExitStack() as stack: while True: start_time = pendulum.now("UTC").timestamp() if until and start_time >= until: # provide a way of organically ending the loop to support test environment break if (not location_manager or (start_time - manager_loaded_time) > RELOAD_LOCATION_MANAGER_INTERVAL): stack.close() # remove the previous context location_manager = stack.enter_context( RepositoryLocationManager(grpc_server_registry)) manager_loaded_time = start_time yield from execute_sensor_iteration(instance, logger, location_manager) loop_duration = pendulum.now("UTC").timestamp() - start_time sleep_time = max(0, MIN_INTERVAL_LOOP_TIME - loop_duration) yield CompletedIteration() time.sleep(sleep_time)
def run_iteration(self, instance, grpc_server_registry): in_progress_runs = self._get_in_progress_runs(instance) max_runs_to_launch = self._max_concurrent_runs - len(in_progress_runs) # Possibly under 0 if runs were launched without queuing if max_runs_to_launch <= 0: self._logger.info( "{} runs are currently in progress. Maximum is {}, won't launch more." .format(len(in_progress_runs), self._max_concurrent_runs)) return queued_runs = self._get_queued_runs(instance) if not queued_runs: self._logger.info("Poll returned no queued runs.") else: self._logger.info( "Retrieved {} queued runs, checking limits.".format( len(queued_runs))) # place in order sorted_runs = self._priority_sort(queued_runs) # launch until blocked by limit rules num_dequeued_runs = 0 tag_concurrency_limits_counter = _TagConcurrencyLimitsCounter( self._tag_concurrency_limits, in_progress_runs) with RepositoryLocationManager( grpc_server_registry) as location_manager: for run in sorted_runs: if num_dequeued_runs >= max_runs_to_launch: break if tag_concurrency_limits_counter.is_run_blocked(run): continue error_info = None try: self._dequeue_run(instance, run, location_manager) except Exception: # pylint: disable=broad-except error_info = serializable_error_info_from_exc_info( sys.exc_info()) message = ( f"Caught an error for run {run.run_id} while removing it from the queue." " Marking the run as failed and dropping it from the queue" ) message_with_full_error = f"{message}: {error_info.to_string()}" self._logger.error(message_with_full_error) instance.report_run_failed(run, message_with_full_error) # modify the original error, so that the extra message appears in heartbeats error_info = error_info._replace( message=f"{message}: {error_info.message}") else: tag_concurrency_limits_counter.update_counters_with_launched_run( run) num_dequeued_runs += 1 yield error_info self._logger.info("Launched {} runs.".format(num_dequeued_runs))