def get_repository_location_from_kwargs(kwargs, instance): check.inst_param(instance, 'instance', DagsterInstance) workspace = get_workspace_from_kwargs(kwargs, instance) provided_location_name = kwargs.get('location') if provided_location_name is None and len( workspace.repository_location_handles) == 1: return RepositoryLocation.from_handle( next(iter(workspace.repository_location_handles))) if provided_location_name is None: raise click.UsageError( ('Must provide --location as there are more than one locations ' 'available. Options are: {}').format( _sorted_quoted(workspace.repository_location_names))) if not workspace.has_repository_location_handle(provided_location_name): raise click.UsageError( ('Location "{provided_location_name}" not found in workspace. ' 'Found {found_names} instead.').format( provided_location_name=provided_location_name, found_names=_sorted_quoted( workspace.repository_location_names), )) return RepositoryLocation.from_handle( workspace.get_repository_location_handle(provided_location_name))
def get_repository_location_from_kwargs(kwargs, instance): check.inst_param(instance, "instance", DagsterInstance) with get_workspace_from_kwargs(kwargs, instance) as workspace: provided_location_name = kwargs.get("location") if provided_location_name is None and len( workspace.repository_location_handles) == 1: yield RepositoryLocation.from_handle( next(iter(workspace.repository_location_handles))) elif provided_location_name is None: raise click.UsageError(( "Must provide --location as there are more than one locations " "available. Options are: {}").format( _sorted_quoted(workspace.repository_location_names))) elif not workspace.has_repository_location_handle( provided_location_name): raise click.UsageError( ('Location "{provided_location_name}" not found in workspace. ' "Found {found_names} instead.").format( provided_location_name=provided_location_name, found_names=_sorted_quoted( workspace.repository_location_names), )) else: yield RepositoryLocation.from_handle( workspace.get_repository_location_handle( provided_location_name))
def test_repository_origin_unchanged_cli_api_vs_grpc(): with get_bar_repo_repository_location_handle(UserProcessApi.CLI) as cli_handle: cli_location = RepositoryLocation.from_handle(cli_handle) external_repo = cli_location.get_repository("bar_repo") cli_origin_id = external_repo.get_origin_id() with get_bar_repo_repository_location_handle(UserProcessApi.GRPC) as grpc_handle: grpc_location = RepositoryLocation.from_handle(grpc_handle) external_repo = grpc_location.get_repository("bar_repo") grpc_origin_id = external_repo.get_origin_id() assert cli_origin_id == grpc_origin_id
def launch_scheduled_runs_for_schedule(instance, schedule_state, end_datetime_utc, max_catchup_runs, debug_crash_flags=None): check.inst_param(instance, "instance", DagsterInstance) check.inst_param(schedule_state, "schedule_state", ScheduleState) check.inst_param(end_datetime_utc, "end_datetime_utc", datetime.datetime) latest_tick = instance.get_latest_tick(schedule_state.schedule_origin_id) if not latest_tick: start_timestamp_utc = schedule_state.start_timestamp elif latest_tick.status == ScheduleTickStatus.STARTED: # Scheduler was interrupted while performing this tick, re-do it start_timestamp_utc = latest_tick.timestamp else: start_timestamp_utc = latest_tick.timestamp + 1 start_datetime_utc = datetime.datetime.fromtimestamp(start_timestamp_utc, tz=get_utc_timezone()) tick_times = list( croniter_range(start_datetime_utc, end_datetime_utc, schedule_state.cron_schedule)) for schedule_time_utc in tick_times[-max_catchup_runs:]: if latest_tick and latest_tick.timestamp == schedule_time_utc.timestamp( ): tick = latest_tick else: tick = instance.create_schedule_tick( ScheduleTickData( schedule_origin_id=schedule_state.schedule_origin_id, schedule_name=schedule_state.name, timestamp=schedule_time_utc.timestamp(), cron_schedule=schedule_state.cron_schedule, status=ScheduleTickStatus.STARTED, )) _check_for_debug_crash(debug_crash_flags, "TICK_CREATED") with ScheduleTickHolder(tick, instance) as tick_holder: _check_for_debug_crash(debug_crash_flags, "TICK_HELD") with RepositoryLocationHandle.create_from_repository_origin( schedule_state.origin.repository_origin, instance) as repo_location_handle: repo_location = RepositoryLocation.from_handle( repo_location_handle) _schedule_run_at_time( instance, repo_location, schedule_state, schedule_time_utc, tick_holder, debug_crash_flags, )
def external_pipeline_from_location_handle(repository_location_handle, pipeline_name, solid_selection): check.inst_param(repository_location_handle, "repository_location_handle", RepositoryLocationHandle) repo_location = RepositoryLocation.from_handle(repository_location_handle) repo_dict = repo_location.get_repositories() check.invariant( len(repo_dict) == 1, "Reconstructed repository location should have exactly one repository", ) external_repo = next(iter(repo_dict.values())) pipeline_selector = PipelineSelector( location_name=repo_location.name, repository_name=external_repo.name, pipeline_name=pipeline_name, solid_selection=solid_selection, ) subset_pipeline_result = repo_location.get_subset_external_pipeline_result( pipeline_selector) external_pipeline = ExternalPipeline( subset_pipeline_result.external_pipeline_data, external_repo.handle, ) return external_pipeline
def external_pipeline_from_location_handle( repository_location_handle, external_pipeline_origin, solid_selection ): check.inst_param( repository_location_handle, "repository_location_handle", RepositoryLocationHandle ) check.inst_param(external_pipeline_origin, "external_pipeline_origin", ExternalPipelineOrigin) repo_location = RepositoryLocation.from_handle(repository_location_handle) repo_name = external_pipeline_origin.external_repository_origin.repository_name pipeline_name = external_pipeline_origin.pipeline_name check.invariant( repo_location.has_repository(repo_name), "Could not find repository {repo_name} in location {repo_location_name}".format( repo_name=repo_name, repo_location_name=repo_location.name ), ) external_repo = repo_location.get_repository(repo_name) pipeline_selector = PipelineSelector( location_name=repo_location.name, repository_name=external_repo.name, pipeline_name=pipeline_name, solid_selection=solid_selection, ) subset_pipeline_result = repo_location.get_subset_external_pipeline_result(pipeline_selector) external_pipeline = ExternalPipeline( subset_pipeline_result.external_pipeline_data, external_repo.handle, ) return external_pipeline
def external_pipeline_from_run(pipeline_run): check.inst_param(pipeline_run, "pipeline_run", PipelineRun) external_pipeline_origin = check.inst( pipeline_run.external_pipeline_origin, ExternalPipelineOrigin) with RepositoryLocationHandle.create_from_repository_location_origin( external_pipeline_origin.external_repository_origin. repository_location_origin) as repo_location_handle: repo_location = RepositoryLocation.from_handle(repo_location_handle) repo_dict = repo_location.get_repositories() check.invariant( len(repo_dict) == 1, "Reconstructed repository location should have exactly one repository", ) external_repo = next(iter(repo_dict.values())) pipeline_selector = PipelineSelector( location_name=repo_location.name, repository_name=external_repo.name, pipeline_name=pipeline_run.pipeline_name, solid_selection=pipeline_run.solid_selection, ) subset_pipeline_result = repo_location.get_subset_external_pipeline_result( pipeline_selector) external_pipeline = ExternalPipeline( subset_pipeline_result.external_pipeline_data, external_repo.handle, ) yield external_pipeline
def legacy_get_bar_repo_handle(): recon_repo = ReconstructableRepository.from_legacy_repository_yaml( file_relative_path(__file__, "legacy_repository_file.yaml")) return (RepositoryLocation.from_handle( RepositoryLocationHandle.create_from_repository_location_origin( InProcessRepositoryLocationOrigin(recon_repo))).get_repository( "bar_repo").handle)
def get_test_project_external_repo(container_image=None): return RepositoryLocation.from_handle( RepositoryLocationHandle.create_from_repository_location_origin( InProcessRepositoryLocationOrigin( ReconstructableRepository.for_file( file_relative_path(__file__, "test_pipelines/repo.py"), "define_demo_execution_repo", container_image=container_image, )))).get_repository("demo_execution_repo")
def get_main_external_repo(instance): return RepositoryLocation.from_handle( location_handle_from_python_file( python_file=file_relative_path(__file__, "setup.py"), attribute=main_repo_name(), working_directory=None, user_process_api=python_user_process_api_from_instance(instance), location_name=main_repo_location_name(), )).get_repository(main_repo_name())
def get_test_external_repo(): with RepositoryLocationHandle.create_from_repository_location_origin( ManagedGrpcPythonEnvRepositoryLocationOrigin( loadable_target_origin=LoadableTargetOrigin( executable_path=sys.executable, python_file=__file__, attribute="test_repository", ), location_name="test_location", ) ) as handle: yield RepositoryLocation.from_handle(handle).get_repository("test_repository")
def get_main_external_repo(): with RepositoryLocationHandle.create_from_repository_location_origin( location_origin_from_python_file( python_file=file_relative_path(__file__, "setup.py"), attribute=main_repo_name(), working_directory=None, location_name=main_repo_location_name(), )) as handle: yield RepositoryLocation.from_handle(handle).get_repository( main_repo_name())
def reload_repository_location(self, name): self._workspace.reload_repository_location(name) if self._workspace.has_repository_location_handle(name): new_handle = self._workspace.get_repository_location_handle(name) new_location = RepositoryLocation.from_handle(new_handle) check.invariant(new_location.name == name) self._repository_locations[name] = new_location elif name in self._repository_locations: del self._repository_locations[name]
def default_repo(): loadable_target_origin = LoadableTargetOrigin( executable_path=sys.executable, python_file=__file__, working_directory=os.getcwd(), ) with RepositoryLocationHandle.create_from_repository_location_origin( ManagedGrpcPythonEnvRepositoryLocationOrigin( loadable_target_origin=loadable_target_origin, location_name="test_location", ) ) as handle: yield RepositoryLocation.from_handle(handle).get_repository("the_repo")
def get_test_external_repo(): return RepositoryLocation.from_handle( RepositoryLocationHandle.create_python_env_location( loadable_target_origin=LoadableTargetOrigin( executable_path=sys.executable, python_file=__file__, attribute="test_repository", ), location_name="test_location", user_process_api=UserProcessApi.CLI, )).get_repository("test_repository")
def launch_scheduled_runs( instance, logger, end_datetime_utc, max_catchup_runs=_DEFAULT_MAX_CATCHUP_RUNS, debug_crash_flags=None, ): schedules = [ s for s in instance.all_stored_schedule_state() if s.status == ScheduleStatus.RUNNING ] if not isinstance(instance.scheduler, DagsterCommandLineScheduler): raise DagsterInvariantViolationError( """Your dagster.yaml must be configured as follows in order to use dagster-scheduler: scheduler: module: dagster.core.scheduler class: DagsterCommandLineScheduler """, ) if not schedules: logger.info("Not checking for any runs since no schedules have been started.") return logger.info( "Checking for new runs for the following schedules: {schedule_names}".format( schedule_names=", ".join([schedule.name for schedule in schedules]), ) ) for schedule_state in schedules: try: with RepositoryLocationHandle.create_from_repository_origin( schedule_state.origin.repository_origin, instance ) as repo_location_handle: repo_location = RepositoryLocation.from_handle(repo_location_handle) launch_scheduled_runs_for_schedule( instance, logger, schedule_state, repo_location, end_datetime_utc, max_catchup_runs, (debug_crash_flags.get(schedule_state.name) if debug_crash_flags else None), ) except Exception: # pylint: disable=broad-except logger.error( "Scheduler failed for {schedule_name} : {error_info}".format( schedule_name=schedule_state.name, error_info=serializable_error_info_from_exc_info(sys.exc_info()).to_string(), ) )
def test_dagster_out_of_process_location(): with RepositoryLocationHandle.create_from_repository_location_origin( ManagedGrpcPythonEnvRepositoryLocationOrigin( location_name="test_location", loadable_target_origin=LoadableTargetOrigin( executable_path=sys.executable, python_file=file_relative_path(__file__, "setup.py"), attribute="test_repo", ), )) as handle: env = RepositoryLocation.from_handle(handle) assert env.get_repository("test_repo")
def test_external_diamond_toposort(): repo_location = RepositoryLocation.from_handle( location_handle_from_python_file(__file__, 'create_diamond_pipeline', UserProcessApi.CLI) ) external_repo = next(iter(repo_location.get_repositories().values())) external_pipeline = next(iter(external_repo.get_all_external_pipelines())) assert external_pipeline.solid_names_in_topological_order == [ 'A_source', 'A', 'B', 'C', 'D', ]
def grpc_repo_location(): loadable_target_origin = LoadableTargetOrigin( executable_path=sys.executable, python_file=__file__, attribute="the_repo" ) server_process = GrpcServerProcess(loadable_target_origin=loadable_target_origin) try: with server_process.create_ephemeral_client() as api_client: yield RepositoryLocation.from_handle( RepositoryLocationHandle.create_grpc_server_location( port=api_client.port, socket=api_client.socket, host=api_client.host, ) ) finally: server_process.wait()
def __init__(self, instance, workspace, version=None): self._instance = check.inst_param(instance, "instance", DagsterInstance) self._workspace = workspace self._repository_locations = {} for handle in self._workspace.repository_location_handles: check.invariant( self._repository_locations.get(handle.location_name) is None, 'Can not have multiple locations with the same name, got multiple "{name}"' .format(name=handle.location_name, ), ) self._repository_locations[ handle.location_name] = RepositoryLocation.from_handle(handle) self.version = version
def log_workspace_stats(instance, workspace): from dagster.cli.workspace import Workspace from dagster.core.host_representation import RepositoryLocation check.inst_param(instance, "instance", DagsterInstance) check.inst_param(workspace, "workspace", Workspace) for repository_location_handle in workspace.repository_location_handles: repo_location = RepositoryLocation.from_handle( repository_location_handle) for external_repo in repo_location.get_repositories().values(): log_external_repo_stats(instance, source="dagit", external_repo=external_repo)
def test_external_diamond_toposort(): with RepositoryLocationHandle.create_from_repository_location_origin( location_origin_from_python_file( python_file=__file__, attribute="create_diamond_pipeline", working_directory=None, ) ) as handle: repo_location = RepositoryLocation.from_handle(handle) external_repo = next(iter(repo_location.get_repositories().values())) external_pipeline = next(iter(external_repo.get_all_external_pipelines())) assert external_pipeline.solid_names_in_topological_order == [ "A_source", "A", "B", "C", "D", ]
def test_origin_id(user_process_api): with RepositoryLocationHandle.create_python_env_location( loadable_target_origin=LoadableTargetOrigin( executable_path=sys.executable, python_file=__file__, attribute="the_repo" ), location_name="the_location", user_process_api=user_process_api, ) as handle: host_location = RepositoryLocation.from_handle(handle) external_pipeline = host_location.get_repository("the_repo").get_full_external_pipeline( "the_pipe" ) recon_pipeline = recon_pipeline_from_origin(external_pipeline.get_origin()) assert external_pipeline.get_origin_id() == recon_pipeline.get_origin_id()
def launch_scheduled_runs( instance, logger, end_datetime_utc, max_catchup_runs=_DEFAULT_MAX_CATCHUP_RUNS, debug_crash_flags=None, ): schedules = [ s for s in instance.all_stored_job_state(job_type=JobType.SCHEDULE) if s.status == JobStatus.RUNNING ] if not schedules: logger.info( "Not checking for any runs since no schedules have been started.") return logger.info( "Checking for new runs for the following schedules: {schedule_names}". format(schedule_names=", ".join( [schedule.job_name for schedule in schedules]), )) for schedule_state in schedules: try: with RepositoryLocationHandle.create_from_repository_location_origin( schedule_state.origin.external_repository_origin. repository_location_origin) as repo_location_handle: repo_location = RepositoryLocation.from_handle( repo_location_handle) launch_scheduled_runs_for_schedule( instance, logger, schedule_state, repo_location, end_datetime_utc, max_catchup_runs, (debug_crash_flags.get(schedule_state.job_name) if debug_crash_flags else None), ) except Exception: # pylint: disable=broad-except logger.error( "Scheduler failed for {schedule_name} : {error_info}".format( schedule_name=schedule_state.job_name, error_info=serializable_error_info_from_exc_info( sys.exc_info()).to_string(), ))
def test_external_diamond_toposort(): repo_location = RepositoryLocation.from_handle( location_handle_from_python_file( python_file=__file__, attribute='create_diamond_pipeline', working_directory=None, user_process_api=UserProcessApi.CLI, )) external_repo = next(iter(repo_location.get_repositories().values())) external_pipeline = next(iter(external_repo.get_all_external_pipelines())) assert external_pipeline.solid_names_in_topological_order == [ 'A_source', 'A', 'B', 'C', 'D', ]
def test_multi_file_override_workspace(): with load_workspace_from_yaml_paths([ file_relative_path(__file__, "multi_location.yaml"), file_relative_path(__file__, "override_location.yaml"), ], ) as workspace: assert isinstance(workspace, Workspace) assert len(workspace.repository_location_handles) == 3 assert workspace.has_repository_location_handle("loaded_from_file") assert workspace.has_repository_location_handle("loaded_from_module") assert workspace.has_repository_location_handle("loaded_from_package") loaded_from_file = RepositoryLocation.from_handle( workspace.get_repository_location_handle("loaded_from_file")) # Ensure location `loaded_from_file` has been overridden external_repositories = loaded_from_file.get_repositories() assert len(external_repositories) == 1 assert "extra_repository" in external_repositories
def create_app_from_workspace(workspace, instance, path_prefix=''): check.inst_param(workspace, 'workspace', Workspace) check.inst_param(instance, 'instance', DagsterInstance) check.str_param(path_prefix, 'path_prefix') if path_prefix: if not path_prefix.startswith('/'): raise Exception('The path prefix should begin with a leading "/".') if path_prefix.endswith('/'): raise Exception('The path prefix should not include a trailing "/".') warn_if_compute_logs_disabled() print('Loading repository...') # pylint: disable=print-call locations = [] for repository_location_handle in workspace.repository_location_handles: locations.append(RepositoryLocation.from_handle(repository_location_handle)) context = DagsterGraphQLContext(instance=instance, locations=locations, version=__version__) return instantiate_app_with_views(context, path_prefix)
def test_origin_id(): loadable_target_origin = LoadableTargetOrigin( executable_path=sys.executable, python_file=__file__, attribute="the_repo") location_name = "the_location" origin = ManagedGrpcPythonEnvRepositoryLocationOrigin( loadable_target_origin, location_name) with RepositoryLocationHandle.create_from_repository_location_origin( origin) as handle: host_location = RepositoryLocation.from_handle(handle) external_pipeline = host_location.get_repository( "the_repo").get_full_external_pipeline("the_pipe") recon_pipeline = recon_pipeline_from_origin( external_pipeline.get_origin()) assert external_pipeline.get_origin_id( ) == recon_pipeline.get_origin_id()
def launch_scheduled_runs_for_schedule(instance, logger, schedule_state, end_datetime_utc, max_catchup_runs, debug_crash_flags=None): check.inst_param(instance, "instance", DagsterInstance) check.inst_param(schedule_state, "schedule_state", ScheduleState) check.inst_param(end_datetime_utc, "end_datetime_utc", datetime.datetime) latest_tick = instance.get_latest_tick(schedule_state.schedule_origin_id) if not latest_tick: start_timestamp_utc = schedule_state.start_timestamp elif latest_tick.status == ScheduleTickStatus.STARTED: # Scheduler was interrupted while performing this tick, re-do it start_timestamp_utc = latest_tick.timestamp else: start_timestamp_utc = latest_tick.timestamp + 1 start_datetime_utc = datetime.datetime.fromtimestamp(start_timestamp_utc, tz=get_utc_timezone()) tick_times = list( croniter_range(start_datetime_utc, end_datetime_utc, schedule_state.cron_schedule)) if not tick_times: logger.info("No new runs for {schedule_name}".format( schedule_name=schedule_state.name)) return if len(tick_times) > max_catchup_runs: logger.warn( "{schedule_name} has fallen behind, only launching {max_catchup_runs} runs" .format(schedule_name=schedule_state.name, max_catchup_runs=max_catchup_runs)) tick_times = tick_times[-max_catchup_runs:] if len(tick_times) == 1: logger.info("Launching run for {schedule_name} at {time}".format( schedule_name=schedule_state.name, time=tick_times[0].strftime(_SCHEDULER_DATETIME_FORMAT), )) else: logger.info( "Launching {num_runs} runs for {schedule_name} at the following times: {times}" .format( num_runs=len(tick_times), schedule_name=schedule_state.name, times=", ".join([ time.strftime(_SCHEDULER_DATETIME_FORMAT) for time in tick_times ]), )) for schedule_time_utc in tick_times: schedule_timestamp = get_timestamp_from_utc_datetime(schedule_time_utc) if latest_tick and latest_tick.timestamp == schedule_timestamp: tick = latest_tick logger.info("Resuming previously interrupted schedule execution") else: tick = instance.create_schedule_tick( ScheduleTickData( schedule_origin_id=schedule_state.schedule_origin_id, schedule_name=schedule_state.name, timestamp=schedule_timestamp, cron_schedule=schedule_state.cron_schedule, status=ScheduleTickStatus.STARTED, )) _check_for_debug_crash(debug_crash_flags, "TICK_CREATED") with ScheduleTickHolder(tick, instance, logger) as tick_holder: _check_for_debug_crash(debug_crash_flags, "TICK_HELD") with RepositoryLocationHandle.create_from_repository_origin( schedule_state.origin.repository_origin, instance) as repo_location_handle: repo_location = RepositoryLocation.from_handle( repo_location_handle) _schedule_run_at_time( instance, logger, repo_location, schedule_state, schedule_time_utc, tick_holder, debug_crash_flags, )
def get_repository_location_from_kwargs(kwargs): origin = get_repository_location_origin_from_kwargs(kwargs) with RepositoryLocationHandle.create_from_repository_location_origin( origin) as handle: yield RepositoryLocation.from_handle(handle)