def step_context_to_step_run_ref( step_context: SystemStepExecutionContext, prior_attempts_count: int, package_dir: Optional[str] = None, ) -> StepRunRef: """ Args: step_context (SystemStepExecutionContext): The step context. prior_attempts_count (int): The number of times this time has been tried before in the same pipeline run. package_dir (Optional[str]): If set, the reconstruction file code pointer will be converted to be relative a module pointer relative to the package root. This enables executing steps in remote setups where the package containing the pipeline resides at a different location on the filesystem in the remote environment than in the environment executing the plan process. Returns (StepRunRef): A reference to the step. """ check.inst_param(step_context, "step_context", SystemStepExecutionContext) check.int_param(prior_attempts_count, "prior_attempts_count") retries = step_context.retries recon_pipeline = step_context.pipeline if package_dir: if isinstance(recon_pipeline, ReconstructablePipeline) and isinstance( recon_pipeline.repository.pointer, FileCodePointer ): recon_pipeline = ReconstructablePipeline( repository=ReconstructableRepository( pointer=ModuleCodePointer( _module_in_package_dir( recon_pipeline.repository.pointer.python_file, package_dir ), recon_pipeline.repository.pointer.fn_name, ), ), pipeline_name=recon_pipeline.pipeline_name, solids_to_execute=recon_pipeline.solids_to_execute, ) return StepRunRef( run_config=step_context.run_config, pipeline_run=step_context.pipeline_run, run_id=step_context.pipeline_run.run_id, step_key=step_context.step.key, retries=retries, recon_pipeline=recon_pipeline, prior_attempts_count=prior_attempts_count, )
def test_notebook_view(): notebook_path = file_relative_path(__file__, 'render_uuid_notebook.ipynb') with create_app_with_reconstructable_repo( ReconstructableRepository.from_yaml( file_relative_path(__file__, './repository.yaml')), DagsterInstance.ephemeral(), ).test_client() as client: res = client.get('/dagit/notebook?path={}'.format(notebook_path)) assert res.status_code == 200 # This magic guid is hardcoded in the notebook assert b'6cac0c38-2c97-49ca-887c-4ac43f141213' in res.data
def _recon_repository_from_origin(self, external_repository_origin): check.inst_param( external_repository_origin, "external_repository_origin", ExternalRepositoryOrigin, ) return ReconstructableRepository( self._repository_symbols_and_code_pointers. code_pointers_by_repo_name[ external_repository_origin.repository_name], self._get_current_image(), )
def test_gcs_storage(dagster_airflow_python_operator_pipeline, ): # pylint: disable=redefined-outer-name pipeline_name = "demo_pipeline_gcs" environments_path = get_test_project_environments_path() results = dagster_airflow_python_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", pipeline_name), environment_yaml=[ os.path.join(environments_path, "env.yaml"), os.path.join(environments_path, "env_gcs.yaml"), ], ) validate_pipeline_execution(results)
def test_s3_storage(dagster_airflow_python_operator_pipeline, ): # pylint: disable=redefined-outer-name pipeline_name = 'demo_pipeline' environments_path = test_project_environments_path() results = dagster_airflow_python_operator_pipeline( pipeline_name=pipeline_name, handle=ReconstructableRepository.for_module('test_pipelines.repo', pipeline_name), environment_yaml=[ os.path.join(environments_path, 'env.yaml'), os.path.join(environments_path, 'env_s3.yaml'), ], ) validate_pipeline_execution(results)
def test_origin_ids_stable(monkeypatch): # This test asserts fixed schedule origin IDs to prevent any changes from # accidentally shifting these ids that are persisted to ScheduleStorage # stable exe path for test monkeypatch.setattr(sys, "executable", "/fake/python") file_repo = ReconstructableRepository.for_file( "/path/to/file", "the_repo", "/path/to/working_dir" ) # ensure monkeypatch worked assert file_repo.get_origin().executable_path == "/fake/python" assert file_repo.get_origin_id() == "3766b1c554fd961b88b9301756250febff3d0ffa" schedule = file_repo.get_reconstructable_schedule("simple_schedule") assert schedule.get_origin_id() == "7c60d01588673ffcaea16b6fd59d998dc63ed3c3" module_repo = ReconstructableRepository.for_module("dummy_module", "the_repo") assert module_repo.get_origin_id() == "86503fc349d4ecf44bd22ca1de64c10f8ffcebbd" module_schedule = module_repo.get_reconstructable_schedule("simple_schedule") assert module_schedule.get_origin_id() == "e4c7131b74ad600969876d8fa461f215ced9631a"
def get_test_project_external_repo(container_image=None): return ( InProcessRepositoryLocationOrigin( ReconstructableRepository.for_file( file_relative_path(__file__, "test_pipelines/repo.py"), "define_demo_execution_repo", container_image=container_image, ) ) .create_handle() .create_location() .get_repository("demo_execution_repo") )
def test_skip_operator( dagster_airflow_python_operator_pipeline, ): # pylint: disable=redefined-outer-name pipeline_name = "optional_outputs" environments_path = test_project_environments_path() results = dagster_airflow_python_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", pipeline_name ), environment_yaml=[os.path.join(environments_path, "env_filesystem.yaml")], ) validate_skip_pipeline_execution(results)
class TestAirflowizedEventPipeline(object): config_yaml = [ script_relative_path('../../dagster_examples/airline_demo/environments/default.yaml') ] pipeline_name = 'event_ingest_pipeline' recon_repo = ReconstructableRepository.for_module( 'dagster_examples.event_pipeline_demo', pipeline_name ) # pylint: disable=redefined-outer-name def test_airflowized_event_pipeline(self, dagster_airflow_python_operator_pipeline): pass
def test_smoke_app(): flask_app = app.create_app_with_reconstructable_repo( ReconstructableRepository.for_module( module='dagster_examples.intro_tutorial.repos', fn_name='define_repo'), DagsterInstance.ephemeral(), ) client = flask_app.test_client() result = client.post( '/graphql', data={ 'query': 'query { pipelinesOrError { ... on PipelineConnection { nodes { name } } } }' }, ) data = json.loads(result.data.decode('utf-8')) assert len(data['data']['pipelinesOrError']['nodes']) == 2 assert { node_data['name'] for node_data in data['data']['pipelinesOrError']['nodes'] } == set(['hello_cereal_pipeline', 'complex_pipeline']) result = client.get('/graphql') assert result.status_code == 400 data = json.loads(result.data.decode('utf-8')) assert len(data['errors']) == 1 assert data['errors'][0]['message'] == 'Must provide query string.' result = client.get('/dagit/notebook?path=foo.bar') assert result.status_code == 400 assert result.data.decode('utf-8') == 'Invalid Path' result = client.post('/graphql', data={'query': 'query { version { slkjd } }'}) data = json.loads(result.data.decode('utf-8')) assert 'errors' in data assert len(data['errors']) == 1 assert 'must not have a sub selection' in data['errors'][0]['message'] # Missing routes return the index.html file of the Dagit react app, so the user # gets our UI when they navigate to "synthetic" react router URLs. result = client.get('static/foo/bar') assert result.status_code == 200 assert "You need to enable JavaScript to run this app." in result.data.decode( 'utf-8') result = client.get('pipelines/foo') assert result.status_code == 200 assert "You need to enable JavaScript to run this app." in result.data.decode( 'utf-8')
def recon_repo_for_cli_args(kwargs): '''Builds a ReconstructableRepository for CLI arguments, which can be any of the combinations for repo loading above. ''' check.dict_param(kwargs, 'kwargs') _cli_load_invariant(kwargs.get('pipeline_name') is None) if kwargs.get('workspace'): check.not_implemented( 'Workspace not supported yet in this cli command') if kwargs.get('repository_yaml') or all_none(kwargs): _cli_load_invariant(kwargs.get('module_name') is None) _cli_load_invariant(kwargs.get('python_file') is None) _cli_load_invariant(kwargs.get('fn_name') is None) repo_yaml = (os.path.abspath(kwargs.get('repository_yaml')) if kwargs.get('repository_yaml') else DEFAULT_REPOSITORY_YAML_FILENAME) _cli_load_invariant( os.path.exists(repo_yaml), 'Expected to use file "{}" to load repository but it does not exist. ' 'Verify your current working directory or CLI arguments.'.format( repo_yaml), ) return ReconstructableRepository.from_legacy_repository_yaml(repo_yaml) elif kwargs.get('module_name') and kwargs.get('fn_name'): _cli_load_invariant(kwargs.get('repository_yaml') is None) _cli_load_invariant(kwargs.get('python_file') is None) return ReconstructableRepository.for_module(kwargs['module_name'], kwargs['fn_name']) elif kwargs.get('python_file') and kwargs.get('fn_name'): _cli_load_invariant(kwargs.get('repository_yaml') is None) _cli_load_invariant(kwargs.get('module_name') is None) return ReconstructableRepository.for_file( os.path.abspath(kwargs['python_file']), kwargs['fn_name']) else: _cli_load_invariant(False)
def get_external_pipeline_from_grpc_server_repository(pipeline_name): repo_yaml = file_relative_path(__file__, 'repo.yaml') recon_repo = ReconstructableRepository.from_legacy_repository_yaml(repo_yaml) loadable_target_origin = LoadableTargetOrigin.from_python_origin(recon_repo.get_origin()) with GrpcServerProcess( loadable_target_origin=loadable_target_origin ).create_ephemeral_client() as server: repository_location = GrpcServerRepositoryLocation( RepositoryLocationHandle.create_grpc_server_location( location_name='test', port=server.port, socket=server.socket, host='localhost', ) ) yield repository_location.get_repository('nope').get_full_external_pipeline(pipeline_name)
def test_execute_hammer_through_dagit(): recon_repo = ReconstructableRepository.for_file( file_relative_path(__file__, '../../../../examples/dagster_examples/toys/hammer.py'), 'hammer_pipeline', ) instance = DagsterInstance.local_temp() context = DagsterGraphQLContext( locations=[InProcessRepositoryLocation(recon_repo)], instance=instance, ) selector = get_legacy_pipeline_selector(context, 'hammer_pipeline') executor = SyncExecutor() variables = { 'executionParams': { 'runConfigData': { 'storage': {'filesystem': {}}, 'execution': {'dask': {'config': {'cluster': {'local': {}}}}}, }, 'selector': selector, 'mode': 'default', } } start_pipeline_result = graphql( request_string=START_PIPELINE_EXECUTION_MUTATION, schema=create_schema(), context=context, variables=variables, executor=executor, ) if start_pipeline_result.errors: raise Exception('{}'.format(start_pipeline_result.errors)) run_id = start_pipeline_result.data['startPipelineExecution']['run']['runId'] context.drain_outstanding_executions() subscription = execute_dagster_graphql(context, SUBSCRIPTION_QUERY, variables={'runId': run_id}) subscribe_results = [] subscription.subscribe(subscribe_results.append) messages = [x['__typename'] for x in subscribe_results[0].data['pipelineRunLogs']['messages']] assert 'PipelineStartEvent' in messages assert 'PipelineSuccessEvent' in messages
def test_skip(): with seven.TemporaryDirectory() as temp_dir: with environ({'DAGSTER_HOME': temp_dir}): instance = DagsterInstance.get() recon_repo = ReconstructableRepository.for_file( __file__, 'the_repo') skip = recon_repo.get_reconstructable_schedule('skip_schedule') result = sync_launch_scheduled_execution(skip.get_origin()) assert isinstance(result, ScheduledExecutionSkipped) ticks = instance.get_schedule_ticks(skip.get_origin_id()) assert ticks[0].status == ScheduleTickStatus.SKIPPED
def execute_plan(self, instance, external_pipeline, environment_dict, pipeline_run, step_keys_to_execute): if (is_repository_location_in_same_python_env(self.location_handle) and len(self.location_handle.repository_code_pointer_dict) == 1): check.inst_param(instance, 'instance', DagsterInstance) check.inst_param(external_pipeline, 'external_pipeline', ExternalPipeline) check.dict_param(environment_dict, 'environment_dict') check.inst_param(pipeline_run, 'pipeline_run', PipelineRun) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) pointer = next( iter(self.location_handle.repository_code_pointer_dict.values( ))) recon_repo = ReconstructableRepository(pointer) execution_plan = create_execution_plan( pipeline=recon_repo.get_reconstructable_pipeline( external_pipeline.name), environment_dict=environment_dict, mode=pipeline_run.mode, step_keys_to_execute=step_keys_to_execute, ) execute_plan( execution_plan=execution_plan, instance=instance, pipeline_run=pipeline_run, environment_dict=environment_dict, ) else: raise NotImplementedError( 'execute_plan is currently only supported when the location is a python ' 'environment with the exact same executable and when there is only a single ' 'repository.')
def __init__(self, loadable_target_origin, entry_point): self._loadable_target_origin = loadable_target_origin self._code_pointers_by_repo_name = {} self._recon_repos_by_name = {} self._loadable_repository_symbols = [] if not loadable_target_origin: return loadable_targets = get_loadable_targets( loadable_target_origin.python_file, loadable_target_origin.module_name, loadable_target_origin.package_name, loadable_target_origin.working_directory, loadable_target_origin.attribute, ) for loadable_target in loadable_targets: pointer = _get_code_pointer(loadable_target_origin, loadable_target) recon_repo = ReconstructableRepository( pointer, _get_current_image(), sys.executable, entry_point=entry_point, ) repo_def = recon_repo.get_definition() # force load of all lazy constructed jobs/pipelines repo_def.get_all_pipelines() self._code_pointers_by_repo_name[repo_def.name] = pointer self._recon_repos_by_name[repo_def.name] = recon_repo self._loadable_repository_symbols.append( LoadableRepositorySymbol( attribute=loadable_target.attribute, repository_name=repo_def.name, ) )
def test_skip_operator( dagster_airflow_docker_operator_pipeline, dagster_docker_image ): # pylint: disable=redefined-outer-name pipeline_name = "optional_outputs" environments_path = test_project_environments_path() results = dagster_airflow_docker_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", "define_demo_execution_repo", ), environment_yaml=[os.path.join(environments_path, "env_filesystem.yaml")], op_kwargs={"host_tmp_dir": "/tmp"}, image=dagster_docker_image, ) validate_skip_pipeline_execution(results)
def test_fs_storage_no_explicit_base_dir( dagster_airflow_python_operator_pipeline, ): # pylint: disable=redefined-outer-name pipeline_name = 'demo_pipeline' environments_path = test_project_environments_path() results = dagster_airflow_python_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( 'dagster_test.test_project.test_pipelines.repo', pipeline_name), environment_yaml=[ os.path.join(environments_path, 'env.yaml'), os.path.join(environments_path, 'env_filesystem_no_explicit_base_dir.yaml'), ], ) validate_pipeline_execution(results)
def create_invalid_run(instance, **kwargs): create_run_for_test( instance, external_pipeline_origin=ExternalPipelineOrigin( ExternalRepositoryOrigin( InProcessRepositoryLocationOrigin( ReconstructableRepository(ModuleCodePointer( "fake", "fake"))), "foo", ), "wrong-pipeline", ), pipeline_name="wrong-pipeline", **kwargs, )
def test_skip_operator(dagster_airflow_docker_operator_pipeline, dagster_docker_image): # pylint: disable=redefined-outer-name pipeline_name = 'optional_outputs' environments_path = test_project_environments_path() results = dagster_airflow_docker_operator_pipeline( pipeline_name=pipeline_name, handle=ReconstructableRepository.for_module('test_pipelines.repo', pipeline_name), environment_yaml=[ os.path.join(environments_path, 'env_filesystem.yaml') ], op_kwargs={'host_tmp_dir': '/tmp'}, image=dagster_docker_image, ) validate_skip_pipeline_execution(results)
def test_bad_load(): with _default_instance() as instance: instance = DagsterInstance.get() working_directory = os.path.dirname(__file__) recon_repo = ReconstructableRepository.for_file(__file__, "doesnt_exist", working_directory) schedule = recon_repo.get_reconstructable_schedule("also_doesnt_exist") result = sync_launch_scheduled_execution(schedule.get_origin()) assert isinstance(result, ScheduledExecutionFailed) assert "doesnt_exist not found at module scope in file" in result.errors[0].to_string() ticks = instance.get_schedule_ticks(schedule.get_origin_id()) assert ticks[0].status == ScheduleTickStatus.FAILURE assert "doesnt_exist not found at module scope in file" in ticks[0].error.message
def get_test_project_external_pipeline(pipeline_name): return ( RepositoryLocation.from_handle( RepositoryLocationHandle.create_from_repository_location_origin( InProcessRepositoryLocationOrigin( ReconstructableRepository.for_file( file_relative_path(__file__, "test_pipelines/repo.py"), "define_demo_execution_repo", ) ) ) ) .get_repository("demo_execution_repo") .get_full_external_pipeline(pipeline_name) )
def test_bad_load(): with schedule_instance() as instance: working_directory = os.path.dirname(__file__) recon_repo = ReconstructableRepository.for_file(__file__, "doesnt_exist", working_directory) schedule = recon_repo.get_reconstructable_schedule("also_doesnt_exist") fake_origin = schedule.get_origin() initial_datetime = datetime( year=2019, month=2, day=27, hour=23, minute=59, second=59, tzinfo=get_utc_timezone(), ) with freeze_time(initial_datetime) as frozen_datetime: schedule_state = ScheduleState( fake_origin, ScheduleStatus.RUNNING, "0 0 * * *", get_timestamp_from_utc_datetime(get_current_datetime_in_utc()), ) instance.add_schedule_state(schedule_state) frozen_datetime.tick(delta=timedelta(seconds=1)) launch_scheduled_runs(instance, get_current_datetime_in_utc()) assert instance.get_runs_count() == 0 ticks = instance.get_schedule_ticks(fake_origin.get_id()) assert len(ticks) == 1 assert ticks[0].status == ScheduleTickStatus.FAILURE assert ticks[0].timestamp == get_timestamp_from_utc_datetime( get_current_datetime_in_utc() ) assert "doesnt_exist not found at module scope in file" in ticks[0].error.message frozen_datetime.tick(delta=timedelta(days=1)) launch_scheduled_runs(instance, get_current_datetime_in_utc()) assert instance.get_runs_count() == 0 ticks = instance.get_schedule_ticks(fake_origin.get_id()) assert len(ticks) == 2 assert ticks[0].status == ScheduleTickStatus.FAILURE assert ticks[0].timestamp == get_timestamp_from_utc_datetime( get_current_datetime_in_utc() ) assert "doesnt_exist not found at module scope in file" in ticks[0].error.message
def test_error_dag_python(): # pylint: disable=redefined-outer-name pipeline_name = 'demo_error_pipeline' recon_repo = ReconstructableRepository.for_module('test_pipelines.repo', pipeline_name) environments_path = test_project_environments_path() environment_yaml = [ os.path.join(environments_path, 'env_filesystem.yaml'), ] run_config = load_yaml_from_glob_list(environment_yaml) execution_date = timezone.utcnow() dag, tasks = make_airflow_dag_for_recon_repo(recon_repo, pipeline_name, run_config) with pytest.raises(AirflowException) as exc_info: execute_tasks_in_dag(dag, tasks, run_id=make_new_run_id(), execution_date=execution_date) assert 'Exception: Unusual error' in str(exc_info.value)
def test_gcs_storage( dagster_airflow_docker_operator_pipeline, dagster_docker_image, ): # pylint: disable=redefined-outer-name pipeline_name = 'demo_pipeline_gcs' environments_path = test_project_environments_path() results = dagster_airflow_docker_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( 'test_pipelines.repo', 'define_demo_execution_repo' ), environment_yaml=[ os.path.join(environments_path, 'env.yaml'), os.path.join(environments_path, 'env_gcs.yaml'), ], image=dagster_docker_image, ) validate_pipeline_execution(results)
def test_launch_scheduled_execution(): with seven.TemporaryDirectory() as temp_dir: with environ({'DAGSTER_HOME': temp_dir}): instance = DagsterInstance.get() recon_repo = ReconstructableRepository.for_file( __file__, 'the_repo') simple = recon_repo.get_reconstructable_schedule('simple_schedule') result = sync_launch_scheduled_execution(simple.get_origin()) assert isinstance(result, ScheduledExecutionSuccess) run = instance.get_run_by_id(result.run_id) assert run.is_success ticks = instance.get_schedule_ticks(simple.get_origin_id()) assert ticks[0].status == ScheduleTickStatus.SUCCESS
def test_bad_load(): with seven.TemporaryDirectory() as temp_dir: with environ({'DAGSTER_HOME': temp_dir}): instance = DagsterInstance.get() recon_repo = ReconstructableRepository.for_file( __file__, 'doesnt_exist') schedule = recon_repo.get_reconstructable_schedule( 'also_doesnt_exist') with pytest.raises(DagsterSubprocessError): sync_launch_scheduled_execution(schedule.get_origin()) ticks = instance.get_schedule_ticks(schedule.get_origin_id()) assert ticks[0].status == ScheduleTickStatus.FAILURE assert 'doesnt_exist not found at module scope in file' in ticks[ 0].error.message
def test_unknown_error(): class AnException(Exception): pass def _raise_custom_error(): raise AnException('foobar') with mock.patch( 'gevent.pywsgi.WSGIServer', new=_define_mock_server(_raise_custom_error) ), seven.TemporaryDirectory() as temp_dir: recon_repo = ReconstructableRepository.from_yaml( file_relative_path(__file__, './repository.yaml') ) with pytest.raises(AnException): host_dagit_ui_with_reconstructable_repo( storage_fallback=temp_dir, recon_repo=recon_repo, host=None, port=2343 )
def test_s3_storage(dagster_airflow_docker_operator_pipeline, dagster_docker_image): # pylint: disable=redefined-outer-name pipeline_name = "demo_pipeline" environments_path = test_project_environments_path() results = dagster_airflow_docker_operator_pipeline( pipeline_name=pipeline_name, recon_repo=ReconstructableRepository.for_module( "dagster_test.test_project.test_pipelines.repo", "define_demo_execution_repo", ), environment_yaml=[ os.path.join(environments_path, "env.yaml"), os.path.join(environments_path, "env_s3.yaml"), ], image=dagster_docker_image, ) validate_pipeline_execution(results)
def from_handle(repository_location_handle): check.inst_param( repository_location_handle, 'repository_location_handle', RepositoryLocationHandle ) if isinstance(repository_location_handle, InProcessRepositoryLocationHandle): check.invariant(len(repository_location_handle.repository_code_pointer_dict) == 1) pointer = next(iter(repository_location_handle.repository_code_pointer_dict.values())) return InProcessRepositoryLocation(ReconstructableRepository(pointer)) elif isinstance(repository_location_handle, PythonEnvRepositoryLocationHandle): return PythonEnvRepositoryLocation(repository_location_handle) elif isinstance( repository_location_handle, GrpcServerRepositoryLocationHandle ) or isinstance(repository_location_handle, ManagedGrpcPythonEnvRepositoryLocationHandle): return GrpcServerRepositoryLocation(repository_location_handle) else: check.failed('Unsupported handle: {}'.format(repository_location_handle))