def test_using_s3_for_subplan(s3_bucket): pipeline_def = define_inty_pipeline() environment_dict = { 'storage': { 's3': { 'config': { 's3_bucket': s3_bucket } } } } run_id = str(uuid.uuid4()) execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id)) assert execution_plan.get_step_by_key('return_one.compute') step_keys = ['return_one.compute'] instance = DagsterInstance.ephemeral() return_one_step_events = list( execute_plan( execution_plan, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), step_keys_to_execute=step_keys, instance=instance, )) assert get_step_output(return_one_step_events, 'return_one.compute') with scoped_pipeline_context(pipeline_def, environment_dict, RunConfig(run_id=run_id), instance) as context: store = S3IntermediateStore( s3_bucket, run_id, s3_session=context.scoped_resources_builder.build().s3.session) assert store.has_intermediate(context, 'return_one.compute') assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1 add_one_step_events = list( execute_plan( execution_plan, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), step_keys_to_execute=['add_one.compute'], instance=instance, )) assert get_step_output(add_one_step_events, 'add_one.compute') with scoped_pipeline_context(pipeline_def, environment_dict, RunConfig(run_id=run_id), instance) as context: assert store.has_intermediate(context, 'add_one.compute') assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
def test_using_s3_for_subplan(s3_bucket): pipeline_def = define_inty_pipeline() run_config = {"storage": {"s3": {"config": {"s3_bucket": s3_bucket}}}} run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one.compute") step_keys = ["return_one.compute"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config ) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(return_one_step_events, "return_one.compute") with scoped_pipeline_context( execution_plan.build_subset_plan(["return_one.compute"]), run_config, pipeline_run, instance, ) as context: intermediates_manager = S3IntermediateStorage( s3_bucket, run_id, s3_session=context.scoped_resources_builder.build(required_resource_keys={"s3"},).s3, ) step_output_handle = StepOutputHandle("return_one.compute") assert intermediates_manager.has_intermediate(context, step_output_handle) assert intermediates_manager.get_intermediate(context, Int, step_output_handle).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one.compute"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(add_one_step_events, "add_one.compute") with scoped_pipeline_context( execution_plan.build_subset_plan(["add_one.compute"]), run_config, pipeline_run, instance, ) as context: step_output_handle = StepOutputHandle("add_one.compute") assert intermediates_manager.has_intermediate(context, step_output_handle) assert intermediates_manager.get_intermediate(context, Int, step_output_handle).obj == 2
def test_using_gcs_for_subplan(gcs_bucket): pipeline_def = define_inty_pipeline() environment_dict = { 'storage': { 'gcs': { 'config': { 'gcs_bucket': gcs_bucket } } } } run_id = str(uuid.uuid4()) execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id)) assert execution_plan.get_step_by_key('return_one.compute') step_keys = ['return_one.compute'] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun.create_empty_run( pipeline_def.name, run_id=run_id, environment_dict=environment_dict) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, 'return_one.compute') with scoped_pipeline_context(pipeline_def, environment_dict, pipeline_run, instance) as context: store = GCSIntermediateStore( gcs_bucket, run_id, client=context.scoped_resources_builder.build().gcs.client) assert store.has_intermediate(context, 'return_one.compute') assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(add_one_step_events, 'add_one.compute') with scoped_pipeline_context(pipeline_def, environment_dict, pipeline_run, instance) as context: assert store.has_intermediate(context, 'add_one.compute') assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
def test_using_s3_for_subplan(s3_bucket): pipeline = define_inty_pipeline() environment_dict = {'storage': {'s3': {'s3_bucket': s3_bucket}}} execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict) assert execution_plan.get_step_by_key('return_one.compute') step_keys = ['return_one.compute'] run_id = str(uuid.uuid4()) store = S3IntermediateStore(s3_bucket, run_id) try: return_one_step_events = list( execute_plan( execution_plan, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), step_keys_to_execute=step_keys, ) ) assert get_step_output(return_one_step_events, 'return_one.compute') with scoped_pipeline_context( pipeline, environment_dict, RunConfig(run_id=run_id) ) as context: assert store.has_intermediate(context, 'return_one.compute') assert store.get_intermediate(context, 'return_one.compute', Int) == 1 add_one_step_events = list( execute_plan( execution_plan, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), step_keys_to_execute=['add_one.compute'], ) ) assert get_step_output(add_one_step_events, 'add_one.compute') with scoped_pipeline_context( pipeline, environment_dict, RunConfig(run_id=run_id) ) as context: assert store.has_intermediate(context, 'add_one.compute') assert store.get_intermediate(context, 'add_one.compute', Int) == 2 finally: with scoped_pipeline_context( pipeline, environment_dict, RunConfig(run_id=run_id) ) as context: store.rm_intermediate(context, 'return_one.compute') store.rm_intermediate(context, 'add_one.compute')
def define_out_of_pipeline_context(self, config=None): '''Defines a context to be used in a notebook (i.e., not in pipeline execution). ''' config = check.opt_dict_param(config, 'config') pipeline_def = PipelineDefinition([], name='Ephemeral Notebook Pipeline') if config.keys(): warnings.warn( 'Config keys will not be respected for in-notebook ' 'execution: [{keys}]'.format( keys=', '.join(['\'{key}\''.format(key=key) for key in config.keys()]) ) ) config = {} run_config = RunConfig() with scoped_pipeline_context( pipeline_def, config, run_config, scoped_resources_builder_cm=self.setup_resources ) as pipeline_context: self.context = DagstermillInNotebookExecutionContext( pipeline_context, out_of_pipeline=True ) if self.context.resources: # pylint: disable=protected-access warnings.warn( 'Call dagstermill.teardown() to finalize resources attached to the context.' ) return self.context
def define_out_of_pipeline_context(self, config=None): '''Defines a context to be used in a notebook (i.e., not in pipeline execution). ''' config = check.opt_dict_param(config, 'config') pipeline_def = PipelineDefinition([], name='Ephemeral Notebook Pipeline') # BUG: If the context cleans up after itself (e.g. closes a db connection or similar) # This will instigate that process *before* return. We are going to have to # manage this manually (without an if block) in order to make this work. # See https://github.com/dagster-io/dagster/issues/796 if config.keys(): warnings.warn( 'Config keys will not be respected for in-notebook ' 'execution: [{keys}]'.format(keys=', '.join( ['\'{key}\''.format(key=key) for key in config.keys()]))) config = {} with scoped_pipeline_context(pipeline_def, config, RunConfig(run_id='')) as pipeline_context: self.context = DagstermillInNotebookExecutionContext( pipeline_context, out_of_pipeline=True) return self.context
def test_gcs_pipeline_with_custom_prefix(gcs_bucket): run_id = str(uuid.uuid4()) gcs_prefix = 'custom_prefix' pipe = define_inty_pipeline(should_throw=False) environment_dict = { 'storage': {'gcs': {'config': {'gcs_bucket': gcs_bucket, 'gcs_prefix': gcs_prefix}}} } pipeline_run = PipelineRun.create_empty_run( pipe.name, run_id=run_id, environment_dict=environment_dict ) instance = DagsterInstance.ephemeral() result = execute_pipeline( pipe, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), ) assert result.success execution_plan = create_execution_plan(pipe, environment_dict, run_config=pipeline_run) with scoped_pipeline_context( pipe, environment_dict, pipeline_run, instance, execution_plan ) as context: store = GCSIntermediateStore( run_id=run_id, gcs_bucket=gcs_bucket, gcs_prefix=gcs_prefix, client=context.scoped_resources_builder.build( mapper_fn=SolidInvocation.default_resource_mapper_fn, required_resource_keys={'gcs'}, ).gcs.client, ) assert store.root == '/'.join(['custom_prefix', 'storage', run_id]) assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1 assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
def yield_empty_pipeline_context(run_id=None, instance=None): pipeline = InMemoryPipeline(PipelineDefinition([])) pipeline_def = pipeline.get_definition() instance = check.opt_inst_param(instance, "instance", DagsterInstance, default=DagsterInstance.ephemeral()) execution_plan = create_execution_plan(pipeline) pipeline_run = instance.create_run( pipeline_name="<empty>", run_id=run_id, run_config=None, mode=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot=pipeline_def.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_def.get_pipeline_snapshot_id()), parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(), ) with scoped_pipeline_context(execution_plan, {}, pipeline_run, instance) as context: yield context
def get_context(self, solid_config=None, mode_def=None, environment_dict=None): '''Get a dagstermill execution context for interactive exploration and development. Args: solid_config (Optional[Any]): If specified, this value will be made available on the context as its ``solid_config`` property. mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to use to construct the context. Specify this if you would like a context constructed with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode with a console logger will be constructed. environment_dict(Optional[dict]): The environment config dict with which to construct the context. Returns: :class:`dagstermill.DagstermillExecutionContext` ''' check.opt_inst_param(mode_def, 'mode_def', ModeDefinition) environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) solid_def = SolidDefinition( name='this_solid', input_defs=[], compute_fn=lambda *args, **kwargs: None, output_defs=[], description= 'Ephemeral solid constructed by dagstermill.get_context()', ) if not mode_def: mode_def = ModeDefinition( logger_defs={'dagstermill': colored_console_logger}) environment_dict['loggers'] = {'dagstermill': {}} pipeline_def = PipelineDefinition( [solid_def], mode_defs=[mode_def], name='ephemeral_dagstermill_pipeline') run_config = RunConfig(mode=mode_def.name) self.in_pipeline = False self.solid_def = solid_def self.pipeline_def = pipeline_def with scoped_pipeline_context( self.pipeline_def, environment_dict, run_config, instance=DagsterInstance.ephemeral(), scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext(pipeline_context, solid_config) return self.context
def test_s3_pipeline_with_custom_prefix(s3_bucket): run_id = make_new_run_id() s3_prefix = 'custom_prefix' pipe = define_inty_pipeline(should_throw=False) environment_dict = { 'storage': {'s3': {'config': {'s3_bucket': s3_bucket, 's3_prefix': s3_prefix}}} } pipeline_run = PipelineRun.create_empty_run( pipe.name, run_id=run_id, environment_dict=environment_dict ) instance = DagsterInstance.ephemeral() result = execute_pipeline( pipe, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), ) assert result.success execution_plan = create_execution_plan(pipe, environment_dict, RunConfig(run_id=run_id)) with scoped_pipeline_context( pipe, environment_dict, pipeline_run, instance, execution_plan ) as context: store = S3IntermediateStore( run_id=run_id, s3_bucket=s3_bucket, s3_prefix=s3_prefix, s3_session=context.scoped_resources_builder.build(required_resource_keys={'s3'}).s3, ) assert store.root == '/'.join(['custom_prefix', 'storage', run_id]) assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1 assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
def yield_empty_pipeline_context(run_id=None, instance=None): pipeline = InMemoryExecutablePipeline(PipelineDefinition([])) instance = check.opt_inst_param(instance, 'instance', DagsterInstance, default=DagsterInstance.ephemeral()) pipeline_run = instance.create_run( pipeline_name='<empty>', run_id=run_id, environment_dict=None, mode=None, solids_to_execute=None, step_keys_to_execute=None, status=None, tags=None, root_run_id=None, parent_run_id=None, pipeline_snapshot=None, execution_plan_snapshot=None, parent_pipeline_snapshot=None, ) with scoped_pipeline_context( create_execution_plan(pipeline), {}, pipeline_run, instance, ) as context: yield context
def test_s3_pipeline_with_custom_prefix(s3_bucket): s3_prefix = "custom_prefix" pipe = define_inty_pipeline(should_throw=False) run_config = {"storage": {"s3": {"config": {"s3_bucket": s3_bucket, "s3_prefix": s3_prefix}}}} pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config) instance = DagsterInstance.ephemeral() result = execute_pipeline(pipe, run_config=run_config,) assert result.success execution_plan = create_execution_plan(pipe, run_config) with scoped_pipeline_context(execution_plan, run_config, pipeline_run, instance,) as context: intermediates_manager = S3IntermediateStorage( run_id=result.run_id, s3_bucket=s3_bucket, s3_prefix=s3_prefix, s3_session=context.scoped_resources_builder.build(required_resource_keys={"s3"}).s3, ) assert intermediates_manager.root == "/".join(["custom_prefix", "storage", result.run_id]) assert ( intermediates_manager.get_intermediate( context, Int, StepOutputHandle("return_one.compute") ).obj == 1 ) assert ( intermediates_manager.get_intermediate( context, Int, StepOutputHandle("add_one.compute") ).obj == 2 )
def yield_empty_pipeline_context(run_id=None, instance=None): with scoped_pipeline_context( PipelineDefinition([]), {}, RunConfig(run_id=run_id), instance or DagsterInstance.ephemeral(), ) as context: yield context
def yield_empty_pipeline_context(run_id=None, instance=None): with scoped_pipeline_context( PipelineDefinition([]), {}, PipelineRun.create_empty_run('empty', run_id=run_id), instance or DagsterInstance.ephemeral(), ) as context: yield context
def execute_on_dask(handle, env_config=None, run_config=None, dask_config=None): # pylint: disable=too-many-locals check.inst_param(handle, 'handle', ExecutionTargetHandle) check.opt_dict_param(env_config, 'env_config', key_type=str) dask_config = check.opt_inst_param(dask_config, 'dask_config', DaskConfig, DaskConfig()) run_config = check.opt_inst_param(run_config, 'run_config', RunConfig, RunConfig(executor_config=dask_config)) check.inst( run_config.executor_config, DaskConfig, 'run_config.executor_config should be instance of DaskConfig to execute on Dask', ) pipeline_def = handle.build_pipeline_definition() execution_plan = create_execution_plan(pipeline_def, env_config, run_config=run_config) with scoped_pipeline_context(pipeline_def, env_config, run_config) as pipeline_context: events = list( DaskEngine.execute(pipeline_context, execution_plan, None)) return PipelineExecutionResult( pipeline_def, run_config.run_id, events, lambda: scoped_pipeline_context( pipeline_def, env_config, run_config, system_storage_data=SystemStorageData( intermediates_manager=pipeline_context. intermediates_manager, run_storage=pipeline_context.run_storage, file_manager=pipeline_context.file_manager, ), ), )
def yield_empty_pipeline_context(run_id=None, instance=None): pipeline = PipelineDefinition([]) with scoped_pipeline_context( pipeline, {}, PipelineRun.create_empty_run('empty', run_id=run_id if run_id is not None else 'TESTING',), instance or DagsterInstance.ephemeral(), create_execution_plan(pipeline), ) as context: yield context
def test_adls2_pipeline_with_custom_prefix(storage_account, file_system): adls2_prefix = "custom_prefix" pipe = define_inty_pipeline(should_throw=False) run_config = { "resources": { "adls2": { "config": { "storage_account": storage_account, "credential": get_azure_credential() } } }, "intermediate_storage": { "adls2": { "config": { "adls2_file_system": file_system, "adls2_prefix": adls2_prefix } } }, } pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config) instance = DagsterInstance.ephemeral() result = execute_pipeline( pipe, run_config=run_config, ) assert result.success execution_plan = create_execution_plan(pipe, run_config) with scoped_pipeline_context( execution_plan, InMemoryPipeline(pipe), run_config, pipeline_run, instance, ) as context: resource = context.scoped_resources_builder.build( required_resource_keys={"adls2"}).adls2 intermediate_storage = ADLS2IntermediateStorage( run_id=result.run_id, file_system=file_system, prefix=adls2_prefix, adls2_client=resource.adls2_client, blob_client=resource.blob_client, ) assert intermediate_storage.root == "/".join( ["custom_prefix", "storage", result.run_id]) assert (intermediate_storage.get_intermediate( context, Int, StepOutputHandle("return_one")).obj == 1) assert (intermediate_storage.get_intermediate( context, Int, StepOutputHandle("add_one")).obj == 2)
def test_adls2_pipeline_with_custom_prefix(storage_account, file_system): adls2_prefix = 'custom_prefix' pipe = define_inty_pipeline(should_throw=False) run_config = { 'resources': { 'adls2': { 'config': { 'storage_account': storage_account, 'credential': get_azure_credential() } } }, 'storage': { 'adls2': { 'config': { 'adls2_file_system': file_system, 'adls2_prefix': adls2_prefix } } }, } pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config) instance = DagsterInstance.ephemeral() result = execute_pipeline( pipe, run_config=run_config, ) assert result.success execution_plan = create_execution_plan(pipe, run_config) with scoped_pipeline_context( execution_plan, run_config, pipeline_run, instance, ) as context: resource = context.scoped_resources_builder.build( required_resource_keys={'adls2'}).adls2 store = ADLS2IntermediateStore( run_id=result.run_id, file_system=file_system, prefix=adls2_prefix, adls2_client=resource.adls2_client, blob_client=resource.blob_client, ) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert store.root == '/'.join( ['custom_prefix', 'storage', result.run_id]) assert (intermediates_manager.get_intermediate( context, Int, StepOutputHandle('return_one.compute')).obj == 1) assert (intermediates_manager.get_intermediate( context, Int, StepOutputHandle('add_one.compute')).obj == 2)
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, environment_dict=None, handle=None, run_config=None, solid_subset=None, solid_handle=None, ): '''Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. ''' check.opt_str_param(output_log_path, 'output_log_path') check.opt_str_param(marshal_dir, 'marshal_dir') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) check.inst_param(run_config, 'run_config', RunConfig) check.inst_param(handle, 'handle', ExecutionTargetHandle) check.opt_list_param(solid_subset, 'solid_subset', of_type=str) check.inst_param(solid_handle, 'solid_handle', SolidHandle) pipeline_def = check.inst_param( handle.build_pipeline_definition(), 'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()), PipelineDefinition, ).build_sub_pipeline(solid_subset) solid_def = pipeline_def.get_solid(solid_handle) run_config = run_config.with_log_sink(construct_sqlite_logger(output_log_path)) self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline_def = pipeline_def with scoped_pipeline_context( self.pipeline_def, environment_dict, run_config, scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext(pipeline_context) return self.context
def yield_empty_pipeline_context(run_id=None, instance=None): pipeline = PipelineDefinition([]) instance = check.opt_inst_param( instance, 'instance', DagsterInstance, default=DagsterInstance.ephemeral() ) pipeline_run = instance.get_or_create_run( run_id=run_id, pipeline_name='<empty>', pipeline_snapshot=None ) with scoped_pipeline_context( pipeline, {}, pipeline_run, instance, create_execution_plan(pipeline), ) as context: yield context
def test_s3_pipeline_with_custom_prefix(s3_bucket): s3_prefix = 'custom_prefix' pipe = define_inty_pipeline(should_throw=False) environment_dict = { 'storage': { 's3': { 'config': { 's3_bucket': s3_bucket, 's3_prefix': s3_prefix } } } } pipeline_run = PipelineRun(pipeline_name=pipe.name, environment_dict=environment_dict) instance = DagsterInstance.ephemeral() result = execute_pipeline( pipe, environment_dict=environment_dict, ) assert result.success execution_plan = create_execution_plan(pipe, environment_dict) with scoped_pipeline_context( execution_plan, environment_dict, pipeline_run, instance, ) as context: store = S3IntermediateStore( run_id=result.run_id, s3_bucket=s3_bucket, s3_prefix=s3_prefix, s3_session=context.scoped_resources_builder.build( required_resource_keys={'s3'}).s3, ) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert store.root == '/'.join( ['custom_prefix', 'storage', result.run_id]) assert (intermediates_manager.get_intermediate( context, Int, StepOutputHandle('return_one.compute')).obj == 1) assert (intermediates_manager.get_intermediate( context, Int, StepOutputHandle('add_one.compute')).obj == 2)
def get_context(self, solid_def=None, mode_def=None, environment_dict=None): check.opt_inst_param(solid_def, 'solid_def', SolidDefinition) check.opt_inst_param(mode_def, 'mode_def', ModeDefinition) environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) if solid_def is None: solid_def = SolidDefinition( name='this_solid', input_defs=[], compute_fn=lambda *args, **kwargs: None, output_defs=[], description= 'Ephemeral solid constructed by dagstermill.get_context()', ) if not mode_def: mode_def = ModeDefinition( logger_defs={'dagstermill': colored_console_logger}) environment_dict['loggers'] = {'dagstermill': {}} pipeline_def = PipelineDefinition( [solid_def], mode_defs=[mode_def], name='ephemeral_dagstermill_pipeline') run_config = RunConfig(mode=mode_def.name) self.in_pipeline = False self.solid_def = solid_def self.pipeline_def = pipeline_def with scoped_pipeline_context( self.pipeline_def, environment_dict, run_config, scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillInPipelineExecutionContext( pipeline_context) return self.context
def test_gcs_pipeline_with_custom_prefix(gcs_bucket): gcs_prefix = 'custom_prefix' pipe = define_inty_pipeline(should_throw=False) run_config = { 'storage': { 'gcs': { 'config': { 'gcs_bucket': gcs_bucket, 'gcs_prefix': gcs_prefix } } } } pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config) instance = DagsterInstance.ephemeral() result = execute_pipeline( pipe, run_config=run_config, ) assert result.success execution_plan = create_execution_plan(pipe, run_config) with scoped_pipeline_context( execution_plan, run_config, pipeline_run, instance, ) as context: intermediate_storage = GCSIntermediateStorage( run_id=result.run_id, gcs_bucket=gcs_bucket, gcs_prefix=gcs_prefix, client=context.scoped_resources_builder.build( required_resource_keys={'gcs'}, ).gcs, ) assert intermediate_storage.root == '/'.join( ['custom_prefix', 'storage', result.run_id]) assert (intermediate_storage.get_intermediate( context, Int, StepOutputHandle('return_one.compute')).obj == 1) assert (intermediate_storage.get_intermediate( context, Int, StepOutputHandle('add_one.compute')).obj == 2)
def test_gcs_pipeline_with_custom_prefix(gcs_bucket): gcs_prefix = "custom_prefix" pipe = define_inty_pipeline(should_throw=False) run_config = { "intermediate_storage": { "gcs": { "config": { "gcs_bucket": gcs_bucket, "gcs_prefix": gcs_prefix } } } } pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config) instance = DagsterInstance.ephemeral() result = execute_pipeline( pipe, run_config=run_config, ) assert result.success execution_plan = create_execution_plan(pipe, run_config) with scoped_pipeline_context( execution_plan, run_config, pipeline_run, instance, ) as context: intermediate_storage = GCSIntermediateStorage( run_id=result.run_id, gcs_bucket=gcs_bucket, gcs_prefix=gcs_prefix, client=context.scoped_resources_builder.build( required_resource_keys={"gcs"}, ).gcs, ) assert intermediate_storage.root == "/".join( ["custom_prefix", "storage", result.run_id]) assert (intermediate_storage.get_intermediate( context, Int, StepOutputHandle("return_one")).obj == 1) assert (intermediate_storage.get_intermediate( context, Int, StepOutputHandle("add_one")).obj == 2)
def execute(self): from dagster.core.execution.api import scoped_pipeline_context check.inst(self.run_config.executor_config, MultiprocessExecutorConfig) pipeline = self.run_config.executor_config.handle.build_pipeline_definition( ) with scoped_pipeline_context( pipeline, self.environment_dict, self.run_config.with_tags( pid=str(os.getpid()))) as pipeline_context: execution_plan = ExecutionPlan.build( pipeline_context.pipeline_def, pipeline_context.environment_config) for step_event in InProcessEngine.execute( pipeline_context, execution_plan, step_keys_to_execute=[self.step_key]): yield step_event
def yield_empty_pipeline_context(run_id=None): with scoped_pipeline_context(PipelineDefinition([]), {}, RunConfig(run_id=run_id)) as context: yield context
def populate_context( self, run_id=None, mode=None, solid_def_name=None, pipeline_name=None, marshal_dir=None, environment_config=None, input_name_type_dict=None, output_name_type_dict=None, output_log_path=None, **_kwargs ): check.str_param(run_id, 'run_id') check.str_param(mode, 'mode') check.str_param(solid_def_name, 'solid_def_name') check.str_param(pipeline_name, 'pipeline_name') check.str_param(marshal_dir, 'marshal_dir') check.dict_param(environment_config, 'environment_config') check.dict_param(input_name_type_dict, 'input_name_type_dict') check.dict_param(output_name_type_dict, 'output_name_type_dict') check.str_param(output_log_path, 'output_log_path') self.populated_by_papermill = True self.solid_def_name = solid_def_name self.marshal_dir = marshal_dir logger_def = construct_logger(output_log_path) loggers = {'dagstermill': logger_def} if self.repository_def is None: self.solid_def = None self.pipeline_def = PipelineDefinition( [], mode_definitions=[ModeDefinition(loggers=loggers)], name='Dummy Pipeline (No Repo Registration)', ) self.input_name_type_dict = dict_to_enum(input_name_type_dict) self.output_name_type_dict = dict_to_enum(output_name_type_dict) for _, runtime_type_enum in self.input_name_type_dict.items(): if runtime_type_enum == SerializableRuntimeType.NONE: raise DagstermillError( 'If Dagstermill solids have inputs that require serialization strategies ' 'that are not pickling, then you must register a repository within ' 'notebook by calling dagstermill.register_repository(repository_def)' ) for _, runtime_type_enum in self.output_name_type_dict.items(): if runtime_type_enum == SerializableRuntimeType.NONE: raise DagstermillError( 'If Dagstermill solids have outputs that require serialization strategies ' 'that are not pickling, then you must register a repository within ' 'notebook by calling dagstermill.register_repository(repository_def).' ) environment_config = {'loggers': {'dagstermill': {}}} run_config = RunConfig(run_id=run_id, mode=mode) else: self.pipeline_def = self.repository_def.get_pipeline(pipeline_name) check.invariant( self.pipeline_def.has_solid_def(solid_def_name), 'solid {} not found'.format(solid_def_name), ) self.solid_def = self.pipeline_def.solid_def_named(solid_def_name) logger = logger_def.logger_fn( InitLoggerContext({}, self.pipeline_def, logger_def, run_id) ) run_config = RunConfig(run_id, loggers=[logger], mode=mode) with scoped_pipeline_context( self.pipeline_def, environment_config, run_config, scoped_resources_builder_cm=self.setup_resources, ) as pipeline_context: self.context = DagstermillInNotebookExecutionContext(pipeline_context) return self.context
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, run_config=None, executable_dict=None, pipeline_run_dict=None, solid_handle_kwargs=None, instance_ref_dict=None, ): """Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. """ check.opt_str_param(output_log_path, "output_log_path") check.opt_str_param(marshal_dir, "marshal_dir") run_config = check.opt_dict_param(run_config, "run_config", key_type=str) check.dict_param(pipeline_run_dict, "pipeline_run_dict") check.dict_param(executable_dict, "executable_dict") check.dict_param(solid_handle_kwargs, "solid_handle_kwargs") check.dict_param(instance_ref_dict, "instance_ref_dict") pipeline = ReconstructablePipeline.from_dict(executable_dict) pipeline_def = pipeline.get_definition() try: instance_ref = unpack_value(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) except Exception as err: # pylint: disable=broad-except raise DagstermillError( "Error when attempting to resolve DagsterInstance from serialized InstanceRef" ) from err pipeline_run = unpack_value(pipeline_run_dict) solid_handle = SolidHandle.from_dict(solid_handle_kwargs) solid_def = pipeline_def.get_solid(solid_handle).definition self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline = pipeline environment_config = EnvironmentConfig.build(pipeline_def, run_config, mode=pipeline_run.mode) execution_plan = ExecutionPlan.build( self.pipeline, environment_config, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) with scoped_pipeline_context( execution_plan, pipeline, run_config, pipeline_run, instance, scoped_resources_builder_cm=self._setup_resources, # Set this flag even though we're not in test for clearer error reporting raise_on_error=True, ) as pipeline_context: self.context = DagstermillRuntimeExecutionContext( pipeline_context=pipeline_context, pipeline_def=pipeline_def, solid_config=run_config.get("solids", {}).get(solid_def.name, {}).get("config"), resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_def, environment_config, pipeline_context.intermediate_storage_def, ), solid_name=solid_def.name, ) return self.context
def execute_on_dask( handle, env_config=None, run_config=None, mode=None, dask_config=None ): # pylint: disable=too-many-locals check.inst_param(handle, 'handle', ExecutionTargetHandle) env_config = check.opt_dict_param(env_config, 'env_config', key_type=str) dask_config = check.opt_inst_param(dask_config, 'dask_config', DaskConfig, DaskConfig()) run_config = check.opt_inst_param( run_config, 'run_config', RunConfig, RunConfig(storage_mode=RunStorageMode.FILESYSTEM) ) pipeline = handle.build_pipeline_definition() mode = check.opt_str_param(mode, 'mode', pipeline.get_default_mode_name()) # Checks to ensure storage is compatible with Dask configuration storage = env_config.get('storage') check.invariant(storage.keys(), 'Must specify storage to use Dask execution') if dask_config.is_remote_execution: check.invariant( storage.get('s3'), 'Must use S3 storage with non-local Dask address {dask_address}'.format( dask_address=dask_config.address ), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Dask, use filesystem or S3', ) execution_plan = create_execution_plan(pipeline, env_config, mode=mode) step_levels = execution_plan.topological_step_levels() query = build_graphql_query() with scoped_pipeline_context(pipeline, env_config, run_config) as pipeline_context: with dask.distributed.Client(**dask_config.build_dict(pipeline.name)) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: step_context = pipeline_context.for_step(step) check.invariant( not step_context.run_config.loggers, 'Cannot inject loggers via RunConfig with the Dask executor', ) check.invariant( not step_context.event_callback, 'Cannot use event_callback with Dask executor', ) # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [ execution_futures_dict[ni.prev_output_handle.step_key] for ni in step.step_inputs ] variables = { 'executionParams': { 'selector': {'name': pipeline.name}, 'environmentConfigData': env_config, 'mode': mode, 'executionMetadata': {'runId': run_config.run_id}, 'stepKeys': [step.key], } } future = client.submit( query_on_dask_worker, handle, query, variables, dependencies ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the master execution_step_events = client.gather(execution_futures) # execution_step_events is now a list of lists, the inner lists contain the dagster # events emitted by each step event_list = list(itertools.chain.from_iterable(execution_step_events)) return PipelineExecutionResult( pipeline, run_config.run_id, event_list, lambda: scoped_pipeline_context( pipeline, env_config, run_config, intermediates_manager=pipeline_context.intermediates_manager, ), )
def get_context(self, solid_config=None, mode_def=None, run_config=None): """Get a dagstermill execution context for interactive exploration and development. Args: solid_config (Optional[Any]): If specified, this value will be made available on the context as its ``solid_config`` property. mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to use to construct the context. Specify this if you would like a context constructed with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode with a console logger will be constructed. run_config(Optional[dict]): The environment config dict with which to construct the context. Returns: :py:class:`~dagstermill.DagstermillExecutionContext` """ check.opt_inst_param(mode_def, "mode_def", ModeDefinition) run_config = check.opt_dict_param(run_config, "run_config", key_type=str) # If we are running non-interactively, and there is already a context reconstituted, return # that context rather than overwriting it. if self.context is not None and isinstance( self.context, DagstermillRuntimeExecutionContext): return self.context if not mode_def: mode_def = ModeDefinition( logger_defs={"dagstermill": colored_console_logger}) run_config["loggers"] = {"dagstermill": {}} solid_def = SolidDefinition( name="this_solid", input_defs=[], compute_fn=lambda *args, **kwargs: None, output_defs=[], description= "Ephemeral solid constructed by dagstermill.get_context()", required_resource_keys=mode_def.resource_key_set, ) pipeline_def = PipelineDefinition( [solid_def], mode_defs=[mode_def], name="ephemeral_dagstermill_pipeline") run_id = make_new_run_id() # construct stubbed PipelineRun for notebook exploration... # The actual pipeline run during pipeline execution will be serialized and reconstituted # in the `reconstitute_pipeline_context` call pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config, mode=mode_def.name, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, tags=None, ) self.in_pipeline = False self.solid_def = solid_def self.pipeline = pipeline_def environment_config = EnvironmentConfig.build(pipeline_def, run_config, mode=mode_def.name) pipeline = InMemoryPipeline(pipeline_def) execution_plan = ExecutionPlan.build(pipeline, environment_config) with scoped_pipeline_context( execution_plan, pipeline, run_config, pipeline_run, DagsterInstance.ephemeral(), scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext( pipeline_context=pipeline_context, pipeline_def=pipeline_def, solid_config=solid_config, resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_def, environment_config, pipeline_context.intermediate_storage_def, ), solid_name=solid_def.name, ) return self.context