@resource(config=Field(Int)) def multer_resource(init_context): return lambda x: x * init_context.resource_config @resource(config={'num_one': Field(Int), 'num_two': Field(Int)}) def double_adder_resource(init_context): return (lambda x: x + init_context.resource_config['num_one'] + init_context.resource_config['num_two']) @pipeline( mode_defs=[ ModeDefinition( name='add_mode', resource_defs={'op': adder_resource}, description='Mode that adds things', ), ModeDefinition( name='mult_mode', resource_defs={'op': multer_resource}, description='Mode that multiplies things', ), ModeDefinition( name='double_adder', resource_defs={'op': double_adder_resource}, description='Mode that adds two numbers to thing', ), ], preset_defs=[PresetDefinition.from_files("add", mode="add_mode")], )
iterations = context.solid_config['iterations'] for iteration in range(iterations): # Calculates URL contributions to the rank of other URLs. contribs = links.join( ranks).flatMap(lambda url_urls_rank: computeContribs( url_urls_rank[1][0], url_urls_rank[1][1])) # Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(add).mapValues( lambda rank: rank * 0.85 + 0.15) context.log.info('Completed iteration {}'.format(iteration)) return ranks @solid(input_defs=[InputDefinition(name='ranks', dagster_type=SparkRDD)]) def log_ranks_step_five(context, ranks): for (link, rank) in ranks.collect(): context.log.info("%s has rank: %s." % (link, rank)) return ranks.collect() @pipeline( mode_defs=[ModeDefinition(resource_defs={'spark': pyspark_resource})]) def pyspark_pagerank_step_five(): log_ranks_step_five( calculate_ranks_step_five( compute_links_step_five(parse_pagerank_data_step_five())))
def test_two_modes(): pipeline_def = PipelineDefinition( name='TwoModePipelines', solids=[], mode_definitions=[ ModeDefinition( 'mode_one', resources={ 'value': dummy_resource( Field(Dict({'mode_one_field': Field(String)}))) }, ), ModeDefinition( 'mode_two', resources={ 'value': dummy_resource(Field(Dict({'mode_two_field': Field(Int)}))) }, ), ], ) assert scaffold_pipeline_config(pipeline_def, mode='mode_one') == { 'resources': { 'value': { 'config': { 'mode_one_field': '' } } } } assert scaffold_pipeline_config(pipeline_def, mode='mode_one', skip_optional=False) == { 'loggers': { 'console': { 'config': { 'log_level': '', 'name': '' } } }, 'solids': {}, 'expectations': { 'evaluate': True }, 'storage': { 'in_memory': {}, 'filesystem': { 'base_dir': '' }, 's3': { 's3_bucket': '' } }, 'execution': {}, 'resources': { 'value': { 'config': { 'mode_one_field': '' } } }, } assert scaffold_pipeline_config(pipeline_def, mode='mode_two') == { 'resources': { 'value': { 'config': { 'mode_two_field': 0 } } } } assert scaffold_pipeline_config(pipeline_def, mode='mode_two', skip_optional=False) == { 'solids': {}, 'expectations': { 'evaluate': True }, 'storage': { 'in_memory': {}, 'filesystem': { 'base_dir': '' }, 's3': { 's3_bucket': '' } }, 'execution': {}, 'resources': { 'value': { 'config': { 'mode_two_field': 0 } } }, 'loggers': { 'console': { 'config': { 'log_level': '', 'name': '' } } }, }
@resource(config_field=Field(Int)) def multer_resource(init_context): return lambda x: x * init_context.resource_config @resource(config={'num_one': Field(Int), 'num_two': Field(Int)}) def double_adder_resource(init_context): return (lambda x: x + init_context.resource_config['num_one'] + init_context.resource_config['num_two']) @pipeline( mode_defs=[ ModeDefinition( name='add_mode', resource_defs={'op': adder_resource}, description='Mode that adds things', ), ModeDefinition( name='mult_mode', resource_defs={'op': multer_resource}, description='Mode that multiplies things', ), ModeDefinition( name='double_adder', resource_defs={'op': double_adder_resource}, description='Mode that adds two numbers to thing', ), ], preset_defs=[PresetDefinition.from_files("add", mode="add_mode")], )
SELECT FORMAT_DATETIME("%F %H:00:00", DATETIME(TIMESTAMP_SECONDS(CAST(timestamp AS INT64)))) AS ts, COUNT(1) AS num_visits FROM events.events WHERE url = '/explore' GROUP BY ts ORDER BY ts ASC ''' context.resources.bigquery.query(sql, job_config=query_job_config) @pipeline( mode_defs=[ ModeDefinition( name='default', resource_defs={ 'bigquery': bigquery_resource, 'dataproc': dataproc_resource }, ) ], preset_defs=[ PresetDefinition.from_pkg_resources( 'default', pkg_resource_defs=[ ('dagster_examples.gcp_data_platform.environments', 'resources_pipeline.yaml'), ], ) ], ) def gcp_data_platform():
load_data_to_database_from_spark, process_sfo_weather_data, q2_sfo_outbound_flights, s3_to_df, s3_to_dw_table, sfo_delays_by_destination, tickets_with_destination, westbound_delays, ) test_mode = ModeDefinition( name='test', resource_defs={ 'spark': spark_session_local, 'db_info': redshift_db_info_resource, 'tempfile': tempfile_resource, 's3': s3_resource, 'file_cache': fs_file_cache, }, system_storage_defs=s3_plus_default_storage_defs, ) local_mode = ModeDefinition( name='local', resource_defs={ 'spark': spark_session_local, 's3': s3_resource, 'db_info': postgres_db_info_resource, 'tempfile': tempfile_resource, 'file_cache': fs_file_cache, },
"inputs": {"table_name": "weather"}, }, "upload_training_set_to_gcs": { "inputs": {"bucket_name": "dagster-scratch-ccdfe1e", "file_name": "training_data",} }, }, } @pipeline( mode_defs=[ ModeDefinition( name='testing', resource_defs={ 'postgres_db': postgres_db_info_resource, 'gcs_client': testing_client, 'volume': mount, }, description='Mode to be used during testing. Allows us to clean up test artifacts without interfearing with local artifacts.', ), ], ) def generate_test_training_set_pipeline(): upload_training_set_to_gcs = upload_pickled_object_to_gcs_bucket.alias( 'upload_training_set_to_gcs' ) return upload_training_set_to_gcs( produce_training_set( transform_into_traffic_dataset(produce_trip_dataset()), produce_weather_dataset(), ) )
@solid(required_resource_keys={'R1'}) def one(context): return 1 + context.resources.R1 @solid(required_resource_keys={'R2'}) def two(_): return 1 @solid(required_resource_keys={'R1', 'R2', 'R3'}) def one_and_two_and_three(_): return 1 @pipeline(mode_defs=[ModeDefinition(resource_defs=lots_of_resources)]) def resource_pipeline(): all_resources() one() two() one_and_two_and_three() if __name__ == '__main__': result = execute_pipeline( ExecutionTargetHandle.for_pipeline_fn(resource_pipeline).build_pipeline_definition(), environment_dict={'storage': {'filesystem': {}}, 'execution': {'multiprocessing': {}}}, )
@solid( input_defs=[ InputDefinition("_string_input_1", String), InputDefinition("_string_input_2", String), ], version="take_string_two_inputs_version", config_schema={ "input_str": Field(String), "base_dir": Field(String) }, ) def take_string_two_inputs(context, _string_input_1, _string_input_2): yield Output( context.solid_config["input_str"], address=os.path.join( context.solid_config["base_dir"], "intermediates/take_string_two_inputs.compute/result"), ) @pipeline(mode_defs=[ ModeDefinition("only_mode", intermediate_storage_defs=[fs_intermediate_storage]) ]) def basic_pipeline(): take_string_two_inputs( _string_input_1=take_string_1(create_string_1()), _string_input_2=take_string_2(create_string_2()), )
def test_optional_and_required_context(): pipeline_def = PipelineDefinition( name='some_pipeline', solids=[], mode_definitions=[ ModeDefinition( name='mixed', resources={ 'optional_resource': ResourceDefinition( lambda: None, config_field=Field(dagster_type=Dict( fields={ 'optional_field': Field(String, is_optional=True) })), ), 'required_resource': ResourceDefinition( lambda: None, config_field=Field(dagster_type=Dict( fields={'required_field': Field(String)})), ), }, ) ], ) env_type = create_environment_type(pipeline_def) assert env_type.fields['solids'].is_optional assert env_type.fields['execution'].is_optional assert env_type.fields['expectations'].is_optional assert nested_field(env_type, 'resources').is_required assert nested_field(env_type, 'resources', 'optional_resource').is_optional assert nested_field(env_type, 'resources', 'optional_resource', 'config').is_optional assert nested_field(env_type, 'resources', 'optional_resource', 'config', 'optional_field').is_optional assert nested_field(env_type, 'resources', 'required_resource').is_required assert nested_field(env_type, 'resources', 'required_resource', 'config').is_required assert nested_field(env_type, 'resources', 'required_resource', 'config', 'required_field').is_required env_obj = EnvironmentConfig.from_dict( throwing_evaluate_config_value( env_type, { 'resources': { 'required_resource': { 'config': { 'required_field': 'foo' } } } })) assert env_obj.resources == { 'optional_resource': { 'config': {} }, 'required_resource': { 'config': { 'required_field': 'foo' } }, }
def define_pipeline(): @pipeline(mode_defs=[ModeDefinition(resource_defs={'a': resource_a})]) def spew_pipeline(): spew(spew(spawn())) return spew_pipeline
from dagster import ModeDefinition, execute_pipeline, pipeline, solid from .conftest import AWS_REGION, TEST_CLOUDWATCH_LOG_GROUP_NAME, TEST_CLOUDWATCH_LOG_STREAM_NAME TEN_MINUTES_MS = 10 * 60 * 1000 # in milliseconds NUM_POLL_ATTEMPTS = 5 @solid def hello_cloudwatch(context): context.log.info('Hello, Cloudwatch!') context.log.error('This is an error') @pipeline(mode_defs=[ModeDefinition(logger_defs={'cloudwatch': cloudwatch_logger})]) def hello_cloudwatch_pipeline(): hello_cloudwatch() def test_cloudwatch_logging_bad_log_group_name(): with pytest.raises( Exception, match='Failed to initialize Cloudwatch logger: Could not find log group with name foo', ): execute_pipeline( hello_cloudwatch_pipeline, { 'loggers': { 'cloudwatch': { 'config': {
def test_cache_file_from_s3_overwrite(): with tempfile.TemporaryDirectory() as temp_dir: s3_session_one = mock.MagicMock() execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ "file_cache": fs_file_cache, "s3": ResourceDefinition.hardcoded_resource( s3_session_one), }), run_config={ "solids": { "cache_file_from_s3": { "inputs": { "s3_coordinate": { "bucket": "some-bucket", "key": "some-key" } } } }, "resources": { "file_cache": { "config": { "target_folder": temp_dir, "overwrite": True } } }, }, ) # assert the download occurred assert s3_session_one.download_file.call_count == 1 s3_session_two = mock.MagicMock() execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ "file_cache": fs_file_cache, "s3": ResourceDefinition.hardcoded_resource( s3_session_two), }), run_config={ "solids": { "cache_file_from_s3": { "inputs": { "s3_coordinate": { "bucket": "some-bucket", "key": "some-key" } } } }, "resources": { "file_cache": { "config": { "target_folder": temp_dir, "overwrite": True } } }, }, ) # assert the download did not occur because file is already there assert s3_session_two.download_file.call_count == 0
Output, OutputDefinition, PipelineExecutionResult, SolidExecutionResult, default_executors, execute_pipeline, lambda_solid, pipeline, seven, solid, ) from dagster.core.instance import DagsterInstance from dagster.core.test_utils import nesting_composite_pipeline celery_mode_defs = [ ModeDefinition(executor_defs=default_executors + [celery_executor]) ] BUILDKITE = os.getenv('BUILDKITE') skip_ci = pytest.mark.skipif( bool(BUILDKITE), reason= 'Tests hang forever on buildkite for reasons we don\'t currently understand', ) @solid def simple(_): return 1
def reconstitute_pipeline_context( self, output_log_path=None, marshal_dir=None, environment_dict=None, handle=None, run_config=None, solid_handle=None, ): '''Reconstitutes a context for dagstermill-managed execution. You'll see this function called to reconstruct a pipeline context within the ``injected parameters`` cell of a dagstermill output notebook. Users should not call this function interactively except when debugging output notebooks. Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a context for interactive exploration and development. This call will be replaced by one to :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by dagstermill. ''' check.opt_str_param(output_log_path, 'output_log_path') check.opt_str_param(marshal_dir, 'marshal_dir') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) check.inst_param(run_config, 'run_config', RunConfig) check.inst_param(handle, 'handle', ExecutionTargetHandle) check.inst_param(solid_handle, 'solid_handle', SolidHandle) pipeline_def = check.inst_param( handle.build_pipeline_definition(), 'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()), PipelineDefinition, ) solid_def = pipeline_def.get_solid(solid_handle) mode_def = pipeline_def.get_mode_definition(run_config.mode) shim_mode_def = ModeDefinition( name=mode_def.name, logger_defs=dict( mode_def.loggers, dagstermill=construct_sqlite_logger(output_log_path) ), resource_defs=mode_def.resource_defs, ) pipeline_def = PipelineDefinition( pipeline_def.solid_defs, name=pipeline_def.name, description=pipeline_def.description, dependencies=pipeline_def.dependencies, mode_defs=[shim_mode_def], preset_defs=pipeline_def.preset_defs, ) if 'loggers' not in environment_dict: environment_dict['loggers'] = {'dagstermill': {}} if 'dagstermill' not in environment_dict['loggers']: environment_dict['loggers']['dagstermill'] = {} self.marshal_dir = marshal_dir self.in_pipeline = True self.solid_def = solid_def self.pipeline_def = pipeline_def with scoped_pipeline_context( self.pipeline_def, environment_dict, run_config, scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext(pipeline_context) return self.context
def get_context(self, solid_config=None, mode_def=None, run_config=None): """Get a dagstermill execution context for interactive exploration and development. Args: solid_config (Optional[Any]): If specified, this value will be made available on the context as its ``solid_config`` property. mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to use to construct the context. Specify this if you would like a context constructed with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode with a console logger will be constructed. run_config(Optional[dict]): The environment config dict with which to construct the context. Returns: :py:class:`~dagstermill.DagstermillExecutionContext` """ check.opt_inst_param(mode_def, "mode_def", ModeDefinition) run_config = check.opt_dict_param(run_config, "run_config", key_type=str) # If we are running non-interactively, and there is already a context reconstituted, return # that context rather than overwriting it. if self.context is not None and isinstance( self.context, DagstermillRuntimeExecutionContext): return self.context if not mode_def: mode_def = ModeDefinition( logger_defs={"dagstermill": colored_console_logger}) run_config["loggers"] = {"dagstermill": {}} solid_def = SolidDefinition( name="this_solid", input_defs=[], compute_fn=lambda *args, **kwargs: None, output_defs=[], description= "Ephemeral solid constructed by dagstermill.get_context()", required_resource_keys=mode_def.resource_key_set, ) pipeline_def = PipelineDefinition( [solid_def], mode_defs=[mode_def], name="ephemeral_dagstermill_pipeline") run_id = make_new_run_id() # construct stubbed PipelineRun for notebook exploration... # The actual pipeline run during pipeline execution will be serialized and reconstituted # in the `reconstitute_pipeline_context` call pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config, mode=mode_def.name, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, tags=None, ) self.in_pipeline = False self.solid_def = solid_def self.pipeline = pipeline_def execution_plan = create_execution_plan(self.pipeline, run_config, mode=mode_def.name) with scoped_pipeline_context( execution_plan, run_config, pipeline_run, DagsterInstance.ephemeral(), scoped_resources_builder_cm=self._setup_resources, ) as pipeline_context: self.context = DagstermillExecutionContext( pipeline_context=pipeline_context, solid_config=solid_config, resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, pipeline_context.intermediate_storage_def, ), solid_name=solid_def.name, ) return self.context
if not safe_isfile(outfile): mkdir_p(output_folder) with gzip.open(gzip_file, "rb") as f_in, open(outfile, "wb") as f_out: shutil.copyfileobj(f_in, f_out) return [path_prefix] @pipeline( mode_defs=[ ModeDefinition( name="default", resource_defs={ "s3": s3_resource, "snowflake": snowflake_resource, "spark": spark_resource, }, ) ], preset_defs=[ PresetDefinition.from_pkg_resources( "default", pkg_resource_defs=[ ("dagster_examples.event_pipeline_demo.environments", "default.yaml"), ], ) ], ) def event_ingest_pipeline():