def test_yield_resource(): called = {} @solid(required_resource_keys={"a_string"}) def a_solid(context): called["yup"] = True assert context.resources.a_string == "foo" def _do_resource(init_context): yield init_context.resource_config yield_string_resource = ResourceDefinition(config_schema=String, resource_fn=_do_resource) pipeline_def = PipelineDefinition( name="with_a_yield_resource", solid_defs=[a_solid], mode_defs=[ModeDefinition(resource_defs={"a_string": yield_string_resource})], ) result = execute_pipeline(pipeline_def, {"resources": {"a_string": {"config": "foo"}}}) assert result.success assert called["yup"]
def test_cache_file_from_s3_basic(): s3_session = mock.MagicMock() with get_temp_dir() as temp_dir: solid_result = execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource(S3Resource(s3_session)), } ), environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': {'s3_coordinate': {'bucket': 'some-bucket', 'key': 'some-key'}} } }, 'resources': {'file_cache': {'config': {'target_folder': temp_dir}}}, }, ) # assert the download occured assert s3_session.download_file.call_count == 1 assert solid_result.success expectation_results = solid_result.expectation_results_during_compute assert len(expectation_results) == 1 expectation_result = expectation_results[0] assert expectation_result.success assert expectation_result.label == 'file_handle_exists' path_in_metadata = expectation_result.metadata_entries[0].entry_data.path assert isinstance(path_in_metadata, str) assert os.path.exists(path_in_metadata) assert isinstance(solid_result.output_value(), LocalFileHandle) assert 'some-key' in solid_result.output_value().path_desc
def define_dagma_resource(): """Returns a ResourceDefinition appropriate for use of the dagma engine. Usage: from dagster import PipelineContextDefinition PipelineContextDefinition( ..., resources={ ..., 'dagma': define_dagma_resource(), }, ) """ def _create_dagma_resource(info): sessionmaker = lambda: boto3.Session( # Otherwise, can't be pickled b/c of ssl.SSLContext aws_access_key_id=info.get('aws_access_key_id'), aws_secret_access_key=info.get('aws_secret_access_key'), aws_session_token=info.get('aws_session_token'), region_name=info.config['aws_region_name'], ) storage_config = dict(DEFAULT_STORAGE_CONFIG, sessionmaker=sessionmaker, s3_bucket=info.config['s3_bucket']) return DagmaResourceType( sessionmaker=sessionmaker, aws_region_name=info.config['aws_region_name'], storage=Storage(storage_config), s3_bucket=info.config['s3_bucket'], runtime_bucket=info.config['runtime_bucket'], ) return ResourceDefinition(resource_fn=_create_dagma_resource, config_field=Field(DagmaResourceConfig))
def test_cache_file_from_s3_basic(): s3_session = mock.MagicMock() with tempfile.TemporaryDirectory() as temp_dir: solid_result = execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ "file_cache": fs_file_cache, "s3": ResourceDefinition.hardcoded_resource(s3_session), } ), run_config={ "solids": { "cache_file_from_s3": { "inputs": {"s3_coordinate": {"bucket": "some-bucket", "key": "some-key"}} } }, "resources": {"file_cache": {"config": {"target_folder": temp_dir}}}, }, ) # assert the download occurred assert s3_session.download_file.call_count == 1 assert solid_result.success expectation_results = solid_result.expectation_results_during_compute assert len(expectation_results) == 1 expectation_result = expectation_results[0] assert expectation_result.success assert expectation_result.label == "file_handle_exists" path_in_metadata = expectation_result.metadata_entries[0].entry_data.path assert isinstance(path_in_metadata, str) assert os.path.exists(path_in_metadata) assert isinstance(solid_result.output_value(), LocalFileHandle) assert "some-key" in solid_result.output_value().path_desc
def test_string_resource(): called = {} @solid def solid_test_string(context): assert context.resources.test_string == 'foo' called['yup'] = True pipeline = PipelineDefinition( name='test_string_resource', solids=[solid_test_string], context_definitions={ 'default': PipelineContextDefinition( resources={'test_string': ResourceDefinition.string_resource()} ) }, ) result = execute_pipeline( pipeline, {'context': {'default': {'resources': {'test_string': {'config': 'foo'}}}}} ) assert result.success assert called['yup']
def test_required_resource_with_required_subfield(): pipeline_def = PipelineDefinition( name='some_pipeline', solid_defs=[], mode_defs=[ ModeDefinition( resource_defs={ 'with_required': ResourceDefinition( resource_fn=lambda _: None, config_schema={'required_field': String}, ) } ) ], ) env_type = create_environment_type(pipeline_def) assert env_type.fields['solids'].is_required is False assert env_type.fields['execution'].is_required is False assert env_type.fields['resources'].is_required assert nested_field(env_type, 'resources', 'with_required').is_required assert nested_field(env_type, 'resources', 'with_required', 'config').is_required assert nested_field( env_type, 'resources', 'with_required', 'config', 'required_field' ).is_required
def test_whole_environment(): pipeline_def = PipelineDefinition( name='some_pipeline', mode_defs=[ ModeDefinition( name='test_mode', resource_defs={ 'test_resource': ResourceDefinition(resource_fn=lambda: None, config=Any) }, ) ], solid_defs=[ SolidDefinition( name='int_config_solid', config=Int, input_defs=[], output_defs=[], compute_fn=lambda *args: None, ), SolidDefinition( name='no_config_solid', input_defs=[], output_defs=[], compute_fn=lambda *args: None ), ], ) env = EnvironmentConfig.build( pipeline_def, { 'resources': {'test_resource': {'config': 1}}, 'solids': {'int_config_solid': {'config': 123}}, }, ) assert isinstance(env, EnvironmentConfig) assert env.solids == {'int_config_solid': SolidConfig(123), 'no_config_solid': SolidConfig()} assert env.resources == {'test_resource': {'config': 1}}
def test_depends_on_adls2_resource_file_manager(storage_account, file_system): bar_bytes = 'bar'.encode() @solid(output_defs=[OutputDefinition(ADLS2FileHandle)]) def emit_file(context): return context.file_manager.write_data(bar_bytes) @solid(input_defs=[InputDefinition('file_handle', ADLS2FileHandle)]) def accept_file(context, file_handle): local_path = context.file_manager.copy_handle_to_local_temp( file_handle) assert isinstance(local_path, str) assert open(local_path, 'rb').read() == bar_bytes adls2_fake_resource = FakeADLS2Resource(storage_account) @pipeline(mode_defs=[ ModeDefinition( system_storage_defs=adls2_plus_default_storage_defs, resource_defs={ 'adls2': ResourceDefinition.hardcoded_resource(adls2_fake_resource) }, ) ]) def adls2_file_manager_test(): accept_file(emit_file()) result = execute_pipeline( adls2_file_manager_test, environment_dict={ 'storage': { 'adls2': { 'config': { 'adls2_file_system': file_system } } } }, ) assert result.success keys_in_bucket = set( adls2_fake_resource.adls2_client.file_systems[file_system].keys()) for step_key, output_name in [ ('emit_file.compute', 'result'), ('accept_file.compute', 'result'), ]: keys_in_bucket.remove( create_adls2_key(result.run_id, step_key, output_name)) assert len(keys_in_bucket) == 1 file_key = list(keys_in_bucket)[0] comps = file_key.split('/') assert '/'.join(comps[:-1]) == 'dagster/storage/{run_id}/files'.format( run_id=result.run_id) assert uuid.UUID(comps[-1])
def define_string_resource(): return ResourceDefinition( config_schema=String, resource_fn=lambda init_context: init_context.resource_config )
from hca_orchestration.support.typing import HcaScratchDatasetName, MetadataType, MetadataTypeFanoutResult from hca_orchestration.tests.support.gcs import FakeGCSClient, FakeGoogleBucket, HexBlobInfo test_bucket = FakeGoogleBucket( {"gs://my-fake-bucket/fake-prefix": HexBlobInfo(hex_md5="b2d6ec45472467c836f253bd170182c7", content="test content")} ) test_bucket_name = "my-fake-bucket" load_table_test_mode = ModeDefinition( "test_load_table", resource_defs={**test_mode.resource_defs} ) load_table_test_mode.resource_defs["gcs"] = ResourceDefinition.hardcoded_resource( FakeGCSClient( buckets={test_bucket_name: test_bucket} ) ) run_config = { "resources": { "scratch_config": { "config": { "scratch_bucket_name": test_bucket_name, "scratch_prefix_name": "prefix_name", "scratch_bq_project": "bq_project", "scratch_dataset_prefix": "dataset_prefix", "scratch_table_expiration_ms": 86400000 } }, "target_hca_dataset": {
if inputs is None: inputs = [] return SolidDefinition( name=name, transform_fn=_create_sql_alchemy_transform_fn(sql_text), inputs=inputs, outputs=[OutputDefinition()], ) InMemSqlLiteEngineResource = ResourceDefinition( resource_fn=lambda info: in_mem_engine(info.config['num_table']), config_field=Field( Dict({ 'num_table': Field(String, is_optional=True, default_value='num_table') })), ) def test_resource_format(): sum_sql_text = '''CREATE TABLE sum_table AS SELECT num1, num2, num1 + num2 as sum FROM num_table''' sum_sq_sql_text = '''CREATE TABLE sum_sq_table AS SELECT num1, num2, sum, sum * sum as sum_sq FROM sum_table''' sum_sql_solid = create_sql_statement_solid('sum_sql_solid', sum_sql_text) sum_sq_sql_solid = create_sql_statement_solid(
def test_airline_demo_load_df(): db_info_mock = DbInfo( engine=mock.MagicMock(), url="url", jdbc_url="url", dialect="dialect", load_table=mock.MagicMock(), host="host", db_name="db_name", ) @solid( required_resource_keys={"pyspark"}, output_defs=[OutputDefinition(io_manager_key="pyspark_io_manager")], ) def emit_mock(context): return context.resources.pyspark.spark_session.read.csv( file_relative_path(__file__, "../data/test.csv")) @pipeline(mode_defs=[ ModeDefinition( resource_defs={ "db_info": ResourceDefinition.hardcoded_resource(db_info_mock), "pyspark": pyspark_resource, "pyspark_step_launcher": no_step_launcher, "pyspark_io_manager": local_parquet_io_manager, "io_manager": fs_io_manager, }) ]) def load_df_test(): load_data_to_database_from_spark(emit_mock()) with tempfile.TemporaryDirectory() as temp_dir: solid_result = execute_pipeline( load_df_test, run_config={ "solids": { "load_data_to_database_from_spark": { "config": { "table_name": "foo" } } }, "resources": { "io_manager": { "config": { "base_dir": temp_dir } }, "pyspark_io_manager": { "config": { "base_dir": temp_dir } }, }, }, ).result_for_solid("load_data_to_database_from_spark") assert solid_result.success mats = solid_result.materializations_during_compute assert len(mats) == 1 mat = mats[0] assert len(mat.metadata_entries) == 2 entries = {me.label: me for me in mat.metadata_entries} assert entries["Host"].entry_data.text == "host" assert entries["Db"].entry_data.text == "db_name"
def test_cache_file_from_s3_overwrite(): with get_temp_dir() as temp_dir: s3_session_one = mock.MagicMock() execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource( s3_session_one), }), environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 's3_coordinate': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir, 'overwrite': True } } }, }, ) # assert the download occurred assert s3_session_one.download_file.call_count == 1 s3_session_two = mock.MagicMock() execute_solid( cache_file_from_s3, ModeDefinition( resource_defs={ 'file_cache': fs_file_cache, 's3': ResourceDefinition.hardcoded_resource( s3_session_two), }), environment_dict={ 'solids': { 'cache_file_from_s3': { 'inputs': { 's3_coordinate': { 'bucket': 'some-bucket', 'key': 'some-key' } } } }, 'resources': { 'file_cache': { 'config': { 'target_folder': temp_dir, 'overwrite': True } } }, }, ) # assert the download did not occur because file is already there assert s3_session_two.download_file.call_count == 0
def define_errorable_resource(): return ResourceDefinition(resource_fn=resource_init, config={'throw_on_resource_init': Bool})
def test_depends_on_s3_resource_file_manager(): bar_bytes = "bar".encode() @solid(output_defs=[OutputDefinition(S3FileHandle)]) def emit_file(context): return context.file_manager.write_data(bar_bytes) @solid(input_defs=[InputDefinition("file_handle", S3FileHandle)]) def accept_file(context, file_handle): local_path = context.file_manager.copy_handle_to_local_temp( file_handle) assert isinstance(local_path, str) assert open(local_path, "rb").read() == bar_bytes # Uses mock S3 s3 = boto3.client("s3") s3.create_bucket(Bucket="some-bucket") @pipeline(mode_defs=[ ModeDefinition( system_storage_defs=s3_plus_default_storage_defs, resource_defs={"s3": ResourceDefinition.hardcoded_resource(s3)}, ) ]) def s3_file_manager_test(): accept_file(emit_file()) result = execute_pipeline( s3_file_manager_test, run_config={ "storage": { "s3": { "config": { "s3_bucket": "some-bucket" } } } }, ) assert result.success keys_in_bucket = set([ obj["Key"] for obj in s3.list_objects(Bucket="some-bucket")["Contents"] ]) for step_key, output_name in [ ("emit_file.compute", "result"), ("accept_file.compute", "result"), ]: keys_in_bucket.remove( create_s3_key(result.run_id, step_key, output_name)) assert len(keys_in_bucket) == 1 file_key = list(keys_in_bucket)[0] comps = file_key.split("/") assert "/".join(comps[:-1]) == "dagster/storage/{run_id}/files".format( run_id=result.run_id) assert uuid.UUID(comps[-1])
""" Trains a collaborative filtering model that can recommend HN stories to users based on what stories they've commented on in the past. """ comment_stories = build_comment_stories() user_story_matrix = build_user_story_matrix(comment_stories) recommender_model = build_recommender_model(user_story_matrix) model_perf_notebook(recommender_model) build_component_top_stories(recommender_model, user_story_matrix) build_user_top_recommended_stories(recommender_model, user_story_matrix) story_recommender_prod_job = story_recommender.to_job(resource_defs={ **RESOURCES_PROD, **{ "partition_bounds": ResourceDefinition.none_resource() }, }) story_recommender_staging_job = story_recommender.to_job( resource_defs={ **RESOURCES_STAGING, **{ "partition_bounds": ResourceDefinition.none_resource() }, }) story_recommender_local_job = story_recommender.to_job( resource_defs={ **RESOURCES_LOCAL, **{
"org.apache.hadoop.fs.s3native.NativeS3FileSystem", "spark.hadoop.fs.s3.awsAccessKeyId": os.getenv("AWS_ACCESS_KEY_ID", ""), "spark.hadoop.fs.s3.awsSecretAccessKey": os.getenv("AWS_SECRET_ACCESS_KEY", ""), "spark.hadoop.fs.s3.buffer.dir": "/tmp", } }) snowflake_io_manager_prod = snowflake_io_manager.configured( {"database": "DEMO_DB_ASSETS"}) RESOURCES_PROD = { "s3_bucket": ResourceDefinition.hardcoded_resource("hackernews-elementl-prod"), "io_manager": common_bucket_s3_pickle_io_manager, "s3": s3_resource, "parquet_io_manager": s3_partitioned_parquet_io_manager, "warehouse_io_manager": snowflake_io_manager_prod, "pyspark": configured_pyspark, "warehouse_loader": snowflake_io_manager_prod, } snowflake_io_manager_staging = snowflake_io_manager.configured( {"database": "DEMO_DB_ASSETS_STAGING"}) RESOURCES_STAGING = { "s3_bucket": ResourceDefinition.hardcoded_resource("hackernews-elementl-dev"), "io_manager": common_bucket_s3_pickle_io_manager,
@schedule(job=event_tables, cron_schedule="0 0 * * *") def event_tables_schedule(_): return {} @graph def event_reports(): make_event_reports = make_solid("make_event_reports", required_resource_keys={"mode"}) make_event_reports() @sensor(job=event_reports.to_job( resource_defs={"mode": ResourceDefinition.none_resource()})) def event_reports_sensor(): pass event_reports_dev = event_reports.to_job( resource_defs={"mode": ResourceDefinition.none_resource()}) @graph def crm_ingest(): """A graph with multiple production jobs""" ingest_users = make_solid("ingest_users", required_resource_keys={"crm"}) ingest_interactions = make_solid("ingest_interactions", required_resource_keys={"crm"})
raw_events = make_raw_events() clean_events(raw_events) @schedule(job=event_tables, cron_schedule="0 0 * * *") def event_tables_schedule(_): return {} @graph def event_reports(): make_event_reports = make_solid("make_event_reports", required_resource_keys={"mode"}) make_event_reports() @sensor(job=event_reports.to_job(resource_defs={"mode": ResourceDefinition.none_resource()})) def event_reports_sensor(): pass event_reports_dev = event_reports.to_job(resource_defs={"mode": ResourceDefinition.none_resource()}) @graph def crm_ingest(): """A graph with multiple production jobs""" ingest_users = make_solid("ingest_users", required_resource_keys={"crm"}) ingest_interactions = make_solid("ingest_interactions", required_resource_keys={"crm"}) ingest_users() ingest_interactions()
def define_tempfile_resource(): return ResourceDefinition(resource_fn=_tempfile_resource_fn)
def test_get_out_of_pipeline_context(): context = dagstermill.get_context(mode_def=ModeDefinition( resource_defs={"list": ResourceDefinition(lambda _: [])})) assert context.pipeline_name == "ephemeral_dagstermill_pipeline" assert context.resources.list == []
def define_lambda_resource(func, *args, **kwargs): return ResourceDefinition(lambda _info: func(*args, **kwargs))
@solid def a(_): pass @solid def b(_): raise Exception() mode_defs = [ ModeDefinition( 'dev', resource_defs={ 'slack': ResourceDefinition.hardcoded_resource( slack_resource_mock, 'do not send messages in dev' ) }, ), ModeDefinition('prod', resource_defs={'slack': slack_resource}), ] @slack_on_failure @pipeline(mode_defs=mode_defs) def notif_all(): # the hook "slack_on_failure" is applied on every solid instance within this pipeline a() b()
def define_value_resource(value): return ResourceDefinition(lambda _info: value)
def a(_): pass @solid def b(_): raise Exception() # start_repo_marker_3 mode_defs = [ ModeDefinition( "dev", resource_defs={ "slack": ResourceDefinition.hardcoded_resource( slack_resource_mock, "do not send messages in dev" ) }, ), ModeDefinition("prod", resource_defs={"slack": slack_resource}), ] # end_repo_marker_3 # start_repo_marker_1 @slack_message_on_failure @pipeline(mode_defs=mode_defs) def notif_all(): # the hook "slack_message_on_failure" is applied on every solid instance within this pipeline a() b()
def define_string_resource(): return ResourceDefinition(resource_fn=lambda info: info.config, config_field=Field(String))
def define_string_resource(): return ResourceDefinition( config_field=Field(String), resource_fn=lambda init_context: init_context.resource_config)
def dummy_resource(config_field): return ResourceDefinition(lambda: None, config_field)
def test_depends_on_adls2_resource_intermediates(storage_account, file_system): @solid( input_defs=[ InputDefinition('num_one', Int), InputDefinition('num_two', Int) ], output_defs=[OutputDefinition(Int)], ) def add_numbers(_, num_one, num_two): return num_one + num_two adls2_fake_resource = FakeADLS2Resource(storage_account) @pipeline(mode_defs=[ ModeDefinition( system_storage_defs=adls2_plus_default_storage_defs, resource_defs={ 'adls2': ResourceDefinition.hardcoded_resource(adls2_fake_resource) }, ) ]) def adls2_internal_pipeline(): return add_numbers() result = execute_pipeline( adls2_internal_pipeline, environment_dict={ 'solids': { 'add_numbers': { 'inputs': { 'num_one': { 'value': 2 }, 'num_two': { 'value': 4 } } } }, 'storage': { 'adls2': { 'config': { 'adls2_file_system': file_system } } }, }, ) assert result.success assert result.result_for_solid('add_numbers').output_value() == 6 assert file_system in adls2_fake_resource.adls2_client.file_systems keys = set() for step_key, output_name in [('add_numbers.compute', 'result')]: keys.add(create_adls2_key(result.run_id, step_key, output_name)) assert set(adls2_fake_resource.adls2_client.file_systems[file_system].keys( )) == keys
# start_resource_example class ExternalCerealFetcher: def fetch_new_cereals(self, start_ts, end_ts): pass @resource def cereal_fetcher(init_context): return ExternalCerealFetcher() # end_resource_example resource_a = ResourceDefinition.hardcoded_resource(1) resource_b = ResourceDefinition.hardcoded_resource(2) # start_mode_example mode_def_ab = ModeDefinition( "ab_mode", resource_defs={ "a": resource_a, "b": resource_b, }, ) # end_mode_example mode_def_c = ModeDefinition("c_mode", resource_defs={"a": resource_a})