def test_multi_asset_asset_materialization_planned_events(): @multi_asset( outs={ "my_out_name": Out(asset_key=AssetKey("my_asset_name")), "my_other_out_name": Out(asset_key=AssetKey("my_other_asset")), } ) def my_asset(): yield Output(1, "my_out_name") yield Output(2, "my_other_out_name") assets_job = build_assets_job("assets_job", [my_asset]) with instance_for_test() as instance: result = assets_job.execute_in_process(instance=instance) records = instance.get_event_records( EventRecordsFilter( DagsterEventType.ASSET_MATERIALIZATION_PLANNED, AssetKey("my_asset_name") ) ) assert result.run_id == records[0].event_log_entry.run_id run_id = result.run_id assert instance.run_ids_for_asset_key(AssetKey("my_asset_name")) == [run_id] assert instance.run_ids_for_asset_key(AssetKey("my_other_asset")) == [run_id]
def test_io_manager(): df_value = pandas.DataFrame({"foo": ["bar", "baz"], "quux": [1, 2]}) @asset(partitions_def=hourly_partitions) def pandas_df_asset(): return df_value @asset(partitions_def=hourly_partitions) def spark_input_asset(pandas_df_asset: SparkDF): assert isinstance(pandas_df_asset, SparkDF) assert pandas_df_asset.count() == 2 assert set(pandas_df_asset.columns) == {"foo", "quux"} return pandas_df_asset with tempfile.TemporaryDirectory() as temp_dir: io_manager_test_job = build_assets_job( "io_manager_test_job", assets=[pandas_df_asset, spark_input_asset], resource_defs={ "pyspark": pyspark_resource, "io_manager": local_partitioned_parquet_io_manager.configured( {"base_path": temp_dir}), }, ) expected_path = os.path.join( temp_dir, "pandas_df_asset-20220101160000_20220101170000.pq") res = io_manager_test_job.execute_in_process( partition_key="2022-01-01-16:00") assert res.success assert os.path.exists(expected_path) intermediate_df = pandas.read_parquet(expected_path) assert all(intermediate_df == df_value)
def test_asset_materialization_planned_event_yielded(): @asset def asset_one(): raise Exception("foo") @asset def never_runs_asset(asset_one): return asset_one asset_job = build_assets_job("asset_job", [asset_one, never_runs_asset]) with instance_for_test() as instance: # test with only one asset selected result = asset_job.execute_in_process( instance=instance, raise_on_error=False, op_selection=["asset_one"] ) run_id = result.run_id assert instance.run_ids_for_asset_key(AssetKey("asset_one")) == [run_id] assert instance.run_ids_for_asset_key(AssetKey("never_runs_asset")) == [] with instance_for_test() as instance: # fresh event log storage # test with both assets selected result = asset_job.execute_in_process(instance=instance, raise_on_error=False) run_id = result.run_id assert instance.run_ids_for_asset_key(AssetKey("asset_one")) == [run_id] assert instance.run_ids_for_asset_key(AssetKey("never_runs_asset")) == [run_id]
def test_download(): with tempfile.TemporaryDirectory() as temp_dir: test_job = build_assets_job( "test_job", assets=ASSETS, resource_defs={ "io_manager": fs_io_manager, "partition_start": ResourceDefinition.string_resource(), "partition_end": ResourceDefinition.string_resource(), "parquet_io_manager": local_partitioned_parquet_io_manager.configured( {"base_path": temp_dir}), "warehouse_io_manager": mem_io_manager, "pyspark": pyspark_resource, "hn_client": hn_snapshot_client, }, ) result = test_job.execute_in_process(partition_key="2020-12-30-00:00") assert result.success
def define_assets_job(): @asset def asset1(): return 1 @asset def asset2(asset1): return asset1 + 1 return build_assets_job( name="assets", assets=[asset1, asset2], resource_defs={ "io_manager": s3_pickle_asset_io_manager, "s3": s3_test_resource, }, )
def define_assets_job(bucket): @asset def asset1(): return 1 @asset def asset2(asset1): return asset1 + 1 @asset(partitions_def=StaticPartitionsDefinition(["apple", "orange"])) def partitioned(): return 8 return build_assets_job( name="assets", assets=[asset1, asset2, partitioned], resource_defs={ "io_manager": s3_pickle_asset_io_manager.configured({"s3_bucket": bucket}), "s3": s3_test_resource, }, )
def test_io_manager_single_partition_add_input_metadata(): partitions_def = StaticPartitionsDefinition(["a", "b", "c"]) @asset(partitions_def=partitions_def) def asset_1(): return 1 @asset(partitions_def=partitions_def) def asset_2(asset_1): return asset_1 + 1 class MyIOManager(IOManager): def handle_output(self, context, obj): pass def load_input(self, context): context.add_input_metadata(metadata={"foo": "bar"}, description="hello world") return 1 @io_manager def my_io_manager(_): return MyIOManager() assets_job = build_assets_job( "assets_job", [asset_1, asset_2], resource_defs={"io_manager": my_io_manager} ) result = assets_job.execute_in_process(partition_key="a") get_observation = lambda event: event.event_specific_data.asset_observation observations = [ event for event in result.all_node_events if event.event_type_value == "ASSET_OBSERVATION" ] assert observations[0].step_key == "asset_2" assert get_observation(observations[0]) == AssetObservation( asset_key="asset_1", metadata={"foo": "bar"}, description="hello world", partition="a" )
class LocalFileSystemIOManager(IOManager): """Translates between Pandas DataFrames and CSVs on the local filesystem.""" def _get_fs_path(self, asset_key: AssetKey) -> str: rpath = os.path.join(*asset_key.path) + ".csv" return os.path.abspath(rpath) def handle_output(self, context, obj: DataFrame): """This saves the dataframe as a CSV.""" fpath = self._get_fs_path(context.asset_key) obj.to_csv(fpath) def load_input(self, context): """This reads a dataframe from a CSV.""" fpath = self._get_fs_path(context.asset_key) return pd.read_csv(fpath) # io_manager_end # build_assets_job_start weather_job = build_assets_job( "weather", assets=[daily_temperature_highs, hottest_dates], source_assets=[sfo_q2_weather_sample], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(LocalFileSystemIOManager()) }, ) # build_assets_job_end
import random from typing import Sequence from dagster import AssetKey, asset, build_assets_job N_ASSETS = 1000 def generate_big_honkin_assets() -> Sequence: random.seed(5438790) assets = [] for i in range(N_ASSETS): non_argument_deps = { AssetKey(f"asset_{j}") for j in random.sample(range(i), min(i, random.randint(0, 3))) } @asset(name=f"asset_{i}", non_argument_deps=non_argument_deps) def some_asset(): pass assets.append(some_asset) return assets big_honkin_assets_job = build_assets_job("big_honkin_assets_job", generate_big_honkin_assets())
def test_assets(schema_prefix): ab_resource = airbyte_resource( build_init_resource_context(config={ "host": "some_host", "port": "8000", })) destination_tables = ["foo", "bar"] if schema_prefix: destination_tables = [schema_prefix + t for t in destination_tables] ab_assets = build_airbyte_assets( "12345", destination_tables=destination_tables, asset_key_prefix=["some", "prefix"], ) assert ab_assets[0].asset_keys == { AssetKey(["some", "prefix", t]) for t in destination_tables } assert len(ab_assets[0].op.output_defs) == 2 responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/get", json=get_sample_connection_json(prefix=schema_prefix), status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/sync", json={"job": { "id": 1 }}, status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/jobs/get", json=get_sample_job_json(schema_prefix=schema_prefix), status=200, ) ab_job = build_assets_job( "ab_job", ab_assets, resource_defs={ "airbyte": airbyte_resource.configured({ "host": "some_host", "port": "8000", }) }, ) res = ab_job.execute_in_process() materializations = [ event.event_specific_data.materialization for event in res.events_for_node("airbyte_sync_12345") if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 3 assert {m.asset_key for m in materializations} == { AssetKey(["some", "prefix", schema_prefix + "foo"]), AssetKey(["some", "prefix", schema_prefix + "bar"]), AssetKey(["some", "prefix", schema_prefix + "baz"]), } assert MetadataEntry("bytesEmitted", value=1234) in materializations[0].metadata_entries assert MetadataEntry("recordsCommitted", value=4321) in materializations[0].metadata_entries assert (MetadataEntry( "schema", value=TableSchema(columns=[ TableColumn(name="a", type="str"), TableColumn(name="b", type="int"), ]), ) in materializations[0].metadata_entries)
def test_assets(): ab_resource = airbyte_resource( build_init_resource_context(config={ "host": "some_host", "port": "8000", })) ab_assets = build_airbyte_assets("12345", ["foo", "bar"], asset_key_prefix=["some", "prefix"]) assert len(ab_assets[0].op.output_defs) == 2 responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/get", json={ "name": "xyz", "syncCatalog": { "streams": [ { "stream": { "name": "foo", "jsonSchema": { "properties": { "a": { "type": "str" }, "b": { "type": "int" } } }, }, "config": { "selected": True }, }, { "stream": { "name": "bar", "jsonSchema": { "properties": { "c": { "type": "str" }, } }, }, "config": { "selected": True }, }, { "stream": { "name": "baz", "jsonSchema": { "properties": { "d": { "type": "str" }, } }, }, "config": { "selected": True }, }, ] }, }, status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/connections/sync", json={"job": { "id": 1 }}, status=200, ) responses.add( method=responses.POST, url=ab_resource.api_base_url + "/jobs/get", json={ "job": { "id": 1, "status": AirbyteState.SUCCEEDED }, "attempts": [{ "attempt": { "streamStats": [ { "streamName": "foo", "stats": { "bytesEmitted": 1234, "recordsCommitted": 4321, }, }, { "streamName": "bar", "stats": { "bytesEmitted": 1234, "recordsCommitted": 4321, }, }, { "streamName": "baz", "stats": { "bytesEmitted": 1111, "recordsCommitted": 1111, }, }, ] } }], }, status=200, ) ab_job = build_assets_job( "ab_job", ab_assets, resource_defs={ "airbyte": airbyte_resource.configured({ "host": "some_host", "port": "8000", }) }, ) res = ab_job.execute_in_process() materializations = [ event for event in res.events_for_node("airbyte_sync_12345") if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 3 assert (MetadataEntry.text("a,b", "columns") in materializations[0]. event_specific_data.materialization.metadata_entries) assert (MetadataEntry.int(1234, "bytesEmitted") in materializations[0]. event_specific_data.materialization.metadata_entries) assert (MetadataEntry.int(4321, "recordsCommitted") in materializations[0]. event_specific_data.materialization.metadata_entries)
def test_fivetran_asset_run(tables, should_error): ft_resource = fivetran_resource.configured({ "api_key": "foo", "api_secret": "bar" }) final_data = {"succeeded_at": "2021-01-01T02:00:00.0Z"} api_prefix = f"{FIVETRAN_API_BASE}/{FIVETRAN_CONNECTOR_PATH}{DEFAULT_CONNECTOR_ID}" fivetran_assets = build_fivetran_assets( connector_id=DEFAULT_CONNECTOR_ID, destination_tables=tables, poll_interval=0.1, poll_timeout=10, ) # expect the multi asset to have one asset key and one output for each specified asset key assert fivetran_assets[0].asset_keys == { AssetKey(table.split(".")) for table in tables } assert len(fivetran_assets[0].op.output_defs) == len(tables) fivetran_assets_job = build_assets_job( name="fivetran_assets_job", assets=fivetran_assets, resource_defs={"fivetran": ft_resource}, ) with responses.RequestsMock() as rsps: rsps.add(rsps.PATCH, api_prefix, json=get_sample_update_response()) rsps.add(rsps.POST, f"{api_prefix}/force", json=get_sample_sync_response()) # connector schema rsps.add( rsps.GET, f"{api_prefix}/schemas", json=get_sample_connector_schema_config(tables=[ ("schema1", "tracked"), ("schema1", "untracked"), ("schema2", "tracked"), ]), ) # initial state rsps.add(rsps.GET, api_prefix, json=get_sample_connector_response()) # final state will be updated rsps.add(rsps.GET, api_prefix, json=get_sample_connector_response(data=final_data)) if should_error: with pytest.raises(DagsterStepOutputNotFoundError): fivetran_assets_job.execute_in_process() else: result = fivetran_assets_job.execute_in_process() assert result.success # make sure we only have outputs for the explicit asset keys outputs = [ event for event in result.events_for_node( f"fivetran_sync_{DEFAULT_CONNECTOR_ID}") if event.event_type_value == "STEP_OUTPUT" ] assert len(outputs) == len(tables) # make sure we have asset materializations for all the schemas/tables that were actually sync'd asset_materializations = [ event for event in result.events_for_node( f"fivetran_sync_{DEFAULT_CONNECTOR_ID}") if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(asset_materializations) == 3 found_asset_keys = set( mat.event_specific_data.materialization.asset_key for mat in asset_materializations) assert found_asset_keys == { AssetKey(["schema1", "tracked"]), AssetKey(["schema1", "untracked"]), AssetKey(["schema2", "tracked"]), }
"memory": "2Gi" }, } }, } } ASSETS = [id_range_for_time, items, comments, stories] download_prod_job = build_assets_job( "hacker_news_api_download", assets=ASSETS, resource_defs={ **{ "hn_client": hn_api_subsample_client.configured({ "sample_rate": 10 }) }, **RESOURCES_PROD, }, tags=DOWNLOAD_TAGS, ) download_staging_job = build_assets_job( "hacker_news_api_download", assets=ASSETS, resource_defs={ **{ "hn_client": hn_api_subsample_client.configured({ "sample_rate": 10 })
"Rows": num_rows[0] } # this list has one element per dbt model assets = load_assets_from_dbt_manifest( json.load(open(os.path.join(DBT_PROJECT_DIR, "target", "manifest.json"))), runtime_metadata_fn=asset_metadata, io_manager_key="warehouse_io_manager", ) activity_stats_staging_job = build_assets_job( "activity_stats", assets, [], resource_defs={ **RESOURCES_STAGING, **{ "dbt": dbt_prod_resource } }, ) activity_stats_prod_job = build_assets_job( "activity_stats", assets, [], resource_defs={ **RESOURCES_PROD, **{ "dbt": dbt_prod_resource }
from hacker_news_assets.assets.comment_stories import comment_stories from hacker_news_assets.assets.items import comments, stories from hacker_news_assets.assets.recommender_model import component_top_stories, recommender_model from hacker_news_assets.assets.user_story_matrix import user_story_matrix from hacker_news_assets.assets.user_top_recommended_stories import user_top_recommended_stories from hacker_news_assets.resources import RESOURCES_PROD, RESOURCES_STAGING assets = [ comment_stories, user_story_matrix, recommender_model, component_top_stories, user_top_recommended_stories, ] source_assets = [comments, stories] story_recommender_prod_job = build_assets_job( "story_recommender", assets=assets, source_assets=source_assets, resource_defs=RESOURCES_PROD, ) story_recommender_staging_job = build_assets_job( "story_recommender", assets=assets, source_assets=source_assets, resource_defs=RESOURCES_STAGING, )
# pylint: disable=redefined-outer-name from dagster import AssetIn, asset, build_assets_job namespace1 = ["s3", "superdomain_1", "subdomain_1", "subsubdomain_1"] @asset(namespace=namespace1) def asset1(): pass @asset( namespace=["s3", "superdomain_2", "subdomain_2", "subsubdomain_2"], ins={"asset1": AssetIn(namespace=namespace1)}, ) def asset2(asset1): assert asset1 is None long_asset_keys_job = build_assets_job("long_asset_keys_job", assets=[asset1, asset2])