def test_materialized_assets(): instance = DagsterInstance.ephemeral() res = execute_pipeline(materialization_pipeline, instance=instance) assert res.success asset_keys = instance.all_asset_keys() assert len(asset_keys) == 1 assert asset_keys[0] == AssetKey(["dashboards", "analytics_dashboard"])
def save(self, key, df): path = os.path.join(self.root_dir, key) df.to_parquet(path) return AssetMaterialization( asset_key=AssetKey(["local_metastore", key]), metadata_entries=[EventMetadataEntry.path(path, "on_disk")], )
def test_asset_materialization(conn_string): event_log_storage = PostgresEventLogStorage.create_clean_storage( conn_string) asset_key = AssetKey(["path", "to", "asset_one"]) @solid def materialize_one(_): yield AssetMaterialization( asset_key=asset_key, metadata_entries=[ EventMetadataEntry.text("hello", "text"), EventMetadataEntry.json({"hello": "world"}, "json"), EventMetadataEntry.float(1.0, "one"), ], ) yield Output(1) def _solids(): materialize_one() events_one, _ = synthesize_events(_solids) for event in events_one: event_log_storage.store_event(event) assert asset_key in set(event_log_storage.get_all_asset_keys()) events = event_log_storage.get_asset_events(asset_key) assert len(events) == 1 event = events[0] assert isinstance(event, DagsterEventRecord) assert event.dagster_event.event_type_value == DagsterEventType.STEP_MATERIALIZATION.value
def test_multi_asset_with_compute_kind(): @multi_asset(outs={"o1": Out(asset_key=AssetKey("o1"))}, compute_kind="sql") def my_asset(arg1): return arg1 assert my_asset.op.tags == {"kind": "sql"}
def migrate_asset_key_data(event_log_storage, print_fn=lambda _: None): """ Utility method to build an asset key index from the data in existing event log records. Takes in event_log_storage, and a print_fn to keep track of progress. """ from dagster.core.storage.event_log.sql_event_log import AssetAwareSqlEventLogStorage from .schema import AssetKeyTable, SqlEventLogStorageTable if not isinstance(event_log_storage, AssetAwareSqlEventLogStorage): return query = (db.select([ SqlEventLogStorageTable.c.asset_key ]).where(SqlEventLogStorageTable.c.asset_key != None).group_by( SqlEventLogStorageTable.c.asset_key)) with event_log_storage.connect() as conn: print_fn("Querying event logs.") to_insert = conn.execute(query).fetchall() print_fn("Found {} records to index".format(len(to_insert))) for (asset_key, ) in tqdm(to_insert): try: conn.execute(AssetKeyTable.insert().values( # pylint: disable=no-value-for-parameter asset_key=AssetKey.from_db_string(asset_key).to_string())) except db.exc.IntegrityError: # asset key already present pass
def test_backcompat_get_asset_records(): src_dir = file_relative_path( __file__, "compat_tests/snapshot_0_11_0_asset_materialization") # should contain materialization events for asset keys a, b, c, d, e, f # events a and b have been wiped, but b has been rematerialized def _validate_materialization(asset_key, event, expected_tags): assert isinstance(event, EventLogEntry) assert event.dagster_event assert event.dagster_event.is_step_materialization assert event.dagster_event.step_materialization_data.materialization.asset_key == asset_key assert event.dagster_event.step_materialization_data.materialization.tags == expected_tags b = AssetKey("b") with copy_directory(src_dir) as test_dir: with DagsterInstance.from_ref( InstanceRef.from_dir(test_dir)) as instance: storage = instance.event_log_storage records = storage.get_asset_records([b]) asset_entry = records[0].asset_entry assert asset_entry.asset_key == b _validate_materialization(b, asset_entry.last_materialization, expected_tags={})
def build_for_materialization(materialization): class DummyIOManager(IOManager): def __init__(self): self.values = {} def handle_output(self, context, obj): keys = tuple(context.get_output_identifier()) self.values[keys] = obj context.add_output_metadata({"foo": "bar"}) yield MetadataEntry("baz", value="baz") context.add_output_metadata({"bar": "bar"}) yield materialization def load_input(self, context): keys = tuple(context.upstream_output.get_output_identifier()) return self.values[keys] @op(out=Out(asset_key=AssetKey("key_on_out"))) def the_op(): return 5 @graph def the_graph(): the_op() return the_graph.execute_in_process( resources={"io_manager": DummyIOManager()})
def test_source_asset(): @asset def asset1(source1): assert source1 == 5 return 1 class MyIOManager(IOManager): def handle_output(self, context, obj): pass def load_input(self, context): assert context.resource_config["a"] == 7 assert context.resources.subresource == 9 assert context.upstream_output.resources.subresource == 9 return 5 @io_manager(config_schema={"a": int}, required_resource_keys={"subresource"}) def my_io_manager(_): return MyIOManager() job = build_assets_job( "a", [asset1], source_assets=[ SourceAsset(AssetKey("source1"), io_manager_key="special_io_manager") ], resource_defs={ "special_io_manager": my_io_manager.configured({"a": 7}), "subresource": ResourceDefinition.hardcoded_resource(9), }, ) assert job.graph.node_defs == [asset1.op] assert job.execute_in_process().success
def test_asset_group_source_asset(): foo_fa = SourceAsset(key=AssetKey("foo"), io_manager_key="the_manager") @asset def asset_depends_on_source(foo): return foo class MyIOManager(IOManager): def handle_output(self, context, obj): pass def load_input(self, context): return 5 @io_manager def the_manager(): return MyIOManager() group = AssetGroup( assets=[asset_depends_on_source], source_assets=[foo_fa], resource_defs={"the_manager": the_manager}, ) @repository def the_repo(): return [group] asset_group_underlying_job = the_repo.get_all_jobs()[0] assert asset_group_underlying_job.name == group.all_assets_job_name result = asset_group_underlying_job.execute_in_process() assert result.success
def test_asset_group_from_list(): @asset def asset_foo(): return "foo" @asset def asset_bar(): return "bar" @asset(ins={"asset_bar": AssetIn(asset_key=AssetKey("asset_foo"))}) def last_asset(asset_bar): return asset_bar group = AssetGroup(assets=[asset_foo, asset_bar, last_asset]) @repository def the_repo(): return [group] assert len(the_repo.get_all_jobs()) == 1 asset_group_underlying_job = the_repo.get_all_jobs()[0] assert asset_group_underlying_job.name == group.all_assets_job_name result = asset_group_underlying_job.execute_in_process() assert result.success
def toy_asset_sensor(context): events = context.instance.events_for_asset_key( AssetKey(["model"]), after_cursor=context.cursor, ascending=False, limit=1) if not events: return record_id, event = events[0] # take the most recent materialization from_pipeline = event.pipeline_name yield RunRequest( run_key=str(record_id), run_config={ "solids": { "read_materialization": { "config": { "asset_key": ["model"], "pipeline": from_pipeline } } } }, ) context.update_cursor(str(record_id))
def get_asset_materializations( self, dbt_output: DbtOutput) -> List[AssetMaterialization]: ret = [] # dbt_output.result contains the parsed contents of the results.json file # Note that the json schema can change from version to version. This is written for # https://schemas.getdbt.com/dbt/run-results/v2.json (also will work with v1.json) for result in dbt_output.result["results"]: if result["status"] != "success": continue unique_id = result["unique_id"] # Here, we choose a naming scheme for our asset keys that will look something like # <asset prefix> / model / <dbt project> / <model name>, but this is pretty arbitrary asset_key = AssetKey(self._asset_key_prefix + unique_id.split(".")) # create an AssetMaterialization with our key and metadata ret.append( AssetMaterialization( description=f"dbt node: {unique_id}", metadata_entries=self._get_metadata(result), asset_key=asset_key, )) return ret
def handle_output(self, context, obj): file_path = os.path.join("my_base_dir", context.step_key, context.name) obj.to_csv(file_path) yield AssetMaterialization(asset_key=AssetKey(file_path), description="Persisted result to storage.")
def test_asset_materialization(self, storage): asset_key = AssetKey(["path", "to", "asset_one"]) @solid def materialize_one(_): yield AssetMaterialization( asset_key=asset_key, metadata={ "text": "hello", "json": { "hello": "world" }, "one_float": 1.0, "one_int": 1, }, ) yield Output(1) def _solids(): materialize_one() events_one, _ = _synthesize_events(_solids) for event in events_one: storage.store_event(event) assert asset_key in set(storage.all_asset_keys()) events = storage.get_asset_events(asset_key) assert len(events) == 1 event = events[0] assert isinstance(event, EventRecord) assert event.dagster_event.event_type_value == DagsterEventType.ASSET_MATERIALIZATION.value
def test_multiple_definition_fails(): class MyIOManager(IOManager): def handle_output(self, context, obj): # store asset return def load_input(self, context): return None def get_output_asset_key(self, context): return AssetKey([context.step_key, context.name]) @io_manager def my_io_manager(_): return MyIOManager() @solid(output_defs=[ OutputDefinition(asset_key=AssetKey("x"), io_manager_key="asset_io_manager"), ]) def fail_solid(_): return 1 @pipeline(mode_defs=[ ModeDefinition(resource_defs={"asset_io_manager": my_io_manager}) ]) def my_pipeline(): fail_solid() with pytest.raises(DagsterInvariantViolationError): execute_pipeline(my_pipeline)
def test_asset_events(asset_aware_context): with asset_aware_context() as ctx: instance, event_log_storage = ctx execute_pipeline(pipeline_one, instance=instance) execute_pipeline(pipeline_two, instance=instance) asset_events = event_log_storage.get_asset_events(AssetKey("asset_1")) assert len(asset_events) == 2 for event in asset_events: assert isinstance(event, EventRecord) assert event.is_dagster_event assert event.dagster_event.event_type == DagsterEventType.ASSET_MATERIALIZATION assert event.dagster_event.asset_key asset_events = event_log_storage.get_asset_events( AssetKey(["path", "to", "asset_3"])) assert len(asset_events) == 1
def test_asset_run_ids(asset_aware_context): with asset_aware_context() as ctx: instance, event_log_storage = ctx one = execute_pipeline(pipeline_one, instance=instance) two = execute_pipeline(pipeline_two, instance=instance) run_ids = event_log_storage.get_asset_run_ids(AssetKey("asset_1")) assert set(run_ids) == set([one.run_id, two.run_id])
def test_asset_key_structure(): src_dir = file_relative_path(__file__, "compat_tests/snapshot_0_9_16_asset_key_structure") with copy_directory(src_dir) as test_dir: asset_storage = ConsolidatedSqliteEventLogStorage(test_dir) asset_keys = asset_storage.get_all_asset_keys() assert len(asset_keys) == 5 # get a structured asset key asset_key = AssetKey(["dashboards", "cost_dashboard"]) # check that backcompat events are read assert asset_storage.has_asset_key(asset_key) events = asset_storage.get_asset_events(asset_key) assert len(events) == 1 run_ids = asset_storage.get_asset_run_ids(asset_key) assert len(run_ids) == 1 # check that backcompat events are merged with newly stored events run_id = "fake_run_id" asset_storage.store_event(_materialization_event_record(run_id, asset_key)) assert asset_storage.has_asset_key(asset_key) events = asset_storage.get_asset_events(asset_key) assert len(events) == 2 run_ids = asset_storage.get_asset_run_ids(asset_key) assert len(run_ids) == 2
def test_precedence(): @solid(input_defs=[ InputDefinition( "arg_b", dagster_type=str, default_value="hi", description="legit", metadata={"explicit": True}, root_manager_key="rudy", asset_key=AssetKey("table_1"), asset_partitions={"0"}, ) ]) def precedence(_context, arg_a: int, arg_b: int, arg_c: int): """ Testing Args: arg_b: boo """ return arg_a + arg_b + arg_c assert precedence.input_defs[0].name == "arg_b" assert (precedence.input_defs[0].dagster_type == InputDefinition( "test", dagster_type=str).dagster_type) assert precedence.input_defs[0].description == "legit" assert precedence.input_defs[0].default_value == "hi" assert precedence.input_defs[0].metadata["explicit"] assert precedence.input_defs[0].root_manager_key == "rudy" assert precedence.input_defs[0].get_asset_key(None) is not None assert precedence.input_defs[0].get_asset_partitions(None) is not None
def test_same_asset_in_multiple_pipelines(): @asset def asset1(): return 1 @pipeline def graph1(): asset1() @pipeline def graph2(): asset1() external_asset_nodes = external_asset_graph_from_defs( [graph1, graph2], foreign_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey("asset1"), dependencies=[], depended_by=[], op_name="asset1", op_description=None, job_names=["graph1", "graph2"], ), ]
def test_input_namespace(): @asset(ins={"arg1": AssetIn(namespace="abc")}) def my_asset(arg1): assert arg1 assert my_asset.op.input_defs[0].get_asset_key(None) == AssetKey( ["abc", "arg1"])
def solid_asset_tags(_): yield AssetMaterialization(asset_key=AssetKey("asset_tags"), tags={ "foo": "FOO", "bar": "BAR" }) yield Output(1)
def test_foreign_asset(): @asset def asset1(source1): assert source1 == 5 return 1 class MyIOManager(IOManager): def handle_output(self, context, obj): pass def load_input(self, context): return 5 @io_manager def my_io_manager(_): return MyIOManager() job = build_assets_job( "a", [asset1], source_assets=[ ForeignAsset(AssetKey("source1"), io_manager_key="special_io_manager") ], resource_defs={"special_io_manager": my_io_manager}, ) assert job.graph.node_defs == [asset1.op] assert job.execute_in_process().success
def test_asset_materialization(conn_string): event_log_storage = PostgresEventLogStorage.create_clean_storage( conn_string) asset_key = AssetKey(['path', 'to', 'asset_one']) @solid def materialize_one(_): yield Materialization( label='one', asset_key=asset_key, metadata_entries=[ EventMetadataEntry.text('hello', 'text'), EventMetadataEntry.json({'hello': 'world'}, 'json'), EventMetadataEntry.float(1.0, 'one'), ], ) yield Output(1) def _solids(): materialize_one() events_one, _ = synthesize_events(_solids) for event in events_one: event_log_storage.store_event(event) assert asset_key in set(event_log_storage.get_all_asset_keys()) events = event_log_storage.get_asset_events(asset_key) assert len(events) == 1 event = events[0] assert isinstance(event, DagsterEventRecord) assert event.dagster_event.event_type_value == DagsterEventType.STEP_MATERIALIZATION.value
def get_output_asset_key(self, context): return AssetKey( [ "my_database", context.metadata["table_name"], ] )
def backcompat_materialize(_): yield Materialization( asset_key="all_types", description="a materialization with all metadata types", metadata_entries=[ MetadataEntry("text", value="text is cool"), MetadataEntry( "url", value=MetadataValue.url("https://bigty.pe/neato")), MetadataEntry("path", value=MetadataValue.path("/tmp/awesome")), MetadataEntry("json", value={"is_dope": True}), MetadataEntry( "python class", value=MetadataValue.python_artifact(MetadataEntry)), MetadataEntry( "python function", value=MetadataValue.python_artifact(file_relative_path)), MetadataEntry("float", value=1.2), MetadataEntry("int", value=1), MetadataEntry("float NaN", value=float("nan")), MetadataEntry("long int", value=LONG_INT), MetadataEntry("pipeline run", value=MetadataValue.pipeline_run("fake_run_id")), MetadataEntry("my asset", value=AssetKey("my_asset")), ], ) yield Output(None)
def test_asset_wipe(self, graphql_context): _create_run(graphql_context, "single_asset_pipeline") _create_run(graphql_context, "multi_asset_pipeline") asset_keys = graphql_context.instance.all_asset_keys() assert AssetKey("a") in asset_keys result = execute_dagster_graphql( graphql_context, WIPE_ASSETS, variables={"assetKeys": [{"path": ["a"]}]} ) assert result.data assert result.data["wipeAssets"] assert result.data["wipeAssets"]["__typename"] == "AssetWipeSuccess" asset_keys = graphql_context.instance.all_asset_keys() assert AssetKey("a") not in asset_keys
def test_asset_group_from_current_module(): group = AssetGroup.from_current_module() assert {asset.op.name for asset in group.assets} == {"asset_in_current_module"} assert len(group.assets) == 1 assert {source_asset.key for source_asset in group.source_assets} == { AssetKey("source_asset_in_current_module") } assert len(group.source_assets) == 1
def test_input_asset_key_and_namespace(): with pytest.raises(check.CheckError, match="key and namespace cannot both be set"): @asset( ins={"arg1": AssetIn(asset_key=AssetKey("foo"), namespace="bar")}) def my_asset(arg1): assert arg1
def test_cross_pipeline_asset_dependency(): @asset def asset1(): return 1 @asset def asset2(asset1): assert asset1 == 1 @pipeline def asset1_graph(): asset1() @pipeline def asset2_graph(): asset2() # pylint: disable=no-value-for-parameter external_asset_nodes = external_asset_graph_from_defs( [asset1_graph, asset2_graph], foreign_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey("asset1"), dependencies=[], depended_by=[ ExternalAssetDependedBy( downstream_asset_key=AssetKey("asset2"), input_name="asset1") ], op_name="asset1", op_description=None, job_names=["asset1_graph"], ), ExternalAssetNode( asset_key=AssetKey("asset2"), dependencies=[ ExternalAssetDependency(upstream_asset_key=AssetKey("asset1"), input_name="asset1") ], depended_by=[], op_name="asset2", op_description=None, job_names=["asset2_graph"], ), ]