def test_assets_job_with_different_partitions_defs(): with pytest.raises(DagsterInvalidDefinitionError): @asset(partitions_def=StaticPartitionsDefinition(["a", "b", "c"])) def upstream(): pass @asset(partitions_def=StaticPartitionsDefinition(["a", "b", "c", "d"])) def downstream(upstream): assert upstream is None build_assets_job("my_job", assets=[upstream, downstream])
def test_io_manager(): df_value = pandas.DataFrame({"foo": ["bar", "baz"], "quux": [1, 2]}) @asset(partitions_def=hourly_partitions) def pandas_df_asset(): return df_value @asset(partitions_def=hourly_partitions) def spark_input_asset(pandas_df_asset: SparkDF): assert isinstance(pandas_df_asset, SparkDF) assert pandas_df_asset.count() == 2 assert set(pandas_df_asset.columns) == {"foo", "quux"} return pandas_df_asset with tempfile.TemporaryDirectory() as temp_dir: io_manager_test_job = build_assets_job( "io_manager_test_job", assets=[pandas_df_asset, spark_input_asset], resource_defs={ "pyspark": pyspark_resource, "io_manager": local_partitioned_parquet_io_manager.configured( {"base_path": temp_dir}), }, ) expected_path = os.path.join( temp_dir, "pandas_df_asset-20220101160000_20220101170000.pq") res = io_manager_test_job.execute_in_process( partition_key="2022-01-01-16:00") assert res.success assert os.path.exists(expected_path) intermediate_df = pandas.read_parquet(expected_path) assert all(intermediate_df == df_value)
def test_source_asset_partitions(): hourly_asset = SourceAsset( AssetKey("hourly_asset"), partitions_def=HourlyPartitionsDefinition( start_date="2021-05-05-00:00"), ) @asset(partitions_def=DailyPartitionsDefinition(start_date="2021-05-05")) def daily_asset(hourly_asset): assert hourly_asset is None class CustomIOManager(IOManager): def handle_output(self, context, obj): pass def load_input(self, context): key_range = context.asset_partition_key_range assert key_range.start == "2021-06-06-00:00" assert key_range.end == "2021-06-06-23:00" daily_job = build_assets_job( name="daily_job", assets=[daily_asset], source_assets=[hourly_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(CustomIOManager()) }, ) assert daily_job.execute_in_process(partition_key="2021-06-06").success
def test_select_from_manifest(dbt_seed, conn_string, test_project_dir, dbt_config_dir): # pylint: disable=unused-argument manifest_path = file_relative_path(__file__, "sample_manifest.json") with open(manifest_path, "r") as f: manifest_json = json.load(f) dbt_assets = load_assets_from_dbt_manifest( manifest_json, selected_unique_ids={ "model.dagster_dbt_test_project.sort_by_calories", "model.dagster_dbt_test_project.least_caloric", }, ) result = build_assets_job( "test_job", dbt_assets, resource_defs={ "dbt": dbt_cli_resource.configured({ "project_dir": test_project_dir, "profiles_dir": dbt_config_dir }) }, ).execute_in_process() assert result.success materializations = [ event.event_specific_data.materialization for event in result.events_for_node(dbt_assets[0].op.name) if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 2
def test_node_info_to_asset_key(dbt_seed, conn_string, test_project_dir, dbt_config_dir): # pylint: disable=unused-argument dbt_assets = load_assets_from_dbt_project( test_project_dir, dbt_config_dir, node_info_to_asset_key=lambda node_info: AssetKey( ["foo", node_info["name"]]), ) result = build_assets_job( "test_job", dbt_assets, resource_defs={ "dbt": dbt_cli_resource.configured({ "project_dir": test_project_dir, "profiles_dir": dbt_config_dir }) }, ).execute_in_process() assert result.success materializations = [ event.event_specific_data.materialization for event in result.events_for_node(dbt_assets[0].op.name) if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 4 assert materializations[0].asset_key == AssetKey( ["foo", "sort_by_calories"])
def test_select_from_project(dbt_seed, conn_string, test_project_dir, dbt_config_dir): # pylint: disable=unused-argument dbt_assets = load_assets_from_dbt_project( test_project_dir, dbt_config_dir, select="sort_by_calories subdir.least_caloric") assert dbt_assets[0].op.name == "run_dbt_dagster_dbt_test_project_e4753" result = build_assets_job( "test_job", dbt_assets, resource_defs={ "dbt": dbt_cli_resource.configured({ "project_dir": test_project_dir, "profiles_dir": dbt_config_dir }) }, ).execute_in_process() assert result.success materializations = [ event.event_specific_data.materialization for event in result.events_for_node(dbt_assets[0].op.name) if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 2
def test_runtime_metadata_fn(): manifest_path = file_relative_path(__file__, "sample_manifest.json") with open(manifest_path, "r") as f: manifest_json = json.load(f) def runtime_metadata_fn(context, node_info): return { "op_name": context.solid_def.name, "dbt_model": node_info["name"] } assets = load_assets_from_dbt_manifest( manifest_json=manifest_json, runtime_metadata_fn=runtime_metadata_fn) assert_assets_match_project(assets) dbt = MagicMock() assets_job = build_assets_job( "assets_job", assets, resource_defs={"dbt": ResourceDefinition.hardcoded_resource(dbt)}) result = assets_job.execute_in_process() assert result.success for asset in assets: materializations = [ event.event_specific_data.materialization for event in result.events_for_node(asset.op.name) if event.event_type_value == "ASSET_MATERIALIZATION" ] assert len(materializations) == 1 assert materializations[0].metadata_entries == [ EventMetadataEntry.text(asset.op.name, label="op_name"), EventMetadataEntry.text(asset.op.name, label="dbt_model"), ]
def test_source_asset(): @asset def asset1(source1): assert source1 == 5 return 1 class MyIOManager(IOManager): def handle_output(self, context, obj): pass def load_input(self, context): assert context.resource_config["a"] == 7 assert context.resources.subresource == 9 assert context.upstream_output.resources.subresource == 9 return 5 @io_manager(config_schema={"a": int}, required_resource_keys={"subresource"}) def my_io_manager(_): return MyIOManager() job = build_assets_job( "a", [asset1], source_assets=[ SourceAsset(AssetKey("source1"), io_manager_key="special_io_manager") ], resource_defs={ "special_io_manager": my_io_manager.configured({"a": 7}), "subresource": ResourceDefinition.hardcoded_resource(9), }, ) assert job.graph.node_defs == [asset1.op] assert job.execute_in_process().success
def test_used_source_asset(): bar = SourceAsset(key=AssetKey("bar"), description="def") @asset def foo(bar): assert bar job1 = build_assets_job("job1", [foo], source_assets=[bar]) external_asset_nodes = external_asset_graph_from_defs( [job1], source_assets_by_key={AssetKey("bar"): bar} ) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey("bar"), op_description="def", dependencies=[], depended_by=[ ExternalAssetDependedBy(downstream_asset_key=AssetKey(["foo"]), input_name="bar") ], job_names=[], ), ExternalAssetNode( asset_key=AssetKey("foo"), op_name="foo", op_description=None, dependencies=[ ExternalAssetDependency(upstream_asset_key=AssetKey(["bar"]), input_name="bar") ], depended_by=[], job_names=["job1"], output_name="result", output_description=None, ), ]
def test_source_op_asset(): @asset(io_manager_key="special_io_manager") def source1(): pass @asset def asset1(source1): assert source1 == 5 return 1 class MyIOManager(IOManager): def handle_output(self, context, obj): pass def load_input(self, context): return 5 @io_manager def my_io_manager(_): return MyIOManager() job = build_assets_job( "a", [asset1], source_assets=[source1], resource_defs={"special_io_manager": my_io_manager}, ) assert job.graph.node_defs == [asset1.op] assert job.execute_in_process().success
def test_download(): with tempfile.TemporaryDirectory() as temp_dir: test_job = build_assets_job( "test_job", assets=ASSETS, resource_defs={ "io_manager": fs_io_manager, "partition_start": ResourceDefinition.string_resource(), "partition_end": ResourceDefinition.string_resource(), "parquet_io_manager": local_partitioned_parquet_io_manager.configured( {"base_path": temp_dir}), "warehouse_io_manager": mem_io_manager, "pyspark": pyspark_resource, "hn_client": hn_snapshot_client, }, ) result = test_job.execute_in_process(partition_key="2020-12-30-00:00") assert result.success
def test_single_partitioned_asset_job(): partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"]) class MyIOManager(IOManager): def handle_output(self, context, obj): assert context.asset_partition_key == "b" def load_input(self, context): assert False, "shouldn't get here" @asset(partitions_def=partitions_def) def my_asset(): pass my_job = build_assets_job( "my_job", assets=[my_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) result = my_job.execute_in_process(partition_key="b") assert result.asset_materializations_for_node("my_asset") == [ AssetMaterialization(asset_key=AssetKey(["my_asset"]), partition="b") ]
def test_input_context_asset_partitions_time_window(): partitions_def = DailyPartitionsDefinition(start_date="2021-05-05") class MyIOManager(IOManager): def handle_output(self, context, _obj): assert context.asset_partitions_time_window == TimeWindow( pendulum.parse("2021-06-06"), pendulum.parse("2021-06-07")) def load_input(self, context): assert context.asset_partitions_time_window == TimeWindow( pendulum.parse("2021-06-06"), pendulum.parse("2021-06-07")) @asset(partitions_def=partitions_def) def upstream_asset(): pass @asset(partitions_def=partitions_def) def downstream_asset(upstream_asset): assert upstream_asset is None my_job = build_assets_job( "my_job", assets=[downstream_asset, upstream_asset], resource_defs={ "io_manager": IOManagerDefinition.hardcoded_io_manager(MyIOManager()) }, ) my_job.execute_in_process(partition_key="2021-06-06")
def test_input_name_matches_output_name(): not_result = SourceAsset(key=AssetKey("not_result"), description=None) @asset(ins={"result": AssetIn(asset_key=AssetKey("not_result"))}) def something(result): # pylint: disable=unused-argument pass assets_job = build_assets_job("assets_job", [something], source_assets=[not_result]) external_asset_nodes = external_asset_graph_from_defs([assets_job], source_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey("not_result"), dependencies=[], depended_by=[ ExternalAssetDependedBy( downstream_asset_key=AssetKey("something"), input_name="result" ) ], job_names=[], ), ExternalAssetNode( asset_key=AssetKey("something"), dependencies=[ ExternalAssetDependency( upstream_asset_key=AssetKey("not_result"), input_name="result" ) ], depended_by=[], op_name="something", output_name="result", job_names=["assets_job"], ), ]
def test_source_asset_with_op(): foo = SourceAsset(key=AssetKey("foo"), description=None) @asset def bar(foo): # pylint: disable=unused-argument pass assets_job = build_assets_job("assets_job", [bar], source_assets=[foo]) external_asset_nodes = external_asset_graph_from_defs([assets_job], source_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey("foo"), op_description=None, dependencies=[], depended_by=[ExternalAssetDependedBy(AssetKey("bar"), input_name="foo")], job_names=[], ), ExternalAssetNode( asset_key=AssetKey("bar"), op_name="bar", op_description=None, dependencies=[ExternalAssetDependency(AssetKey("foo"), input_name="foo")], depended_by=[], job_names=["assets_job"], output_name="result", ), ]
def test_basic_multi_asset(): @multi_asset( outs={ f"out{i}": Out(description=f"foo: {i}", asset_key=AssetKey(f"asset{i}")) for i in range(10) } ) def assets(): pass assets_job = build_assets_job("assets_job", [assets]) external_asset_nodes = external_asset_graph_from_defs([assets_job], source_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey(f"asset{i}"), dependencies=[], depended_by=[], op_name="assets", op_description=None, job_names=["assets_job"], output_name=f"out{i}", output_description=f"foo: {i}", ) for i in range(10) ]
def test_join(): @asset def asset1(): return 1 @asset def asset2(): return 2 @asset def asset3(asset1, asset2): assert asset1 == 1 assert asset2 == 2 job = build_assets_job("a", [asset1, asset2, asset3]) assert job.graph.node_defs == [asset1.op, asset2.op, asset3.op] assert job.dependencies == { "asset1": {}, "asset2": {}, "asset3": { "asset1": DependencyDefinition("asset1", "result"), "asset2": DependencyDefinition("asset2", "result"), }, } assert job.execute_in_process().success
def test_two_downstream_assets_job(): @asset def asset1(): return 1 @asset def asset2_a(asset1): assert asset1 == 1 @asset def asset2_b(asset1): assert asset1 == 1 assets_job = build_assets_job("assets_job", [asset1, asset2_a, asset2_b]) external_asset_nodes = external_asset_graph_from_defs([assets_job], source_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey("asset1"), dependencies=[], depended_by=[ ExternalAssetDependedBy( downstream_asset_key=AssetKey("asset2_a"), input_name="asset1" ), ExternalAssetDependedBy( downstream_asset_key=AssetKey("asset2_b"), input_name="asset1" ), ], op_name="asset1", op_description=None, job_names=["assets_job"], output_name="result", output_description=None, ), ExternalAssetNode( asset_key=AssetKey("asset2_a"), dependencies=[ ExternalAssetDependency(upstream_asset_key=AssetKey("asset1"), input_name="asset1") ], depended_by=[], op_name="asset2_a", op_description=None, job_names=["assets_job"], output_name="result", output_description=None, ), ExternalAssetNode( asset_key=AssetKey("asset2_b"), dependencies=[ ExternalAssetDependency(upstream_asset_key=AssetKey("asset1"), input_name="asset1") ], depended_by=[], op_name="asset2_b", op_description=None, job_names=["assets_job"], output_name="result", output_description=None, ), ]
def test_single_asset_pipeline(): @asset def asset1(): return 1 job = build_assets_job("a", [asset1]) assert job.graph.node_defs == [asset1.op] assert job.execute_in_process().success
def test_cross_job_asset_dependency(): @asset def asset1(): return 1 @asset def asset2(asset1): assert asset1 == 1 assets_job1 = build_assets_job("assets_job1", [asset1]) assets_job2 = build_assets_job("assets_job2", [asset2], source_assets=[asset1]) external_asset_nodes = external_asset_graph_from_defs( [assets_job1, assets_job2], source_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey("asset1"), dependencies=[], depended_by=[ ExternalAssetDependedBy( downstream_asset_key=AssetKey("asset2"), input_name="asset1") ], op_name="asset1", op_description=None, job_names=["assets_job1"], output_name="result", output_description=None, ), ExternalAssetNode( asset_key=AssetKey("asset2"), dependencies=[ ExternalAssetDependency(upstream_asset_key=AssetKey("asset1"), input_name="asset1") ], depended_by=[], op_name="asset2", op_description=None, job_names=["assets_job2"], output_name="result", output_description=None, ), ]
def test_source_asset_conflicts_with_asset(): bar_source_asset = SourceAsset(key=AssetKey("bar"), description="def") @asset def bar(): pass job1 = build_assets_job("job1", [bar]) with pytest.raises(DagsterInvariantViolationError): external_asset_graph_from_defs( [job1], source_assets_by_key={AssetKey("bar"): bar_source_asset})
def test_single_partitioned_asset_job(): partitions_def = StaticPartitionsDefinition(["a", "b", "c", "d"]) @asset(partitions_def=partitions_def) def my_asset(): pass my_job = build_assets_job("my_job", assets=[my_asset]) result = my_job.execute_in_process(partition_key="b") assert result.asset_materializations_for_node("my_asset") == [ AssetMaterialization(asset_key=AssetKey(["my_asset"]), partition="b") ]
def test_same_asset_in_multiple_pipelines(): @asset def asset1(): return 1 job1 = build_assets_job("job1", [asset1]) job2 = build_assets_job("job2", [asset1]) external_asset_nodes = external_asset_graph_from_defs([job1, job2], source_assets_by_key={}) assert external_asset_nodes == [ ExternalAssetNode( asset_key=AssetKey("asset1"), dependencies=[], depended_by=[], op_name="asset1", op_description=None, job_names=["job1", "job2"], output_name="result", output_description=None, ), ]
def test_asset_key_output(): @asset def asset1(): return 1 @asset(ins={"hello": AssetIn(asset_key=AssetKey("asset1"))}) def asset2(hello): return hello job = build_assets_job("boo", [asset1, asset2]) result = job.execute_in_process() assert result.success assert result.output_for_node("asset2") == 1
def test_load_from_manifest_json(): manifest_path = file_relative_path(__file__, "sample_manifest.json") with open(manifest_path, "r") as f: manifest_json = json.load(f) assets = load_assets_from_dbt_manifest(manifest_json=manifest_json) assert_assets_match_project(assets) dbt = MagicMock() assets_job = build_assets_job( "assets_job", assets, resource_defs={"dbt": ResourceDefinition.hardcoded_resource(dbt)}) assert assets_job.execute_in_process().success
def test_asset_key_for_asset_with_namespace(): @asset(namespace="hello") def asset_foo(): return "foo" @asset( ins={"foo": AssetIn(asset_key=AssetKey("asset_foo"))} ) # Should fail because asset_foo is defined with namespace, so has asset key ["hello", "asset_foo"] def failing_asset(foo): pass with pytest.raises(DagsterInvalidDefinitionError, ): build_assets_job("lol", [asset_foo, failing_asset]) @asset(ins={"foo": AssetIn(asset_key=AssetKey(["hello", "asset_foo"]))}) def success_asset(foo): return foo job = build_assets_job("lol", [asset_foo, success_asset]) result = job.execute_in_process() assert result.success assert result.output_for_node("success_asset") == "foo"
def test_asset_key_for_asset_with_namespace_str(): @asset(namespace="hello") def asset_foo(): return "foo" @asset(ins={"foo": AssetIn(asset_key=AssetKey(["hello", "asset_foo"]))}) def success_asset(foo): return foo job = build_assets_job("lol", [asset_foo, success_asset]) result = job.execute_in_process() assert result.success assert result.output_for_node("success_asset") == "foo"
def get_assets_job(io_manager_def): asset1_namespace = ["one", "two", "three"] @asset(namespace=["one", "two", "three"]) def asset1(): return [1, 2, 3] @asset(namespace=["four", "five"], ins={"asset1": AssetIn(namespace=asset1_namespace)}) def asset2(asset1): return asset1 + [4] return build_assets_job(name="a", assets=[asset1, asset2], resource_defs={"io_manager": io_manager_def})
def test_invoking_asset_with_deps(): @asset def upstream(): return [1] @asset def downstream(upstream): return upstream + [2, 3] # check that the asset dependencies are in place job = build_assets_job("foo", [upstream, downstream]) assert job.execute_in_process().success out = downstream([3]) assert out == [3, 2, 3]
def test_non_argument_deps(): with safe_tempfile_path() as path: @asset def foo(): with open(path, "w") as ff: ff.write("yup") @asset(non_argument_deps={AssetKey("foo")}) def bar(): # assert that the foo asset already executed assert os.path.exists(path) job = build_assets_job("a", [foo, bar]) assert job.execute_in_process().success