def test_build_output_context_with_cm_resource(): entered = [] @resource def cm_resource(): try: yield "foo" finally: entered.append("yes") context = build_output_context(step_key="test", name="test", resources={"cm_resource": cm_resource}) with pytest.raises( DagsterInvariantViolationError, match=re.escape( "At least one provided resource is a generator, but attempting to access " "resources outside of context manager scope. You can use the following syntax to " "open a context manager: `with build_output_context(...) as context:`", ), ): context.resources # pylint: disable=pointless-statement del context assert entered == ["yes"] with build_output_context(step_key="test", name="test", resources={"cm_resource": cm_resource}) as context: assert context.resources.cm_resource == "foo" assert entered == ["yes", "yes"]
def test_versioned_pickled_object_filesystem_io_manager(): with TemporaryDirectory() as temp_dir: store = VersionedPickledObjectFilesystemIOManager(temp_dir) context = build_output_context(step_key="foo", name="bar", version="version1") store.handle_output(context, "cat") assert store.has_output(context) assert store.load_input( build_input_context(upstream_output=context)) == "cat" context_diff_version = build_output_context(step_key="foo", name="bar", version="version2") assert not store.has_output(context_diff_version)
def test_handle_output_spark_then_load_input_pandas(): snowflake_manager = snowflake_io_manager( build_init_resource_context(config={"database": "TESTDB"}, resources={"partition_bounds": None})) spark = SparkSession.builder.config( "spark.jars.packages", "net.snowflake:snowflake-jdbc:3.8.0,net.snowflake:spark-snowflake_2.12:2.8.2-spark_3.0", ).getOrCreate() schema = StructType([ StructField("col1", StringType()), StructField("col2", IntegerType()) ]) contents = spark.createDataFrame([Row(col1="Thom", col2=51)], schema) with temporary_snowflake_table(PandasDataFrame([{ "col1": "a", "col2": 1 }])) as temp_table_name: metadata = { "table": f"public.{temp_table_name}", } output_context = build_output_context(metadata=metadata) list(snowflake_manager.handle_output(output_context, contents)) # exhaust the iterator input_context = build_input_context(upstream_output=output_context) input_value = snowflake_manager.load_input(input_context) contents_pandas = contents.toPandas() assert str(input_value) == str( contents_pandas), f"{input_value}\n\n{contents_pandas}"
def test_context_logging_metadata(): context = build_output_context() context.add_output_metadata({"foo": "bar"}) assert [entry.label for entry in context.get_logged_metadata_entries()] == ["foo"]
def test_my_io_manager_load_input(): manager = my_io_manager(None) manager.storage_dict[("123", "abc")] = 5 context = build_input_context( upstream_output=build_output_context(name="abc", step_key="123")) assert manager.load_input(context) == 5
def test_context_logging_user_events(): context = build_output_context() context.log_event(AssetMaterialization("first")) context.log_event(AssetMaterialization("second")) assert [event.label for event in context.get_logged_events()] == ["first", "second"]
def test_mem_io_manager_execution(): mem_io_manager_instance = InMemoryIOManager() output_context = build_output_context(step_key="step_key", name="output_name") mem_io_manager_instance.handle_output(output_context, 1) input_context = build_input_context(upstream_output=output_context) assert mem_io_manager_instance.load_input(input_context) == 1
def test_output_identifier_dynamic_memoization(): context = build_output_context(version="foo", mapping_key="bar", step_key="baz", name="buzz") with pytest.raises( CheckError, match= "Mapping key and version both provided for output 'buzz' of step 'baz'. Dynamic " "mapping is not supported when using versioning.", ): context.get_output_identifier()
def test_df_to_csv_io_manager(): with tempfile.TemporaryDirectory() as temp_dir: my_io_manager = df_to_csv_io_manager( build_init_resource_context(config={"base_dir": temp_dir})) test_df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) # test handle_output output_context = build_output_context(name="abc", step_key="123") my_io_manager.handle_output(output_context, test_df) output_path = my_io_manager._get_path(output_context) # pylint:disable=protected-access assert os.path.exists(output_path) assert test_df.equals(pd.read_csv(output_path)) # test load_input input_context = build_input_context(upstream_output=output_context) assert test_df.equals(my_io_manager.load_input(input_context))
def test_handle_output_then_load_input_pandas(): snowflake_manager = snowflake_io_manager( build_init_resource_context( config={"database": "TESTDB"}, resources={"partition_bounds": None} ) ) contents1 = PandasDataFrame([{"col1": "a", "col2": 1}]) # just to get the types right contents2 = PandasDataFrame([{"col1": "b", "col2": 2}]) # contents we will insert with temporary_snowflake_table(contents1) as temp_table_name: metadata = {"table": f"public.{temp_table_name}"} output_context = build_output_context(metadata=metadata) list(snowflake_manager.handle_output(output_context, contents2)) # exhaust the iterator input_context = build_input_context(upstream_output=output_context) input_value = snowflake_manager.load_input(input_context) assert input_value.equals(contents2), f"{input_value}\n\n{contents2}"
def test_handle_output_then_load_input(): snowflake_config = generate_snowflake_config() snowflake_manager = snowflake_io_manager(build_init_resource_context(config=snowflake_config)) contents1 = DataFrame([{"col1": "a", "col2": 1}]) # just to get the types right contents2 = DataFrame([{"col1": "b", "col2": 2}]) # contents we will insert with temporary_snowflake_table(contents1) as temp_table_name: metadata = { "table": f"public.{temp_table_name}", } output_context = build_output_context(metadata=metadata, resource_config=snowflake_config) list(snowflake_manager.handle_output(output_context, contents2)) # exhaust the iterator input_context = build_input_context( upstream_output=output_context, resource_config=snowflake_config ) input_value = snowflake_manager.load_input(input_context) assert input_value.equals(contents2), f"{input_value}\n\n{contents2}"
def test_handle_output_then_load_input(): snowflake_manager = SnowflakeIOManager(config=PROD_SNOWFLAKE_CONF) contents1 = DataFrame([{"col1": "a", "col2": 1}]) # just to get the types right contents2 = DataFrame([{"col1": "b", "col2": 2}]) # contents we will insert with temporary_snowflake_table(contents1) as temp_table_name: @solid(output_defs=[OutputDefinition(asset_key=AssetKey(temp_table_name))]) def my_solid(): pass output_context = build_output_context( name="result", solid_def=my_solid, resource_config=PROD_SNOWFLAKE_CONF ) list(snowflake_manager.handle_output(output_context, contents2)) # exhaust the iterator input_context = build_input_context( upstream_output=output_context, resource_config=PROD_SNOWFLAKE_CONF ) input_value = snowflake_manager.load_input(input_context) assert input_value.equals(contents2), f"{input_value}\n\n{contents2}"
def test_gcs_pickle_io_manager_execution(gcs_bucket): inty_job = define_inty_job() run_config = { "resources": { "io_manager": { "config": { "gcs_bucket": gcs_bucket, } } } } run_id = make_new_run_id() resolved_run_config = ResolvedRunConfig.build(inty_job, run_config=run_config) execution_plan = ExecutionPlan.build(InMemoryPipeline(inty_job), resolved_run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=inty_job.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys, inty_job, resolved_run_config), pipeline=InMemoryPipeline(inty_job), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") io_manager = PickledObjectGCSIOManager(gcs_bucket, storage.Client()) step_output_handle = StepOutputHandle("return_one") context = build_input_context(upstream_output=build_output_context( step_key=step_output_handle.step_key, name=step_output_handle.output_name, run_id=run_id, )) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"], inty_job, resolved_run_config), pipeline=InMemoryPipeline(inty_job), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) step_output_handle = StepOutputHandle("add_one") context = build_input_context(upstream_output=build_output_context( step_key=step_output_handle.step_key, name=step_output_handle.output_name, run_id=run_id, )) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2
def test_s3_pickle_io_manager_execution(mock_s3_bucket): pipeline_def = define_inty_pipeline() run_config = { "resources": { "io_manager": { "config": { "s3_bucket": mock_s3_bucket.name } } } } run_id = make_new_run_id() resolved_run_config = ResolvedRunConfig.build(pipeline_def, run_config=run_config) execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline_def), resolved_run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys, pipeline_def, resolved_run_config), pipeline=InMemoryPipeline(pipeline_def), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") io_manager = PickledObjectS3IOManager(mock_s3_bucket.name, construct_s3_client(max_attempts=5), s3_prefix="dagster") step_output_handle = StepOutputHandle("return_one") context = build_input_context(upstream_output=build_output_context( step_key=step_output_handle.step_key, name=step_output_handle.output_name, run_id=run_id, )) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"], pipeline_def, resolved_run_config), pipeline=InMemoryPipeline(pipeline_def), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) step_output_handle = StepOutputHandle("add_one") context = build_input_context(upstream_output=build_output_context( step_key=step_output_handle.step_key, name=step_output_handle.output_name, run_id=run_id, )) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2
def test_adls2_pickle_io_manager_execution(storage_account, file_system, credential): job = define_inty_job() run_config = { "resources": { "io_manager": { "config": { "adls2_file_system": file_system } }, "adls2": { "config": { "storage_account": storage_account, "credential": { "key": credential } } }, } } run_id = make_new_run_id() resolved_run_config = ResolvedRunConfig.build(job, run_config=run_config) execution_plan = ExecutionPlan.build(InMemoryPipeline(job), resolved_run_config) assert execution_plan.get_step_by_key("return_one") step_keys = ["return_one"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun(pipeline_name=job.name, run_id=run_id, run_config=run_config) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys, job, resolved_run_config), pipeline=InMemoryPipeline(job), run_config=run_config, pipeline_run=pipeline_run, instance=instance, )) assert get_step_output(return_one_step_events, "return_one") context = build_input_context(upstream_output=build_output_context( step_key="return_one", name="result", run_id=run_id, )) io_manager = PickledObjectADLS2IOManager( file_system=file_system, adls2_client=create_adls2_client(storage_account, credential), blob_client=create_blob_client(storage_account, credential), ) assert io_manager.load_input(context) == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"], job, resolved_run_config), pipeline=InMemoryPipeline(job), pipeline_run=pipeline_run, run_config=run_config, instance=instance, )) context = build_input_context(upstream_output=build_output_context( step_key="add_one", name="result", run_id=run_id, mapping_key="foo", )) assert get_step_output(add_one_step_events, "add_one") assert io_manager.load_input(context) == 2
def test_my_io_manager_handle_output(): manager = my_io_manager(None) context = build_output_context(name="abc", step_key="123") manager.handle_output(context, 5) assert manager.storage_dict[("123", "abc")] == 5
def mock_output_context(table_name): @asset(name=table_name) def my_asset(): pass return build_output_context(op_def=my_asset.op, name="result")
def test_basic_build_output_context(): context = build_output_context() assert isinstance(context, OutputContext)
def test_basic_build_output_context(): context = build_output_context("fake_key", "fake_name") assert isinstance(context, OutputContext) assert context.step_key == "fake_key" assert context.name == "fake_name"