def test_adls2_intermediate_storage_with_custom_prefix(storage_account, file_system): run_id = make_new_run_id() intermediate_storage = ADLS2IntermediateStorage( adls2_client=get_adls2_client(storage_account), blob_client=get_blob_client(storage_account), run_id=run_id, file_system=file_system, prefix="custom_prefix", ) assert intermediate_storage.root == "/".join(["custom_prefix", "storage", run_id]) try: with yield_empty_pipeline_context(run_id=run_id) as context: intermediate_storage.set_intermediate( context, RuntimeBool, StepOutputHandle("true"), True ) assert intermediate_storage.has_intermediate(context, StepOutputHandle("true")) assert intermediate_storage.uri_for_paths(["true"]).startswith( "abfss://{fs}@{account}.dfs.core.windows.net/custom_prefix".format( account=storage_account, fs=file_system ) ) finally: intermediate_storage.rm_intermediate(context, StepOutputHandle("true"))
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs({'solids': {'add_one': {'inputs': {'num': {'value': 3}}}}}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id) ) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj == 4 ) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_two.compute')).obj == 6 ) ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=['add_two.compute'], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id) ) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj == 4 ) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_two.compute')).obj == 6 ) assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute') with pytest.raises( DagsterExecutionStepNotFoundError, match='Execution plan does not contain step' ): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=['nope.compute'], instance=instance, )
def test_gcs_intermediate_storage_composite_types_with_custom_serializer_for_inner_type( gcs_bucket): run_id = make_new_run_id() intermediate_storage = GCSIntermediateStorage(run_id=run_id, gcs_bucket=gcs_bucket) obj_name = "list" with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_storage.set_intermediate( context, resolve_dagster_type(List[LowercaseString]), StepOutputHandle(obj_name), ["foo", "bar"], ) assert intermediate_storage.has_intermediate( context, StepOutputHandle(obj_name)) assert intermediate_storage.get_intermediate( context, resolve_dagster_type(List[Bool]), StepOutputHandle(obj_name)).obj == ["foo", "bar"] finally: intermediate_storage.rm_intermediate(context, StepOutputHandle(obj_name))
def test_adls2_intermediate_storage_composite_types_with_custom_serializer_for_inner_type( storage_account, file_system ): run_id = make_new_run_id() intermediate_storage = ADLS2IntermediateStorage( adls2_client=get_adls2_client(storage_account), blob_client=get_blob_client(storage_account), run_id=run_id, file_system=file_system, ) obj_name = "list" with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_storage.set_intermediate( context, resolve_dagster_type(List[LowercaseString]), StepOutputHandle(obj_name), ["foo", "bar"], ) assert intermediate_storage.has_intermediate(context, StepOutputHandle(obj_name)) assert intermediate_storage.get_intermediate( context, resolve_dagster_type(List[Bool]), StepOutputHandle(obj_name) ).obj == ["foo", "bar"] finally: intermediate_storage.rm_intermediate(context, StepOutputHandle(obj_name))
def test_adls2_intermediate_storage_with_custom_serializer(storage_account, file_system): run_id = make_new_run_id() intermediate_storage = ADLS2IntermediateStorage( adls2_client=get_adls2_client(storage_account), blob_client=get_blob_client(storage_account), run_id=run_id, file_system=file_system, ) with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_storage.set_intermediate( context, LowercaseString, StepOutputHandle("foo"), "foo" ) assert ( intermediate_storage.object_store.file_system_client.get_file_client( os.path.join(*[intermediate_storage.root, "intermediates", "foo", "result"]), ) .download_file() .readall() .decode("utf-8") == "FOO" ) assert intermediate_storage.has_intermediate(context, StepOutputHandle("foo")) assert ( intermediate_storage.get_intermediate( context, LowercaseString, StepOutputHandle("foo") ).obj == "foo" ) finally: intermediate_storage.rm_intermediate(context, StepOutputHandle("foo"))
def test_gcs_intermediate_storage_with_custom_prefix(gcs_bucket): run_id = make_new_run_id() intermediate_storage = GCSIntermediateStorage(run_id=run_id, gcs_bucket=gcs_bucket, gcs_prefix="custom_prefix") assert intermediate_storage.root == "/".join( ["custom_prefix", "storage", run_id]) obj_name = "true" try: with yield_empty_pipeline_context(run_id=run_id) as context: intermediate_storage.set_intermediate(context, RuntimeBool, StepOutputHandle(obj_name), True) assert intermediate_storage.has_intermediate( context, StepOutputHandle(obj_name)) assert intermediate_storage.uri_for_paths([obj_name]).startswith( "gs://%s/custom_prefix" % gcs_bucket) finally: intermediate_storage.rm_intermediate(context, StepOutputHandle(obj_name))
def test_adls2_intermediate_storage_with_type_storage_plugin(storage_account, file_system): run_id = make_new_run_id() intermediate_storage = ADLS2IntermediateStorage( adls2_client=get_adls2_client(storage_account), blob_client=get_blob_client(storage_account), run_id=run_id, file_system=file_system, type_storage_plugin_registry=TypeStoragePluginRegistry( [(RuntimeString, FancyStringS3TypeStoragePlugin)] ), ) with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_storage.set_intermediate( context, RuntimeString, StepOutputHandle("obj_name"), "hello" ) assert intermediate_storage.has_intermediate(context, StepOutputHandle("obj_name")) assert ( intermediate_storage.get_intermediate( context, RuntimeString, StepOutputHandle("obj_name") ) == "hello" ) finally: intermediate_storage.rm_intermediate(context, StepOutputHandle("obj_name"))
def test_successful_one_part_execute_plan(graphql_context, snapshot): instance = graphql_context.instance environment_dict = csv_hello_world_solids_config_fs_storage() pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, environment_dict=environment_dict) selector = infer_pipeline_selector(graphql_context, 'csv_hello_world') result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': selector, 'runConfigData': environment_dict, 'stepKeys': ['sum_solid.compute'], 'executionMetadata': { 'runId': pipeline_run.run_id }, 'mode': 'default', } }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = query_result['stepEvents'] assert [se['__typename'] for se in step_events] == [ 'ExecutionStepStartEvent', 'ExecutionStepInputEvent', 'ExecutionStepOutputEvent', 'ObjectStoreOperationEvent', 'ExecutionStepSuccessEvent', ] assert step_events[1]['step']['key'] == 'sum_solid.compute' assert step_events[2]['outputName'] == 'result' expected_value_repr = ( '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), ''' '''OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]''') assert step_events[3]['step']['key'] == 'sum_solid.compute' assert step_events[4]['step']['key'] == 'sum_solid.compute' snapshot.assert_match(clean_log_messages(result.data)) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert intermediates_manager.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert (str( intermediates_manager.get_intermediate( None, PoorMansDataFrame, StepOutputHandle('sum_solid.compute')).obj) == expected_value_repr)
def _create_step_events_for_output(step_context, output): check.inst_param(step_context, "step_context", SystemStepExecutionContext) check.inst_param(output, "output", Output) step = step_context.step step_output = step.step_output_named(output.output_name) version = resolve_step_output_versions( step_context.execution_plan, step_context.environment_config, step_context.mode_def, )[StepOutputHandle(step_context.step.key, output.output_name)] for output_event in _type_checked_step_output_event_sequence( step_context, output, version): yield output_event step_output_handle = StepOutputHandle.from_step( step=step, output_name=output.output_name) for evt in _set_intermediates(step_context, step_output, step_output_handle, output, version): yield evt for evt in _create_output_materializations(step_context, output.output_name, output.value): yield evt
def test_resolve_memoized_execution_plan_yes_stored_results(): speculative_execution_plan = create_execution_plan(versioned_pipeline) step_output_handle = StepOutputHandle("versioned_solid_no_input.compute", "result") instance = DagsterInstance.ephemeral() instance.get_addresses_for_step_output_versions = mock.MagicMock( return_value={ (versioned_pipeline.name, step_output_handle): "some_address" }) memoized_execution_plan = instance.resolve_memoized_execution_plan( speculative_execution_plan, run_config={}, mode="default") assert memoized_execution_plan.step_keys_to_execute == [ "versioned_solid_takes_input.compute" ] expected_handle = StepOutputHandle( step_key="versioned_solid_no_input.compute", output_name="result") assert memoized_execution_plan.step_dict[ "versioned_solid_takes_input.compute"].step_input_dict[ "intput"].addresses == { expected_handle: "some_address" }
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=["add_two.compute"], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) assert not get_step_output_event(step_events, "add_one.compute") assert get_step_output_event(step_events, "add_two.compute") with pytest.raises(DagsterExecutionStepNotFoundError, match="Execution plan does not contain step"): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=["nope.compute"], instance=instance, )
def test_s3_intermediate_storage_with_type_storage_plugin(s3_bucket): run_id = make_new_run_id() intermediate_storage = S3IntermediateStorage( run_id=run_id, s3_bucket=s3_bucket, type_storage_plugin_registry=TypeStoragePluginRegistry( [(RuntimeString, FancyStringS3TypeStoragePlugin)] ), ) with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_storage.set_intermediate( context, RuntimeString, StepOutputHandle("obj_name"), "hello" ) assert intermediate_storage.has_intermediate(context, StepOutputHandle("obj_name")) assert ( intermediate_storage.get_intermediate( context, RuntimeString, StepOutputHandle("obj_name") ) == "hello" ) finally: intermediate_storage.rm_intermediate(context, StepOutputHandle("obj_name"))
def test_s3_intermediate_storage_with_custom_serializer(s3_bucket): run_id = make_new_run_id() intermediate_storage = S3IntermediateStorage(run_id=run_id, s3_bucket=s3_bucket) with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_storage.set_intermediate( context, LowercaseString, StepOutputHandle("foo"), "foo" ) assert ( intermediate_storage.object_store.s3.get_object( Bucket=intermediate_storage.object_store.bucket, Key=os.path.join(intermediate_storage.root, "intermediates", "foo", "result"), )["Body"] .read() .decode("utf-8") == "FOO" ) assert intermediate_storage.has_intermediate(context, StepOutputHandle("foo")) assert ( intermediate_storage.get_intermediate( context, LowercaseString, StepOutputHandle("foo") ).obj == "foo" ) finally: intermediate_storage.rm_intermediate(context, StepOutputHandle("foo"))
def test_s3_pipeline_with_custom_prefix(s3_bucket): s3_prefix = "custom_prefix" pipe = define_inty_pipeline(should_throw=False) run_config = {"storage": {"s3": {"config": {"s3_bucket": s3_bucket, "s3_prefix": s3_prefix}}}} pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config) instance = DagsterInstance.ephemeral() result = execute_pipeline(pipe, run_config=run_config,) assert result.success execution_plan = create_execution_plan(pipe, run_config) with scoped_pipeline_context(execution_plan, run_config, pipeline_run, instance,) as context: intermediates_manager = S3IntermediateStorage( run_id=result.run_id, s3_bucket=s3_bucket, s3_prefix=s3_prefix, s3_session=context.scoped_resources_builder.build(required_resource_keys={"s3"}).s3, ) assert intermediates_manager.root == "/".join(["custom_prefix", "storage", result.run_id]) assert ( intermediates_manager.get_intermediate( context, Int, StepOutputHandle("return_one.compute") ).obj == 1 ) assert ( intermediates_manager.get_intermediate( context, Int, StepOutputHandle("add_one.compute") ).obj == 2 )
def test_gcs_intermediate_storage_with_type_storage_plugin(gcs_bucket): run_id = make_new_run_id() intermediate_storage = GCSIntermediateStorage( run_id=run_id, gcs_bucket=gcs_bucket, type_storage_plugin_registry=TypeStoragePluginRegistry([ (RuntimeString, FancyStringGCSTypeStoragePlugin) ]), ) obj_name = 'obj_name' with yield_empty_pipeline_context(run_id=run_id) as context: try: intermediate_storage.set_intermediate(context, RuntimeString, StepOutputHandle(obj_name), 'hello') assert intermediate_storage.has_intermediate( context, StepOutputHandle(obj_name)) assert (intermediate_storage.get_intermediate( context, RuntimeString, StepOutputHandle(obj_name)) == 'hello') finally: intermediate_storage.rm_intermediate(context, StepOutputHandle(obj_name))
def test_custom_read_write_mode(storage_account, file_system): run_id = make_new_run_id() data_frame = [OrderedDict({"foo": "1", "bar": "1"}), OrderedDict({"foo": "2", "bar": "2"})] try: with yield_empty_pipeline_context(run_id=run_id) as context: intermediate_storage = ADLS2IntermediateStorage( adls2_client=get_adls2_client(storage_account), blob_client=get_blob_client(storage_account), run_id=run_id, file_system=file_system, ) intermediate_storage.set_intermediate( context, resolve_dagster_type(LessSimpleDataFrame), StepOutputHandle("data_frame"), data_frame, ) assert intermediate_storage.has_intermediate(context, StepOutputHandle("data_frame")) assert ( intermediate_storage.get_intermediate( context, resolve_dagster_type(LessSimpleDataFrame), StepOutputHandle("data_frame"), ).obj == data_frame ) assert intermediate_storage.uri_for_paths(["data_frame"]).startswith("abfss://") finally: intermediate_storage.rm_intermediate(context, StepOutputHandle("data_frame"))
def test_successful_one_part_execute_plan(graphql_context, snapshot): instance = graphql_context.instance run_config = csv_hello_world_solids_config_fs_storage() pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, run_config=run_config) selector = infer_pipeline_selector(graphql_context, "csv_hello_world") result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ "executionParams": { "selector": selector, "runConfigData": run_config, "stepKeys": ["sum_solid.compute"], "executionMetadata": { "runId": pipeline_run.run_id }, "mode": "default", }, }, ) query_result = result.data["executePlan"] assert query_result["__typename"] == "ExecutePlanSuccess" assert query_result["pipeline"]["name"] == "csv_hello_world" assert query_result["hasFailures"] is False step_events = query_result["stepEvents"] assert [se["__typename"] for se in step_events] == [ "ExecutionStepStartEvent", "ExecutionStepInputEvent", "ExecutionStepOutputEvent", "ObjectStoreOperationEvent", "ExecutionStepSuccessEvent", ] assert step_events[1]["stepKey"] == "sum_solid.compute" assert step_events[2]["outputName"] == "result" expected_value_repr = ( """[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), """ """OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]""") assert step_events[3]["stepKey"] == "sum_solid.compute" assert step_events[4]["stepKey"] == "sum_solid.compute" snapshot.assert_match(clean_log_messages(result.data)) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert intermediate_storage.has_intermediate( None, StepOutputHandle("sum_solid.compute")) assert (str( intermediate_storage.get_intermediate( None, PoorMansDataFrame, StepOutputHandle("sum_solid.compute")).obj) == expected_value_repr)
def test_using_s3_for_subplan(s3_bucket): pipeline_def = define_inty_pipeline() run_config = {"storage": {"s3": {"config": {"s3_bucket": s3_bucket}}}} run_id = make_new_run_id() execution_plan = create_execution_plan(pipeline_def, run_config=run_config) assert execution_plan.get_step_by_key("return_one.compute") step_keys = ["return_one.compute"] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config ) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(return_one_step_events, "return_one.compute") with scoped_pipeline_context( execution_plan.build_subset_plan(["return_one.compute"]), run_config, pipeline_run, instance, ) as context: intermediates_manager = S3IntermediateStorage( s3_bucket, run_id, s3_session=context.scoped_resources_builder.build(required_resource_keys={"s3"},).s3, ) step_output_handle = StepOutputHandle("return_one.compute") assert intermediates_manager.has_intermediate(context, step_output_handle) assert intermediates_manager.get_intermediate(context, Int, step_output_handle).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one.compute"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(add_one_step_events, "add_one.compute") with scoped_pipeline_context( execution_plan.build_subset_plan(["add_one.compute"]), run_config, pipeline_run, instance, ) as context: step_output_handle = StepOutputHandle("add_one.compute") assert intermediates_manager.has_intermediate(context, step_output_handle) assert intermediates_manager.get_intermediate(context, Int, step_output_handle).obj == 2
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, instance=instance, ) assert result.success intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id)) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) ## re-execute add_two execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=execution_plan, environment_dict=environment_dict, parent_run_id=result.run_id, root_run_id=result.run_id, ) step_events = execute_plan( execution_plan.build_subset_plan(['add_two.compute']), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, ) intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id)) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline( pipeline_def, run_config=run_config, instance=instance, ) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) ## re-execute add_two execution_plan = create_execution_plan(pipeline_def, run_config=run_config) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=execution_plan, run_config=run_config, parent_run_id=result.run_id, root_run_id=result.run_id, ) step_events = execute_plan( execution_plan.build_subset_plan(["add_two.compute"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) assert not get_step_output_event(step_events, "add_one.compute") assert get_step_output_event(step_events, "add_two.compute")
def test_adls2_pipeline_with_custom_prefix(storage_account, file_system): adls2_prefix = 'custom_prefix' pipe = define_inty_pipeline(should_throw=False) run_config = { 'resources': { 'adls2': { 'config': { 'storage_account': storage_account, 'credential': get_azure_credential() } } }, 'storage': { 'adls2': { 'config': { 'adls2_file_system': file_system, 'adls2_prefix': adls2_prefix } } }, } pipeline_run = PipelineRun(pipeline_name=pipe.name, run_config=run_config) instance = DagsterInstance.ephemeral() result = execute_pipeline( pipe, run_config=run_config, ) assert result.success execution_plan = create_execution_plan(pipe, run_config) with scoped_pipeline_context( execution_plan, run_config, pipeline_run, instance, ) as context: resource = context.scoped_resources_builder.build( required_resource_keys={'adls2'}).adls2 store = ADLS2IntermediateStore( run_id=result.run_id, file_system=file_system, prefix=adls2_prefix, adls2_client=resource.adls2_client, blob_client=resource.blob_client, ) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert store.root == '/'.join( ['custom_prefix', 'storage', result.run_id]) assert (intermediates_manager.get_intermediate( context, Int, StepOutputHandle('return_one.compute')).obj == 1) assert (intermediates_manager.get_intermediate( context, Int, StepOutputHandle('add_one.compute')).obj == 2)
def test_resolve_step_output_versions_no_external_dependencies(): speculative_execution_plan = create_execution_plan(versioned_pipeline) versions = resolve_step_output_versions_for_test( speculative_execution_plan, run_config={}, mode="default") assert (versions[StepOutputHandle( "versioned_solid_no_input.compute", "result")] == versioned_pipeline_expected_step1_output_version()) assert (versions[StepOutputHandle( "versioned_solid_takes_input.compute", "result")] == versioned_pipeline_expected_step2_output_version())
def test_using_intermediate_file_system_for_subplan_multiprocessing(): with instance_for_test() as instance: run_config = {"intermediate_storage": {"filesystem": {}}} pipeline = reconstructable(define_inty_pipeline) execution_plan = create_execution_plan(pipeline, run_config=run_config) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline.get_definition(), execution_plan=execution_plan ) assert execution_plan.get_step_by_key("return_one.compute") return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["return_one.compute"]), instance, run_config=dict(run_config, execution={"multiprocess": {}}), pipeline_run=pipeline_run, ) ) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id ) assert get_step_output(return_one_step_events, "return_one.compute") assert intermediate_storage.has_intermediate(None, StepOutputHandle("return_one.compute")) assert ( intermediate_storage.get_intermediate( None, Int, StepOutputHandle("return_one.compute") ).obj == 1 ) add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one.compute"]), instance, run_config=dict(run_config, execution={"multiprocess": {}}), pipeline_run=pipeline_run, ) ) assert get_step_output(add_one_step_events, "add_one.compute") assert intermediate_storage.has_intermediate(None, StepOutputHandle("add_one.compute")) assert ( intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute") ).obj == 2 )
def test_file_system_intermediate_storage_composite_types(): _, _, intermediate_storage = define_intermediate_storage() assert intermediate_storage.set_intermediate( None, List[Bool], StepOutputHandle('return_true_lst.compute'), [True]) assert intermediate_storage.has_intermediate( None, StepOutputHandle('return_true_lst.compute')) assert intermediate_storage.get_intermediate( None, List[Bool], StepOutputHandle('return_true_lst.compute')).obj == [True]
def test_output_handles_from_execution_plan(): execution_plan = create_execution_plan( define_pipeline(), run_config={"solids": {"add_one": {"inputs": {"num": {"value": 3}}}}}, ) assert output_handles_from_execution_plan(execution_plan) == set() assert output_handles_from_execution_plan( execution_plan.build_subset_plan(["add_two.compute", "add_three.compute"]) ) == {StepOutputHandle("add_one.compute", "result")} assert output_handles_from_execution_plan( execution_plan.build_subset_plan(["add_three.compute"]) ) == {StepOutputHandle("add_two.compute", "result")}
def test_using_file_system_for_subplan_multiprocessing(): environment_dict = {'storage': {'filesystem': {}}} instance = DagsterInstance.local_temp() pipeline = reconstructable(define_inty_pipeline) execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline.get_definition(), execution_plan=execution_plan ) assert execution_plan.get_step_by_key('return_one.compute') return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['return_one.compute']), instance, environment_dict=dict(environment_dict, execution={'multiprocess': {}}), pipeline_run=pipeline_run, ) ) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert get_step_output(return_one_step_events, 'return_one.compute') assert intermediates_manager.has_intermediate(None, StepOutputHandle('return_one.compute')) assert ( intermediates_manager.get_intermediate( None, Int, StepOutputHandle('return_one.compute') ).obj == 1 ) add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), instance, environment_dict=dict(environment_dict, execution={'multiprocess': {}}), pipeline_run=pipeline_run, ) ) assert get_step_output(add_one_step_events, 'add_one.compute') assert intermediates_manager.has_intermediate(None, StepOutputHandle('add_one.compute')) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj == 2 )
def test_address_operation_using_intermediates_file_system(): with seven.TemporaryDirectory() as tmpdir_path: output_address = os.path.join(tmpdir_path, "solid1.output") output_value = 5 instance = DagsterInstance.ephemeral() intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, run_id="some_run_id") object_operation_result = intermediate_storage.set_intermediate_to_address( context=None, dagster_type=Int, step_output_handle=StepOutputHandle("solid1.compute"), value=output_value, address=output_address, ) assert object_operation_result.key == output_address assert object_operation_result.obj == output_value assert (intermediate_storage.get_intermediate_from_address( context=None, dagster_type=Int, step_output_handle=StepOutputHandle("solid1.compute"), address=output_address, ).obj == output_value) with pytest.raises( DagsterAddressIOError, match="No such file or directory", ): intermediate_storage.set_intermediate_to_address( context=None, dagster_type=Int, step_output_handle=StepOutputHandle("solid1.compute"), value=1, address="invalid_address", ) with pytest.raises( DagsterAddressIOError, match="No such file or directory", ): intermediate_storage.get_intermediate_from_address( context=None, dagster_type=Int, step_output_handle=StepOutputHandle("solid1.compute"), address=os.path.join(tmpdir_path, "invalid.output"), )
def test_file_system_intermediate_storage_composite_types_with_custom_serializer_for_inner_type( ): run_id, instance, intermediate_storage = define_intermediate_storage() with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context: intermediate_storage.set_intermediate( context, resolve_dagster_type(List[LowercaseString]), StepOutputHandle('baz'), ['list']) assert intermediate_storage.has_intermediate(context, StepOutputHandle('baz')) assert intermediate_storage.get_intermediate( context, resolve_dagster_type(List[Bool]), StepOutputHandle('baz')).obj == ['list']
def test_using_intermediates_to_override(): pipeline = define_inty_pipeline() run_config = { 'storage': { 'filesystem': {} }, 'intermediate_storage': { 'in_memory': {} } } instance = DagsterInstance.ephemeral() execution_plan = create_execution_plan( pipeline, run_config=run_config, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline, execution_plan=execution_plan) assert execution_plan.get_step_by_key('return_one.compute') return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['return_one.compute']), instance, run_config=run_config, pipeline_run=pipeline_run, )) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert get_step_output(return_one_step_events, 'return_one.compute') assert not intermediates_manager.has_intermediate( None, StepOutputHandle('return_one.compute'))
def test_addresses_for_version(version_storing_context): @solid(version="abc") def solid1(_): yield Output(5, address="some_address") @solid(version="123") def solid2(_, _input1): pass @pipeline def my_pipeline(): solid2(solid1()) with version_storing_context() as ctx: instance, _ = ctx execute_pipeline(instance=instance, pipeline=my_pipeline) step_output_handle = StepOutputHandle("solid1.compute", "result") output_version = resolve_step_output_versions( create_execution_plan(my_pipeline), run_config={}, mode="default")[step_output_handle] assert instance.get_addresses_for_step_output_versions({ ("my_pipeline", step_output_handle): output_version }) == { ("my_pipeline", step_output_handle): "some_address" }