def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs({'solids': {'add_one': {'inputs': {'num': {'value': 3}}}}}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id) ) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj == 4 ) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_two.compute')).obj == 6 ) ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=['add_two.compute'], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id) ) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj == 4 ) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_two.compute')).obj == 6 ) assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute') with pytest.raises( DagsterExecutionStepNotFoundError, match='Execution plan does not contain step' ): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=['nope.compute'], instance=instance, )
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline(pipeline_def, environment_dict=environment_dict, instance=instance) assert result.success store = build_fs_intermediate_store(instance.intermediates_directory, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two pipeline_run = instance.create_run_for_pipeline( pipeline_def, environment_dict=environment_dict, step_keys_to_execute=['add_two.compute'], parent_run_id=result.run_id, root_run_id=result.run_id, ) pipeline_reexecution_result = execute_run(pipeline_def, pipeline_run, instance) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_reexecution_result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute') with pytest.raises(DagsterExecutionStepNotFoundError, match='Execution plan does not contain step'): pipeline_run = instance.create_run_for_pipeline( pipeline_def, environment_dict=environment_dict, step_keys_to_execute=['nope.compute'], parent_run_id=result.run_id, root_run_id=result.run_id, )
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, instance=instance, ) assert result.success intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id)) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) ## re-execute add_two execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=execution_plan, environment_dict=environment_dict, parent_run_id=result.run_id, root_run_id=result.run_id, ) step_events = execute_plan( execution_plan.build_subset_plan(['add_two.compute']), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, ) intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id)) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() old_run_id = str(uuid.uuid4()) environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id), instance=instance, ) assert result.success store = build_fs_intermediate_store(instance.intermediates_directory, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two new_run_id = str(uuid.uuid4()) run_config = RunConfig( run_id=new_run_id, reexecution_config=ReexecutionConfig( previous_run_id=result.run_id, step_output_handles=[StepOutputHandle('add_one.compute')]), ) execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict, run_config=run_config) step_events = execute_plan( execution_plan, environment_dict=environment_dict, run_config=run_config, step_keys_to_execute=['add_two.compute'], instance=instance, ) store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() old_run_id = str(uuid.uuid4()) environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id), instance=instance, ) assert result.success store = build_fs_intermediate_store(instance.intermediates_directory, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two new_run_id = str(uuid.uuid4()) pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=new_run_id, environment_dict=environment_dict, mode='default', previous_run_id=result.run_id, ) execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict, run_config=pipeline_run) step_events = execute_plan( execution_plan.build_subset_plan(['add_two.compute']), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, ) store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() old_run_id = make_new_run_id() environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id), instance=instance, ) assert result.success store = build_fs_intermediate_store(instance.intermediates_directory, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two new_run_id = make_new_run_id() pipeline_reexecution_result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig( run_id=new_run_id, previous_run_id=result.run_id, step_keys_to_execute=['add_two.compute'], ), instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_file_system_intermediate_store_with_composite_type_storage_plugin(): run_id = str(uuid.uuid4()) # FIXME need a dedicated test bucket intermediate_store = build_fs_intermediate_store( DagsterInstance.ephemeral().intermediates_directory, run_id=run_id, type_storage_plugin_registry=TypeStoragePluginRegistry( {RuntimeString.inst(): FancyStringFilesystemTypeStoragePlugin}), ) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value(['hello'], context, resolve_to_runtime_type(List[String]), ['obj_name']) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value(['hello'], context, resolve_to_runtime_type( Optional[String]), ['obj_name']) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value( ['hello'], context, resolve_to_runtime_type(List[Optional[String]]), ['obj_name']) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value( ['hello'], context, resolve_to_runtime_type(Optional[List[String]]), ['obj_name'])
def test_using_intermediates_to_override(): pipeline = define_inty_pipeline() run_config = { 'storage': { 'filesystem': {} }, 'intermediate_storage': { 'in_memory': {} } } instance = DagsterInstance.ephemeral() execution_plan = create_execution_plan( pipeline, run_config=run_config, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline, execution_plan=execution_plan) assert execution_plan.get_step_by_key('return_one.compute') return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['return_one.compute']), instance, run_config=run_config, pipeline_run=pipeline_run, )) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert get_step_output(return_one_step_events, 'return_one.compute') assert not intermediates_manager.has_intermediate( None, StepOutputHandle('return_one.compute'))
def test_file_system_intermediate_store_with_composite_type_storage_plugin(): run_id = make_new_run_id() intermediate_store = build_fs_intermediate_store( DagsterInstance.ephemeral().intermediates_directory, run_id=run_id, type_storage_plugin_registry=TypeStoragePluginRegistry( [(RuntimeString, FancyStringFilesystemTypeStoragePlugin)] ), ) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value( ['hello'], context, resolve_dagster_type(List[String]), ['obj_name'] ) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value( ['hello'], context, resolve_dagster_type(Optional[String]), ['obj_name'] ) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value( ['hello'], context, resolve_dagster_type(List[Optional[String]]), ['obj_name'] ) with yield_empty_pipeline_context(run_id=run_id) as context: with pytest.raises(check.NotImplementedCheckError): intermediate_store.set_value( ['hello'], context, resolve_dagster_type(Optional[List[String]]), ['obj_name'] )
def test_successful_one_part_execute_plan(graphql_context, snapshot): instance = graphql_context.instance environment_dict = csv_hello_world_solids_config_fs_storage() pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, environment_dict=environment_dict) selector = get_legacy_pipeline_selector(graphql_context, 'csv_hello_world') result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': selector, 'runConfigData': environment_dict, 'stepKeys': ['sum_solid.compute'], 'executionMetadata': { 'runId': pipeline_run.run_id }, 'mode': 'default', } }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = query_result['stepEvents'] assert [se['__typename'] for se in step_events] == [ 'ExecutionStepStartEvent', 'ExecutionStepInputEvent', 'ExecutionStepOutputEvent', 'ObjectStoreOperationEvent', 'ExecutionStepSuccessEvent', ] assert step_events[1]['step']['key'] == 'sum_solid.compute' assert step_events[2]['outputName'] == 'result' expected_value_repr = ( '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), ''' '''OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]''') assert step_events[3]['step']['key'] == 'sum_solid.compute' assert step_events[4]['step']['key'] == 'sum_solid.compute' snapshot.assert_match(clean_log_messages(result.data)) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert intermediates_manager.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert (str( intermediates_manager.get_intermediate( None, PoorMansDataFrame, StepOutputHandle('sum_solid.compute')).obj) == expected_value_repr)
def test_successful_one_part_execute_plan(snapshot): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() instance.create_empty_run(run_id, 'csv_hello_world') result = execute_dagster_graphql( define_test_context(instance=instance), EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': { 'name': 'csv_hello_world' }, 'environmentConfigData': csv_hello_world_solids_config_fs_storage(), 'stepKeys': ['sum_solid.compute'], 'executionMetadata': { 'runId': run_id }, 'mode': 'default', } }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = query_result['stepEvents'] assert [se['__typename'] for se in step_events] == [ 'EngineEvent', 'ExecutionStepStartEvent', 'ExecutionStepInputEvent', 'ExecutionStepOutputEvent', 'ObjectStoreOperationEvent', 'ExecutionStepSuccessEvent', 'EngineEvent', ] assert step_events[1]['step']['key'] == 'sum_solid.compute' assert step_events[3]['outputName'] == 'result' expected_value_repr = ( '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), ''' '''OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]''') assert step_events[4]['step']['key'] == 'sum_solid.compute' assert step_events[5]['step']['key'] == 'sum_solid.compute' snapshot.assert_match(clean_log_messages(result.data)) store = build_fs_intermediate_store(instance.intermediates_directory, run_id) assert store.has_intermediate(None, 'sum_solid.compute') assert (str( store.get_intermediate(None, 'sum_solid.compute', PoorMansDataFrame).obj) == expected_value_repr)
def test_using_file_system_for_subplan_multiprocessing(): environment_dict = {'storage': {'filesystem': {}}} instance = DagsterInstance.local_temp() pipeline = reconstructable(define_inty_pipeline) execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline.get_definition(), execution_plan=execution_plan ) assert execution_plan.get_step_by_key('return_one.compute') return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['return_one.compute']), instance, environment_dict=dict(environment_dict, execution={'multiprocess': {}}), pipeline_run=pipeline_run, ) ) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert get_step_output(return_one_step_events, 'return_one.compute') assert intermediates_manager.has_intermediate(None, StepOutputHandle('return_one.compute')) assert ( intermediates_manager.get_intermediate( None, Int, StepOutputHandle('return_one.compute') ).obj == 1 ) add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), instance, environment_dict=dict(environment_dict, execution={'multiprocess': {}}), pipeline_run=pipeline_run, ) ) assert get_step_output(add_one_step_events, 'add_one.compute') assert intermediates_manager.has_intermediate(None, StepOutputHandle('add_one.compute')) assert ( intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj == 2 )
def test_file_system_intermediate_store(): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() intermediate_store = build_fs_intermediate_store( instance.intermediates_directory, run_id=run_id ) with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context: intermediate_store.set_object(True, context, RuntimeBool, ['true']) assert intermediate_store.has_object(context, ['true']) assert intermediate_store.get_object(context, RuntimeBool, ['true']).obj is True assert intermediate_store.uri_for_paths(['true']).startswith('file:///') assert intermediate_store.rm_object(context, ['true']) is None assert intermediate_store.rm_object(context, ['true']) is None assert intermediate_store.rm_object(context, ['dslkfhjsdflkjfs']) is None
def test_using_file_system_for_subplan_multiprocessing(): environment_dict = {'storage': {'filesystem': {}}} instance = DagsterInstance.local_temp() execution_plan = create_execution_plan( ExecutionTargetHandle.for_pipeline_fn( define_inty_pipeline).build_pipeline_definition(), environment_dict=environment_dict, ) assert execution_plan.get_step_by_key('return_one.compute') step_keys = ['return_one.compute'] run_id = str(uuid.uuid4()) instance.create_empty_run(run_id, execution_plan.pipeline_def.name) return_one_step_events = list( execute_plan( execution_plan, instance, environment_dict=dict(environment_dict, execution={'multiprocess': {}}), run_config=RunConfig(run_id=run_id), step_keys_to_execute=step_keys, )) store = build_fs_intermediate_store(instance.intermediates_directory, run_id) assert get_step_output(return_one_step_events, 'return_one.compute') assert store.has_intermediate(None, 'return_one.compute') assert store.get_intermediate(None, 'return_one.compute', Int).obj == 1 add_one_step_events = list( execute_plan( execution_plan, instance, environment_dict=dict(environment_dict, execution={'multiprocess': {}}), run_config=RunConfig(run_id=run_id), step_keys_to_execute=['add_one.compute'], )) assert get_step_output(add_one_step_events, 'add_one.compute') assert store.has_intermediate(None, 'add_one.compute') assert store.get_intermediate(None, 'add_one.compute', Int).obj == 2
def test_file_system_intermediate_store_composite_types(): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() intermediate_store = build_fs_intermediate_store( instance.intermediates_directory, run_id=run_id ) with yield_empty_pipeline_context(instance=instance, run_id=run_id) as context: intermediate_store.set_object( [True, False], context, resolve_dagster_type(List[Bool]), ['bool'] ) assert intermediate_store.has_object(context, ['bool']) assert intermediate_store.get_object( context, resolve_dagster_type(List[Bool]), ['bool'] ).obj == [True, False]
def test_file_system_intermediate_store_with_custom_serializer(): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() intermediate_store = build_fs_intermediate_store( instance.intermediates_directory, run_id=run_id ) with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context: intermediate_store.set_object('foo', context, LowercaseString, ['foo']) with open(os.path.join(intermediate_store.root, 'foo'), 'rb') as fd: assert fd.read().decode('utf-8') == 'FOO' assert intermediate_store.has_object(context, ['foo']) assert intermediate_store.get_object(context, LowercaseString, ['foo']).obj == 'foo'
def test_file_system_intermediate_store_composite_types_with_custom_serializer_for_inner_type(): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() intermediate_store = build_fs_intermediate_store( instance.intermediates_directory, run_id=run_id ) with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context: intermediate_store.set_object( ['foo', 'bar'], context, resolve_dagster_type(List[LowercaseString]), ['list'] ) assert intermediate_store.has_object(context, ['list']) assert intermediate_store.get_object( context, resolve_dagster_type(List[Bool]), ['list'] ).obj == ['foo', 'bar']
def test_spark_data_frame_serialization_file_system_file_handle(spark_config): @solid def nonce(_): return LocalFileHandle(file_relative_path(__file__, 'data/test.csv')) @pipeline(mode_defs=[spark_mode]) def spark_df_test_pipeline(): ingest_csv_file_handle_to_spark(nonce()) run_id = make_new_run_id() instance = DagsterInstance.ephemeral() intermediate_store = build_fs_intermediate_store( instance.intermediates_directory, run_id=run_id) result = execute_pipeline( spark_df_test_pipeline, run_config=RunConfig(run_id=run_id, mode='spark'), environment_dict={ 'storage': { 'filesystem': {} }, 'resources': { 'spark': { 'config': { 'spark_conf': spark_config } } }, }, instance=instance, ) assert result.success result_dir = os.path.join( intermediate_store.root, 'intermediates', 'ingest_csv_file_handle_to_spark.compute', 'result', ) assert '_SUCCESS' in os.listdir(result_dir) spark = SparkSession.builder.getOrCreate() df = spark.read.parquet(result_dir) assert isinstance(df, pyspark.sql.dataframe.DataFrame) assert df.head()[0] == '1'
def test_success_whole_execution_plan_with_in_memory_config( graphql_context, snapshot): instance = graphql_context.instance environment_dict = merge_dicts(csv_hello_world_solids_config(), {'storage': { 'in_memory': {} }}) pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, environment_dict=environment_dict) result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': { 'name': 'csv_hello_world' }, 'environmentConfigData': environment_dict, 'stepKeys': None, 'executionMetadata': { 'runId': pipeline_run.run_id }, 'mode': 'default', } }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = { step_event['step']['key']: step_event for step_event in query_result['stepEvents'] if step_event['step'] } assert 'sum_solid.compute' in step_events assert 'sum_sq_solid.compute' in step_events snapshot.assert_match(clean_log_messages(result.data)) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert not intermediates_manager.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert not intermediates_manager.has_intermediate( None, StepOutputHandle('sum_sq_solid.compute'))
def test_success_whole_execution_plan_with_in_memory_config(snapshot): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() instance.create_empty_run(run_id, 'csv_hello_world') result = execute_dagster_graphql( define_test_context(instance=instance), EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': { 'name': 'csv_hello_world' }, 'environmentConfigData': merge_dicts(csv_hello_world_solids_config(), {'storage': { 'in_memory': {} }}), 'stepKeys': None, 'executionMetadata': { 'runId': run_id }, 'mode': 'default', } }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = { step_event['step']['key']: step_event for step_event in query_result['stepEvents'] if step_event['step'] } assert 'sum_solid.compute' in step_events assert 'sum_sq_solid.compute' in step_events snapshot.assert_match(clean_log_messages(result.data)) store = build_fs_intermediate_store(instance.intermediates_directory, run_id) assert not store.has_intermediate(None, 'sum_solid.compute') assert not store.has_intermediate(None, 'sum_sq_solid.compute')
def test_using_file_system_for_subplan_multiprocessing(): environment_dict = {'storage': {'filesystem': {}}} instance = DagsterInstance.local_temp() pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_inty_pipeline ).build_pipeline_definition() execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict) pipeline_run = instance.create_run_for_pipeline( pipeline=pipeline_def, execution_plan=execution_plan ) assert execution_plan.get_step_by_key('return_one.compute') return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['return_one.compute']), instance, environment_dict=dict(environment_dict, execution={'multiprocess': {}}), pipeline_run=pipeline_run, ) ) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) assert get_step_output(return_one_step_events, 'return_one.compute') assert store.has_intermediate(None, 'return_one.compute') assert store.get_intermediate(None, 'return_one.compute', Int).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), instance, environment_dict=dict(environment_dict, execution={'multiprocess': {}}), pipeline_run=pipeline_run, ) ) assert get_step_output(add_one_step_events, 'add_one.compute') assert store.has_intermediate(None, 'add_one.compute') assert store.get_intermediate(None, 'add_one.compute', Int).obj == 2
def test_using_file_system_for_subplan(): pipeline = define_inty_pipeline() environment_dict = {'storage': {'filesystem': {}}} execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict) instance = DagsterInstance.ephemeral() assert execution_plan.get_step_by_key('return_one.compute') step_keys = ['return_one.compute'] run_id = str(uuid.uuid4()) return_one_step_events = list( execute_plan( execution_plan, instance, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), step_keys_to_execute=step_keys, )) store = build_fs_intermediate_store(instance.intermediates_directory, run_id) assert get_step_output(return_one_step_events, 'return_one.compute') assert store.has_intermediate(None, 'return_one.compute') assert store.get_intermediate(None, 'return_one.compute', Int).obj == 1 add_one_step_events = list( execute_plan( execution_plan, instance, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), step_keys_to_execute=['add_one.compute'], )) assert get_step_output(add_one_step_events, 'add_one.compute') assert store.has_intermediate(None, 'add_one.compute') assert store.get_intermediate(None, 'add_one.compute', Int).obj == 2
def test_file_system_intermediate_store_with_type_storage_plugin(): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() intermediate_store = build_fs_intermediate_store( instance.intermediates_directory, run_id=run_id, type_storage_plugin_registry=TypeStoragePluginRegistry( [(RuntimeString, FancyStringFilesystemTypeStoragePlugin)] ), ) with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context: try: intermediate_store.set_value('hello', context, RuntimeString, ['obj_name']) assert intermediate_store.has_object(context, ['obj_name']) assert intermediate_store.get_value(context, RuntimeString, ['obj_name']) == 'hello' finally: intermediate_store.rm_object(context, ['obj_name'])
def test_success_whole_execution_plan(snapshot): instance = DagsterInstance.local_temp() environment_dict = csv_hello_world_solids_config_fs_storage() pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, environment_dict=environment_dict) result = execute_dagster_graphql( define_test_context(instance=instance), EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': { 'name': 'csv_hello_world' }, 'environmentConfigData': environment_dict, 'stepKeys': None, 'executionMetadata': { 'runId': pipeline_run.run_id }, 'mode': 'default', } }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = { step_event['step']['key']: step_event for step_event in query_result['stepEvents'] if step_event['step'] } assert 'sum_solid.compute' in step_events assert 'sum_sq_solid.compute' in step_events snapshot.assert_match(clean_log_messages(result.data)) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) assert store.has_intermediate(None, 'sum_solid.compute') assert store.has_intermediate(None, 'sum_sq_solid.compute')
def test_using_file_system_for_subplan(): pipeline = define_inty_pipeline() environment_dict = {'storage': {'filesystem': {}}} instance = DagsterInstance.ephemeral() execution_plan = create_execution_plan( pipeline, environment_dict=environment_dict, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline, execution_plan=execution_plan) assert execution_plan.get_step_by_key('return_one.compute') return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['return_one.compute']), instance, environment_dict=environment_dict, pipeline_run=pipeline_run, )) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) assert get_step_output(return_one_step_events, 'return_one.compute') assert store.has_intermediate(None, 'return_one.compute') assert store.get_intermediate(None, 'return_one.compute', Int).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), instance, environment_dict=environment_dict, pipeline_run=pipeline_run, )) assert get_step_output(add_one_step_events, 'add_one.compute') assert store.has_intermediate(None, 'add_one.compute') assert store.get_intermediate(None, 'add_one.compute', Int).obj == 2
def test_file_system_intermediate_store_with_type_storage_plugin(): run_id = str(uuid.uuid4()) instance = DagsterInstance.ephemeral() # FIXME need a dedicated test bucket intermediate_store = build_fs_intermediate_store( instance.intermediates_directory, run_id=run_id, type_storage_plugin_registry=TypeStoragePluginRegistry( {RuntimeString.inst(): FancyStringFilesystemTypeStoragePlugin}), ) with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context: try: intermediate_store.set_value('hello', context, RuntimeString.inst(), ['obj_name']) assert intermediate_store.has_object(context, ['obj_name']) assert (intermediate_store.get_value(context, RuntimeString.inst(), ['obj_name']) == 'hello') finally: intermediate_store.rm_object(context, ['obj_name'])
def test_successful_pipeline_reexecution(self, graphql_context): selector = get_legacy_pipeline_selector(graphql_context, 'csv_hello_world') run_id = make_new_run_id() result_one = execute_dagster_graphql_and_finish_runs( graphql_context, START_PIPELINE_EXECUTION_SNAPSHOT_QUERY, variables={ 'executionParams': { 'selector': selector, 'runConfigData': csv_hello_world_solids_config_fs_storage(), 'executionMetadata': { 'runId': run_id }, 'mode': 'default', } }, ) assert result_one.data['startPipelineExecution'][ '__typename'] == 'StartPipelineRunSuccess' expected_value_repr = ( '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), ''' '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), ''' '''('sum_sq', 49)])]''') instance = graphql_context.instance store = build_fs_intermediate_store(instance.intermediates_directory, run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert intermediates_manager.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert intermediates_manager.has_intermediate( None, StepOutputHandle('sum_sq_solid.compute')) assert (str( intermediates_manager.get_intermediate( None, PoorMansDataFrame, StepOutputHandle('sum_sq_solid.compute')).obj) == expected_value_repr) # retry new_run_id = make_new_run_id() result_two = execute_dagster_graphql_and_finish_runs( graphql_context, START_PIPELINE_REEXECUTION_SNAPSHOT_QUERY, variables={ 'executionParams': { 'selector': selector, 'runConfigData': csv_hello_world_solids_config_fs_storage(), 'stepKeys': ['sum_sq_solid.compute'], 'executionMetadata': { 'runId': new_run_id, 'rootRunId': run_id, 'parentRunId': run_id, 'tags': [{ 'key': RESUME_RETRY_TAG, 'value': 'true' }], }, 'mode': 'default', } }, ) query_result = result_two.data['startPipelineReexecution'] assert query_result['__typename'] == 'StartPipelineRunSuccess' result = get_all_logs_for_finished_run_via_subscription( graphql_context, new_run_id) logs = result['pipelineRunLogs']['messages'] assert isinstance(logs, list) assert has_event_of_type(logs, 'PipelineStartEvent') assert has_event_of_type(logs, 'PipelineSuccessEvent') assert not has_event_of_type(logs, 'PipelineFailureEvent') assert not get_step_output_event(logs, 'sum_solid.compute') assert get_step_output_event(logs, 'sum_sq_solid.compute') store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert not intermediates_manager.has_intermediate( None, StepOutputHandle('sum_solid.inputs.num.read', 'input_thunk_output')) assert intermediates_manager.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert intermediates_manager.has_intermediate( None, StepOutputHandle('sum_sq_solid.compute')) assert (str( intermediates_manager.get_intermediate( None, PoorMansDataFrame, StepOutputHandle('sum_sq_solid.compute')).obj) == expected_value_repr)
def test_successful_pipeline_reexecution(snapshot): def sanitize_result_data(result_data): if isinstance(result_data, dict): if 'path' in result_data: result_data['path'] = 'DUMMY_PATH' result_data = { k: sanitize_result_data(v) for k, v in result_data.items() } elif isinstance(result_data, list): for i in range(len(result_data)): result_data[i] = sanitize_result_data(result_data[i]) else: pass return result_data run_id = str(uuid.uuid4()) instance = DagsterInstance.ephemeral() result_one = execute_dagster_graphql( define_context(instance=instance), START_PIPELINE_EXECUTION_SNAPSHOT_QUERY, variables={ 'executionParams': { 'selector': { 'name': 'csv_hello_world' }, 'environmentConfigData': csv_hello_world_solids_config_fs_storage(), 'executionMetadata': { 'runId': run_id }, 'mode': 'default', } }, ) assert (result_one.data['startPipelineExecution']['__typename'] == 'StartPipelineExecutionSuccess') snapshot.assert_match(sanitize_result_data(result_one.data)) expected_value_repr = ( '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), ''' '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), ''' '''('sum_sq', 49)])]''') store = build_fs_intermediate_store(instance.intermediates_directory, run_id) assert store.has_intermediate(None, 'sum_solid.compute') assert store.has_intermediate(None, 'sum_sq_solid.compute') assert (str( store.get_intermediate(None, 'sum_sq_solid.compute', PoorMansDataFrame).obj) == expected_value_repr) new_run_id = str(uuid.uuid4()) result_two = execute_dagster_graphql( define_context(instance=instance), START_PIPELINE_EXECUTION_SNAPSHOT_QUERY, variables={ 'executionParams': { 'selector': { 'name': 'csv_hello_world' }, 'environmentConfigData': csv_hello_world_solids_config_fs_storage(), 'stepKeys': ['sum_sq_solid.compute'], 'executionMetadata': { 'runId': new_run_id }, 'mode': 'default', 'retryRunId': run_id, } }, ) query_result = result_two.data['startPipelineExecution'] assert query_result['__typename'] == 'StartPipelineExecutionSuccess' logs = query_result['run']['logs']['nodes'] assert isinstance(logs, list) assert has_event_of_type(logs, 'PipelineStartEvent') assert has_event_of_type(logs, 'PipelineSuccessEvent') assert not has_event_of_type(logs, 'PipelineFailureEvent') assert not get_step_output_event(logs, 'sum_solid.compute') assert get_step_output_event(logs, 'sum_sq_solid.compute') snapshot.assert_match(sanitize_result_data(result_two.data)) store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) assert not store.has_intermediate(None, 'sum_solid.inputs.num.read', 'input_thunk_output') assert store.has_intermediate(None, 'sum_solid.compute') assert store.has_intermediate(None, 'sum_sq_solid.compute') assert (str( store.get_intermediate(None, 'sum_sq_solid.compute', PoorMansDataFrame).obj) == expected_value_repr)
def test_successful_pipeline_reexecution(): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() context = define_test_context(instance=instance) result_one = execute_dagster_graphql( context, START_PIPELINE_EXECUTION_SNAPSHOT_QUERY, variables={ 'executionParams': { 'selector': {'name': 'csv_hello_world'}, 'environmentConfigData': csv_hello_world_solids_config_fs_storage(), 'executionMetadata': {'runId': run_id}, 'mode': 'default', } }, ) assert result_one.data['startPipelineExecution']['__typename'] == 'StartPipelineRunSuccess' expected_value_repr = ( '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), ''' '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), ''' '''('sum_sq', 49)])]''' ) store = build_fs_intermediate_store(instance.intermediates_directory, run_id) assert store.has_intermediate(None, 'sum_solid.compute') assert store.has_intermediate(None, 'sum_sq_solid.compute') assert ( str(store.get_intermediate(None, 'sum_sq_solid.compute', PoorMansDataFrame).obj) == expected_value_repr ) # retry new_run_id = make_new_run_id() result_two = execute_dagster_graphql( define_test_context(instance=instance), START_PIPELINE_REEXECUTION_SNAPSHOT_QUERY, variables={ 'executionParams': { 'selector': {'name': 'csv_hello_world'}, 'environmentConfigData': csv_hello_world_solids_config_fs_storage(), 'stepKeys': ['sum_sq_solid.compute'], 'executionMetadata': { 'runId': new_run_id, 'rootRunId': run_id, 'parentRunId': run_id, 'tags': [{'key': RESUME_RETRY_TAG, 'value': 'true'}], }, 'mode': 'default', } }, ) query_result = result_two.data['startPipelineReexecution'] assert query_result['__typename'] == 'StartPipelineRunSuccess' result = sync_get_all_logs_for_run(context, new_run_id) logs = result['pipelineRunLogs']['messages'] assert isinstance(logs, list) assert has_event_of_type(logs, 'PipelineStartEvent') assert has_event_of_type(logs, 'PipelineSuccessEvent') assert not has_event_of_type(logs, 'PipelineFailureEvent') assert not get_step_output_event(logs, 'sum_solid.compute') assert get_step_output_event(logs, 'sum_sq_solid.compute') store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) assert not store.has_intermediate(None, 'sum_solid.inputs.num.read', 'input_thunk_output') assert store.has_intermediate(None, 'sum_solid.compute') assert store.has_intermediate(None, 'sum_sq_solid.compute') assert ( str(store.get_intermediate(None, 'sum_sq_solid.compute', PoorMansDataFrame).obj) == expected_value_repr )