def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=['add_two.compute'], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute') with pytest.raises(DagsterExecutionStepNotFoundError, match='Execution plan does not contain step'): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=['nope.compute'], instance=instance, )
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline( pipeline_def, run_config=run_config, instance=instance, ) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) ## re-execute add_two execution_plan = create_execution_plan(pipeline_def, run_config=run_config) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=execution_plan, run_config=run_config, parent_run_id=result.run_id, root_run_id=result.run_id, ) step_events = execute_plan( execution_plan.build_subset_plan(["add_two.compute"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) assert not get_step_output_event(step_events, "add_one.compute") assert get_step_output_event(step_events, "add_two.compute")
def test_using_intermediates_to_override(): pipeline = define_inty_pipeline() run_config = { "storage": { "filesystem": {} }, "intermediate_storage": { "in_memory": {} } } instance = DagsterInstance.ephemeral() execution_plan = create_execution_plan( pipeline, run_config=run_config, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline, execution_plan=execution_plan) assert execution_plan.get_step_by_key("return_one") return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["return_one"]), instance, run_config=run_config, pipeline_run=pipeline_run, )) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert get_step_output(return_one_step_events, "return_one") assert not intermediate_storage.has_intermediate( None, StepOutputHandle("return_one"))
def test_successful_one_part_execute_plan(graphql_context, snapshot): instance = graphql_context.instance run_config = csv_hello_world_solids_config_fs_storage() pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, run_config=run_config) selector = infer_pipeline_selector(graphql_context, 'csv_hello_world') result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': selector, 'runConfigData': run_config, 'stepKeys': ['sum_solid.compute'], 'executionMetadata': { 'runId': pipeline_run.run_id }, 'mode': 'default', }, }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = query_result['stepEvents'] assert [se['__typename'] for se in step_events] == [ 'ExecutionStepStartEvent', 'ExecutionStepInputEvent', 'ExecutionStepOutputEvent', 'ObjectStoreOperationEvent', 'ExecutionStepSuccessEvent', ] assert step_events[1]['stepKey'] == 'sum_solid.compute' assert step_events[2]['outputName'] == 'result' expected_value_repr = ( '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), ''' '''OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]''') assert step_events[3]['stepKey'] == 'sum_solid.compute' assert step_events[4]['stepKey'] == 'sum_solid.compute' snapshot.assert_match(clean_log_messages(result.data)) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert intermediate_storage.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert (str( intermediate_storage.get_intermediate( None, PoorMansDataFrame, StepOutputHandle('sum_solid.compute')).obj) == expected_value_repr)
def test_successful_one_part_execute_plan(graphql_context, snapshot): instance = graphql_context.instance run_config = csv_hello_world_solids_config_fs_storage() pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, run_config=run_config) selector = infer_pipeline_selector(graphql_context, "csv_hello_world") result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ "executionParams": { "selector": selector, "runConfigData": run_config, "stepKeys": ["sum_solid.compute"], "executionMetadata": { "runId": pipeline_run.run_id }, "mode": "default", }, }, ) query_result = result.data["executePlan"] assert query_result["__typename"] == "ExecutePlanSuccess" assert query_result["pipeline"]["name"] == "csv_hello_world" assert query_result["hasFailures"] is False step_events = query_result["stepEvents"] assert [se["__typename"] for se in step_events] == [ "ExecutionStepStartEvent", "ExecutionStepInputEvent", "ExecutionStepOutputEvent", "ObjectStoreOperationEvent", "ExecutionStepSuccessEvent", ] assert step_events[1]["stepKey"] == "sum_solid.compute" assert step_events[2]["outputName"] == "result" expected_value_repr = ( """[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), """ """OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]""") assert step_events[3]["stepKey"] == "sum_solid.compute" assert step_events[4]["stepKey"] == "sum_solid.compute" snapshot.assert_match(clean_log_messages(result.data)) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert intermediate_storage.has_intermediate( None, StepOutputHandle("sum_solid.compute")) assert (str( intermediate_storage.get_intermediate( None, PoorMansDataFrame, StepOutputHandle("sum_solid.compute")).obj) == expected_value_repr)
def test_using_intermediate_file_system_for_subplan_multiprocessing(): with instance_for_test() as instance: run_config = {"intermediate_storage": {"filesystem": {}}} pipeline = reconstructable(define_inty_pipeline) environment_config = EnvironmentConfig.build( pipeline.get_definition(), run_config=run_config, ) execution_plan = ExecutionPlan.build( pipeline, environment_config, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline.get_definition(), execution_plan=execution_plan) assert execution_plan.get_step_by_key("return_one") return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["return_one"], pipeline.get_definition(), environment_config), pipeline, instance, run_config=dict(run_config, execution={"multiprocess": {}}), pipeline_run=pipeline_run, )) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert get_step_output(return_one_step_events, "return_one") assert intermediate_storage.has_intermediate( None, StepOutputHandle("return_one")) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("return_one")).obj == 1) add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"], pipeline.get_definition(), environment_config), pipeline, instance, run_config=dict(run_config, execution={"multiprocess": {}}), pipeline_run=pipeline_run, )) assert get_step_output(add_one_step_events, "add_one") assert intermediate_storage.has_intermediate( None, StepOutputHandle("add_one")) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 2)
def define_intermediate_storage(type_storage_plugin_registry=None): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, run_id=run_id, type_storage_plugin_registry=type_storage_plugin_registry, ) return run_id, instance, intermediate_storage
def test_using_intermediates_file_system_for_subplan(): pipeline = define_inty_pipeline() run_config = {"intermediate_storage": {"filesystem": {}}} instance = DagsterInstance.ephemeral() environment_config = EnvironmentConfig.build( pipeline, run_config=run_config, ) execution_plan = ExecutionPlan.build( InMemoryPipeline(pipeline), environment_config, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline, execution_plan=execution_plan) assert execution_plan.get_step_by_key("return_one") return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["return_one"], pipeline, environment_config), InMemoryPipeline(pipeline), instance, run_config=run_config, pipeline_run=pipeline_run, )) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert get_step_output(return_one_step_events, "return_one") assert intermediate_storage.has_intermediate( None, StepOutputHandle("return_one")) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("return_one")).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(["add_one"], pipeline, environment_config), InMemoryPipeline(pipeline), instance, run_config=run_config, pipeline_run=pipeline_run, )) assert get_step_output(add_one_step_events, "add_one") assert intermediate_storage.has_intermediate(None, StepOutputHandle("add_one")) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 2
def test_address_operation_using_intermediates_file_system(): with seven.TemporaryDirectory() as tmpdir_path: output_address = os.path.join(tmpdir_path, "solid1.output") output_value = 5 instance = DagsterInstance.ephemeral() intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, run_id="some_run_id") object_operation_result = intermediate_storage.set_intermediate_to_address( context=None, dagster_type=Int, step_output_handle=StepOutputHandle("solid1.compute"), value=output_value, address=output_address, ) assert object_operation_result.key == output_address assert object_operation_result.obj == output_value assert (intermediate_storage.get_intermediate_from_address( context=None, dagster_type=Int, step_output_handle=StepOutputHandle("solid1.compute"), address=output_address, ).obj == output_value) with pytest.raises( DagsterAddressIOError, match="No such file or directory", ): intermediate_storage.set_intermediate_to_address( context=None, dagster_type=Int, step_output_handle=StepOutputHandle("solid1.compute"), value=1, address="invalid_address", ) with pytest.raises( DagsterAddressIOError, match="No such file or directory", ): intermediate_storage.get_intermediate_from_address( context=None, dagster_type=Int, step_output_handle=StepOutputHandle("solid1.compute"), address=os.path.join(tmpdir_path, "invalid.output"), )
def test_spark_data_frame_serialization_file_system_file_handle(spark_config): @solid def nonce(_): return LocalFileHandle(file_relative_path(__file__, 'data/test.csv')) @pipeline(mode_defs=[spark_local_fs_mode]) def spark_df_test_pipeline(): ingest_csv_file_handle_to_spark(nonce()) instance = DagsterInstance.ephemeral() result = execute_pipeline( spark_df_test_pipeline, mode='spark', run_config={ 'intermediate_storage': { 'filesystem': {} }, 'resources': { 'pyspark': { 'config': { 'spark_conf': spark_config } } }, }, instance=instance, ) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, run_id=result.run_id) assert result.success result_dir = os.path.join( intermediate_storage.root, 'intermediates', 'ingest_csv_file_handle_to_spark.compute', 'result', ) assert '_SUCCESS' in os.listdir(result_dir) spark = SparkSession.builder.getOrCreate() df = spark.read.parquet(result_dir) assert isinstance(df, pyspark.sql.dataframe.DataFrame) assert df.head()[0] == '1'
def test_spark_data_frame_serialization_file_system_file_handle(spark_config): @solid def nonce(_): return LocalFileHandle(file_relative_path(__file__, "data/test.csv")) @pipeline(mode_defs=[spark_local_fs_mode]) def spark_df_test_pipeline(): ingest_csv_file_handle_to_spark(nonce()) instance = DagsterInstance.ephemeral() result = execute_pipeline( spark_df_test_pipeline, mode="spark", run_config={ "intermediate_storage": { "filesystem": {} }, "resources": { "pyspark": { "config": { "spark_conf": spark_config } } }, }, instance=instance, ) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, run_id=result.run_id) assert result.success result_dir = os.path.join( intermediate_storage.root, "intermediates", "ingest_csv_file_handle_to_spark", "result", ) assert "_SUCCESS" in os.listdir(result_dir) spark = SparkSession.builder.getOrCreate() df = spark.read.parquet(result_dir) assert isinstance(df, pyspark.sql.dataframe.DataFrame) assert df.head()[0] == "1"
def test_success_whole_execution_plan_with_in_memory_config( graphql_context, snapshot): instance = graphql_context.instance selector = infer_pipeline_selector(graphql_context, 'csv_hello_world') run_config = merge_dicts(csv_hello_world_solids_config(), {'storage': { 'in_memory': {} }}) pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, run_config=run_config) result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': selector, 'runConfigData': run_config, 'stepKeys': None, 'executionMetadata': { 'runId': pipeline_run.run_id }, 'mode': 'default', }, }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = { step_event['stepKey']: step_event for step_event in query_result['stepEvents'] if step_event['stepKey'] } assert 'sum_solid.compute' in step_events assert 'sum_sq_solid.compute' in step_events snapshot.assert_match(clean_log_messages(result.data)) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert not intermediate_storage.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert not intermediate_storage.has_intermediate( None, StepOutputHandle('sum_sq_solid.compute'))
def test_success_whole_execution_plan_with_in_memory_config( graphql_context, snapshot): instance = graphql_context.instance selector = infer_pipeline_selector(graphql_context, "csv_hello_world") run_config = merge_dicts(csv_hello_world_solids_config(), {"storage": { "in_memory": {} }}) pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, run_config=run_config) result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ "executionParams": { "selector": selector, "runConfigData": run_config, "stepKeys": None, "executionMetadata": { "runId": pipeline_run.run_id }, "mode": "default", }, }, ) query_result = result.data["executePlan"] assert query_result["__typename"] == "ExecutePlanSuccess" assert query_result["pipeline"]["name"] == "csv_hello_world" assert query_result["hasFailures"] is False step_events = { step_event["stepKey"]: step_event for step_event in query_result["stepEvents"] if step_event["stepKey"] } assert "sum_solid.compute" in step_events assert "sum_sq_solid.compute" in step_events snapshot.assert_match(clean_log_messages(result.data)) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert not intermediate_storage.has_intermediate( None, StepOutputHandle("sum_solid.compute")) assert not intermediate_storage.has_intermediate( None, StepOutputHandle("sum_sq_solid.compute"))
def test_using_file_system_for_subplan(): pipeline = define_inty_pipeline() run_config = {'storage': {'filesystem': {}}} instance = DagsterInstance.ephemeral() execution_plan = create_execution_plan( pipeline, run_config=run_config, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline, execution_plan=execution_plan) assert execution_plan.get_step_by_key('return_one.compute') return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['return_one.compute']), instance, run_config=run_config, pipeline_run=pipeline_run, )) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert get_step_output(return_one_step_events, 'return_one.compute') assert intermediate_storage.has_intermediate( None, StepOutputHandle('return_one.compute')) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('return_one.compute')).obj == 1) add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), instance, run_config=run_config, pipeline_run=pipeline_run, )) assert get_step_output(add_one_step_events, 'add_one.compute') assert intermediate_storage.has_intermediate( None, StepOutputHandle('add_one.compute')) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 2)
def test_using_intermediate_file_system_for_subplan_multiprocessing(): run_config = {'intermediate_storage': {'filesystem': {}}} instance = DagsterInstance.local_temp() pipeline = reconstructable(define_inty_pipeline) execution_plan = create_execution_plan(pipeline, run_config=run_config) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline.get_definition(), execution_plan=execution_plan) assert execution_plan.get_step_by_key('return_one.compute') return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['return_one.compute']), instance, run_config=dict(run_config, execution={'multiprocess': {}}), pipeline_run=pipeline_run, )) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, pipeline_run.run_id) assert get_step_output(return_one_step_events, 'return_one.compute') assert intermediate_storage.has_intermediate( None, StepOutputHandle('return_one.compute')) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('return_one.compute')).obj == 1) add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), instance, run_config=dict(run_config, execution={'multiprocess': {}}), pipeline_run=pipeline_run, )) assert get_step_output(add_one_step_events, 'add_one.compute') assert intermediate_storage.has_intermediate( None, StepOutputHandle('add_one.compute')) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 2)
def test_successful_pipeline_reexecution(self, graphql_context): selector = infer_pipeline_selector(graphql_context, "csv_hello_world") run_id = make_new_run_id() result_one = execute_dagster_graphql_and_finish_runs( graphql_context, LAUNCH_PIPELINE_EXECUTION_MUTATION, variables={ "executionParams": { "selector": selector, "runConfigData": csv_hello_world_solids_config_fs_storage(), "executionMetadata": { "runId": run_id }, "mode": "default", } }, ) assert (result_one.data["launchPipelineExecution"]["__typename"] == "LaunchPipelineRunSuccess") expected_value_repr = ( """[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), """ """('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), """ """('sum_sq', 49)])]""") instance = graphql_context.instance intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, run_id) assert intermediate_storage.has_intermediate( None, StepOutputHandle("sum_solid.compute")) assert intermediate_storage.has_intermediate( None, StepOutputHandle("sum_sq_solid.compute")) assert (str( intermediate_storage.get_intermediate( None, PoorMansDataFrame, StepOutputHandle("sum_sq_solid.compute")).obj) == expected_value_repr) # retry new_run_id = make_new_run_id() result_two = execute_dagster_graphql_and_finish_runs( graphql_context, LAUNCH_PIPELINE_REEXECUTION_MUTATION, variables={ "executionParams": { "selector": selector, "runConfigData": csv_hello_world_solids_config_fs_storage(), "stepKeys": ["sum_sq_solid.compute"], "executionMetadata": { "runId": new_run_id, "rootRunId": run_id, "parentRunId": run_id, "tags": [{ "key": RESUME_RETRY_TAG, "value": "true" }], }, "mode": "default", } }, ) query_result = result_two.data["launchPipelineReexecution"] assert query_result["__typename"] == "LaunchPipelineRunSuccess" result = get_all_logs_for_finished_run_via_subscription( graphql_context, new_run_id) logs = result["pipelineRunLogs"]["messages"] assert isinstance(logs, list) assert has_event_of_type(logs, "PipelineStartEvent") assert has_event_of_type(logs, "PipelineSuccessEvent") assert not has_event_of_type(logs, "PipelineFailureEvent") assert not get_step_output_event(logs, "sum_solid.compute") assert get_step_output_event(logs, "sum_sq_solid.compute") intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, new_run_id) assert not intermediate_storage.has_intermediate( None, StepOutputHandle("sum_solid.inputs.num.read", "input_thunk_output")) assert intermediate_storage.has_intermediate( None, StepOutputHandle("sum_solid.compute")) assert intermediate_storage.has_intermediate( None, StepOutputHandle("sum_sq_solid.compute")) assert (str( intermediate_storage.get_intermediate( None, PoorMansDataFrame, StepOutputHandle("sum_sq_solid.compute")).obj) == expected_value_repr)
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline( pipeline_def, run_config=run_config, instance=instance, ) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 4 assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two")).obj == 6 ## re-execute add_two environment_config = EnvironmentConfig.build( pipeline_def, run_config=run_config, ) execution_plan = ExecutionPlan.build( InMemoryPipeline(pipeline_def), environment_config, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=execution_plan, run_config=run_config, parent_run_id=result.run_id, root_run_id=result.run_id, step_keys_to_execute=["add_two"], ) step_events = execute_plan( execution_plan.build_subset_plan(["add_two"], pipeline_def, environment_config), InMemoryPipeline(pipeline_def), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 4 assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two")).obj == 6 assert not get_step_output_event(step_events, "add_one") assert get_step_output_event(step_events, "add_two")
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 4 assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two")).obj == 6 ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["add_two"], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 4 assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two")).obj == 6 assert not get_step_output_event(step_events, "add_one") assert get_step_output_event(step_events, "add_two") with pytest.raises( DagsterExecutionStepNotFoundError, match="Can not build subset plan from unknown step: nope", ): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["nope"], instance=instance, )
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["add_two.compute"], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) assert not get_step_output_event(step_events, "add_one.compute") assert get_step_output_event(step_events, "add_two.compute") with pytest.raises( DagsterInvalidSubsetError, match="No qualified steps to execute found for step_selection"): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["nope.compute"], instance=instance, )
def test_successful_pipeline_reexecution(self, graphql_context): selector = infer_pipeline_selector(graphql_context, 'csv_hello_world') run_id = make_new_run_id() result_one = execute_dagster_graphql_and_finish_runs( graphql_context, LAUNCH_PIPELINE_EXECUTION_MUTATION, variables={ 'executionParams': { 'selector': selector, 'runConfigData': csv_hello_world_solids_config_fs_storage(), 'executionMetadata': { 'runId': run_id }, 'mode': 'default', } }, ) assert (result_one.data['launchPipelineExecution']['__typename'] == 'LaunchPipelineRunSuccess') expected_value_repr = ( '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), ''' '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), ''' '''('sum_sq', 49)])]''') instance = graphql_context.instance intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, run_id) assert intermediate_storage.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert intermediate_storage.has_intermediate( None, StepOutputHandle('sum_sq_solid.compute')) assert (str( intermediate_storage.get_intermediate( None, PoorMansDataFrame, StepOutputHandle('sum_sq_solid.compute')).obj) == expected_value_repr) # retry new_run_id = make_new_run_id() result_two = execute_dagster_graphql_and_finish_runs( graphql_context, LAUNCH_PIPELINE_REEXECUTION_MUTATION, variables={ 'executionParams': { 'selector': selector, 'runConfigData': csv_hello_world_solids_config_fs_storage(), 'stepKeys': ['sum_sq_solid.compute'], 'executionMetadata': { 'runId': new_run_id, 'rootRunId': run_id, 'parentRunId': run_id, 'tags': [{ 'key': RESUME_RETRY_TAG, 'value': 'true' }], }, 'mode': 'default', } }, ) query_result = result_two.data['launchPipelineReexecution'] assert query_result['__typename'] == 'LaunchPipelineRunSuccess' result = get_all_logs_for_finished_run_via_subscription( graphql_context, new_run_id) logs = result['pipelineRunLogs']['messages'] assert isinstance(logs, list) assert has_event_of_type(logs, 'PipelineStartEvent') assert has_event_of_type(logs, 'PipelineSuccessEvent') assert not has_event_of_type(logs, 'PipelineFailureEvent') assert not get_step_output_event(logs, 'sum_solid.compute') assert get_step_output_event(logs, 'sum_sq_solid.compute') intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, new_run_id) assert not intermediate_storage.has_intermediate( None, StepOutputHandle('sum_solid.inputs.num.read', 'input_thunk_output')) assert intermediate_storage.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert intermediate_storage.has_intermediate( None, StepOutputHandle('sum_sq_solid.compute')) assert (str( intermediate_storage.get_intermediate( None, PoorMansDataFrame, StepOutputHandle('sum_sq_solid.compute')).obj) == expected_value_repr)