def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=['add_two.compute'], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute') with pytest.raises(DagsterExecutionStepNotFoundError, match='Execution plan does not contain step'): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_keys_to_execute=['nope.compute'], instance=instance, )
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline(using_file_system=True) instance = DagsterInstance.ephemeral() run_config = {"solids": {"add_one": {"inputs": {"num": {"value": 3}}}}} result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success with open( os.path.join(instance.storage_directory(), result.run_id, "add_one", "result"), "rb", ) as read_obj: assert pickle.load(read_obj) == 4 with open( os.path.join(instance.storage_directory(), result.run_id, "add_two", "result"), "rb", ) as read_obj: assert pickle.load(read_obj) == 6 ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["add_two"], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events assert not os.path.exists( os.path.join( instance.storage_directory(), pipeline_reexecution_result.run_id, "add_one", "result" ) ) with open( os.path.join( instance.storage_directory(), pipeline_reexecution_result.run_id, "add_two", "result" ), "rb", ) as read_obj: assert pickle.load(read_obj) == 6 assert not get_step_output_event(step_events, "add_one") assert get_step_output_event(step_events, "add_two") with pytest.raises( DagsterExecutionStepNotFoundError, match="Step selection refers to unknown step: nope", ): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["nope"], instance=instance, )
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, instance=instance, ) assert result.success intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id)) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) ## re-execute add_two execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=execution_plan, environment_dict=environment_dict, parent_run_id=result.run_id, root_run_id=result.run_id, ) step_events = execute_plan( execution_plan.build_subset_plan(['add_two.compute']), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, ) intermediates_manager = IntermediateStoreIntermediatesManager( build_fs_intermediate_store(instance.intermediates_directory, result.run_id)) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_one.compute')).obj == 4) assert (intermediates_manager.get_intermediate( None, Int, StepOutputHandle('add_two.compute')).obj == 6) assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline(pipeline_def, environment_dict=environment_dict, instance=instance) assert result.success store = build_fs_intermediate_store(instance.intermediates_directory, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two pipeline_run = instance.create_run_for_pipeline( pipeline_def, environment_dict=environment_dict, step_keys_to_execute=['add_two.compute'], parent_run_id=result.run_id, root_run_id=result.run_id, ) pipeline_reexecution_result = execute_run(pipeline_def, pipeline_run, instance) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_reexecution_result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute') with pytest.raises(DagsterExecutionStepNotFoundError, match='Execution plan does not contain step'): pipeline_run = instance.create_run_for_pipeline( pipeline_def, environment_dict=environment_dict, step_keys_to_execute=['nope.compute'], parent_run_id=result.run_id, root_run_id=result.run_id, )
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() old_run_id = str(uuid.uuid4()) environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id), instance=instance, ) assert result.success store = build_fs_intermediate_store(instance.intermediates_directory, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two new_run_id = str(uuid.uuid4()) pipeline_run = PipelineRun( pipeline_name=pipeline_def.name, run_id=new_run_id, environment_dict=environment_dict, mode='default', previous_run_id=result.run_id, ) execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict, run_config=pipeline_run) step_events = execute_plan( execution_plan.build_subset_plan(['add_two.compute']), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, ) store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() old_run_id = str(uuid.uuid4()) environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id), instance=instance, ) assert result.success store = build_fs_intermediate_store(instance.intermediates_directory, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two new_run_id = str(uuid.uuid4()) run_config = RunConfig( run_id=new_run_id, reexecution_config=ReexecutionConfig( previous_run_id=result.run_id, step_output_handles=[StepOutputHandle('add_one.compute')]), ) execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict, run_config=run_config) step_events = execute_plan( execution_plan, environment_dict=environment_dict, run_config=run_config, step_keys_to_execute=['add_two.compute'], instance=instance, ) store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() old_run_id = str(uuid.uuid4()) environment_dict = { 'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } } } result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(storage_mode=RunStorageMode.FILESYSTEM, run_id=old_run_id), ) assert result.success store = FileSystemIntermediateStore(result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int) == 4 assert store.get_intermediate(None, 'add_two.compute', Int) == 6 ## re-execute add_two new_run_id = str(uuid.uuid4()) pipeline_reexecution_result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig( run_id=new_run_id, reexecution_config=ReexecutionConfig( previous_run_id=result.run_id, step_output_handles=[StepOutputHandle('add_one.compute')], ), storage_mode=RunStorageMode.FILESYSTEM, step_keys_to_execute=['add_two.compute'], ), ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events store = FileSystemIntermediateStore(new_run_id) assert store.get_intermediate(None, 'add_one.compute', Int) == 4 assert store.get_intermediate(None, 'add_two.compute', Int) == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline( pipeline_def, run_config=run_config, instance=instance, ) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) ## re-execute add_two execution_plan = create_execution_plan(pipeline_def, run_config=run_config) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=execution_plan, run_config=run_config, parent_run_id=result.run_id, root_run_id=result.run_id, ) step_events = execute_plan( execution_plan.build_subset_plan(["add_two.compute"]), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) assert not get_step_output_event(step_events, "add_one.compute") assert get_step_output_event(step_events, "add_two.compute")
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() old_run_id = make_new_run_id() environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id), instance=instance, ) assert result.success store = build_fs_intermediate_store(instance.intermediates_directory, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two new_run_id = make_new_run_id() pipeline_reexecution_result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig( run_id=new_run_id, previous_run_id=result.run_id, step_keys_to_execute=['add_two.compute'], ), instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def _get_source_run_id_from_logs(self, step_output_handle: StepOutputHandle) -> Optional[str]: # walk through event logs to find the right run_id based on the run lineage from dagster.core.events import get_step_output_event _, runs = self.instance.get_run_group(self.run_id) run_id_to_parent_run_id = {run.run_id: run.parent_run_id for run in runs} source_run_id = self.pipeline_run.parent_run_id while source_run_id: # note: this would cost N db calls where N = number of parent runs logs = self.instance.all_logs(source_run_id) # if the parent run has yielded an StepOutput event for the given step output, # we find the source run id if get_step_output_event( events=[e.dagster_event for e in logs if e.is_dagster_event], step_key=step_output_handle.step_key, output_name=step_output_handle.output_name, ): return source_run_id else: # else, keep looking backwards source_run_id = run_id_to_parent_run_id[source_run_id] # when a fixed path is provided via io manager, it's able to run step subset using an execution # plan when the ascendant outputs were not previously created by dagster-controlled # computations. for example, in backfills, with fixed path io manager, we allow users to # "re-execute" runs with steps where the outputs weren't previously stored by dagster. return None
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() old_run_id = str(uuid.uuid4()) environment_dict = env_with_fs({'solids': {'add_one': {'inputs': {'num': {'value': 3}}}}}) result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id), instance=instance, ) assert result.success store = FilesystemIntermediateStore.for_instance(instance, result.run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 ## re-execute add_two new_run_id = str(uuid.uuid4()) pipeline_reexecution_result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig( run_id=new_run_id, reexecution_config=ReexecutionConfig( previous_run_id=result.run_id, step_output_handles=[StepOutputHandle('add_one.compute')], ), step_keys_to_execute=['add_two.compute'], ), instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events store = FilesystemIntermediateStore.for_instance(instance, new_run_id) assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4 assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6 assert not get_step_output_event(step_events, 'add_one.compute') assert get_step_output_event(step_events, 'add_two.compute')
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() old_run_id = str(uuid.uuid4()) environment_dict = {'solids': {'add_one': {'inputs': {'num': {'value': 3}}}}} result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(storage_mode=RunStorageMode.FILESYSTEM, run_id=old_run_id), ) assert result.success assert get_filesystem_intermediate(result.run_id, 'add_one.transform', Int) == 4 assert get_filesystem_intermediate(result.run_id, 'add_two.transform', Int) == 6 ## re-execute add_two new_run_id = str(uuid.uuid4()) run_config = RunConfig( run_id=new_run_id, reexecution_config=ReexecutionConfig( previous_run_id=result.run_id, step_output_handles=[StepOutputHandle('add_one.transform')], ), storage_mode=RunStorageMode.FILESYSTEM, ) execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict) step_events = execute_plan( execution_plan, environment_dict=environment_dict, run_config=run_config, step_keys_to_execute=['add_two.transform'], ) assert get_filesystem_intermediate(new_run_id, 'add_one.transform', Int) == 4 assert get_filesystem_intermediate(new_run_id, 'add_two.transform', Int) == 6 assert not get_step_output_event(step_events, 'add_one.transform') assert get_step_output_event(step_events, 'add_two.transform')
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["add_two.compute"], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one.compute")).obj == 4) assert (intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two.compute")).obj == 6) assert not get_step_output_event(step_events, "add_one.compute") assert get_step_output_event(step_events, "add_two.compute") with pytest.raises( DagsterInvalidSubsetError, match="No qualified steps to execute found for step_selection"): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["nope.compute"], instance=instance, )
def test_pipeline_step_key_subset_execution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 4 assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two")).obj == 6 ## re-execute add_two pipeline_reexecution_result = reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["add_two"], instance=instance, ) assert pipeline_reexecution_result.success step_events = pipeline_reexecution_result.step_event_list assert step_events intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 4 assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two")).obj == 6 assert not get_step_output_event(step_events, "add_one") assert get_step_output_event(step_events, "add_two") with pytest.raises( DagsterExecutionStepNotFoundError, match="Can not build subset plan from unknown step: nope", ): reexecute_pipeline( pipeline_def, parent_run_id=result.run_id, run_config=run_config, step_selection=["nope"], instance=instance, )
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() run_config = env_with_fs( {"solids": { "add_one": { "inputs": { "num": { "value": 3 } } } }}) result = execute_pipeline( pipeline_def, run_config=run_config, instance=instance, ) assert result.success intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 4 assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two")).obj == 6 ## re-execute add_two environment_config = EnvironmentConfig.build( pipeline_def, run_config=run_config, ) execution_plan = ExecutionPlan.build( InMemoryPipeline(pipeline_def), environment_config, ) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=execution_plan, run_config=run_config, parent_run_id=result.run_id, root_run_id=result.run_id, step_keys_to_execute=["add_two"], ) step_events = execute_plan( execution_plan.build_subset_plan(["add_two"], pipeline_def, environment_config), InMemoryPipeline(pipeline_def), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) intermediate_storage = build_fs_intermediate_storage( instance.intermediates_directory, result.run_id) assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_one")).obj == 4 assert intermediate_storage.get_intermediate( None, Int, StepOutputHandle("add_two")).obj == 6 assert not get_step_output_event(step_events, "add_one") assert get_step_output_event(step_events, "add_two")
def test_execution_plan_reexecution(): pipeline_def = define_addy_pipeline(using_file_system=True) instance = DagsterInstance.ephemeral() run_config = {"solids": {"add_one": {"inputs": {"num": {"value": 3}}}}} result = execute_pipeline( pipeline_def, run_config=run_config, instance=instance, ) assert result.success with open( os.path.join(instance.storage_directory(), result.run_id, "add_one", "result"), "rb", ) as read_obj: assert pickle.load(read_obj) == 4 with open( os.path.join(instance.storage_directory(), result.run_id, "add_two", "result"), "rb", ) as read_obj: assert pickle.load(read_obj) == 6 ## re-execute add_two resolved_run_config = ResolvedRunConfig.build( pipeline_def, run_config=run_config, ) execution_plan = ExecutionPlan.build( InMemoryPipeline(pipeline_def), resolved_run_config, ) subset_plan = execution_plan.build_subset_plan(["add_two"], pipeline_def, resolved_run_config) pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, execution_plan=subset_plan, run_config=run_config, parent_run_id=result.run_id, root_run_id=result.run_id, ) step_events = execute_plan( subset_plan, InMemoryPipeline(pipeline_def), run_config=run_config, pipeline_run=pipeline_run, instance=instance, ) assert not os.path.exists( os.path.join(instance.storage_directory(), pipeline_run.run_id, "add_one", "result") ) with open( os.path.join(instance.storage_directory(), pipeline_run.run_id, "add_two", "result"), "rb", ) as read_obj: assert pickle.load(read_obj) == 6 assert not get_step_output_event(step_events, "add_one") assert get_step_output_event(step_events, "add_two")