def test_pipeline_step_key_subset_execution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs({'solids': {'add_one': {'inputs': {'num': {'value': 3}}}}})
    result = execute_pipeline(pipeline_def, run_config=run_config, instance=instance)

    assert result.success

    intermediates_manager = IntermediateStoreIntermediatesManager(
        build_fs_intermediate_store(instance.intermediates_directory, result.run_id)
    )
    assert (
        intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj
        == 4
    )
    assert (
        intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_two.compute')).obj
        == 6
    )

    ## re-execute add_two

    pipeline_reexecution_result = reexecute_pipeline(
        pipeline_def,
        parent_run_id=result.run_id,
        run_config=run_config,
        step_keys_to_execute=['add_two.compute'],
        instance=instance,
    )

    assert pipeline_reexecution_result.success

    step_events = pipeline_reexecution_result.step_event_list
    assert step_events

    intermediates_manager = IntermediateStoreIntermediatesManager(
        build_fs_intermediate_store(instance.intermediates_directory, result.run_id)
    )
    assert (
        intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj
        == 4
    )
    assert (
        intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_two.compute')).obj
        == 6
    )

    assert not get_step_output_event(step_events, 'add_one.compute')
    assert get_step_output_event(step_events, 'add_two.compute')

    with pytest.raises(
        DagsterExecutionStepNotFoundError, match='Execution plan does not contain step'
    ):
        reexecute_pipeline(
            pipeline_def,
            parent_run_id=result.run_id,
            run_config=run_config,
            step_keys_to_execute=['nope.compute'],
            instance=instance,
        )
def test_pipeline_step_key_subset_execution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    environment_dict = env_with_fs(
        {'solids': {
            'add_one': {
                'inputs': {
                    'num': {
                        'value': 3
                    }
                }
            }
        }})
    result = execute_pipeline(pipeline_def,
                              environment_dict=environment_dict,
                              instance=instance)

    assert result.success

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        result.run_id)
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4
    assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6

    ## re-execute add_two

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        step_keys_to_execute=['add_two.compute'],
        parent_run_id=result.run_id,
        root_run_id=result.run_id,
    )

    pipeline_reexecution_result = execute_run(pipeline_def, pipeline_run,
                                              instance)

    assert pipeline_reexecution_result.success

    step_events = pipeline_reexecution_result.step_event_list
    assert step_events

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        pipeline_reexecution_result.run_id)
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4
    assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6

    assert not get_step_output_event(step_events, 'add_one.compute')
    assert get_step_output_event(step_events, 'add_two.compute')

    with pytest.raises(DagsterExecutionStepNotFoundError,
                       match='Execution plan does not contain step'):
        pipeline_run = instance.create_run_for_pipeline(
            pipeline_def,
            environment_dict=environment_dict,
            step_keys_to_execute=['nope.compute'],
            parent_run_id=result.run_id,
            root_run_id=result.run_id,
        )
def test_execution_plan_reexecution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    environment_dict = env_with_fs(
        {'solids': {
            'add_one': {
                'inputs': {
                    'num': {
                        'value': 3
                    }
                }
            }
        }})
    result = execute_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        instance=instance,
    )

    assert result.success

    intermediates_manager = IntermediateStoreIntermediatesManager(
        build_fs_intermediate_store(instance.intermediates_directory,
                                    result.run_id))
    assert (intermediates_manager.get_intermediate(
        None, Int, StepOutputHandle('add_one.compute')).obj == 4)
    assert (intermediates_manager.get_intermediate(
        None, Int, StepOutputHandle('add_two.compute')).obj == 6)

    ## re-execute add_two

    execution_plan = create_execution_plan(pipeline_def,
                                           environment_dict=environment_dict)

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def,
        execution_plan=execution_plan,
        environment_dict=environment_dict,
        parent_run_id=result.run_id,
        root_run_id=result.run_id,
    )

    step_events = execute_plan(
        execution_plan.build_subset_plan(['add_two.compute']),
        environment_dict=environment_dict,
        pipeline_run=pipeline_run,
        instance=instance,
    )

    intermediates_manager = IntermediateStoreIntermediatesManager(
        build_fs_intermediate_store(instance.intermediates_directory,
                                    result.run_id))
    assert (intermediates_manager.get_intermediate(
        None, Int, StepOutputHandle('add_one.compute')).obj == 4)
    assert (intermediates_manager.get_intermediate(
        None, Int, StepOutputHandle('add_two.compute')).obj == 6)

    assert not get_step_output_event(step_events, 'add_one.compute')
    assert get_step_output_event(step_events, 'add_two.compute')
def test_execution_plan_reexecution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    old_run_id = str(uuid.uuid4())
    environment_dict = env_with_fs(
        {'solids': {
            'add_one': {
                'inputs': {
                    'num': {
                        'value': 3
                    }
                }
            }
        }})
    result = execute_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        run_config=RunConfig(run_id=old_run_id),
        instance=instance,
    )

    assert result.success

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        result.run_id)
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4
    assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6

    ## re-execute add_two

    new_run_id = str(uuid.uuid4())

    run_config = RunConfig(
        run_id=new_run_id,
        reexecution_config=ReexecutionConfig(
            previous_run_id=result.run_id,
            step_output_handles=[StepOutputHandle('add_one.compute')]),
    )

    execution_plan = create_execution_plan(pipeline_def,
                                           environment_dict=environment_dict,
                                           run_config=run_config)

    step_events = execute_plan(
        execution_plan,
        environment_dict=environment_dict,
        run_config=run_config,
        step_keys_to_execute=['add_two.compute'],
        instance=instance,
    )

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        new_run_id)
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4
    assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6

    assert not get_step_output_event(step_events, 'add_one.compute')
    assert get_step_output_event(step_events, 'add_two.compute')
def test_execution_plan_reexecution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    old_run_id = str(uuid.uuid4())
    environment_dict = env_with_fs(
        {'solids': {
            'add_one': {
                'inputs': {
                    'num': {
                        'value': 3
                    }
                }
            }
        }})
    result = execute_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        run_config=RunConfig(run_id=old_run_id),
        instance=instance,
    )

    assert result.success

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        result.run_id)
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4
    assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6

    ## re-execute add_two

    new_run_id = str(uuid.uuid4())

    pipeline_run = PipelineRun(
        pipeline_name=pipeline_def.name,
        run_id=new_run_id,
        environment_dict=environment_dict,
        mode='default',
        previous_run_id=result.run_id,
    )

    execution_plan = create_execution_plan(pipeline_def,
                                           environment_dict=environment_dict,
                                           run_config=pipeline_run)

    step_events = execute_plan(
        execution_plan.build_subset_plan(['add_two.compute']),
        environment_dict=environment_dict,
        pipeline_run=pipeline_run,
        instance=instance,
    )

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        new_run_id)
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4
    assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6

    assert not get_step_output_event(step_events, 'add_one.compute')
    assert get_step_output_event(step_events, 'add_two.compute')
Пример #6
0
def test_pipeline_step_key_subset_execution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    old_run_id = make_new_run_id()
    environment_dict = env_with_fs(
        {'solids': {
            'add_one': {
                'inputs': {
                    'num': {
                        'value': 3
                    }
                }
            }
        }})
    result = execute_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        run_config=RunConfig(run_id=old_run_id),
        instance=instance,
    )

    assert result.success

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        result.run_id)
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4
    assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6

    ## re-execute add_two

    new_run_id = make_new_run_id()

    pipeline_reexecution_result = execute_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        run_config=RunConfig(
            run_id=new_run_id,
            previous_run_id=result.run_id,
            step_keys_to_execute=['add_two.compute'],
        ),
        instance=instance,
    )

    assert pipeline_reexecution_result.success

    step_events = pipeline_reexecution_result.step_event_list
    assert step_events

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        new_run_id)
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 4
    assert store.get_intermediate(None, 'add_two.compute', Int).obj == 6

    assert not get_step_output_event(step_events, 'add_one.compute')
    assert get_step_output_event(step_events, 'add_two.compute')
Пример #7
0
def test_file_system_intermediate_store_with_composite_type_storage_plugin():
    run_id = str(uuid.uuid4())

    # FIXME need a dedicated test bucket
    intermediate_store = build_fs_intermediate_store(
        DagsterInstance.ephemeral().intermediates_directory,
        run_id=run_id,
        type_storage_plugin_registry=TypeStoragePluginRegistry(
            {RuntimeString.inst(): FancyStringFilesystemTypeStoragePlugin}),
    )

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(['hello'], context,
                                         resolve_to_runtime_type(List[String]),
                                         ['obj_name'])

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(['hello'], context,
                                         resolve_to_runtime_type(
                                             Optional[String]), ['obj_name'])

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(
                ['hello'], context,
                resolve_to_runtime_type(List[Optional[String]]), ['obj_name'])

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(
                ['hello'], context,
                resolve_to_runtime_type(Optional[List[String]]), ['obj_name'])
Пример #8
0
def test_using_intermediates_to_override():
    pipeline = define_inty_pipeline()

    run_config = {
        'storage': {
            'filesystem': {}
        },
        'intermediate_storage': {
            'in_memory': {}
        }
    }

    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(
        pipeline,
        run_config=run_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)
    assert execution_plan.get_step_by_key('return_one.compute')

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['return_one.compute']),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        pipeline_run.run_id)
    intermediates_manager = IntermediateStoreIntermediatesManager(store)
    assert get_step_output(return_one_step_events, 'return_one.compute')
    assert not intermediates_manager.has_intermediate(
        None, StepOutputHandle('return_one.compute'))
Пример #9
0
def test_file_system_intermediate_store_with_composite_type_storage_plugin():
    run_id = make_new_run_id()

    intermediate_store = build_fs_intermediate_store(
        DagsterInstance.ephemeral().intermediates_directory,
        run_id=run_id,
        type_storage_plugin_registry=TypeStoragePluginRegistry(
            [(RuntimeString, FancyStringFilesystemTypeStoragePlugin)]
        ),
    )

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(
                ['hello'], context, resolve_dagster_type(List[String]), ['obj_name']
            )

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(
                ['hello'], context, resolve_dagster_type(Optional[String]), ['obj_name']
            )

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(
                ['hello'], context, resolve_dagster_type(List[Optional[String]]), ['obj_name']
            )

    with yield_empty_pipeline_context(run_id=run_id) as context:
        with pytest.raises(check.NotImplementedCheckError):
            intermediate_store.set_value(
                ['hello'], context, resolve_dagster_type(Optional[List[String]]), ['obj_name']
            )
Пример #10
0
def test_successful_one_part_execute_plan(graphql_context, snapshot):
    instance = graphql_context.instance
    environment_dict = csv_hello_world_solids_config_fs_storage()
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=csv_hello_world, environment_dict=environment_dict)
    selector = get_legacy_pipeline_selector(graphql_context, 'csv_hello_world')

    result = execute_dagster_graphql(
        graphql_context,
        EXECUTE_PLAN_QUERY,
        variables={
            'executionParams': {
                'selector': selector,
                'runConfigData': environment_dict,
                'stepKeys': ['sum_solid.compute'],
                'executionMetadata': {
                    'runId': pipeline_run.run_id
                },
                'mode': 'default',
            }
        },
    )

    query_result = result.data['executePlan']

    assert query_result['__typename'] == 'ExecutePlanSuccess'
    assert query_result['pipeline']['name'] == 'csv_hello_world'
    assert query_result['hasFailures'] is False

    step_events = query_result['stepEvents']

    assert [se['__typename'] for se in step_events] == [
        'ExecutionStepStartEvent',
        'ExecutionStepInputEvent',
        'ExecutionStepOutputEvent',
        'ObjectStoreOperationEvent',
        'ExecutionStepSuccessEvent',
    ]

    assert step_events[1]['step']['key'] == 'sum_solid.compute'
    assert step_events[2]['outputName'] == 'result'

    expected_value_repr = (
        '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), '''
        '''OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]''')

    assert step_events[3]['step']['key'] == 'sum_solid.compute'
    assert step_events[4]['step']['key'] == 'sum_solid.compute'

    snapshot.assert_match(clean_log_messages(result.data))

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        pipeline_run.run_id)
    intermediates_manager = IntermediateStoreIntermediatesManager(store)
    assert intermediates_manager.has_intermediate(
        None, StepOutputHandle('sum_solid.compute'))
    assert (str(
        intermediates_manager.get_intermediate(
            None, PoorMansDataFrame,
            StepOutputHandle('sum_solid.compute')).obj) == expected_value_repr)
Пример #11
0
def test_successful_one_part_execute_plan(snapshot):
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    instance.create_empty_run(run_id, 'csv_hello_world')

    result = execute_dagster_graphql(
        define_test_context(instance=instance),
        EXECUTE_PLAN_QUERY,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData':
                csv_hello_world_solids_config_fs_storage(),
                'stepKeys': ['sum_solid.compute'],
                'executionMetadata': {
                    'runId': run_id
                },
                'mode': 'default',
            }
        },
    )

    query_result = result.data['executePlan']

    assert query_result['__typename'] == 'ExecutePlanSuccess'
    assert query_result['pipeline']['name'] == 'csv_hello_world'
    assert query_result['hasFailures'] is False

    step_events = query_result['stepEvents']

    assert [se['__typename'] for se in step_events] == [
        'EngineEvent',
        'ExecutionStepStartEvent',
        'ExecutionStepInputEvent',
        'ExecutionStepOutputEvent',
        'ObjectStoreOperationEvent',
        'ExecutionStepSuccessEvent',
        'EngineEvent',
    ]

    assert step_events[1]['step']['key'] == 'sum_solid.compute'
    assert step_events[3]['outputName'] == 'result'

    expected_value_repr = (
        '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3)]), '''
        '''OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7)])]''')

    assert step_events[4]['step']['key'] == 'sum_solid.compute'
    assert step_events[5]['step']['key'] == 'sum_solid.compute'

    snapshot.assert_match(clean_log_messages(result.data))

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        run_id)
    assert store.has_intermediate(None, 'sum_solid.compute')
    assert (str(
        store.get_intermediate(None, 'sum_solid.compute',
                               PoorMansDataFrame).obj) == expected_value_repr)
Пример #12
0
def test_using_file_system_for_subplan_multiprocessing():

    environment_dict = {'storage': {'filesystem': {}}}
    instance = DagsterInstance.local_temp()

    pipeline = reconstructable(define_inty_pipeline)

    execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline.get_definition(), execution_plan=execution_plan
    )

    assert execution_plan.get_step_by_key('return_one.compute')

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['return_one.compute']),
            instance,
            environment_dict=dict(environment_dict, execution={'multiprocess': {}}),
            pipeline_run=pipeline_run,
        )
    )

    store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id)
    intermediates_manager = IntermediateStoreIntermediatesManager(store)

    assert get_step_output(return_one_step_events, 'return_one.compute')
    assert intermediates_manager.has_intermediate(None, StepOutputHandle('return_one.compute'))
    assert (
        intermediates_manager.get_intermediate(
            None, Int, StepOutputHandle('return_one.compute')
        ).obj
        == 1
    )

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            instance,
            environment_dict=dict(environment_dict, execution={'multiprocess': {}}),
            pipeline_run=pipeline_run,
        )
    )

    assert get_step_output(add_one_step_events, 'add_one.compute')
    assert intermediates_manager.has_intermediate(None, StepOutputHandle('add_one.compute'))
    assert (
        intermediates_manager.get_intermediate(None, Int, StepOutputHandle('add_one.compute')).obj
        == 2
    )
Пример #13
0
def test_file_system_intermediate_store():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id
    )

    with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context:
        intermediate_store.set_object(True, context, RuntimeBool, ['true'])
        assert intermediate_store.has_object(context, ['true'])
        assert intermediate_store.get_object(context, RuntimeBool, ['true']).obj is True
        assert intermediate_store.uri_for_paths(['true']).startswith('file:///')
        assert intermediate_store.rm_object(context, ['true']) is None
        assert intermediate_store.rm_object(context, ['true']) is None
        assert intermediate_store.rm_object(context, ['dslkfhjsdflkjfs']) is None
def test_using_file_system_for_subplan_multiprocessing():

    environment_dict = {'storage': {'filesystem': {}}}
    instance = DagsterInstance.local_temp()

    execution_plan = create_execution_plan(
        ExecutionTargetHandle.for_pipeline_fn(
            define_inty_pipeline).build_pipeline_definition(),
        environment_dict=environment_dict,
    )

    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']

    run_id = str(uuid.uuid4())
    instance.create_empty_run(run_id, execution_plan.pipeline_def.name)

    return_one_step_events = list(
        execute_plan(
            execution_plan,
            instance,
            environment_dict=dict(environment_dict,
                                  execution={'multiprocess': {}}),
            run_config=RunConfig(run_id=run_id),
            step_keys_to_execute=step_keys,
        ))

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        run_id)

    assert get_step_output(return_one_step_events, 'return_one.compute')
    assert store.has_intermediate(None, 'return_one.compute')
    assert store.get_intermediate(None, 'return_one.compute', Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan,
            instance,
            environment_dict=dict(environment_dict,
                                  execution={'multiprocess': {}}),
            run_config=RunConfig(run_id=run_id),
            step_keys_to_execute=['add_one.compute'],
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    assert store.has_intermediate(None, 'add_one.compute')
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 2
Пример #15
0
def test_file_system_intermediate_store_composite_types():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()

    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id
    )

    with yield_empty_pipeline_context(instance=instance, run_id=run_id) as context:
        intermediate_store.set_object(
            [True, False], context, resolve_dagster_type(List[Bool]), ['bool']
        )
        assert intermediate_store.has_object(context, ['bool'])
        assert intermediate_store.get_object(
            context, resolve_dagster_type(List[Bool]), ['bool']
        ).obj == [True, False]
Пример #16
0
def test_file_system_intermediate_store_with_custom_serializer():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id
    )

    with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context:

        intermediate_store.set_object('foo', context, LowercaseString, ['foo'])

        with open(os.path.join(intermediate_store.root, 'foo'), 'rb') as fd:
            assert fd.read().decode('utf-8') == 'FOO'

        assert intermediate_store.has_object(context, ['foo'])
        assert intermediate_store.get_object(context, LowercaseString, ['foo']).obj == 'foo'
Пример #17
0
def test_file_system_intermediate_store_composite_types_with_custom_serializer_for_inner_type():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id
    )

    with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context:

        intermediate_store.set_object(
            ['foo', 'bar'], context, resolve_dagster_type(List[LowercaseString]), ['list']
        )
        assert intermediate_store.has_object(context, ['list'])
        assert intermediate_store.get_object(
            context, resolve_dagster_type(List[Bool]), ['list']
        ).obj == ['foo', 'bar']
Пример #18
0
def test_spark_data_frame_serialization_file_system_file_handle(spark_config):
    @solid
    def nonce(_):
        return LocalFileHandle(file_relative_path(__file__, 'data/test.csv'))

    @pipeline(mode_defs=[spark_mode])
    def spark_df_test_pipeline():

        ingest_csv_file_handle_to_spark(nonce())

    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory, run_id=run_id)

    result = execute_pipeline(
        spark_df_test_pipeline,
        run_config=RunConfig(run_id=run_id, mode='spark'),
        environment_dict={
            'storage': {
                'filesystem': {}
            },
            'resources': {
                'spark': {
                    'config': {
                        'spark_conf': spark_config
                    }
                }
            },
        },
        instance=instance,
    )

    assert result.success
    result_dir = os.path.join(
        intermediate_store.root,
        'intermediates',
        'ingest_csv_file_handle_to_spark.compute',
        'result',
    )

    assert '_SUCCESS' in os.listdir(result_dir)

    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(result_dir)
    assert isinstance(df, pyspark.sql.dataframe.DataFrame)
    assert df.head()[0] == '1'
Пример #19
0
def test_success_whole_execution_plan_with_in_memory_config(
        graphql_context, snapshot):
    instance = graphql_context.instance
    environment_dict = merge_dicts(csv_hello_world_solids_config(),
                                   {'storage': {
                                       'in_memory': {}
                                   }})
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=csv_hello_world, environment_dict=environment_dict)
    result = execute_dagster_graphql(
        graphql_context,
        EXECUTE_PLAN_QUERY,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData': environment_dict,
                'stepKeys': None,
                'executionMetadata': {
                    'runId': pipeline_run.run_id
                },
                'mode': 'default',
            }
        },
    )

    query_result = result.data['executePlan']

    assert query_result['__typename'] == 'ExecutePlanSuccess'
    assert query_result['pipeline']['name'] == 'csv_hello_world'
    assert query_result['hasFailures'] is False
    step_events = {
        step_event['step']['key']: step_event
        for step_event in query_result['stepEvents'] if step_event['step']
    }
    assert 'sum_solid.compute' in step_events
    assert 'sum_sq_solid.compute' in step_events

    snapshot.assert_match(clean_log_messages(result.data))
    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        pipeline_run.run_id)
    intermediates_manager = IntermediateStoreIntermediatesManager(store)
    assert not intermediates_manager.has_intermediate(
        None, StepOutputHandle('sum_solid.compute'))
    assert not intermediates_manager.has_intermediate(
        None, StepOutputHandle('sum_sq_solid.compute'))
Пример #20
0
def test_success_whole_execution_plan_with_in_memory_config(snapshot):
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    instance.create_empty_run(run_id, 'csv_hello_world')
    result = execute_dagster_graphql(
        define_test_context(instance=instance),
        EXECUTE_PLAN_QUERY,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData':
                merge_dicts(csv_hello_world_solids_config(),
                            {'storage': {
                                'in_memory': {}
                            }}),
                'stepKeys':
                None,
                'executionMetadata': {
                    'runId': run_id
                },
                'mode':
                'default',
            }
        },
    )

    query_result = result.data['executePlan']

    assert query_result['__typename'] == 'ExecutePlanSuccess'
    assert query_result['pipeline']['name'] == 'csv_hello_world'
    assert query_result['hasFailures'] is False
    step_events = {
        step_event['step']['key']: step_event
        for step_event in query_result['stepEvents'] if step_event['step']
    }
    assert 'sum_solid.compute' in step_events
    assert 'sum_sq_solid.compute' in step_events

    snapshot.assert_match(clean_log_messages(result.data))
    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        run_id)
    assert not store.has_intermediate(None, 'sum_solid.compute')
    assert not store.has_intermediate(None, 'sum_sq_solid.compute')
def test_using_file_system_for_subplan_multiprocessing():

    environment_dict = {'storage': {'filesystem': {}}}
    instance = DagsterInstance.local_temp()

    pipeline_def = ExecutionTargetHandle.for_pipeline_fn(
        define_inty_pipeline
    ).build_pipeline_definition()

    execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline=pipeline_def, execution_plan=execution_plan
    )

    assert execution_plan.get_step_by_key('return_one.compute')

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['return_one.compute']),
            instance,
            environment_dict=dict(environment_dict, execution={'multiprocess': {}}),
            pipeline_run=pipeline_run,
        )
    )

    store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id)

    assert get_step_output(return_one_step_events, 'return_one.compute')
    assert store.has_intermediate(None, 'return_one.compute')
    assert store.get_intermediate(None, 'return_one.compute', Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            instance,
            environment_dict=dict(environment_dict, execution={'multiprocess': {}}),
            pipeline_run=pipeline_run,
        )
    )

    assert get_step_output(add_one_step_events, 'add_one.compute')
    assert store.has_intermediate(None, 'add_one.compute')
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 2
def test_using_file_system_for_subplan():
    pipeline = define_inty_pipeline()

    environment_dict = {'storage': {'filesystem': {}}}

    execution_plan = create_execution_plan(pipeline,
                                           environment_dict=environment_dict)
    instance = DagsterInstance.ephemeral()
    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']

    run_id = str(uuid.uuid4())

    return_one_step_events = list(
        execute_plan(
            execution_plan,
            instance,
            environment_dict=environment_dict,
            run_config=RunConfig(run_id=run_id),
            step_keys_to_execute=step_keys,
        ))

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        run_id)
    assert get_step_output(return_one_step_events, 'return_one.compute')
    assert store.has_intermediate(None, 'return_one.compute')
    assert store.get_intermediate(None, 'return_one.compute', Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan,
            instance,
            environment_dict=environment_dict,
            run_config=RunConfig(run_id=run_id),
            step_keys_to_execute=['add_one.compute'],
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    assert store.has_intermediate(None, 'add_one.compute')
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 2
Пример #23
0
def test_file_system_intermediate_store_with_type_storage_plugin():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()

    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory,
        run_id=run_id,
        type_storage_plugin_registry=TypeStoragePluginRegistry(
            [(RuntimeString, FancyStringFilesystemTypeStoragePlugin)]
        ),
    )

    with yield_empty_pipeline_context(run_id=run_id, instance=instance) as context:
        try:
            intermediate_store.set_value('hello', context, RuntimeString, ['obj_name'])

            assert intermediate_store.has_object(context, ['obj_name'])
            assert intermediate_store.get_value(context, RuntimeString, ['obj_name']) == 'hello'

        finally:
            intermediate_store.rm_object(context, ['obj_name'])
Пример #24
0
def test_success_whole_execution_plan(snapshot):
    instance = DagsterInstance.local_temp()
    environment_dict = csv_hello_world_solids_config_fs_storage()
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=csv_hello_world, environment_dict=environment_dict)
    result = execute_dagster_graphql(
        define_test_context(instance=instance),
        EXECUTE_PLAN_QUERY,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData': environment_dict,
                'stepKeys': None,
                'executionMetadata': {
                    'runId': pipeline_run.run_id
                },
                'mode': 'default',
            }
        },
    )

    query_result = result.data['executePlan']

    assert query_result['__typename'] == 'ExecutePlanSuccess'
    assert query_result['pipeline']['name'] == 'csv_hello_world'
    assert query_result['hasFailures'] is False
    step_events = {
        step_event['step']['key']: step_event
        for step_event in query_result['stepEvents'] if step_event['step']
    }
    assert 'sum_solid.compute' in step_events
    assert 'sum_sq_solid.compute' in step_events

    snapshot.assert_match(clean_log_messages(result.data))
    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        pipeline_run.run_id)
    assert store.has_intermediate(None, 'sum_solid.compute')
    assert store.has_intermediate(None, 'sum_sq_solid.compute')
Пример #25
0
def test_using_file_system_for_subplan():
    pipeline = define_inty_pipeline()

    environment_dict = {'storage': {'filesystem': {}}}

    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(
        pipeline,
        environment_dict=environment_dict,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)
    assert execution_plan.get_step_by_key('return_one.compute')

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['return_one.compute']),
            instance,
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
        ))

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        pipeline_run.run_id)
    assert get_step_output(return_one_step_events, 'return_one.compute')
    assert store.has_intermediate(None, 'return_one.compute')
    assert store.get_intermediate(None, 'return_one.compute', Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            instance,
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    assert store.has_intermediate(None, 'add_one.compute')
    assert store.get_intermediate(None, 'add_one.compute', Int).obj == 2
Пример #26
0
def test_file_system_intermediate_store_with_type_storage_plugin():
    run_id = str(uuid.uuid4())
    instance = DagsterInstance.ephemeral()
    # FIXME need a dedicated test bucket
    intermediate_store = build_fs_intermediate_store(
        instance.intermediates_directory,
        run_id=run_id,
        type_storage_plugin_registry=TypeStoragePluginRegistry(
            {RuntimeString.inst(): FancyStringFilesystemTypeStoragePlugin}),
    )

    with yield_empty_pipeline_context(run_id=run_id,
                                      instance=instance) as context:
        try:
            intermediate_store.set_value('hello', context,
                                         RuntimeString.inst(), ['obj_name'])

            assert intermediate_store.has_object(context, ['obj_name'])
            assert (intermediate_store.get_value(context, RuntimeString.inst(),
                                                 ['obj_name']) == 'hello')

        finally:
            intermediate_store.rm_object(context, ['obj_name'])
Пример #27
0
    def test_successful_pipeline_reexecution(self, graphql_context):
        selector = get_legacy_pipeline_selector(graphql_context,
                                                'csv_hello_world')
        run_id = make_new_run_id()
        result_one = execute_dagster_graphql_and_finish_runs(
            graphql_context,
            START_PIPELINE_EXECUTION_SNAPSHOT_QUERY,
            variables={
                'executionParams': {
                    'selector': selector,
                    'runConfigData':
                    csv_hello_world_solids_config_fs_storage(),
                    'executionMetadata': {
                        'runId': run_id
                    },
                    'mode': 'default',
                }
            },
        )

        assert result_one.data['startPipelineExecution'][
            '__typename'] == 'StartPipelineRunSuccess'

        expected_value_repr = (
            '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), '''
            '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), '''
            '''('sum_sq', 49)])]''')

        instance = graphql_context.instance

        store = build_fs_intermediate_store(instance.intermediates_directory,
                                            run_id)
        intermediates_manager = IntermediateStoreIntermediatesManager(store)
        assert intermediates_manager.has_intermediate(
            None, StepOutputHandle('sum_solid.compute'))
        assert intermediates_manager.has_intermediate(
            None, StepOutputHandle('sum_sq_solid.compute'))
        assert (str(
            intermediates_manager.get_intermediate(
                None, PoorMansDataFrame,
                StepOutputHandle('sum_sq_solid.compute')).obj) ==
                expected_value_repr)

        # retry
        new_run_id = make_new_run_id()

        result_two = execute_dagster_graphql_and_finish_runs(
            graphql_context,
            START_PIPELINE_REEXECUTION_SNAPSHOT_QUERY,
            variables={
                'executionParams': {
                    'selector': selector,
                    'runConfigData':
                    csv_hello_world_solids_config_fs_storage(),
                    'stepKeys': ['sum_sq_solid.compute'],
                    'executionMetadata': {
                        'runId': new_run_id,
                        'rootRunId': run_id,
                        'parentRunId': run_id,
                        'tags': [{
                            'key': RESUME_RETRY_TAG,
                            'value': 'true'
                        }],
                    },
                    'mode': 'default',
                }
            },
        )

        query_result = result_two.data['startPipelineReexecution']
        assert query_result['__typename'] == 'StartPipelineRunSuccess'

        result = get_all_logs_for_finished_run_via_subscription(
            graphql_context, new_run_id)
        logs = result['pipelineRunLogs']['messages']

        assert isinstance(logs, list)
        assert has_event_of_type(logs, 'PipelineStartEvent')
        assert has_event_of_type(logs, 'PipelineSuccessEvent')
        assert not has_event_of_type(logs, 'PipelineFailureEvent')

        assert not get_step_output_event(logs, 'sum_solid.compute')
        assert get_step_output_event(logs, 'sum_sq_solid.compute')

        store = build_fs_intermediate_store(instance.intermediates_directory,
                                            new_run_id)
        intermediates_manager = IntermediateStoreIntermediatesManager(store)
        assert not intermediates_manager.has_intermediate(
            None,
            StepOutputHandle('sum_solid.inputs.num.read',
                             'input_thunk_output'))
        assert intermediates_manager.has_intermediate(
            None, StepOutputHandle('sum_solid.compute'))
        assert intermediates_manager.has_intermediate(
            None, StepOutputHandle('sum_sq_solid.compute'))
        assert (str(
            intermediates_manager.get_intermediate(
                None, PoorMansDataFrame,
                StepOutputHandle('sum_sq_solid.compute')).obj) ==
                expected_value_repr)
Пример #28
0
def test_successful_pipeline_reexecution(snapshot):
    def sanitize_result_data(result_data):
        if isinstance(result_data, dict):
            if 'path' in result_data:
                result_data['path'] = 'DUMMY_PATH'
            result_data = {
                k: sanitize_result_data(v)
                for k, v in result_data.items()
            }
        elif isinstance(result_data, list):
            for i in range(len(result_data)):
                result_data[i] = sanitize_result_data(result_data[i])
        else:
            pass
        return result_data

    run_id = str(uuid.uuid4())
    instance = DagsterInstance.ephemeral()
    result_one = execute_dagster_graphql(
        define_context(instance=instance),
        START_PIPELINE_EXECUTION_SNAPSHOT_QUERY,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData':
                csv_hello_world_solids_config_fs_storage(),
                'executionMetadata': {
                    'runId': run_id
                },
                'mode': 'default',
            }
        },
    )

    assert (result_one.data['startPipelineExecution']['__typename'] ==
            'StartPipelineExecutionSuccess')

    snapshot.assert_match(sanitize_result_data(result_one.data))

    expected_value_repr = (
        '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), '''
        '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), '''
        '''('sum_sq', 49)])]''')

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        run_id)
    assert store.has_intermediate(None, 'sum_solid.compute')
    assert store.has_intermediate(None, 'sum_sq_solid.compute')
    assert (str(
        store.get_intermediate(None, 'sum_sq_solid.compute',
                               PoorMansDataFrame).obj) == expected_value_repr)

    new_run_id = str(uuid.uuid4())

    result_two = execute_dagster_graphql(
        define_context(instance=instance),
        START_PIPELINE_EXECUTION_SNAPSHOT_QUERY,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData':
                csv_hello_world_solids_config_fs_storage(),
                'stepKeys': ['sum_sq_solid.compute'],
                'executionMetadata': {
                    'runId': new_run_id
                },
                'mode': 'default',
                'retryRunId': run_id,
            }
        },
    )

    query_result = result_two.data['startPipelineExecution']
    assert query_result['__typename'] == 'StartPipelineExecutionSuccess'
    logs = query_result['run']['logs']['nodes']

    assert isinstance(logs, list)
    assert has_event_of_type(logs, 'PipelineStartEvent')
    assert has_event_of_type(logs, 'PipelineSuccessEvent')
    assert not has_event_of_type(logs, 'PipelineFailureEvent')

    assert not get_step_output_event(logs, 'sum_solid.compute')
    assert get_step_output_event(logs, 'sum_sq_solid.compute')

    snapshot.assert_match(sanitize_result_data(result_two.data))

    store = build_fs_intermediate_store(instance.intermediates_directory,
                                        new_run_id)
    assert not store.has_intermediate(None, 'sum_solid.inputs.num.read',
                                      'input_thunk_output')
    assert store.has_intermediate(None, 'sum_solid.compute')
    assert store.has_intermediate(None, 'sum_sq_solid.compute')
    assert (str(
        store.get_intermediate(None, 'sum_sq_solid.compute',
                               PoorMansDataFrame).obj) == expected_value_repr)
Пример #29
0
def test_successful_pipeline_reexecution():
    run_id = make_new_run_id()
    instance = DagsterInstance.ephemeral()
    context = define_test_context(instance=instance)
    result_one = execute_dagster_graphql(
        context,
        START_PIPELINE_EXECUTION_SNAPSHOT_QUERY,
        variables={
            'executionParams': {
                'selector': {'name': 'csv_hello_world'},
                'environmentConfigData': csv_hello_world_solids_config_fs_storage(),
                'executionMetadata': {'runId': run_id},
                'mode': 'default',
            }
        },
    )

    assert result_one.data['startPipelineExecution']['__typename'] == 'StartPipelineRunSuccess'

    expected_value_repr = (
        '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), '''
        '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), '''
        '''('sum_sq', 49)])]'''
    )

    store = build_fs_intermediate_store(instance.intermediates_directory, run_id)
    assert store.has_intermediate(None, 'sum_solid.compute')
    assert store.has_intermediate(None, 'sum_sq_solid.compute')
    assert (
        str(store.get_intermediate(None, 'sum_sq_solid.compute', PoorMansDataFrame).obj)
        == expected_value_repr
    )

    # retry
    new_run_id = make_new_run_id()

    result_two = execute_dagster_graphql(
        define_test_context(instance=instance),
        START_PIPELINE_REEXECUTION_SNAPSHOT_QUERY,
        variables={
            'executionParams': {
                'selector': {'name': 'csv_hello_world'},
                'environmentConfigData': csv_hello_world_solids_config_fs_storage(),
                'stepKeys': ['sum_sq_solid.compute'],
                'executionMetadata': {
                    'runId': new_run_id,
                    'rootRunId': run_id,
                    'parentRunId': run_id,
                    'tags': [{'key': RESUME_RETRY_TAG, 'value': 'true'}],
                },
                'mode': 'default',
            }
        },
    )

    query_result = result_two.data['startPipelineReexecution']
    assert query_result['__typename'] == 'StartPipelineRunSuccess'

    result = sync_get_all_logs_for_run(context, new_run_id)
    logs = result['pipelineRunLogs']['messages']

    assert isinstance(logs, list)
    assert has_event_of_type(logs, 'PipelineStartEvent')
    assert has_event_of_type(logs, 'PipelineSuccessEvent')
    assert not has_event_of_type(logs, 'PipelineFailureEvent')

    assert not get_step_output_event(logs, 'sum_solid.compute')
    assert get_step_output_event(logs, 'sum_sq_solid.compute')

    store = build_fs_intermediate_store(instance.intermediates_directory, new_run_id)
    assert not store.has_intermediate(None, 'sum_solid.inputs.num.read', 'input_thunk_output')
    assert store.has_intermediate(None, 'sum_solid.compute')
    assert store.has_intermediate(None, 'sum_sq_solid.compute')
    assert (
        str(store.get_intermediate(None, 'sum_sq_solid.compute', PoorMansDataFrame).obj)
        == expected_value_repr
    )