def test_basic_int_and_string_json_materialization():

    pipeline = multiple_output_pipeline()

    with get_temp_file_names(2) as file_tuple:
        filename_one, filename_two = file_tuple  # pylint: disable=E0632
        result = execute_pipeline(
            pipeline,
            {
                "solids": {
                    "return_one_and_foo": {
                        "outputs": [
                            {"string": {"json": {"path": filename_one}}},
                            {"number": {"json": {"path": filename_two}}},
                        ]
                    }
                }
            },
        )

        assert result.success

        with open(filename_one, "r") as ff_1:
            value = json.loads(ff_1.read())
            assert value == {"value": "foo"}

        with open(filename_two, "r") as ff_2:
            value = json.loads(ff_2.read())
            assert value == {"value": 1}
예제 #2
0
def test_basic_int_json_multiple_materializations():
    pipeline = single_int_output_pipeline()

    with get_temp_file_names(2) as file_tuple:
        filename_one, filename_two = file_tuple  # pylint: disable=E0632
        result = execute_pipeline(
            pipeline,
            {
                'solids': {
                    'return_one': {
                        'outputs': [
                            {'result': {'json': {'path': filename_one}}},
                            {'result': {'json': {'path': filename_two}}},
                        ]
                    }
                }
            },
        )

        assert result.success

        with open(filename_one, 'r') as ff:
            value = json.loads(ff.read())
            assert value == {'value': 1}

        with open(filename_two, 'r') as ff:
            value = json.loads(ff.read())
            assert value == {'value': 1}
예제 #3
0
def test_basic_int_and_string_json_materialization():

    pipeline = multiple_output_pipeline()

    with get_temp_file_names(2) as file_tuple:
        filename_one, filename_two = file_tuple  # pylint: disable=E0632
        result = execute_pipeline(
            pipeline,
            {
                'solids': {
                    'return_one_and_foo': {
                        'outputs': [
                            {'string': {'json': {'path': filename_one}}},
                            {'number': {'json': {'path': filename_two}}},
                        ]
                    }
                }
            },
        )

        assert result.success

        with open(filename_one, 'r') as ff_1:
            value = json.loads(ff_1.read())
            assert value == {'value': 'foo'}

        with open(filename_two, 'r') as ff_2:
            value = json.loads(ff_2.read())
            assert value == {'value': 1}
def test_basic_int_json_multiple_materializations():
    pipeline = single_int_output_pipeline()

    with get_temp_file_names(2) as file_tuple:
        filename_one, filename_two = file_tuple  # pylint: disable=E0632
        result = execute_pipeline(
            pipeline,
            {
                "solids": {
                    "return_one": {
                        "outputs": [
                            {"result": {"json": {"path": filename_one}}},
                            {"result": {"json": {"path": filename_two}}},
                        ]
                    }
                }
            },
        )

        assert result.success

        with open(filename_one, "r") as ff:
            value = json.loads(ff.read())
            assert value == {"value": 1}

        with open(filename_two, "r") as ff:
            value = json.loads(ff.read())
            assert value == {"value": 1}
예제 #5
0
def test_pandas_multiple_outputs():
    with get_temp_file_names(2) as temp_tuple:
        # false positive on pylint error
        csv_file, parquet_file = temp_tuple  # pylint: disable=E0632
        pipeline = create_diamond_pipeline()

        write_sum_mult_csv = dagster_pd.to_csv_solid('write_sum_mult_csv')
        write_sum_mult_parquet = dagster_pd.to_parquet_solid(
            'write_sum_mult_parquet')

        pipeline = create_diamond_pipeline(
            extra_solids=[write_sum_mult_csv, write_sum_mult_parquet],
            extra_dependencies={
                write_sum_mult_csv.name: {
                    'df': DependencyDefinition('sum_mult_table'),
                },
                write_sum_mult_parquet.name: {
                    'df': DependencyDefinition('sum_mult_table'),
                }
            })

        environment = get_num_csv_environment({
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            write_sum_mult_csv.name:
            config.Solid({
                'path': csv_file,
            }),
            write_sum_mult_parquet.name:
            config.Solid({
                'path': parquet_file,
            }),
        })

        execute_pipeline(pipeline, environment)

        assert os.path.exists(csv_file)
        output_csv_df = pd.read_csv(csv_file)
        assert output_csv_df.to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
            'mult': [2, 12],
            'sum_mult': [6, 84],
        }

        assert os.path.exists(parquet_file)
        output_parquet_df = pd.read_parquet(parquet_file)
        assert output_parquet_df.to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
            'mult': [2, 12],
            'sum_mult': [6, 84],
        }
def test_complex_type_materialization():
    pipeline = multiple_output_pipeline()

    with get_temp_file_names(2) as file_tuple:
        filename_one, filename_two = file_tuple  # pylint: disable=E0632
        result = execute_pipeline(
            pipeline,
            {
                'solids': {
                    'return_one_and_foo': {
                        'outputs': [
                            {
                                'string': {
                                    'json': {
                                        'path': filename_one
                                    }
                                }
                            },
                            {
                                'number': {
                                    'json': {
                                        'path': filename_two
                                    }
                                }
                            },
                        ]
                    }
                }
            },
        )

        assert result.success
        for event in result.event_list:
            if event.event_type == DagsterEventType.STEP_MATERIALIZATION:
                event = event.event_specific_data.materialization
                assert len(event.metadata_entries) == 3
                assert event.metadata_entries[1] == EventMetadataEntry(
                    label='system-type-name',
                    description=None,
                    entry_data=TextMetadataEntryData(text='String'),
                ) or event.metadata_entries[1] == EventMetadataEntry(
                    label='system-type-name',
                    description=None,
                    entry_data=TextMetadataEntryData(text='Int'),
                )
                assert event.metadata_entries[2] == EventMetadataEntry(
                    label='system-type-description',
                    description=None,
                    entry_data=TextMetadataEntryData(text='Any'),
                )
예제 #7
0
def test_pandas_output_intermediate_parquet_files():
    pipeline = create_diamond_pipeline()

    with get_temp_file_names(2) as temp_tuple:
        # false positive on pylint error
        sum_file, mult_file = temp_tuple  # pylint: disable=E0632

        write_sum_table = dagster_pd.to_parquet_solid('write_sum_table')
        write_mult_table = dagster_pd.to_parquet_solid('write_mult_table')

        pipeline = create_diamond_pipeline(
            extra_solids=[write_sum_table, write_mult_table],
            extra_dependencies={
                write_sum_table.name: {
                    'df': DependencyDefinition('sum_table'),
                },
                write_mult_table.name: {
                    'df': DependencyDefinition('mult_table'),
                }
            })

        environment = get_num_csv_environment({
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            write_sum_table.name:
            config.Solid({'path': sum_file}),
            write_mult_table.name:
            config.Solid({'path': mult_file}),
        })

        pipeline_result = execute_pipeline(
            pipeline,
            environment,
        )

        assert pipeline_result.success

        expected_sum = {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
        }

        assert pd.read_parquet(sum_file).to_dict('list') == expected_sum
예제 #8
0
def test_basic_pipeline_external_plan_execution():
    pipeline = define_inty_pipeline()

    with get_temp_file_names(2) as temp_files:

        temp_path, write_path = temp_files  # pylint: disable=W0632

        int_type = resolve_to_runtime_type(Int)

        serialize_to_file(int_type.serialization_strategy, 5, temp_path)

        execution_plan = create_execution_plan(pipeline)

        results = execute_externalized_plan(
            pipeline,
            execution_plan,
            ['add_one.transform'],
            inputs_to_marshal={'add_one.transform': {
                'num': temp_path
            }},
            outputs_to_marshal={
                'add_one.transform': [{
                    'output': 'result',
                    'path': write_path
                }]
            },
            execution_metadata=ExecutionMetadata(),
        )

        assert deserialize_from_file(int_type.serialization_strategy,
                                     write_path) == 6

    assert len(results) == 2

    thunk_step_result = results[0]

    assert thunk_step_result.kind == StepKind.VALUE_THUNK

    transform_step_result = results[1]
    assert transform_step_result.kind == StepKind.TRANSFORM
    assert transform_step_result.success
    assert transform_step_result.success_data.output_name == 'result'
    assert transform_step_result.success_data.value == 6
예제 #9
0
def test_basic_int_and_string_json_multiple_materialization():

    pipeline = multiple_output_pipeline()

    with get_temp_file_names(4) as file_tuple:
        # False positive for unbalanced tuple unpacking
        # pylint: disable=E0632
        filename_one, filename_two, filename_three, filename_four = file_tuple
        result = execute_pipeline(
            pipeline,
            {
                'solids': {
                    'return_one_and_foo': {
                        'outputs': [
                            {'string': {'json': {'path': filename_one}}},
                            {'string': {'json': {'path': filename_two}}},
                            {'number': {'json': {'path': filename_three}}},
                            {'number': {'json': {'path': filename_four}}},
                        ]
                    }
                }
            },
        )

        assert result.success

        with open(filename_one, 'r') as ff:
            value = json.loads(ff.read())
            assert value == {'value': 'foo'}

        with open(filename_two, 'r') as ff:
            value = json.loads(ff.read())
            assert value == {'value': 'foo'}

        with open(filename_three, 'r') as ff:
            value = json.loads(ff.read())
            assert value == {'value': 1}

        with open(filename_four, 'r') as ff:
            value = json.loads(ff.read())
            assert value == {'value': 1}
def test_basic_int_and_string_json_multiple_materialization():

    pipeline = multiple_output_pipeline()

    with get_temp_file_names(4) as file_tuple:
        # False positive for unbalanced tuple unpacking
        # pylint: disable=E0632
        filename_one, filename_two, filename_three, filename_four = file_tuple
        result = execute_pipeline(
            pipeline,
            {
                "solids": {
                    "return_one_and_foo": {
                        "outputs": [
                            {"string": {"json": {"path": filename_one}}},
                            {"string": {"json": {"path": filename_two}}},
                            {"number": {"json": {"path": filename_three}}},
                            {"number": {"json": {"path": filename_four}}},
                        ]
                    }
                }
            },
        )

        assert result.success

        with open(filename_one, "r") as ff:
            value = json.loads(ff.read())
            assert value == {"value": "foo"}

        with open(filename_two, "r") as ff:
            value = json.loads(ff.read())
            assert value == {"value": "foo"}

        with open(filename_three, "r") as ff:
            value = json.loads(ff.read())
            assert value == {"value": 1}

        with open(filename_four, "r") as ff:
            value = json.loads(ff.read())
            assert value == {"value": 1}
예제 #11
0
def test_basic_pipeline_external_plan_execution():
    pipeline = define_inty_pipeline()

    with get_temp_file_names(2) as temp_files:

        temp_path, write_path = temp_files  # pylint: disable=W0632

        int_type = resolve_to_runtime_type(Int)

        serialize_to_file(int_type.serialization_strategy, 5, temp_path)

        step_events = execute_marshalling(
            pipeline,
            ['add_one.transform'],
            inputs_to_marshal={'add_one.transform': {
                'num': temp_path
            }},
            outputs_to_marshal={
                'add_one.transform': [MarshalledOutput('result', write_path)]
            },
        )

        assert deserialize_from_file(int_type.serialization_strategy,
                                     write_path) == 6

    assert len(step_events) == 2

    thunk_step_output_event = step_events[0]

    assert thunk_step_output_event.kind == StepKind.UNMARSHAL_INPUT

    transform_step_output_event = step_events[1]
    assert transform_step_output_event.kind == StepKind.TRANSFORM
    assert transform_step_output_event.is_successful_output
    assert transform_step_output_event.success_data.output_name == 'result'
    assert transform_step_output_event.success_data.value == 6
예제 #12
0
def get_temp_file_locations(num):
    with get_temp_file_names(num) as paths:
        for path in paths:
            os.unlink(path)

        yield paths
예제 #13
0
def test_pandas_output_intermediate_csv_files():

    with get_temp_file_names(2) as temp_tuple:
        sum_file, mult_file = temp_tuple  # pylint: disable=E0632

        write_sum_table = dagster_pd.to_csv_solid('write_sum_table')
        write_mult_table = dagster_pd.to_csv_solid('write_mult_table')

        pipeline = create_diamond_pipeline(
            extra_solids=[write_sum_table, write_mult_table],
            extra_dependencies={
                write_sum_table.name: {
                    'df': DependencyDefinition('sum_table'),
                },
                write_mult_table.name: {
                    'df': DependencyDefinition('mult_table'),
                }
            })

        environment = get_num_csv_environment({
            'load_csv':
            config.Solid({
                'path': script_relative_path('num.csv'),
            }),
            write_sum_table.name:
            config.Solid({'path': sum_file}),
            write_mult_table.name:
            config.Solid({'path': mult_file}),
        })

        subgraph_one_result = execute_pipeline(pipeline,
                                               environment=environment)

        assert len(subgraph_one_result.result_list) == 5

        expected_sum = {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
        }

        assert pd.read_csv(sum_file).to_dict('list') == expected_sum
        sum_table_result = subgraph_one_result.result_for_solid('sum_table')
        assert sum_table_result.transformed_value().to_dict(
            'list') == expected_sum

        expected_mult = {
            'num1': [1, 3],
            'num2': [2, 4],
            'mult': [2, 12],
        }
        assert pd.read_csv(mult_file).to_dict('list') == expected_mult
        mult_table_result = subgraph_one_result.result_for_solid('mult_table')
        assert mult_table_result.transformed_value().to_dict(
            'list') == expected_mult

        injected_solids = {
            'sum_mult_table': {
                'sum_table': dagster_pd.load_csv_solid('load_sum_table'),
                'mult_table': dagster_pd.load_csv_solid('load_mult_table'),
            }
        }

        pipeline_result = execute_pipeline(
            PipelineDefinition.create_sub_pipeline(
                pipeline,
                ['sum_mult_table'],
                ['sum_mult_table'],
                injected_solids,
            ),
            environment=config.Environment(solids={
                'load_sum_table':
                config.Solid({'path': sum_file}, ),
                'load_mult_table':
                config.Solid({'path': mult_file}, ),
            }, ),
        )

        assert pipeline_result.success

        subgraph_two_result_list = pipeline_result.result_list

        assert len(subgraph_two_result_list) == 3
        output_df = pipeline_result.result_for_solid(
            'sum_mult_table').transformed_value()
        assert output_df.to_dict('list') == {
            'num1': [1, 3],
            'num2': [2, 4],
            'sum': [3, 7],
            'mult': [2, 12],
            'sum_mult': [6, 84],
        }