示例#1
0
def test_bad_dataframe_type_returns_bad_stuff():
    with pytest.raises(DagsterInvariantViolationError):
        BadDFBadSummaryStats = create_dagster_pandas_dataframe_type(
            "BadDF", event_metadata_fn=lambda _: "ksjdkfsd")
        check_dagster_type(BadDFBadSummaryStats, DataFrame({"num": [1]}))

    with pytest.raises(DagsterInvariantViolationError):
        BadDFBadSummaryStatsListItem = create_dagster_pandas_dataframe_type(
            "BadDF", event_metadata_fn=lambda _: ["ksjdkfsd"])
        check_dagster_type(BadDFBadSummaryStatsListItem,
                           DataFrame({"num": [1]}))
def test_bad_dataframe_type_returns_bad_stuff():
    with pytest.raises(DagsterInvariantViolationError):
        BadDFBadSummaryStats = create_dagster_pandas_dataframe_type(
            'BadDF', summary_statistics=lambda _: 'ksjdkfsd')
        check_dagster_type(BadDFBadSummaryStats, DataFrame({'num': [1]}))

    with pytest.raises(DagsterInvariantViolationError):
        BadDFBadSummaryStatsListItem = create_dagster_pandas_dataframe_type(
            'BadDF', summary_statistics=lambda _: ['ksjdkfsd'])
        check_dagster_type(BadDFBadSummaryStatsListItem,
                           DataFrame({'num': [1]}))
示例#3
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return [
            EventMetadataEntry.text(str(max(dataframe['pid'])), 'max_pid', 'maximum pid'),
        ]

    BasicDF = create_dagster_pandas_dataframe_type(
        name='BasicDF',
        columns=[
            PandasColumn.integer_column('pid', non_nullable=True),
            PandasColumn.string_column('names'),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=BasicDF)])
    def create_dataframe(_):
        yield Output(
            DataFrame({'pid': [1, 2, 3], 'names': ['foo', 'bar', 'baz']}),
            output_name='basic_dataframe',
        )

    @pipeline
    def basic_pipeline():
        return create_dataframe()

    result = execute_pipeline(basic_pipeline)
    assert result.success
    for event in result.event_list:
        if event.event_type_value == 'STEP_OUTPUT':
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries
            )
            assert len(mock_df_output_event_metadata) == 1
            assert any([entry.label == 'max_pid' for entry in mock_df_output_event_metadata])
示例#4
0
def test_custom_dagster_dataframe_hydration_ok():
    input_dataframe = DataFrame({'foo': [1, 2, 3]})
    with safe_tempfile_path() as input_csv_fp, safe_tempfile_path() as output_csv_fp:
        input_dataframe.to_csv(input_csv_fp)
        TestDataFrame = create_dagster_pandas_dataframe_type(
            name='TestDataFrame', columns=[PandasColumn.exists('foo'),]
        )

        @solid(
            input_defs=[InputDefinition('test_df', TestDataFrame)],
            output_defs=[OutputDefinition(TestDataFrame)],
        )
        def use_test_dataframe(_, test_df):
            test_df['bar'] = [2, 4, 6]
            return test_df

        solid_result = execute_solid(
            use_test_dataframe,
            run_config={
                'solids': {
                    'use_test_dataframe': {
                        'inputs': {'test_df': {'csv': {'path': input_csv_fp}}},
                        'outputs': [{'result': {'csv': {'path': output_csv_fp}}},],
                    }
                }
            },
        )

        assert solid_result.success
        solid_output_df = read_csv(output_csv_fp)
        assert all(solid_output_df['bar'] == [2, 4, 6])
示例#5
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return {"max_pid": str(max(dataframe["pid"]))}

    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @op(out={"basic_dataframe": Out(dagster_type=BasicDF)})
    def create_dataframe(_):
        yield Output(
            DataFrame({"pid": [1, 2, 3], "names": ["foo", "bar", "baz"]}),
            output_name="basic_dataframe",
        )

    @graph
    def basic_graph():
        return create_dataframe()

    result = basic_graph.execute_in_process()
    assert result.success
    for event in result.all_node_events:
        if event.event_type_value == "STEP_OUTPUT":
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries
            )
            assert len(mock_df_output_event_metadata) == 1
            assert any([entry.label == "max_pid" for entry in mock_df_output_event_metadata])
def test_create_pandas_dataframe_dagster_type():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(name='foo',
                         constraints=[ColumnTypeConstraint('int64')])
        ],
    )
    assert isinstance(TestDataFrame, RuntimeType)
示例#7
0
def test_create_pandas_dataframe_dagster_type():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(name="foo",
                         constraints=[ColumnDTypeInSetConstraint({"int64"})])
        ],
    )
    assert isinstance(TestDataFrame, DagsterType)
示例#8
0
def test_dataframe_description_generation_just_type_constraint():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(name="foo",
                         constraints=[ColumnDTypeInSetConstraint({"int64"})])
        ],
    )
    assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
def test_dataframe_description_generation_just_type_constraint():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(name='foo',
                         constraints=[ColumnTypeConstraint('int64')])
        ],
    )
    assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
示例#10
0
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn():
    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=None,
    )
    assert isinstance(BasicDF, DagsterType)
    basic_type_check = check_dagster_type(BasicDF, DataFrame({"pid": [1], "names": ["foo"]}))
    assert basic_type_check.success
示例#11
0
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn():
    BasicDF = create_dagster_pandas_dataframe_type(
        name='BasicDF',
        columns=[
            PandasColumn.integer_column('pid', non_nullable=True),
            PandasColumn.string_column('names'),
        ],
        event_metadata_fn=None,
    )
    assert isinstance(BasicDF, DagsterType)
    basic_type_check = check_dagster_type(BasicDF, DataFrame({'pid': [1], 'names': ['foo']}))
    assert basic_type_check.success
示例#12
0
def test_custom_dagster_dataframe_parametrizable_input():
    @input_selector_schema(
        Selector({'door_a': Field(str), 'door_b': Field(str), 'door_c': Field(str),})
    )
    def silly_hydrator(_, which_door, _field):
        if which_door == 'door_a':
            return DataFrame({'foo': ['goat']})
        elif which_door == 'door_b':
            return DataFrame({'foo': ['car']})
        elif which_door == 'door_c':
            return DataFrame({'foo': ['goat']})
        raise DagsterInvariantViolationError(
            'You did not pick a door. You chose: {which_door}'.format(which_door=which_door)
        )

    @output_selector_schema(Selector({'devnull': Field(str), 'nothing': Field(str)}))
    def silly_materializer(_, _location, _field, _value):
        return Materialization(label='did nothing', description='just one of those days')

    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[PandasColumn.exists('foo'),],
        input_hydration_config=silly_hydrator,
        output_materialization_config=silly_materializer,
    )

    @solid(
        input_defs=[InputDefinition('df', TestDataFrame)],
        output_defs=[OutputDefinition(TestDataFrame)],
    )
    def did_i_win(_, df):
        return df

    solid_result = execute_solid(
        did_i_win,
        run_config={
            'solids': {
                'did_i_win': {
                    'inputs': {'df': {'door_a': 'bar'}},
                    'outputs': [{'result': {'devnull': 'baz'}}],
                }
            }
        },
    )
    assert solid_result.success
    output_df = solid_result.output_value()
    assert isinstance(output_df, DataFrame)
    assert output_df['foo'].tolist() == ['goat']
    materialization_events = solid_result.materialization_events_during_compute
    assert len(materialization_events) == 1
    assert materialization_events[0].event_specific_data.materialization.label == 'did nothing'
示例#13
0
def test_custom_dagster_dataframe_loading_ok():
    input_dataframe = DataFrame({"foo": [1, 2, 3]})
    with safe_tempfile_path() as input_csv_fp, safe_tempfile_path(
    ) as output_csv_fp:
        input_dataframe.to_csv(input_csv_fp)
        TestDataFrame = create_dagster_pandas_dataframe_type(
            name="TestDataFrame",
            columns=[
                PandasColumn.exists("foo"),
            ],
        )

        @op(
            ins={"test_df": In(TestDataFrame)},
            out=Out(TestDataFrame),
        )
        def use_test_dataframe(_, test_df):
            test_df["bar"] = [2, 4, 6]
            return test_df

        @graph
        def basic_graph():
            use_test_dataframe()

        result = basic_graph.execute_in_process(
            run_config={
                "ops": {
                    "use_test_dataframe": {
                        "inputs": {
                            "test_df": {
                                "csv": {
                                    "path": input_csv_fp
                                }
                            }
                        },
                        "outputs": [
                            {
                                "result": {
                                    "csv": {
                                        "path": output_csv_fp
                                    }
                                }
                            },
                        ],
                    }
                }
            })
        assert result.success
        output_df = read_csv(output_csv_fp)
        assert all(output_df["bar"] == [2, 4, 6])
示例#14
0
def test_custom_dagster_dataframe_loading_ok():
    input_dataframe = DataFrame({"foo": [1, 2, 3]})
    with safe_tempfile_path() as input_csv_fp, safe_tempfile_path(
    ) as output_csv_fp:
        input_dataframe.to_csv(input_csv_fp)
        TestDataFrame = create_dagster_pandas_dataframe_type(
            name="TestDataFrame",
            columns=[
                PandasColumn.exists("foo"),
            ],
        )

        @solid(
            input_defs=[InputDefinition("test_df", TestDataFrame)],
            output_defs=[OutputDefinition(TestDataFrame)],
        )
        def use_test_dataframe(_, test_df):
            test_df["bar"] = [2, 4, 6]
            return test_df

        solid_result = execute_solid(
            use_test_dataframe,
            run_config={
                "solids": {
                    "use_test_dataframe": {
                        "inputs": {
                            "test_df": {
                                "csv": {
                                    "path": input_csv_fp
                                }
                            }
                        },
                        "outputs": [
                            {
                                "result": {
                                    "csv": {
                                        "path": output_csv_fp
                                    }
                                }
                            },
                        ],
                    }
                }
            },
        )

        assert solid_result.success
        solid_output_df = read_csv(output_csv_fp)
        assert all(solid_output_df["bar"] == [2, 4, 6])
示例#15
0
def test_dataframe_description_generation_multi_constraints():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(
                name="foo",
                constraints=[
                    ColumnDTypeInSetConstraint({"int64"}),
                    InRangeColumnConstraint(0, 100, ignore_missing_vals=False),
                    NonNullableColumnConstraint(),
                ],
            ),
        ],
    )
    assert (
        TestDataFrame.description ==
        "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n"
    )
def test_dataframe_description_generation_multi_constraints():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(
                name='foo',
                constraints=[
                    ColumnTypeConstraint('int64'),
                    InRangeColumnConstraint(0, 100),
                    NonNullableColumnConstraint(),
                ],
            ),
        ],
    )
    assert (
        TestDataFrame.description ==
        "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n"
    )
示例#17
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return [
            EventMetadataEntry.text(str(max(dataframe["pid"])), "max_pid",
                                    "maximum pid"),
        ]

    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @solid(output_defs=[
        OutputDefinition(name="basic_dataframe", dagster_type=BasicDF)
    ])
    def create_dataframe(_):
        yield Output(
            DataFrame({
                "pid": [1, 2, 3],
                "names": ["foo", "bar", "baz"]
            }),
            output_name="basic_dataframe",
        )

    @pipeline
    def basic_pipeline():
        return create_dataframe()

    result = execute_pipeline(basic_pipeline)
    assert result.success
    for event in result.event_list:
        if event.event_type_value == "STEP_OUTPUT":
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries)
            assert len(mock_df_output_event_metadata) == 1
            assert any([
                entry.label == "max_pid"
                for entry in mock_df_output_event_metadata
            ])
示例#18
0
def test_custom_dagster_dataframe_parametrizable_input():
    @dagster_type_loader(
        Selector({
            "door_a": Field(str),
            "door_b": Field(str),
            "door_c": Field(str),
        }))
    def silly_loader(_, config):
        which_door = list(config.keys())[0]
        if which_door == "door_a":
            return DataFrame({"foo": ["goat"]})
        elif which_door == "door_b":
            return DataFrame({"foo": ["car"]})
        elif which_door == "door_c":
            return DataFrame({"foo": ["goat"]})
        raise DagsterInvariantViolationError(
            "You did not pick a door. You chose: {which_door}".format(
                which_door=which_door))

    @dagster_type_materializer(
        Selector({
            "devnull": Field(str),
            "nothing": Field(str)
        }))
    def silly_materializer(_, _config, _value):
        return AssetMaterialization(asset_key="nothing",
                                    description="just one of those days")

    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn.exists("foo"),
        ],
        loader=silly_loader,
        materializer=silly_materializer,
    )

    @solid(
        input_defs=[InputDefinition("df", TestDataFrame)],
        output_defs=[OutputDefinition(TestDataFrame)],
    )
    def did_i_win(_, df):
        return df

    solid_result = execute_solid(
        did_i_win,
        run_config={
            "solids": {
                "did_i_win": {
                    "inputs": {
                        "df": {
                            "door_a": "bar"
                        }
                    },
                    "outputs": [{
                        "result": {
                            "devnull": "baz"
                        }
                    }],
                }
            }
        },
    )
    assert solid_result.success
    output_df = solid_result.output_value()
    assert isinstance(output_df, DataFrame)
    assert output_df["foo"].tolist() == ["goat"]
    materialization_events = solid_result.materialization_events_during_compute
    assert len(materialization_events) == 1
    assert materialization_events[
        0].event_specific_data.materialization.label == "nothing"
示例#19
0
def test_dataframe_description_generation_no_type_constraint():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[PandasColumn(name="foo")],
    )
    assert TestDataFrame.description == "\n### Columns\n**foo**\n\n"
示例#20
0
def test_custom_dagster_dataframe_parametrizable_input():
    @dagster_type_loader(
        Selector({
            "door_a": Field(str),
            "door_b": Field(str),
            "door_c": Field(str),
        }))
    def silly_loader(_, config):
        which_door = list(config.keys())[0]
        if which_door == "door_a":
            return DataFrame({"foo": ["goat"]})
        elif which_door == "door_b":
            return DataFrame({"foo": ["car"]})
        elif which_door == "door_c":
            return DataFrame({"foo": ["goat"]})
        raise DagsterInvariantViolationError(
            "You did not pick a door. You chose: {which_door}".format(
                which_door=which_door))

    @dagster_type_materializer(
        Selector({
            "devnull": Field(str),
            "nothing": Field(str)
        }))
    def silly_materializer(_, _config, _value):
        return AssetMaterialization(asset_key="nothing",
                                    description="just one of those days")

    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn.exists("foo"),
        ],
        loader=silly_loader,
        materializer=silly_materializer,
    )

    @op(
        ins={"df": In(TestDataFrame)},
        out=Out(TestDataFrame),
    )
    def did_i_win(_, df):
        return df

    @graph
    def basic_graph():
        did_i_win()

    result = basic_graph.execute_in_process(
        run_config={
            "ops": {
                "did_i_win": {
                    "inputs": {
                        "df": {
                            "door_a": "bar"
                        }
                    },
                    "outputs": [{
                        "result": {
                            "devnull": "baz"
                        }
                    }],
                }
            }
        })
    assert result.success
    output_df = result.output_for_node("did_i_win")
    assert isinstance(output_df, DataFrame)
    assert output_df["foo"].tolist() == ["goat"]
    materialization_events = [
        event for event in result.all_node_events
        if event.is_step_materialization
    ]
    assert len(materialization_events) == 1
    assert materialization_events[
        0].event_specific_data.materialization.label == "nothing"
示例#21
0
    PandasColumn.integer_column('bike_id', min_value=0),
    PandasColumn.datetime_column(
        'start_time',
        min_datetime=Timestamp(year=2018, month=1, day=1),
    ),
    PandasColumn.datetime_column(
        'end_time',
        min_datetime=Timestamp(year=2018, month=1, day=1),
    ),
    PandasColumn.string_column('interval_date'),
]

RawTripDataFrame = create_dagster_pandas_dataframe_type(
    name='RawTripDataFrame',
    columns=[
        PandasColumn(column.name) for column in TripDataFrameSchema
        if column.name != 'interval_date'
    ],
)

TripDataFrame = create_dagster_pandas_dataframe_type(
    name='TripDataFrame',
    columns=TripDataFrameSchema,
    event_metadata_fn=compute_trip_dataframe_event_metadata,
)


def compute_traffic_dataframe_event_metadata(dataframe):
    return [
        EventMetadataEntry.text(str(min(dataframe['peak_traffic_load'])),
                                'min_traffic_load', 'Best Peak Load'),
示例#22
0
文件: types.py 项目: zuik/dagster
    PandasColumn.integer_column("bike_id", min_value=0),
    PandasColumn.datetime_column(
        "start_time",
        min_datetime=Timestamp(year=2017, month=1, day=1),
    ),
    PandasColumn.datetime_column(
        "end_time",
        min_datetime=Timestamp(year=2017, month=1, day=1),
    ),
    PandasColumn.string_column("interval_date"),
]

RawTripDataFrame = create_dagster_pandas_dataframe_type(
    name="RawTripDataFrame",
    columns=[
        PandasColumn(column.name) for column in TripDataFrameSchema
        if column.name != "interval_date"
    ],
)

TripDataFrame = create_dagster_pandas_dataframe_type(
    name="TripDataFrame",
    columns=TripDataFrameSchema,
    event_metadata_fn=compute_trip_dataframe_event_metadata,
)


def compute_traffic_dataframe_event_metadata(dataframe):
    return [
        EventMetadataEntry.text(str(min(dataframe["peak_traffic_load"])),
                                "min_traffic_load", "Best Peak Load"),
示例#23
0
    PandasColumn.integer_column('bike_id', min_value=0),
    PandasColumn.datetime_column(
        'start_time',
        min_datetime=Timestamp(year=2018, month=1, day=1),
    ),
    PandasColumn.datetime_column(
        'end_time',
        min_datetime=Timestamp(year=2018, month=1, day=1),
    ),
    PandasColumn.string_column('interval_date'),
]

RawTripDataFrame = create_dagster_pandas_dataframe_type(
    name='RawTripDataFrame',
    columns=[
        PandasColumn(column.name) for column in TripDataFrameSchema
        if column.name != 'interval_date'
    ],
)

TripDataFrame = create_dagster_pandas_dataframe_type(
    name='TripDataFrame',
    columns=TripDataFrameSchema,
    summary_statistics=compute_trip_dataframe_summary_statistics,
)


def compute_traffic_dataframe_summary_statistics(dataframe):
    return [
        EventMetadataEntry.text(str(min(dataframe['peak_traffic_load'])),
                                'min_traffic_load', 'Best Peak Load'),