Exemplo n.º 1
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return [
            EventMetadataEntry.text(str(max(dataframe['pid'])), 'max_pid', 'maximum pid'),
        ]

    BasicDF = create_dagster_pandas_dataframe_type(
        name='BasicDF',
        columns=[
            PandasColumn.integer_column('pid', non_nullable=True),
            PandasColumn.string_column('names'),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=BasicDF)])
    def create_dataframe(_):
        yield Output(
            DataFrame({'pid': [1, 2, 3], 'names': ['foo', 'bar', 'baz']}),
            output_name='basic_dataframe',
        )

    @pipeline
    def basic_pipeline():
        return create_dataframe()

    result = execute_pipeline(basic_pipeline)
    assert result.success
    for event in result.event_list:
        if event.event_type_value == 'STEP_OUTPUT':
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries
            )
            assert len(mock_df_output_event_metadata) == 1
            assert any([entry.label == 'max_pid' for entry in mock_df_output_event_metadata])
Exemplo n.º 2
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return {"max_pid": str(max(dataframe["pid"]))}

    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @op(out={"basic_dataframe": Out(dagster_type=BasicDF)})
    def create_dataframe(_):
        yield Output(
            DataFrame({"pid": [1, 2, 3], "names": ["foo", "bar", "baz"]}),
            output_name="basic_dataframe",
        )

    @graph
    def basic_graph():
        return create_dataframe()

    result = basic_graph.execute_in_process()
    assert result.success
    for event in result.all_node_events:
        if event.event_type_value == "STEP_OUTPUT":
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries
            )
            assert len(mock_df_output_event_metadata) == 1
            assert any([entry.label == "max_pid" for entry in mock_df_output_event_metadata])
Exemplo n.º 3
0
def test_shape_validation_throw_error():
    with pytest.raises(ConstraintViolationException):
        validate_constraints(
            DataFrame({"foo": [2], "bar": ["hello"]}),
            pandas_columns=[
                PandasColumn.integer_column("foo", min_value=0),
                PandasColumn.string_column("bar"),
            ],
            dataframe_constraints=[RowCountConstraint(2)],
        )
Exemplo n.º 4
0
def test_shape_validation_ok():
    assert (validate_constraints(
        DataFrame({
            'foo': [2],
            'bar': ['hello']
        }),
        pandas_columns=[
            PandasColumn.integer_column('foo', min_value=0),
            PandasColumn.string_column('bar'),
        ],
        dataframe_constraints=[RowCountConstraint(1)],
    ) is None)
Exemplo n.º 5
0
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn():
    BasicDF = create_dagster_pandas_dataframe_type(
        name='BasicDF',
        columns=[
            PandasColumn.integer_column('pid', non_nullable=True),
            PandasColumn.string_column('names'),
        ],
        event_metadata_fn=None,
    )
    assert isinstance(BasicDF, DagsterType)
    basic_type_check = check_dagster_type(BasicDF, DataFrame({'pid': [1], 'names': ['foo']}))
    assert basic_type_check.success
Exemplo n.º 6
0
def test_shape_validation_ok():
    assert (
        validate_constraints(
            DataFrame({"foo": [2], "bar": ["hello"]}),
            pandas_columns=[
                PandasColumn.integer_column("foo", min_value=0),
                PandasColumn.string_column("bar"),
            ],
            dataframe_constraints=[RowCountConstraint(1)],
        )
        is None
    )
Exemplo n.º 7
0
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn():
    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=None,
    )
    assert isinstance(BasicDF, DagsterType)
    basic_type_check = check_dagster_type(BasicDF, DataFrame({"pid": [1], "names": ["foo"]}))
    assert basic_type_check.success
Exemplo n.º 8
0
def test_shape_validation_throw_error():
    with pytest.raises(ConstraintViolationException):
        validate_constraints(
            DataFrame({
                'foo': [2],
                'bar': ['hello']
            }),
            pandas_columns=[
                PandasColumn.integer_column('foo', min_value=0),
                PandasColumn.string_column('bar'),
            ],
            dataframe_constraints=[RowCountConstraint(2)],
        )
Exemplo n.º 9
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return [
            EventMetadataEntry.text(str(max(dataframe["pid"])), "max_pid",
                                    "maximum pid"),
        ]

    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @solid(output_defs=[
        OutputDefinition(name="basic_dataframe", dagster_type=BasicDF)
    ])
    def create_dataframe(_):
        yield Output(
            DataFrame({
                "pid": [1, 2, 3],
                "names": ["foo", "bar", "baz"]
            }),
            output_name="basic_dataframe",
        )

    @pipeline
    def basic_pipeline():
        return create_dataframe()

    result = execute_pipeline(basic_pipeline)
    assert result.success
    for event in result.event_list:
        if event.event_type_value == "STEP_OUTPUT":
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries)
            assert len(mock_df_output_event_metadata) == 1
            assert any([
                entry.label == "max_pid"
                for entry in mock_df_output_event_metadata
            ])
Exemplo n.º 10
0
        EventMetadataEntry.text(str(dataframe.columns), "columns",
                                "Keys of columns seen in the dataframe"),
    ]


TripDataFrameSchema = [
    PandasColumn.integer_column("bike_id", min_value=0),
    PandasColumn.datetime_column(
        "start_time",
        min_datetime=Timestamp(year=2017, month=1, day=1),
    ),
    PandasColumn.datetime_column(
        "end_time",
        min_datetime=Timestamp(year=2017, month=1, day=1),
    ),
    PandasColumn.string_column("interval_date"),
]

RawTripDataFrame = create_dagster_pandas_dataframe_type(
    name="RawTripDataFrame",
    columns=[
        PandasColumn(column.name) for column in TripDataFrameSchema
        if column.name != "interval_date"
    ],
)

TripDataFrame = create_dagster_pandas_dataframe_type(
    name="TripDataFrame",
    columns=TripDataFrameSchema,
    event_metadata_fn=compute_trip_dataframe_event_metadata,
)
Exemplo n.º 11
0
        EventMetadataEntry.text(str(dataframe.columns), 'columns',
                                'Keys of columns seen in the dataframe'),
    ]


TripDataFrameSchema = [
    PandasColumn.integer_column('bike_id', min_value=0),
    PandasColumn.datetime_column(
        'start_time',
        min_datetime=Timestamp(year=2018, month=1, day=1),
    ),
    PandasColumn.datetime_column(
        'end_time',
        min_datetime=Timestamp(year=2018, month=1, day=1),
    ),
    PandasColumn.string_column('interval_date'),
]

RawTripDataFrame = create_dagster_pandas_dataframe_type(
    name='RawTripDataFrame',
    columns=[
        PandasColumn(column.name) for column in TripDataFrameSchema
        if column.name != 'interval_date'
    ],
)

TripDataFrame = create_dagster_pandas_dataframe_type(
    name='TripDataFrame',
    columns=TripDataFrameSchema,
    event_metadata_fn=compute_trip_dataframe_event_metadata,
)