示例#1
0
def validate_warehouse(
    entity_name, feature_infos, feature_values_filepath, dataset, project=None
):
    print("\n[Validating warehouse data]")

    expected = pd.read_csv(
        feature_values_filepath,
        names=["id", "event_timestamp"] + [f["name"] for f in feature_infos],
        dtype=dict(
            [("id", np.string_)] + [(f["name"], f["dtype"]) for f in feature_infos]
        ),
        parse_dates=["event_timestamp"],
    )

    # TODO: Retrieve actual values via Feast Core rather than directly from BigQuery
    #       Need to change Python SDK so can retrieve values via Feast Core while
    #       "ensuring correct value types"
    actual = (
        bq_util.query_to_dataframe(
            f"SELECT {','.join(expected.columns)} FROM `{dataset}.{entity_name}_view`",
            project=project,
        )
        .sort_values(["id", "event_timestamp"])
        .reset_index(drop=True)
        .astype({"event_timestamp": "datetime64[ns]"})
    )[expected.columns]

    pd.testing.assert_frame_equal(expected, actual)

    print("OK")
示例#2
0
def test_query_to_dataframe():
    with open(
        os.path.join(testdata_path, "austin_bikeshare.bikeshare_stations.avro"), "rb"
    ) as expected_file:
        avro_reader = fastavro.reader(expected_file)
        expected = pd.DataFrame.from_records(avro_reader)

    query = "SELECT * FROM `bigquery-public-data.austin_bikeshare.bikeshare_stations`"
    actual = query_to_dataframe(query)
    assert expected.equals(actual)
示例#3
0
 def validate_warehouse_data(project_id, expected):
     actual = (
         bq_util.query_to_dataframe(
             f"SELECT * FROM `feast_it.myentity_view`", project=project_id
         )
         # created_timestamp is not relevant for validating correctness of import
         # and retrieval of feature values
         .drop(columns=["created_timestamp"])
         .sort_values(["id", "event_timestamp"])
         .reset_index(drop=True)
     )
     assert expected.equals(actual)
示例#4
0
def test_query_to_dataframe_for_non_existing_dataset():
    query = "SELECT * FROM `bigquery-public-data.this_dataset_should_not_exists.bikeshare_stations`"
    with pytest.raises(NotFound):
        query_to_dataframe(query)