def validate_warehouse( entity_name, feature_infos, feature_values_filepath, dataset, project=None ): print("\n[Validating warehouse data]") expected = pd.read_csv( feature_values_filepath, names=["id", "event_timestamp"] + [f["name"] for f in feature_infos], dtype=dict( [("id", np.string_)] + [(f["name"], f["dtype"]) for f in feature_infos] ), parse_dates=["event_timestamp"], ) # TODO: Retrieve actual values via Feast Core rather than directly from BigQuery # Need to change Python SDK so can retrieve values via Feast Core while # "ensuring correct value types" actual = ( bq_util.query_to_dataframe( f"SELECT {','.join(expected.columns)} FROM `{dataset}.{entity_name}_view`", project=project, ) .sort_values(["id", "event_timestamp"]) .reset_index(drop=True) .astype({"event_timestamp": "datetime64[ns]"}) )[expected.columns] pd.testing.assert_frame_equal(expected, actual) print("OK")
def test_query_to_dataframe(): with open( os.path.join(testdata_path, "austin_bikeshare.bikeshare_stations.avro"), "rb" ) as expected_file: avro_reader = fastavro.reader(expected_file) expected = pd.DataFrame.from_records(avro_reader) query = "SELECT * FROM `bigquery-public-data.austin_bikeshare.bikeshare_stations`" actual = query_to_dataframe(query) assert expected.equals(actual)
def validate_warehouse_data(project_id, expected): actual = ( bq_util.query_to_dataframe( f"SELECT * FROM `feast_it.myentity_view`", project=project_id ) # created_timestamp is not relevant for validating correctness of import # and retrieval of feature values .drop(columns=["created_timestamp"]) .sort_values(["id", "event_timestamp"]) .reset_index(drop=True) ) assert expected.equals(actual)
def test_query_to_dataframe_for_non_existing_dataset(): query = "SELECT * FROM `bigquery-public-data.this_dataset_should_not_exists.bikeshare_stations`" with pytest.raises(NotFound): query_to_dataframe(query)