Пример #1
0
def test_update_featureset_apply_featureset_and_ingest_first_subset(
        client, update_featureset_dataframe):
    subset_columns = [
        "datetime", "entity_id", "update_feature1", "update_feature2"
    ]
    subset_df = update_featureset_dataframe.iloc[:5][subset_columns]
    update_fs = FeatureSet(
        "update_fs",
        entities=[Entity(name="entity_id", dtype=ValueType.INT64)],
        max_age=Duration(seconds=432000),
    )
    update_fs.infer_fields_from_df(subset_df)
    client.apply(update_fs)

    client.ingest(feature_set=update_fs, source=subset_df)

    time.sleep(15)
    feature_retrieval_job = client.get_batch_features(
        entity_rows=update_featureset_dataframe[["datetime",
                                                 "entity_id"]].iloc[:5],
        feature_refs=[
            f"{PROJECT_NAME}/update_feature1",
            f"{PROJECT_NAME}/update_feature2",
        ],
    )

    output = feature_retrieval_job.to_dataframe().sort_values(by=["entity_id"])
    print(output.head())

    assert output["update_feature1"].to_list(
    ) == subset_df["update_feature1"].to_list()
    assert output["update_feature2"].to_list(
    ) == subset_df["update_feature2"].to_list()
Пример #2
0
def test_update_featureset_update_featureset_and_ingest_second_subset(
        client, update_featureset_dataframe):
    subset_columns = [
        "datetime",
        "entity_id",
        "update_feature1",
        "update_feature3",
        "update_feature4",
    ]
    subset_df = update_featureset_dataframe.iloc[5:][subset_columns]
    update_fs = FeatureSet(
        "update_fs",
        entities=[Entity(name="entity_id", dtype=ValueType.INT64)],
        max_age=Duration(seconds=432000),
    )
    update_fs.infer_fields_from_df(subset_df)
    client.apply(update_fs)

    # We keep retrying this ingestion until all values make it into the buffer.
    # This is a necessary step because bigquery streaming caches table schemas
    # and as a result, rows may be lost.
    while True:
        ingestion_id = client.ingest(feature_set=update_fs, source=subset_df)
        time.sleep(15)  # wait for rows to get written to bq
        rows_ingested = get_rows_ingested(client, update_fs, ingestion_id)
        if rows_ingested == len(subset_df):
            print(
                f"Number of rows successfully ingested: {rows_ingested}. Continuing."
            )
            break
        print(
            f"Number of rows successfully ingested: {rows_ingested}. Retrying ingestion."
        )
        time.sleep(30)

    def check():
        feature_retrieval_job = client.get_batch_features(
            entity_rows=update_featureset_dataframe[["datetime",
                                                     "entity_id"]].iloc[5:],
            feature_refs=[
                "update_feature1",
                "update_feature3",
                "update_feature4",
            ],
            project=PROJECT_NAME,
        )

        output = feature_retrieval_job.to_dataframe(
            timeout_sec=180).sort_values(by=["entity_id"])
        print(output.head())

        assert output["update_feature1"].to_list(
        ) == subset_df["update_feature1"].to_list()
        assert output["update_feature3"].to_list(
        ) == subset_df["update_feature3"].to_list()
        assert output["update_feature4"].to_list(
        ) == subset_df["update_feature4"].to_list()
        clean_up_remote_files(feature_retrieval_job.get_avro_files())

    wait_for(check, timedelta(minutes=5))
Пример #3
0
    def test_feature_set_ingest_failure(self, client, dataframe, exception):
        with pytest.raises(exception):
            # Create feature set
            driver_fs = FeatureSet("driver-feature-set")

            # Update based on dataset
            driver_fs.infer_fields_from_df(dataframe)

            # Register with Feast core
            client.apply(driver_fs)

            # Ingest data into Feast
            client.ingest(driver_fs, dataframe=dataframe)
Пример #4
0
    def test_feature_set_ingest_failure(self, client, dataframe, exception):
        with pytest.raises(exception):
            # Create feature set
            driver_fs = FeatureSet("driver-feature-set")
            driver_fs.source = KafkaSource(topic="feature-topic",
                                           brokers="fake.broker.com")
            client._message_producer = MagicMock()
            client._message_producer.produce = MagicMock()

            # Update based on dataset
            driver_fs.infer_fields_from_df(dataframe)

            # Register with Feast core
            client.apply(driver_fs)

            # Ingest data into Feast
            client.ingest(driver_fs, dataframe=dataframe)
Пример #5
0
 def test_add_features_from_df_success(
     self,
     dataframe,
     feature_count,
     entity_count,
     discard_unused_fields,
     features,
     entities,
 ):
     my_feature_set = FeatureSet(
         name="my_feature_set",
         features=[Feature(name="dummy_f1", dtype=ValueType.INT64)],
         entities=[Entity(name="dummy_entity_1", dtype=ValueType.INT64)],
     )
     my_feature_set.infer_fields_from_df(
         dataframe,
         discard_unused_fields=discard_unused_fields,
         features=features,
         entities=entities,
     )
     assert len(my_feature_set.features) == feature_count
     assert len(my_feature_set.entities) == entity_count
Пример #6
0
def test_update_featureset_apply_featureset_and_ingest_first_subset(
    client, update_featureset_dataframe
):
    subset_columns = ["datetime", "entity_id", "update_feature1", "update_feature2"]
    subset_df = update_featureset_dataframe.iloc[:5][subset_columns]
    update_fs = FeatureSet(
        "update_fs",
        entities=[Entity(name="entity_id", dtype=ValueType.INT64)],
        max_age=Duration(seconds=432000),
    )
    update_fs.infer_fields_from_df(subset_df)
    client.apply(update_fs)

    client.ingest(feature_set=update_fs, source=subset_df)

    def check():
        feature_retrieval_job = client.get_historical_features(
            entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[:5],
            feature_refs=["update_feature1", "update_feature2"],
            project=PROJECT_NAME,
        )

        output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values(
            by=["entity_id"]
        )
        print(output.head())

        assert (
            output["update_feature1"].to_list()
            == subset_df["update_feature1"].to_list()
        )
        assert (
            output["update_feature2"].to_list()
            == subset_df["update_feature2"].to_list()
        )

        clean_up_remote_files(feature_retrieval_job.get_avro_files())

    wait_for(check, timedelta(minutes=5))
Пример #7
0
 def test_update_from_source_failure(self):
     with pytest.raises(Exception):
         df = pd.DataFrame()
         fs = FeatureSet("driver-feature-set")
         fs.infer_fields_from_df(df)