Exemplo n.º 1
0
Arquivo: job.py Projeto: vjrkr/feast
    def get_avro_files(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])):
        """
        Wait until job is done to get the file uri to Avro result files on
        Google Cloud Storage.

        Args:
            timeout_sec (int):
                Max no of seconds to wait until job is done. If "timeout_sec"
                is exceeded, an exception will be raised.

        Returns:
            str: Google Cloud Storage file uris of the returned Avro files.
        """

        def try_retrieve():
            self.reload()
            return None, self.status == JOB_STATUS_DONE

        wait_retry_backoff(
            retry_fn=try_retrieve,
            timeout_secs=timeout_sec,
            timeout_msg="Timeout exceeded while waiting for result. Please retry "
            "this method or use a longer timeout value.",
        )

        if self.job_proto.error:
            raise Exception(self.job_proto.error)

        if self.job_proto.data_format != DATA_FORMAT_AVRO:
            raise Exception(
                "Feast only supports Avro data format for now. Please check "
                "your Feast Serving deployment."
            )

        return [urlparse(uri) for uri in self.job_proto.file_uris]
Exemplo n.º 2
0
def test_ingest_into_bq(
    feast_client: Client,
    customer_entity: Entity,
    driver_entity: Entity,
    bq_dataframe: pd.DataFrame,
    bq_dataset: str,
    pytestconfig,
):
    bq_project = pytestconfig.getoption("bq_project")
    bq_table_id = f"bq_staging_{datetime.now():%Y%m%d%H%M%s}"
    ft = FeatureTable(
        name="basic_featuretable",
        entities=["driver_id", "customer_id"],
        features=[
            Feature(name="dev_feature_float", dtype=ValueType.FLOAT),
            Feature(name="dev_feature_string", dtype=ValueType.STRING),
        ],
        max_age=Duration(seconds=3600),
        batch_source=BigQuerySource(
            table_ref=f"{bq_project}:{bq_dataset}.{bq_table_id}",
            event_timestamp_column="datetime",
            created_timestamp_column="timestamp",
        ),
    )

    # ApplyEntity
    feast_client.apply(customer_entity)
    feast_client.apply(driver_entity)

    # ApplyFeatureTable
    feast_client.apply(ft)
    feast_client.ingest(ft, bq_dataframe, timeout=120)

    bq_client = bigquery.Client(project=bq_project)

    # Poll BQ for table until the table has been created
    def try_get_table():
        try:
            table = bq_client.get_table(
                bigquery.TableReference(
                    bigquery.DatasetReference(bq_project, bq_dataset), bq_table_id
                )
            )
        except NotFound:
            return None, False
        else:
            return table, True

    wait_retry_backoff(
        retry_fn=try_get_table,
        timeout_secs=30,
        timeout_msg="Timed out trying to get bigquery table",
    )

    query_string = f"SELECT * FROM `{bq_project}.{bq_dataset}.{bq_table_id}`"

    job = bq_client.query(query_string)
    query_df = job.to_dataframe()

    assert_frame_equal(query_df, bq_dataframe)
Exemplo n.º 3
0
def test_basic_retrieve_online_multiple_featureset(client, cust_trans_df,
                                                   driver_df):
    # Test retrieve with different variations of the string feature refs
    # ie feature set inference for feature refs without specified feature set
    feature_ref_df_mapping = [
        ("customer_transactions:daily_transactions", cust_trans_df),
        ("driver:rating", driver_df),
        ("total_transactions", cust_trans_df),
    ]

    # Poll serving for feature values until the correct values are returned
    def try_get_features():
        feature_refs = [mapping[0] for mapping in feature_ref_df_mapping]
        response = client.get_online_features(
            entity_rows=[
                GetOnlineFeaturesRequest.EntityRow(
                    fields={
                        "customer_id":
                        Value(int64_val=cust_trans_df.iloc[0]["customer_id"]),
                        "driver_id":
                        Value(int64_val=driver_df.iloc[0]["driver_id"])
                    })
            ],
            feature_refs=feature_refs,
        )  # type: GetOnlineFeaturesResponse
        is_ok = all([
            check_online_response(ref, df, response)
            for ref, df in feature_ref_df_mapping
        ])
        return response, is_ok

    wait_retry_backoff(
        retry_fn=try_get_features,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values")
Exemplo n.º 4
0
def test_large_volume_retrieve_online_success(client, large_volume_dataframe):
    # Poll serving for feature values until the correct values are returned
    feature_refs = [
        "daily_transactions_large",
        "total_transactions_large",
    ]
    while True:
        response = client.get_online_features(
            entity_rows=[
                GetOnlineFeaturesRequest.EntityRow(
                    fields={
                        "customer_id":
                        Value(int64_val=large_volume_dataframe.iloc[0]
                              ["customer_id"])
                    })
            ],
            feature_refs=feature_refs,
        )  # type: GetOnlineFeaturesResponse
        is_ok = all([
            check_online_response(ref, large_volume_dataframe, response)
            for ref in feature_refs
        ])
        return None, is_ok

    wait_retry_backoff(
        retry_fn=try_get_features,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values")
Exemplo n.º 5
0
def test_basic_retrieve_online_success(client, cust_trans_df):
    feature_refs = ["daily_transactions", "total_transactions", "null_values"]

    # Poll serving for feature values until the correct values are returned
    def try_get_features():
        response = client.get_online_features(
            entity_rows=[
                GetOnlineFeaturesRequest.EntityRow(
                    fields={
                        "customer_id":
                        Value(int64_val=cust_trans_df.iloc[0]["customer_id"])
                    })
            ],
            feature_refs=feature_refs,
        )  # type: GetOnlineFeaturesResponse
        is_ok = all([
            check_online_response(ref, cust_trans_df, response)
            for ref in feature_refs
        ])
        return response, is_ok

    wait_retry_backoff(
        retry_fn=try_get_features,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values",
    )
Exemplo n.º 6
0
def ingest_and_verify(feast_client: Client, feature_table: FeatureTable,
                      original: pd.DataFrame):
    job = feast_client.start_offline_to_online_ingestion(
        feature_table,
        original.event_timestamp.min().to_pydatetime(),
        original.event_timestamp.max().to_pydatetime() + timedelta(seconds=1),
    )
    assert job.get_feature_table() == feature_table.name

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", f"{feature_table.name}:unique_drivers"]],
        original[["s2id", "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
Exemplo n.º 7
0
def start_job(feast_client: Client, feature_table: FeatureTable, pytestconfig):
    if pytestconfig.getoption("scheduled_streaming_job"):
        return

    job = feast_client.start_stream_to_online_ingestion(feature_table)
    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)
    return job
Exemplo n.º 8
0
def test_batch_get_historical_features_with_file(client):
    file_fs1 = client.get_feature_set(name="file_feature_set")

    N_ROWS = 10
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    features_1_df = pd.DataFrame(
        {
            "datetime": [time_offset] * N_ROWS,
            "entity_id": [i for i in range(N_ROWS)],
            "feature_value1": [f"{i}" for i in range(N_ROWS)],
        }
    )

    # feature set may be ready (direct runner set ready  right after job submitted),
    # but kafka consumer is not configured
    # give some time to warm up ingestion job
    wait_retry_backoff(
        retry_fn=(
            lambda: (
                None,
                client.get_feature_set(name="file_feature_set").status
                == FeatureSetStatus.STATUS_READY,
            )
        ),
        timeout_secs=480,
        timeout_msg="Wait for FeatureSet to be READY",
    )
    time.sleep(20)

    client.ingest(file_fs1, features_1_df, timeout=480)

    # Rename column (datetime -> event_timestamp)
    features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"})

    to_avro(
        df=features_1_df[["event_timestamp", "entity_id"]],
        file_path_or_buffer="file_feature_set.avro",
    )

    time.sleep(10)

    def check():
        feature_retrieval_job = client.get_historical_features(
            entity_rows="file://file_feature_set.avro",
            feature_refs=["feature_value1"],
            project=PROJECT_NAME,
        )

        output = feature_retrieval_job.to_dataframe(timeout_sec=180)
        print(output.head())

        assert output["entity_id"].to_list() == [
            int(i) for i in output["feature_value1"].to_list()
        ]
        clean_up_remote_files(feature_retrieval_job.get_avro_files())

    wait_for(check, timedelta(minutes=10))
Exemplo n.º 9
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(name="long_entity_name" * 10,
                    description="S2id",
                    value_type=ValueType.INT64)

    feature_table = FeatureTable(
        name="just1a2featuretable3with4a5really6really7really8really9really10",
        entities=[entity.name],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data().rename(columns={"s2id": entity.name})
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            entity.name: key
        } for key in data_sample[entity.name].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[[entity.name, f"{feature_table.name}:unique_drivers"]],
        data_sample[[entity.name, "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
Exemplo n.º 10
0
def test_ingest(
    client: Client,
    customer_entity: Entity,
    driver_entity: Entity,
    bq_featuretable: FeatureTable,
    bq_dataset: pd.DataFrame,
    bq_table_id: str,
):
    gcp_project, _ = bq_table_id.split(":")
    bq_table_id = bq_table_id.replace(":", ".")

    # ApplyEntity
    client.apply_entity(customer_entity)
    client.apply_entity(driver_entity)

    # ApplyFeatureTable
    client.apply_feature_table(bq_featuretable)
    client.ingest(bq_featuretable, bq_dataset, timeout=120)

    from google.api_core.exceptions import NotFound
    from google.cloud import bigquery

    bq_client = bigquery.Client(project=gcp_project)

    # Poll BQ for table until the table has been created
    def try_get_table():
        table_exist = False
        table_resp = None
        try:
            table_resp = bq_client.get_table(bq_table_id)

            if table_resp and table_resp.table_id == bq_table_id.split(
                    ".")[-1]:
                table_exist = True
        except NotFound:
            pass

        return table_resp, table_exist

    wait_retry_backoff(
        retry_fn=try_get_table,
        timeout_secs=30,
        timeout_msg="Timed out trying to get bigquery table",
    )

    query_string = f"SELECT * FROM `{bq_table_id}`"

    job = bq_client.query(query_string)
    query_df = job.to_dataframe()

    assert_frame_equal(query_df, bq_dataset)

    bq_client.delete_table(bq_table_id, not_found_ok=True)
Exemplo n.º 11
0
Arquivo: job.py Projeto: zulily/feast
    def wait(self, status: IngestionJobStatus, timeout_secs: int = 300):  # type: ignore
        """
        Wait for this IngestJob to transtion to the given status.
        Raises TimeoutError if the wait operation times out.

        Args:
            status: The IngestionJobStatus to wait for.
            timeout_secs: Maximum seconds to wait before timing out.
        """
        # poll & wait for job status to transition
        wait_retry_backoff(
            retry_fn=(lambda: (None, self.status == status)),  # type: ignore
            timeout_secs=timeout_secs,
            timeout_msg="Wait for IngestJob's status to transition timed out",
        )
Exemplo n.º 12
0
def ingest_and_retrieve(
    feast_client: Client,
    df: pd.DataFrame,
    topic_name: str,
    kafka_broker: str,
    avro_schema_json: str,
    entity_rows: List[Dict[str, Any]],
    feature_names: List[Any],
    expected_ingested_count: Optional[int] = None,
):
    expected_ingested_count = expected_ingested_count or df.shape[0]

    for record in df.to_dict("records"):
        record["event_timestamp"] = (
            record["event_timestamp"].to_pydatetime().replace(tzinfo=pytz.utc))

        send_avro_record_to_kafka(
            topic_name,
            record,
            bootstrap_servers=kafka_broker,
            avro_schema_json=avro_schema_json,
        )

    def get_online_features():
        features = feast_client.get_online_features(
            feature_names,
            entity_rows=entity_rows,
        ).to_dict()
        out_df = pd.DataFrame.from_dict(features)
        return out_df, out_df[feature_names].count().min(
        ) >= expected_ingested_count

    ingested = wait_retry_backoff(get_online_features, 120)
    return ingested
Exemplo n.º 13
0
def test_offline_ingestion(feast_client: Client, local_staging_path: str):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    job = feast_client.start_offline_to_online_ingestion(
        feature_table, datetime.today(),
        datetime.today() + timedelta(days=1))

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 60)

    features = feast_client.get_online_features(
        ["drivers:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}),
    )
Exemplo n.º 14
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name=
        "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data()
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids
Exemplo n.º 15
0
def test_all_types_retrieve_online_success(client, all_types_dataframe):
    # Poll serving for feature values until the correct values are returned_float_list
    feature_refs = [
        "float_feature",
        "int64_feature",
        "int32_feature",
        "double_feature",
        "string_feature",
        "bool_feature",
        "bytes_feature",
        "float_list_feature",
        "int64_list_feature",
        "int32_list_feature",
        "string_list_feature",
        "bytes_list_feature",
        "double_list_feature",
    ]

    def try_get_features():
        response = client.get_online_features(
            entity_rows=[
                GetOnlineFeaturesRequest.EntityRow(
                    fields={
                        "user_id":
                        Value(int64_val=all_types_dataframe.iloc[0]["user_id"])
                    })
            ],
            feature_refs=feature_refs,
        )  # type: GetOnlineFeaturesResponse
        is_ok = check_online_response("float_feature", all_types_dataframe,
                                      response)
        return response, is_ok

    response = wait_retry_backoff(
        retry_fn=try_get_features,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values",
    )

    # check returned values
    returned_float_list = (response.field_values[0].
                           fields["float_list_feature"].float_list_val.val)
    sent_float_list = all_types_dataframe.iloc[0]["float_list_feature"]
    assert math.isclose(returned_float_list[0],
                        sent_float_list[0],
                        abs_tol=FLOAT_TOLERANCE)
    # check returned metadata
    assert (response.field_values[0].statuses["float_list_feature"] ==
            GetOnlineFeaturesResponse.FieldStatus.PRESENT)
def test_basic_ingest_retrieval_str(client):
    # Set to another project to test ingestion based on current project context
    client.set_project(PROJECT_NAME + "_NS1")
    customer_fs = FeatureSet(
        name="cust_fs",
        features=[
            Feature(name="cust_rating", dtype=ValueType.INT64),
            Feature(name="cust_cost", dtype=ValueType.FLOAT),
        ],
        entities=[Entity("cust_id", ValueType.INT64)],
        max_age=Duration(seconds=3600),
    )
    client.apply(customer_fs)

    N_ROWS = 2
    time_offset = datetime.utcnow().replace(tzinfo=pytz.utc)
    cust_df = pd.DataFrame({
        "datetime": [time_offset] * N_ROWS,
        "cust_id": [i for i in range(N_ROWS)],
        "cust_rating": [i for i in range(N_ROWS)],
        "cust_cost": [float(i) + 0.5 for i in range(N_ROWS)],
    })
    client.ingest("cust_fs", cust_df, timeout=600)
    time.sleep(15)

    online_request_entity = [{"cust_id": 0}, {"cust_id": 1}]
    online_request_features = ["cust_rating", "cust_cost"]

    def try_get_features():
        response = client.get_online_features(
            entity_rows=online_request_entity,
            feature_refs=online_request_features)
        return response, True

    online_features_actual = wait_retry_backoff(
        retry_fn=try_get_features,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values",
    )

    online_features_expected = {
        "cust_id": [0, 1],
        "cust_rating": [0, 1],
        "cust_cost": [0.5, 1.5],
    }

    assert online_features_actual.to_dict() == online_features_expected
Exemplo n.º 17
0
def test_validation_reports_metrics(feast_client: Client, kafka_server,
                                    statsd_server: StatsDServer):
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    entity, feature_table = create_schema(kafka_broker, topic_name,
                                          "validation_ge_metrics")
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations, feature_table)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=10)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    unexpected_counts = {
        "expect_column_values_to_be_between_num_0_100":
        validation_result.results[0].result["unexpected_count"],
        "expect_column_values_to_be_in_set_set":
        validation_result.results[1].result["unexpected_count"],
    }
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=[
                "validation_ge_metrics:num", "validation_ge_metrics:set"
            ],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    expected_metrics = [(
        f"feast_feature_validation_check_failed#check:{check_name},"
        f"feature_table:{feature_table.name},project:{feast_client.project}",
        value,
    ) for check_name, value in unexpected_counts.items()]
    wait_retry_backoff(
        lambda: (
            None,
            all(
                statsd_server.metrics.get(m) == v
                for m, v in expected_metrics),
        ),
        timeout_secs=30,
        timeout_msg="Expected metrics were not received: " +
        str(expected_metrics) + "\n"
        "Actual received metrics" + str(statsd_server.metrics),
    )
Exemplo n.º 18
0
def test_basic_retrieve_online_entity_listform(client, list_entity_dataframe):
    # Case 1: Features retrieval with entity in list format check
    district_fs = FeatureSet(
        name="district",
        features=[
            Feature(name="district_rating", dtype=ValueType.INT64),
            Feature(name="district_cost", dtype=ValueType.FLOAT),
            Feature(name="district_past_transactions_int",
                    dtype=ValueType.INT64_LIST),
            Feature(name="district_past_transactions_double",
                    dtype=ValueType.DOUBLE_LIST),
            Feature(name="district_past_transactions_float",
                    dtype=ValueType.FLOAT_LIST),
            Feature(name="district_past_transactions_string",
                    dtype=ValueType.STRING_LIST),
            Feature(name="district_past_transactions_bool",
                    dtype=ValueType.BOOL_LIST),
        ],
        entities=[Entity("district_ids", dtype=ValueType.INT64_LIST)],
        max_age=Duration(seconds=3600),
    )

    client.set_project(PROJECT_NAME)
    client.apply(district_fs)

    district_fs = client.get_feature_set(name="district")
    client.ingest(district_fs, list_entity_dataframe, timeout=600)
    time.sleep(15)

    online_request_entity = [{
        "district_ids": [np.int64(1), np.int64(2),
                         np.int64(3)]
    }]
    online_request_features = [
        "district_rating",
        "district_cost",
        "district_past_transactions_int",
        "district_past_transactions_double",
        "district_past_transactions_float",
        "district_past_transactions_string",
        "district_past_transactions_bool",
    ]
    online_request_entity2 = [{
        "district_ids":
        Value(int64_list_val=Int64List(val=[1, 2, 3]))
    }]

    def try_get_features1():
        response = client.get_online_features(
            entity_rows=online_request_entity,
            feature_refs=online_request_features)
        return response, True

    def try_get_features2():
        response = client.get_online_features(
            entity_rows=online_request_entity2,
            feature_refs=online_request_features)
        return response, True

    online_features_actual = wait_retry_backoff(
        retry_fn=try_get_features1,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values",
    )

    online_features_actual2 = wait_retry_backoff(
        retry_fn=try_get_features2,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values",
    )

    online_features_expected = {
        "district_ids": [[np.int64(1), np.int64(2),
                          np.int64(3)]],
        "district_rating": [1],
        "district_cost": [1.5],
        "district_past_transactions_int": [[1, 3]],
        "district_past_transactions_double": [[1.5, 3.0]],
        "district_past_transactions_float": [[1.5, 3.0]],
        "district_past_transactions_string": [["first_1", "second_1"]],
        "district_past_transactions_bool": [[True, False]],
    }

    assert online_features_actual.to_dict() == online_features_expected
    assert online_features_actual2.to_dict() == online_features_expected

    # Case 2: Features retrieval with entity in list format check with mixed types
    with pytest.raises(ValueError) as excinfo:
        online_request_entity2 = [{
            "district_ids": [np.int64(1), np.int64(2), True]
        }]
        online_features_actual2 = client.get_online_features(
            entity_rows=online_request_entity2,
            feature_refs=online_request_features)

    assert (
        "List value type for field district_ids is inconsistent. ValueType.INT64 different from ValueType.BOOL."
        in str(excinfo.value))
Exemplo n.º 19
0
def test_basic_retrieve_online_entity_nonlistform(client,
                                                  nonlist_entity_dataframe,
                                                  list_entity_dataframe):
    # Case 1: Feature retrieval with multiple entities retrieval check
    customer_fs = FeatureSet(
        name="customer2",
        features=[
            Feature(name="customer2_rating", dtype=ValueType.INT64),
            Feature(name="customer2_cost", dtype=ValueType.FLOAT),
            Feature(name="customer2_past_transactions_int",
                    dtype=ValueType.INT64_LIST),
            Feature(name="customer2_past_transactions_double",
                    dtype=ValueType.DOUBLE_LIST),
            Feature(name="customer2_past_transactions_float",
                    dtype=ValueType.FLOAT_LIST),
            Feature(name="customer2_past_transactions_string",
                    dtype=ValueType.STRING_LIST),
            Feature(name="customer2_past_transactions_bool",
                    dtype=ValueType.BOOL_LIST),
        ],
        entities=[Entity("customer_id2", ValueType.INT64)],
        max_age=Duration(seconds=3600),
    )

    client.set_project(PROJECT_NAME)
    client.apply(customer_fs)

    customer_fs = client.get_feature_set(name="customer2")
    client.ingest(customer_fs, nonlist_entity_dataframe, timeout=600)
    time.sleep(15)

    online_request_entity = [{"customer_id2": 0}, {"customer_id2": 1}]
    online_request_features = [
        "customer2_rating",
        "customer2_cost",
        "customer2_past_transactions_int",
        "customer2_past_transactions_double",
        "customer2_past_transactions_float",
        "customer2_past_transactions_string",
        "customer2_past_transactions_bool",
    ]
    online_request_entity2 = [
        {
            "customer_id2": Value(int64_val=0)
        },
        {
            "customer_id2": Value(int64_val=1)
        },
    ]

    def try_get_features1():
        response = client.get_online_features(
            entity_rows=online_request_entity,
            feature_refs=online_request_features)
        return response, True

    def try_get_features2():
        response = client.get_online_features(
            entity_rows=online_request_entity2,
            feature_refs=online_request_features)
        return response, True

    online_features_actual1 = wait_retry_backoff(
        retry_fn=try_get_features1,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values",
    )

    online_features_actual2 = wait_retry_backoff(
        retry_fn=try_get_features2,
        timeout_secs=90,
        timeout_msg="Timed out trying to get online feature values",
    )

    online_features_expected = {
        "customer_id2": [0, 1],
        "customer2_rating": [0, 1],
        "customer2_cost": [0.5, 1.5],
        "customer2_past_transactions_int": [[0, 2], [1, 3]],
        "customer2_past_transactions_double": [[0.5, 2.0], [1.5, 3.0]],
        "customer2_past_transactions_float": [[0.5, 2.0], [1.5, 3.0]],
        "customer2_past_transactions_string": [
            ["first_0", "second_0"],
            ["first_1", "second_1"],
        ],
        "customer2_past_transactions_bool": [[True, False], [True, False]],
    }

    assert online_features_actual1.to_dict() == online_features_expected
    assert online_features_actual2.to_dict() == online_features_expected

    # Case 2: Feature retrieval with multiple entities retrieval check with mixed types
    with pytest.raises(TypeError) as excinfo:
        online_request_entity2 = [{
            "customer_id": 0
        }, {
            "customer_id": "error_pls"
        }]
        online_features_actual2 = client.get_online_features(
            entity_rows=online_request_entity2,
            feature_refs=online_request_features)

    assert (
        "Input entity customer_id has mixed types, ValueType.STRING and ValueType.INT64. That is not allowed."
        in str(excinfo.value))
Exemplo n.º 20
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="validation_test",
        entities=["key"],
        features=[
            Feature("num", ValueType.INT64),
            Feature("set", ValueType.STRING)
        ],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url="/dev/null",
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_test:num", "validation_test:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_test:num", "validation_test:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_test:num",
            "set": "validation_test:set"
        }),
    )
Exemplo n.º 21
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            "event_timestamp",
            "event_timestamp",
            kafka_broker,
            AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 60)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 60)

    try:
        original = generate_data()[[
            "s2id", "unique_drivers", "event_timestamp"
        ]]
        for record in original.to_dict("records"):
            record["event_timestamp"] = (
                record["event_timestamp"].to_pydatetime().replace(
                    tzinfo=pytz.utc))

            send_avro_record_to_kafka(
                topic_name,
                record,
                bootstrap_servers=kafka_broker,
                avro_schema_json=avro_schema(),
            )

        def get_online_features():
            features = feast_client.get_online_features(
                ["drivers_stream:unique_drivers"],
                entity_rows=[{
                    "s2id": s2_id
                } for s2_id in original["s2id"].tolist()],
            ).to_dict()
            df = pd.DataFrame.from_dict(features)
            return df, not df["drivers_stream:unique_drivers"].isna().any()

        ingested = wait_retry_backoff(get_online_features, 60)
    finally:
        job.cancel()

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )
Exemplo n.º 22
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    entity, feature_table = create_schema(kafka_broker, topic_name,
                                          "validation_ge")
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations, feature_table)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_ge:num", "validation_ge:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_ge:num", "validation_ge:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_ge:num",
            "set": "validation_ge:set"
        }),
    )
Exemplo n.º 23
0
# Register feature set
client.apply(all_types_fs_expected)

df.info()
df.describe()
df.head()

# Ingest tdata
client.ingest(all_types_fs_expected, df)


# Wait for data to be available
def try_get_features():
    online_request_entity = [{"user_id": 1001}]
    online_request_features = ["float_feature"]

    response = client.get_online_features(entity_rows=online_request_entity,
                                          feature_refs=online_request_features)
    response_dict = response.to_dict()
    if response_dict['float_feature'] == df.iloc[0]['float_feature']:
        return response_dict, True
    return response_dict, False


online_features_actual = wait_retry_backoff(
    retry_fn=try_get_features,
    timeout_secs=90,
    timeout_msg="Timed out trying to get online feature values",
)
Exemplo n.º 24
0
def test_online_store_cleanup(environment, universal_data_sources):
    """
    Some online store implementations (like Redis) keep features from different features views
    but with common entities together.
    This might end up with deletion of all features attached to the entity,
    when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150).

    Plan:
        1. Register two feature views with common entity "driver"
        2. Materialize data
        3. Check if features are available (via online retrieval)
        4. Delete one feature view
        5. Check that features for other are still available
        6. Delete another feature view (and create again)
        7. Verify that features for both feature view were deleted
    """
    fs = environment.feature_store
    entities, datasets, data_sources = universal_data_sources
    driver_stats_fv = construct_universal_feature_views(data_sources).driver

    driver_entities = entities.driver_vals
    df = pd.DataFrame({
        "ts_1": [environment.end_date] * len(driver_entities),
        "created_ts": [environment.end_date] * len(driver_entities),
        "driver_id":
        driver_entities,
        "value":
        np.random.random(size=len(driver_entities)),
    })

    ds = environment.data_source_creator.create_data_source(
        df, destination_name="simple_driver_dataset")

    simple_driver_fv = driver_feature_view(
        data_source=ds, name="test_universal_online_simple_driver")

    fs.apply([driver(), simple_driver_fv, driver_stats_fv])

    fs.materialize(
        environment.start_date - timedelta(days=1),
        environment.end_date + timedelta(days=1),
    )
    expected_values = df.sort_values(by="driver_id")

    features = [f"{simple_driver_fv.name}:value"]
    entity_rows = [{
        "driver_id": driver_id
    } for driver_id in sorted(driver_entities)]

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert np.allclose(expected_values["value"], online_features["value"])

    fs.apply(objects=[simple_driver_fv],
             objects_to_delete=[driver_stats_fv],
             partial=False)

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert np.allclose(expected_values["value"], online_features["value"])

    fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False)

    def eventually_apply() -> Tuple[None, bool]:
        try:
            fs.apply([simple_driver_fv])
        except BotoCoreError:
            return None, False

        return None, True

    # Online store backend might have eventual consistency in schema update
    # So recreating table that was just deleted might need some retries
    wait_retry_backoff(eventually_apply, timeout_secs=60)

    online_features = fs.get_online_features(
        features=features, entity_rows=entity_rows).to_dict()
    assert all(v is None for v in online_features["value"])
Exemplo n.º 25
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server, pytestconfig):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    if not pytestconfig.getoption("scheduled_streaming_job"):
        job = feast_client.start_stream_to_online_ingestion(feature_table)
        assert job.get_feature_table() == feature_table.name
        wait_retry_backoff(
            lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS),
            120)
    else:
        job = None

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300)

    test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=[{
                "s2id": s2_id
            } for s2_id in test_data["s2id"].tolist()],
            feature_names=["drivers_stream:unique_drivers"],
        )
    finally:
        if job:
            job.cancel()
        else:
            feast_client.delete_feature_table(feature_table.name)

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        test_data[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )