示例#1
0
def test_telemetry_on_v09(mocker):
    # Setup environment
    old_environ = dict(os.environ)
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    test_client = Client(serving_url=None, core_url=None, telemetry=True)
    test_client.set_project("project1")
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )

    mocker.patch.object(
        test_client, "_apply_entity", return_value=None,
    )

    test_client.apply(entity)

    os.environ.clear()
    os.environ.update(old_environ)

    ensure_bigquery_telemetry_id_with_retry(test_telemetry_id)
示例#2
0
def test_telemetry_off_v09(mocker):
    old_environ = dict(os.environ)
    os.environ["FEAST_IS_TELEMETRY_TEST"] = "True"
    test_telemetry_id = str(uuid.uuid4())
    os.environ["FEAST_FORCE_TELEMETRY_UUID"] = test_telemetry_id
    os.environ["FEAST_TELEMETRY"] = "False"

    test_client = Client(serving_url=None, core_url=None, telemetry=False)
    test_client.set_project("project1")
    entity = Entity(
        name="driver_car_id",
        description="Car driver id",
        value_type=ValueType.STRING,
        labels={"team": "matchmaking"},
    )

    mocker.patch.object(
        test_client, "_apply_entity", return_value=None,
    )

    test_client.apply(entity)

    os.environ.clear()
    os.environ.update(old_environ)
    sleep(30)
    rows = read_bigquery_telemetry_id(test_telemetry_id)
    assert rows.total_rows == 0
示例#3
0
def test_offline_ingestion(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
示例#4
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(name="long_entity_name" * 10,
                    description="S2id",
                    value_type=ValueType.INT64)

    feature_table = FeatureTable(
        name="just1a2featuretable3with4a5really6really7really8really9really10",
        entities=[entity.name],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data().rename(columns={"s2id": entity.name})
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            entity.name: key
        } for key in data_sample[entity.name].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[[entity.name, f"{feature_table.name}:unique_drivers"]],
        data_sample[[entity.name, "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
示例#5
0
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client,
                                       feast_spark_client: SparkClient):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="gs://example/feast/*",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
        date_partition_column="datetime",
    )
    feature_table = FeatureTable(
        name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"),
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )
    feast_client.apply(entity)
    feast_client.apply(feature_table)

    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "0 0 * * *")
    config.load_incluster_config()
    k8s_api = client.CustomObjectsApi()

    def get_scheduled_spark_application():
        job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}".
                               encode()).hexdigest()
        resource_name = f"feast-{job_hash}"

        return k8s_api.get_namespaced_custom_object(
            group="sparkoperator.k8s.io",
            version="v1beta2",
            namespace=pytestconfig.getoption("k8s_namespace"),
            plural="scheduledsparkapplications",
            name=resource_name,
        )

    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "0 0 * * *"
    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "1 0 * * *")
    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "1 0 * * *"

    feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
示例#6
0
def test_historical_features(feast_client: Client,
                             batch_source: Union[BigQuerySource, FileSource]):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    job = feast_client.get_historical_features(feature_refs, customers_df)
    output_dir = job.get_output_file_uri()
    joined_df = read_parquet(output_dir)

    expected_joined_df = pd.DataFrame({
        "event_timestamp":
        customers_df.event_timestamp.tolist(),
        "user_id":
        customers_df.user_id.tolist(),
        "transactions__daily_transactions":
        transactions_df.daily_transactions.tolist() +
        [None] * transactions_df.shape[0],
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name=
        "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data()
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids
示例#8
0
def test_offline_ingestion_from_bq_view(pytestconfig, bq_dataset,
                                        feast_client: Client,
                                        feast_spark_client: SparkClient):
    original = generate_data()
    bq_project = pytestconfig.getoption("bq_project")

    bq_client = bigquery.Client(project=bq_project)
    source_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_source_{datetime.now():%Y%m%d%H%M%s}",
    )
    bq_client.load_table_from_dataframe(original, source_ref).result()

    view_ref = bigquery.TableReference(
        bigquery.DatasetReference(bq_project, bq_dataset),
        f"ingestion_view_{datetime.now():%Y%m%d%H%M%s}",
    )
    view = bigquery.Table(view_ref)
    view.view_query = f"select * from `{source_ref.project}.{source_ref.dataset_id}.{source_ref.table_id}`"
    bq_client.create_table(view)

    entity = Entity(name="s2id",
                    description="S2id",
                    value_type=ValueType.INT64)
    feature_table = FeatureTable(
        name="bq_ingestion",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=BigQuerySource(
            event_timestamp_column="event_timestamp",
            table_ref=
            f"{view_ref.project}:{view_ref.dataset_id}.{view_ref.table_id}",
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    ingest_and_verify(feast_client, feast_spark_client, feature_table,
                      original)
def test_historical_features(
    feast_client: Client,
    tfrecord_feast_client: Client,
    batch_source: Union[BigQuerySource, FileSource],
):
    customer_entity = Entity(name="user_id",
                             description="Customer",
                             value_type=ValueType.INT64)
    feast_client.apply(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    # remove microseconds because job.get_start_time() does not contain microseconds
    job_submission_time = datetime.utcnow().replace(microsecond=0)
    job = feast_client.get_historical_features(feature_refs, customers_df)
    assert job.get_start_time() >= job_submission_time
    assert job.get_start_time() <= job_submission_time + timedelta(hours=1)

    output_dir = job.get_output_file_uri()

    # will both be None if not using Azure blob storage
    account_name, account_key = _get_azure_creds(feast_client)

    joined_df = read_parquet(output_dir,
                             azure_account_name=account_name,
                             azure_account_key=account_key)

    expected_joined_df = pd.DataFrame({
        "event_timestamp":
        customers_df.event_timestamp.tolist(),
        "user_id":
        customers_df.user_id.tolist(),
        "transactions__daily_transactions":
        transactions_df.daily_transactions.tolist() +
        [None] * transactions_df.shape[0],
    })

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True),
        expected_joined_df.sort_values(
            by=["user_id", "event_timestamp"]).reset_index(drop=True),
    )

    job = tfrecord_feast_client.get_historical_features(
        feature_refs, customers_df)
    job.get_output_file_uri()
    assert job.get_status() == SparkJobStatus.COMPLETED
示例#10
0
        name="dummy_entity_1",
        description="Dummy entity 1",
        value_type=ValueType.STRING,
        labels={"key": "val"},
    )

    # create dummy entity since Feast demands it
    entity_2 = Entity(
        name="dummy_entity_2",
        description="Dummy entity 2",
        value_type=ValueType.INT32,
        labels={"key": "val"},
    )

    # commit entities
    test_client.apply([entity_1, entity_2])

    # dummy file source
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    # first feature table for testing, with all of Feast's datatypes
    table_1 = FeatureTable(
        name="test_feature_table_all_feature_dtypes",
        features=[
            Feature(name="test_BYTES_feature", dtype=ValueType.BYTES),
示例#11
0
        Feature(name="bytes_feature", dtype=ValueType.BYTES),
        Feature(name="bool_feature", dtype=ValueType.BOOL),
        Feature(name="double_feature", dtype=ValueType.DOUBLE),
        Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST),
        Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST),
        Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST),
        Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST),
        Feature(name="string_list_feature", dtype=ValueType.STRING_LIST),
        Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST),
    ],
)

client = Client(core_url=feast_core_url, serving_url=feast_online_serving_url)

# Register feature set
client.apply(all_types_fs_expected)

df.info()
df.describe()
df.head()

# Ingest tdata
client.ingest(all_types_fs_expected, df)


# Wait for data to be available
def try_get_features():
    online_request_entity = [{"user_id": 1001}]
    online_request_features = ["float_feature"]

    response = client.get_online_features(entity_rows=online_request_entity,
示例#12
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server, pytestconfig):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    if not pytestconfig.getoption("scheduled_streaming_job"):
        job = feast_client.start_stream_to_online_ingestion(feature_table)
        assert job.get_feature_table() == feature_table.name
        wait_retry_backoff(
            lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS),
            120)
    else:
        job = None

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300)

    test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=[{
                "s2id": s2_id
            } for s2_id in test_data["s2id"].tolist()],
            feature_names=["drivers_stream:unique_drivers"],
        )
    finally:
        if job:
            job.cancel()
        else:
            feast_client.delete_feature_table(feature_table.name)

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        test_data[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )
示例#13
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    try:
        original = generate_data()[[
            "s2id", "unique_drivers", "event_timestamp"
        ]]
        for record in original.to_dict("records"):
            record["event_timestamp"] = (
                record["event_timestamp"].to_pydatetime().replace(
                    tzinfo=pytz.utc))

            send_avro_record_to_kafka(
                topic_name,
                record,
                bootstrap_servers=kafka_broker,
                avro_schema_json=avro_schema(),
            )

        def get_online_features():
            features = feast_client.get_online_features(
                ["drivers_stream:unique_drivers"],
                entity_rows=[{
                    "s2id": s2_id
                } for s2_id in original["s2id"].tolist()],
            ).to_dict()
            df = pd.DataFrame.from_dict(features)
            return df, not df["drivers_stream:unique_drivers"].isna().any()

        ingested = wait_retry_backoff(get_online_features, 60)
    finally:
        job.cancel()

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )