示例#1
0
def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        df = create_dataset()
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            file_url=f"file://{f.name}",
            event_timestamp_column="ts",
            created_timestamp_column="created_ts",
            date_partition_column="",
            field_mapping={
                "ts_1": "ts",
                "id": "driver_id"
            },
        )
        fv = get_feature_view(file_source)
        with tempfile.TemporaryDirectory(
        ) as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name:
            config = RepoConfig(
                registry=str(Path(repo_dir_name) / "registry.db"),
                project=
                f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
                provider="local",
                online_store=OnlineStoreConfig(local=LocalOnlineStoreConfig(
                    path=str(Path(data_dir_name) / "online_store.db"))),
            )
            fs = FeatureStore(config=config)
            fs.apply([fv])

            yield fs, fv
示例#2
0
def stage_dataframe(df, event_timestamp_column: str, config: Config) -> FileSource:
    """
    Helper function to upload a pandas dataframe in parquet format to a temporary location (under
    SPARK_STAGING_LOCATION) and return it wrapped in a FileSource.

    Args:
        event_timestamp_column(str): the name of the timestamp column in the dataframe.
        config(Config): feast config.
    """
    staging_location = config.get(opt.SPARK_STAGING_LOCATION)
    staging_uri = urlparse(staging_location)

    with tempfile.NamedTemporaryFile() as f:
        df.to_parquet(f)

        file_url = urlunparse(
            get_staging_client(staging_uri.scheme, config).upload_fileobj(
                f,
                f.name,
                remote_path_prefix=os.path.join(staging_location, "dataframes"),
                remote_path_suffix=".parquet",
            )
        )

    return FileSource(
        event_timestamp_column=event_timestamp_column,
        file_format=ParquetFormat(),
        file_url=file_url,
    )
示例#3
0
def prep_dynamodb_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        df = create_dataset()
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            file_url=f"file://{f.name}",
            event_timestamp_column="ts",
            created_timestamp_column="created_ts",
            date_partition_column="",
            field_mapping={"ts_1": "ts", "id": "driver_id"},
        )
        fv = get_feature_view(file_source)
        e = Entity(
            name="driver",
            description="id for driver",
            join_key="driver_id",
            value_type=ValueType.INT32,
        )
        with tempfile.TemporaryDirectory() as repo_dir_name:
            config = RepoConfig(
                registry=str(Path(repo_dir_name) / "registry.db"),
                project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}",
                provider="aws",
                online_store=DynamoDBOnlineStoreConfig(region="us-west-2"),
                offline_store=FileOfflineStoreConfig(),
            )
            fs = FeatureStore(config=config)
            fs.apply([fv, e])

            yield fs, fv
示例#4
0
文件: file.py 项目: qooba/feast
    def create_data_source(
        self,
        df: pd.DataFrame,
        destination_name: Optional[str] = None,
        suffix: Optional[str] = None,
        event_timestamp_column="ts",
        created_timestamp_column="created_ts",
        field_mapping: Dict[str, str] = None,
    ) -> DataSource:
        filename = f"{destination_name}.parquet"
        port = self.minio.get_exposed_port("9000")
        host = self.minio.get_container_host_ip()
        minio_endpoint = f"{host}:{port}"

        self._upload_parquet_file(df, filename, minio_endpoint)

        return FileSource(
            file_format=ParquetFormat(),
            path=f"s3://{self.bucket}/{filename}",
            event_timestamp_column=event_timestamp_column,
            created_timestamp_column=created_timestamp_column,
            date_partition_column="",
            field_mapping=field_mapping or {"ts_1": "ts"},
            s3_endpoint_override=f"http://{host}:{port}",
        )
示例#5
0
def _ingest_test_getfeaturetable_mocked_resp(file_url: str,
                                             date_partition_col: str = ""):
    return GetFeatureTableResponse(table=FeatureTableProto(
        spec=FeatureTableSpecProto(
            name="ingest_featuretable",
            max_age=Duration(seconds=3600),
            features=[
                FeatureSpecProto(
                    name="dev_feature_float",
                    value_type=ValueProto.ValueType.FLOAT,
                ),
                FeatureSpecProto(
                    name="dev_feature_string",
                    value_type=ValueProto.ValueType.STRING,
                ),
            ],
            entities=["dev_entity"],
            batch_source=DataSourceProto(
                file_options=DataSourceProto.FileOptions(
                    file_format=ParquetFormat().to_proto(), file_url=file_url),
                event_timestamp_column="datetime",
                created_timestamp_column="timestamp",
                date_partition_column=date_partition_col,
            ),
        ),
        meta=FeatureTableMetaProto(),
    ))
示例#6
0
def alltypes_featuretable():
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )
    return FeatureTable(
        name="alltypes",
        entities=["alltypes_id"],
        features=[
            Feature(name="float_feature", dtype=ValueType.FLOAT),
            Feature(name="int64_feature", dtype=ValueType.INT64),
            Feature(name="int32_feature", dtype=ValueType.INT32),
            Feature(name="string_feature", dtype=ValueType.STRING),
            Feature(name="bytes_feature", dtype=ValueType.BYTES),
            Feature(name="bool_feature", dtype=ValueType.BOOL),
            Feature(name="double_feature", dtype=ValueType.DOUBLE),
            Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST),
            Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST),
            Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST),
            Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST),
            Feature(name="string_list_feature", dtype=ValueType.STRING_LIST),
            Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST),
            Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST),
        ],
        max_age=Duration(seconds=3600),
        batch_source=batch_source,
        labels={"cat": "alltypes"},
    )
示例#7
0
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str,
                         config: Config) -> FileSource:
    """
    Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location)

    :return: FileSource with remote destination path
    """
    entity_staging_uri = urlparse(
        os.path.join(staging_location, str(uuid.uuid4())))
    staging_client = get_staging_client(entity_staging_uri.scheme, config)
    with tempfile.NamedTemporaryFile() as df_export_path:
        # prevent casting ns -> ms exception inside pyarrow
        entity_source["event_timestamp"] = entity_source[
            "event_timestamp"].dt.floor("ms")

        entity_source.to_parquet(df_export_path.name)

        with open(df_export_path.name, "rb") as f:
            staging_client.upload_fileobj(f,
                                          df_export_path.name,
                                          remote_uri=entity_staging_uri)

    # ToDo: support custom event_timestamp_column
    return FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=entity_staging_uri.geturl(),
    )
示例#8
0
    def create_data_source(
        self,
        df: pd.DataFrame,
        destination_name: str,
        timestamp_field="ts",
        created_timestamp_column="created_ts",
        field_mapping: Dict[str, str] = None,
    ) -> DataSource:

        destination_name = self.get_prefixed_table_name(destination_name)

        f = tempfile.NamedTemporaryFile(
            prefix=f"{self.project_name}_{destination_name}",
            suffix=".parquet",
            delete=False,
        )
        df.to_parquet(f.name)
        self.files.append(f)
        return FileSource(
            file_format=ParquetFormat(),
            path=f"{f.name}",
            timestamp_field=timestamp_field,
            created_timestamp_column=created_timestamp_column,
            field_mapping=field_mapping or {"ts_1": "ts"},
        )
示例#9
0
def test_apply_feature_view_integration(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == ValueType.INT64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == ValueType.STRING
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == ValueType.STRING_LIST
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == ValueType.BYTES_LIST
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_feature_store.get_feature_view("my_feature_view_1")
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == ValueType.INT64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == ValueType.STRING
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == ValueType.STRING_LIST
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == ValueType.BYTES_LIST
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_feature_store.delete_feature_view("my_feature_view_1")
    feature_views = test_feature_store.list_feature_views()
    assert len(feature_views) == 0

    test_feature_store.teardown()
示例#10
0
def test_apply_feature_view_integration(test_feature_store):
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
        date_partition_column="date_partition_col",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1])

    feature_views = test_feature_store.list_feature_views()

    # List Feature Views
    assert (len(feature_views) == 1
            and feature_views[0].name == "my_feature_view_1"
            and feature_views[0].features[0].name == "fs1_my_feature_1"
            and feature_views[0].features[0].dtype == Int64
            and feature_views[0].features[1].name == "fs1_my_feature_2"
            and feature_views[0].features[1].dtype == String
            and feature_views[0].features[2].name == "fs1_my_feature_3"
            and feature_views[0].features[2].dtype == Array(String)
            and feature_views[0].features[3].name == "fs1_my_feature_4"
            and feature_views[0].features[3].dtype == Array(Bytes)
            and feature_views[0].entities[0] == "fs1_my_entity_1")

    feature_view = test_feature_store.get_feature_view("my_feature_view_1")
    assert (feature_view.name == "my_feature_view_1"
            and feature_view.features[0].name == "fs1_my_feature_1"
            and feature_view.features[0].dtype == Int64
            and feature_view.features[1].name == "fs1_my_feature_2"
            and feature_view.features[1].dtype == String
            and feature_view.features[2].name == "fs1_my_feature_3"
            and feature_view.features[2].dtype == Array(String)
            and feature_view.features[3].name == "fs1_my_feature_4"
            and feature_view.features[3].dtype == Array(Bytes)
            and feature_view.entities[0] == "fs1_my_entity_1")

    test_feature_store.delete_feature_view("my_feature_view_1")
    feature_views = test_feature_store.list_feature_views()
    assert len(feature_views) == 0

    test_feature_store.teardown()
示例#11
0
def test_apply_object_and_read(test_feature_store):
    assert isinstance(test_feature_store, FeatureStore)
    # Create Feature Views
    batch_source = FileSource(
        file_format=ParquetFormat(),
        path="file://feast/*",
        event_timestamp_column="ts_col",
        created_timestamp_column="timestamp",
    )

    e1 = Entity(name="fs1_my_entity_1",
                value_type=ValueType.STRING,
                description="something")

    e2 = Entity(name="fs1_my_entity_2",
                value_type=ValueType.STRING,
                description="something")

    fv1 = FeatureView(
        name="my_feature_view_1",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    fv2 = FeatureView(
        name="my_feature_view_2",
        features=[
            Feature(name="fs1_my_feature_1", dtype=ValueType.INT64),
            Feature(name="fs1_my_feature_2", dtype=ValueType.STRING),
            Feature(name="fs1_my_feature_3", dtype=ValueType.STRING_LIST),
            Feature(name="fs1_my_feature_4", dtype=ValueType.BYTES_LIST),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    # Register Feature View
    test_feature_store.apply([fv1, e1, fv2, e2])

    fv1_actual = test_feature_store.get_feature_view("my_feature_view_1")
    e1_actual = test_feature_store.get_entity("fs1_my_entity_1")

    assert fv1 == fv1_actual
    assert e1 == e1_actual
    assert fv2 != fv1_actual
    assert e2 != e1_actual

    test_feature_store.teardown()
def transactions_feature_table(spark, client):
    schema = StructType([
        StructField("customer_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("created_timestamp", TimestampType()),
        StructField("total_transactions", DoubleType()),
        StructField("is_vip", BooleanType()),
    ])
    df_data = [
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            50.0,
            True,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=2),
            100.0,
            True,
        ),
        (
            2001,
            datetime(year=2020, month=9, day=1),
            datetime(year=2020, month=9, day=1),
            400.0,
            False,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=2),
            datetime(year=2020, month=9, day=1),
            200.0,
            False,
        ),
        (
            1001,
            datetime(year=2020, month=9, day=4),
            datetime(year=2020, month=9, day=1),
            300.0,
            False,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "transactions",
                                                  schema, df_data)
    file_source = FileSource("event_timestamp", "created_timestamp",
                             ParquetFormat(), file_uri)
    features = [
        Feature("total_transactions", ValueType.DOUBLE),
        Feature("is_vip", ValueType.BOOL),
    ]
    feature_table = FeatureTable("transactions", ["customer_id"],
                                 features,
                                 batch_source=file_source)
    yield client.apply_feature_table(feature_table)
    shutil.rmtree(temp_dir)
示例#13
0
def test_apply_data_source(test_registry: Registry):
    # Create Feature Views
    batch_source = FileSource(
        name="test_source",
        file_format=ParquetFormat(),
        path="file://feast/*",
        timestamp_field="ts_col",
        created_timestamp_column="timestamp",
    )

    fv1 = FeatureView(
        name="my_feature_view_1",
        schema=[
            Field(name="fs1_my_feature_1", dtype=Int64),
            Field(name="fs1_my_feature_2", dtype=String),
            Field(name="fs1_my_feature_3", dtype=Array(String)),
            Field(name="fs1_my_feature_4", dtype=Array(Bytes)),
        ],
        entities=["fs1_my_entity_1"],
        tags={"team": "matchmaking"},
        batch_source=batch_source,
        ttl=timedelta(minutes=5),
    )

    project = "project"

    # Register data source and feature view
    test_registry.apply_data_source(batch_source, project, commit=False)
    test_registry.apply_feature_view(fv1, project, commit=True)

    registry_feature_views = test_registry.list_feature_views(project)
    registry_data_sources = test_registry.list_data_sources(project)
    assert len(registry_feature_views) == 1
    assert len(registry_data_sources) == 1
    registry_feature_view = registry_feature_views[0]
    assert registry_feature_view.batch_source == batch_source
    registry_data_source = registry_data_sources[0]
    assert registry_data_source == batch_source

    # Check that change to batch source propagates
    batch_source.timestamp_field = "new_ts_col"
    test_registry.apply_data_source(batch_source, project, commit=False)
    test_registry.apply_feature_view(fv1, project, commit=True)
    registry_feature_views = test_registry.list_feature_views(project)
    registry_data_sources = test_registry.list_data_sources(project)
    assert len(registry_feature_views) == 1
    assert len(registry_data_sources) == 1
    registry_feature_view = registry_feature_views[0]
    assert registry_feature_view.batch_source == batch_source
    registry_batch_source = test_registry.list_data_sources(project)[0]
    assert registry_batch_source == batch_source

    test_registry.teardown()

    # Will try to reload registry, which will fail because the file has been deleted
    with pytest.raises(FileNotFoundError):
        test_registry._get_registry_proto()
示例#14
0
    def create_saved_dataset_destination(self) -> SavedDatasetFileStorage:
        port = self.minio.get_exposed_port("9000")
        host = self.minio.get_container_host_ip()

        return SavedDatasetFileStorage(
            path=f"s3://{self.bucket}/persisted/{str(uuid.uuid4())}",
            file_format=ParquetFormat(),
            s3_endpoint_override=f"http://{host}:{port}",
        )
示例#15
0
def prep_file_source(df, event_timestamp_column=None) -> FileSource:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            file_url=f.name,
            event_timestamp_column=event_timestamp_column,
        )
        yield file_source
示例#16
0
    def test_apply_feature_table_success(self, test_client):

        test_client.set_project("project1")

        # Create Feature Tables
        batch_source = FileSource(
            file_format=ParquetFormat(),
            file_url="file://feast/*",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
            date_partition_column="date_partition_col",
        )

        stream_source = KafkaSource(
            bootstrap_servers="localhost:9094",
            message_format=ProtoFormat("class.path"),
            topic="test_topic",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
        )

        ft1 = FeatureTable(
            name="my-feature-table-1",
            features=[
                Feature(name="fs1-my-feature-1", dtype=ValueType.INT64),
                Feature(name="fs1-my-feature-2", dtype=ValueType.STRING),
                Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST),
                Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST),
            ],
            entities=["fs1-my-entity-1"],
            labels={"team": "matchmaking"},
            batch_source=batch_source,
            stream_source=stream_source,
        )

        # Register Feature Table with Core
        test_client.apply_feature_table(ft1)

        feature_tables = test_client.list_feature_tables()

        # List Feature Tables
        assert (len(feature_tables) == 1
                and feature_tables[0].name == "my-feature-table-1"
                and feature_tables[0].features[0].name == "fs1-my-feature-1"
                and feature_tables[0].features[0].dtype == ValueType.INT64
                and feature_tables[0].features[1].name == "fs1-my-feature-2"
                and feature_tables[0].features[1].dtype == ValueType.STRING
                and feature_tables[0].features[2].name == "fs1-my-feature-3"
                and feature_tables[0].features[2].dtype
                == ValueType.STRING_LIST
                and feature_tables[0].features[3].name == "fs1-my-feature-4"
                and feature_tables[0].features[3].dtype == ValueType.BYTES_LIST
                and feature_tables[0].entities[0] == "fs1-my-entity-1")
示例#17
0
 def batch_source(self):
     return FileSource(
         field_mapping={
             "ride_distance": "ride_distance",
             "ride_duration": "ride_duration",
         },
         file_format=ParquetFormat(),
         file_url="file://feast/*",
         event_timestamp_column="ts_col",
         created_timestamp_column="timestamp",
         date_partition_column="date_partition_col",
     )
示例#18
0
def test_offline_ingestion(feast_client: Client, staging_path: str):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            "event_timestamp",
            "event_timestamp",
            ParquetFormat(),
            os.path.join(staging_path, "batch-storage"),
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    job = feast_client.start_offline_to_online_ingestion(
        feature_table, datetime.today(),
        datetime.today() + timedelta(days=1))

    status = wait_retry_backoff(
        lambda:
        (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS),
        300)

    assert status == SparkJobStatus.COMPLETED

    features = feast_client.get_online_features(
        ["drivers:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}),
    )
示例#19
0
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client,
                                       feast_spark_client: SparkClient):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="gs://example/feast/*",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
        date_partition_column="datetime",
    )
    feature_table = FeatureTable(
        name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"),
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )
    feast_client.apply(entity)
    feast_client.apply(feature_table)

    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "0 0 * * *")
    config.load_incluster_config()
    k8s_api = client.CustomObjectsApi()

    def get_scheduled_spark_application():
        job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}".
                               encode()).hexdigest()
        resource_name = f"feast-{job_hash}"

        return k8s_api.get_namespaced_custom_object(
            group="sparkoperator.k8s.io",
            version="v1beta2",
            namespace=pytestconfig.getoption("k8s_namespace"),
            plural="scheduledsparkapplications",
            name=resource_name,
        )

    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "0 0 * * *"
    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "1 0 * * *")
    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "1 0 * * *"

    feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
示例#20
0
 def stage_dataframe(self, df: pandas.DataFrame,
                     event_timestamp: str) -> FileSource:
     with tempfile.NamedTemporaryFile() as f:
         df.to_parquet(f)
         file_url = _s3_upload(
             f,
             f.name,
             remote_path_prefix=os.path.join(self._staging_location,
                                             "dataframes"),
             remote_path_suffix=".parquet",
         )
     return FileSource(
         event_timestamp_column=event_timestamp,
         file_format=ParquetFormat(),
         file_url=file_url,
     )
示例#21
0
    def test_feature_table_import_export_yaml(self):

        batch_source = FileSource(
            field_mapping={
                "ride_distance": "ride_distance",
                "ride_duration": "ride_duration",
            },
            file_format=ParquetFormat(),
            file_url="file://feast/*",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
            date_partition_column="date_partition_col",
        )

        stream_source = KafkaSource(
            field_mapping={
                "ride_distance": "ride_distance",
                "ride_duration": "ride_duration",
            },
            bootstrap_servers="localhost:9094",
            message_format=ProtoFormat(class_path="class.path"),
            topic="test_topic",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
        )

        test_feature_table = FeatureTable(
            name="car_driver",
            features=[
                Feature(name="ride_distance", dtype=ValueType.FLOAT),
                Feature(name="ride_duration", dtype=ValueType.STRING),
            ],
            entities=["car_driver_entity"],
            labels={"team": "matchmaking"},
            batch_source=batch_source,
            stream_source=stream_source,
        )

        # Create a string YAML representation of the feature table
        string_yaml = test_feature_table.to_yaml()

        # Create a new feature table object from the YAML string
        actual_feature_table_from_string = FeatureTable.from_yaml(string_yaml)

        # Ensure equality is upheld to original feature table
        assert test_feature_table == actual_feature_table_from_string
示例#22
0
def bookings_feature_table_with_mapping(spark, client):
    schema = StructType([
        StructField("id", IntegerType()),
        StructField("datetime", TimestampType()),
        StructField("created_datetime", TimestampType()),
        StructField("total_completed_bookings", IntegerType()),
    ])
    df_data = [
        (
            8001,
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            datetime(year=2020, month=9, day=1, tzinfo=utc),
            100,
        ),
        (
            8001,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            150,
        ),
        (
            8002,
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            datetime(year=2020, month=9, day=2, tzinfo=utc),
            200,
        ),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "bookings", schema,
                                                  df_data)

    file_source = FileSource(
        event_timestamp_column="datetime",
        created_timestamp_column="created_datetime",
        file_format=ParquetFormat(),
        file_url=file_uri,
        field_mapping={"id": "driver_id"},
    )
    features = [Feature("total_completed_bookings", ValueType.INT32)]
    max_age = Duration()
    max_age.FromSeconds(86400)
    feature_table = FeatureTable("bookings", ["driver_id"],
                                 features,
                                 batch_source=file_source,
                                 max_age=max_age)
    yield client.apply(feature_table)
    shutil.rmtree(temp_dir)
示例#23
0
 def create_data_source(
     self,
     destination: str,
     df: pd.DataFrame,
     event_timestamp_column="ts",
     created_timestamp_column="created_ts",
     field_mapping: Dict[str, str] = None,
 ) -> DataSource:
     self.f = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
     df.to_parquet(self.f.name)
     return FileSource(
         file_format=ParquetFormat(),
         path=f"file://{self.f.name}",
         event_timestamp_column=event_timestamp_column,
         created_timestamp_column=created_timestamp_column,
         date_partition_column="",
         field_mapping=field_mapping or {"ts_1": "ts"},
     )
示例#24
0
文件: data.py 项目: yutiansut/feast
def batch_source(local_staging_path: str, pytestconfig,
                 request: FixtureRequest):
    if pytestconfig.getoption("env") == "gcloud":
        bq_project = pytestconfig.getoption("bq_project")
        bq_dataset = request.getfixturevalue("bq_dataset")
        return BigQuerySource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="created_timestamp",
            table_ref=
            f"{bq_project}:{bq_dataset}.source_{datetime.now():%Y%m%d%H%M%s}",
        )
    else:
        return FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="created_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "transactions"),
        )
示例#25
0
文件: common.py 项目: tianshizz/feast
def create_schema(kafka_broker, topic_name, feature_table_name):
    entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
    feature_table = FeatureTable(
        name=feature_table_name,
        entities=["key"],
        features=[Feature("num", ValueType.INT64), Feature("set", ValueType.STRING)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url="/dev/null",
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )
    return entity, feature_table
def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]:
    with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
        df = create_dataset()
        f.close()
        df.to_parquet(f.name)
        file_source = FileSource(
            file_format=ParquetFormat(),
            path=f"file://{f.name}",
            event_timestamp_column="ts",
            created_timestamp_column="created_ts",
            date_partition_column="",
            field_mapping={
                "ts_1": "ts",
                "id": "driver_id"
            },
        )
        fv = driver_feature_view(file_source)
        e = Entity(
            name="driver",
            description="id for driver",
            join_key="driver_id",
            value_type=ValueType.INT32,
        )
        project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}"
        print(f"Using project: {project}")
        with tempfile.TemporaryDirectory() as repo_dir_name:
            config = RepoConfig(
                registry=str(Path(repo_dir_name) / "registry.db"),
                project=project,
                provider="local",
                online_store=RedisOnlineStoreConfig(
                    type="redis",
                    redis_type=RedisType.redis,
                    connection_string="localhost:6379,db=0",
                ),
            )
            fs = FeatureStore(config=config)
            fs.apply([fv, e])

            yield fs, fv

            fs.teardown()
示例#27
0
def test_historical_feature_retrieval_with_field_mappings_from_local_spark_session(
    spark,
    client,
    driver_entity,
    bookings_feature_table_with_mapping,
):
    schema = StructType([
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
    ])
    df_data = [
        (8001, datetime(year=2020, month=9, day=1, tzinfo=utc)),
        (8001, datetime(year=2020, month=9, day=2, tzinfo=utc)),
        (8002, datetime(year=2020, month=9, day=1, tzinfo=utc)),
    ]
    temp_dir, file_uri = create_temp_parquet_file(spark, "drivers", schema,
                                                  df_data)
    entity_source = FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=file_uri,
    )
    joined_df = client.get_historical_features_df(
        ["bookings:total_completed_bookings"],
        entity_source,
    )
    expected_joined_df_schema = StructType([
        StructField("driver_id", IntegerType()),
        StructField("event_timestamp", TimestampType()),
        StructField("bookings__total_completed_bookings", IntegerType()),
    ])
    expected_joined_df_data = [
        (8001, datetime(year=2020, month=9, day=1, tzinfo=utc), 100),
        (8001, datetime(year=2020, month=9, day=2, tzinfo=utc), 150),
        (8002, datetime(year=2020, month=9, day=1, tzinfo=utc), None),
    ]
    expected_joined_df = spark.createDataFrame(
        spark.sparkContext.parallelize(expected_joined_df_data),
        expected_joined_df_schema,
    )
    assert_dataframe_equal(joined_df, expected_joined_df)
    shutil.rmtree(temp_dir)
示例#28
0
def basic_featuretable():
    batch_source = FileSource(
        field_mapping={
            "dev_entity": "dev_entity_field",
            "dev_feature_float": "dev_feature_float_field",
            "dev_feature_string": "dev_feature_string_field",
        },
        file_format=ParquetFormat(),
        file_url="gs://example/feast/*",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
        date_partition_column="datetime",
    )
    stream_source = KafkaSource(
        field_mapping={
            "dev_entity": "dev_entity_field",
            "dev_feature_float": "dev_feature_float_field",
            "dev_feature_string": "dev_feature_string_field",
        },
        bootstrap_servers="localhost:9094",
        message_format=ProtoFormat(class_path="class.path"),
        topic="test_topic",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
    )
    return FeatureTable(
        name="basic_featuretable",
        entities=["driver_id", "customer_id"],
        features=[
            Feature(name="dev_feature_float", dtype=ValueType.FLOAT),
            Feature(name="dev_feature_string", dtype=ValueType.STRING),
        ],
        max_age=Duration(seconds=3600),
        batch_source=batch_source,
        stream_source=stream_source,
        labels={
            "key1": "val1",
            "key2": "val2"
        },
    )
示例#29
0
    def _create_ft(self, client: Client, features) -> None:
        entity = Entity(
            name="driver_car_id",
            description="Car driver id",
            value_type=ValueType.STRING,
            labels={"team": "matchmaking"},
        )

        # Register Entity with Core
        client.apply_entity(entity)

        # Create Feature Tables
        batch_source = FileSource(
            file_format=ParquetFormat(),
            file_url="file://feast/*",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
            date_partition_column="date_partition_col",
        )

        stream_source = KafkaSource(
            bootstrap_servers="localhost:9094",
            message_format=ProtoFormat("class.path"),
            topic="test_topic",
            event_timestamp_column="ts_col",
            created_timestamp_column="timestamp",
        )

        ft1 = FeatureTable(
            name=self.table_name,
            features=features,
            entities=["driver_car_id"],
            labels={"team": "matchmaking"},
            batch_source=batch_source,
            stream_source=stream_source,
        )

        # Register Feature Table with Core
        client.apply_feature_table(ft1)
示例#30
0
def stage_entities_to_fs(entity_source: pd.DataFrame,
                         staging_location: str) -> FileSource:
    """
    Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location)

    :return: FileSource with remote destination path
    """
    entity_staging_uri = urlparse(
        os.path.join(staging_location, str(uuid.uuid4())))
    staging_client = get_staging_client(entity_staging_uri.scheme)
    with tempfile.NamedTemporaryFile() as df_export_path:
        entity_source.to_parquet(df_export_path.name)
        bucket = (None if entity_staging_uri.scheme == "file" else
                  entity_staging_uri.netloc)
        staging_client.upload_file(df_export_path.name, bucket,
                                   entity_staging_uri.path.lstrip("/"))

    # ToDo: support custom event_timestamp_column
    return FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=entity_staging_uri.geturl(),
    )