def materialize_single_feature_view( self, config: RepoConfig, feature_view: FeatureView, start_date: datetime, end_date: datetime, registry: Registry, project: str, tqdm_builder: Callable[[int], tqdm], ) -> None: set_usage_attribute("provider", self.__class__.__name__) entities = [] for entity_name in feature_view.entities: entities.append(registry.get_entity(entity_name, project)) ( join_key_columns, feature_name_columns, event_timestamp_column, created_timestamp_column, ) = _get_column_names(feature_view, entities) offline_job = self.offline_store.pull_latest_from_table_or_query( config=config, data_source=feature_view.batch_source, join_key_columns=join_key_columns, feature_name_columns=feature_name_columns, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, start_date=start_date, end_date=end_date, ) table = offline_job.to_arrow() if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping(table, feature_view.batch_source.field_mapping) join_keys = {entity.join_key: entity.value_type for entity in entities} with tqdm_builder(table.num_rows) as pbar: for batch in table.to_batches(DEFAULT_BATCH_SIZE): rows_to_write = _convert_arrow_to_proto( batch, feature_view, join_keys) self.online_write_batch( self.repo_config, feature_view, rows_to_write, lambda x: pbar.update(x), )
def benchmark_writes(): project_id = "test" + "".join( random.choice(string.ascii_lowercase + string.digits) for _ in range(10) ) with tempfile.TemporaryDirectory() as temp_dir: store = FeatureStore( config=RepoConfig( registry=os.path.join(temp_dir, "registry.db"), project=project_id, provider="gcp", ) ) # This is just to set data source to something, we're not reading from parquet source here. parquet_path = os.path.join(temp_dir, "data.parquet") driver = Entity(name="driver_id", value_type=ValueType.INT64) table = create_driver_hourly_stats_feature_view( create_driver_hourly_stats_source(parquet_path=parquet_path) ) store.apply([table, driver]) provider = store._get_provider() end_date = datetime.utcnow() start_date = end_date - timedelta(days=14) customers = list(range(100)) data = create_driver_hourly_stats_df(customers, start_date, end_date) # Show the data for reference print(data) proto_data = _convert_arrow_to_proto( pa.Table.from_pandas(data), table, ["driver_id"] ) # Write it with tqdm(total=len(proto_data)) as progress: provider.online_write_batch( project=store.project, table=table, data=proto_data, progress=progress.update, ) registry_tables = store.list_feature_views() registry_entities = store.list_entities() provider.teardown_infra( store.project, tables=registry_tables, entities=registry_entities )
def ingest_df( self, feature_view: FeatureView, entities: List[Entity], df: pandas.DataFrame, ): table = pa.Table.from_pandas(df) if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping(table, feature_view.batch_source.field_mapping) join_keys = [entity.join_key for entity in entities] rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys) self.online_write_batch( self.repo_config, feature_view, rows_to_write, progress=None )
def materialize_single_feature_view( self, feature_view: FeatureView, start_date: datetime, end_date: datetime, registry: Registry, project: str, tqdm_builder: Callable[[int], tqdm], ) -> None: entities = [] for entity_name in feature_view.entities: entities.append(registry.get_entity(entity_name, project)) ( join_key_columns, feature_name_columns, event_timestamp_column, created_timestamp_column, ) = _get_column_names(feature_view, entities) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) table = self.offline_store.pull_latest_from_table_or_query( data_source=feature_view.input, join_key_columns=join_key_columns, feature_name_columns=feature_name_columns, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, start_date=start_date, end_date=end_date, ) if feature_view.input.field_mapping is not None: table = _run_field_mapping(table, feature_view.input.field_mapping) join_keys = [entity.join_key for entity in entities] rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys) with tqdm_builder(len(rows_to_write)) as pbar: self.online_write_batch( project, feature_view, rows_to_write, lambda x: pbar.update(x) ) feature_view.materialization_intervals.append((start_date, end_date)) registry.apply_feature_view(feature_view, project)
def materialize_single_feature_view( self, feature_view: FeatureView, start_date: datetime, end_date: datetime, registry: Registry, project: str, ) -> None: assert isinstance(feature_view.input, BigQuerySource) entities = [] for entity_name in feature_view.entities: entities.append(registry.get_entity(entity_name, project)) ( join_key_columns, feature_name_columns, event_timestamp_column, created_timestamp_column, ) = _get_column_names(feature_view, entities) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) offline_store = get_offline_store_from_sources([feature_view.input]) table = offline_store.pull_latest_from_table_or_query( data_source=feature_view.input, join_key_columns=join_key_columns, feature_name_columns=feature_name_columns, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, start_date=start_date, end_date=end_date, ) if feature_view.input.field_mapping is not None: table = _run_field_mapping(table, feature_view.input.field_mapping) join_keys = [entity.join_key for entity in entities] rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys) self.online_write_batch(project, feature_view, rows_to_write, None) feature_view.materialization_intervals.append((start_date, end_date)) registry.apply_feature_view(feature_view, project)
def materialize_single_feature_view( self, config: RepoConfig, feature_view: FeatureView, start_date: datetime, end_date: datetime, registry: Registry, project: str, tqdm_builder: Callable[[int], tqdm], ) -> None: entities = [] for entity_name in feature_view.entities: entities.append(registry.get_entity(entity_name, project)) ( join_key_columns, feature_name_columns, event_timestamp_column, created_timestamp_column, ) = _get_column_names(feature_view, entities) offline_job = self.offline_store.pull_latest_from_table_or_query( config=config, data_source=feature_view.batch_source, join_key_columns=join_key_columns, feature_name_columns=feature_name_columns, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, start_date=start_date, end_date=end_date, ) table = offline_job.to_arrow() if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping(table, feature_view.batch_source.field_mapping) join_keys = [entity.join_key for entity in entities] rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys) with tqdm_builder(len(rows_to_write)) as pbar: self.online_write_batch(self.repo_config, feature_view, rows_to_write, lambda x: pbar.update(x))
def ingest_df( self, feature_view: FeatureView, entities: List[Entity], df: pandas.DataFrame, ): set_usage_attribute("provider", self.__class__.__name__) table = pa.Table.from_pandas(df) if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping(table, feature_view.batch_source.field_mapping) join_keys = {entity.join_key: entity.value_type for entity in entities} rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys) self.online_write_batch(self.repo_config, feature_view, rows_to_write, progress=None)