def _write_minibatch( client, project: str, table: Union[FeatureTable, FeatureView], data: Sequence[ Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ], progress: Optional[Callable[[int], Any]], ): entities = [] for entity_key, features, timestamp, created_ts in data: document_id = compute_datastore_entity_id(entity_key) key = client.key("Project", project, "Table", table.name, "Row", document_id,) entity = datastore.Entity( key=key, exclude_from_indexes=("created_ts", "event_ts", "values") ) entity.update( dict( key=entity_key.SerializeToString(), values={k: v.SerializeToString() for k, v in features.items()}, event_ts=utils.make_tzaware(timestamp), created_ts=( utils.make_tzaware(created_ts) if created_ts is not None else None ), ) ) entities.append(entity) with client.transaction(): client.put_multi(entities) if progress: progress(len(entities))
def from_proto(cls, feature_view_proto: FeatureViewProto): """ Creates a feature view from a protobuf representation of a feature view. Args: feature_view_proto: A protobuf representation of a feature view. Returns: A FeatureViewProto object based on the feature view protobuf. """ batch_source = DataSource.from_proto(feature_view_proto.spec.batch_source) stream_source = ( DataSource.from_proto(feature_view_proto.spec.stream_source) if feature_view_proto.spec.HasField("stream_source") else None ) feature_view = cls( name=feature_view_proto.spec.name, entities=[entity for entity in feature_view_proto.spec.entities], features=[ Feature( name=feature.name, dtype=ValueType(feature.value_type), labels=dict(feature.labels), ) for feature in feature_view_proto.spec.features ], tags=dict(feature_view_proto.spec.tags), online=feature_view_proto.spec.online, ttl=( None if feature_view_proto.spec.ttl.seconds == 0 and feature_view_proto.spec.ttl.nanos == 0 else feature_view_proto.spec.ttl ), batch_source=batch_source, stream_source=stream_source, ) if feature_view_proto.meta.HasField("created_timestamp"): feature_view.created_timestamp = ( feature_view_proto.meta.created_timestamp.ToDatetime() ) if feature_view_proto.meta.HasField("last_updated_timestamp"): feature_view.last_updated_timestamp = ( feature_view_proto.meta.last_updated_timestamp.ToDatetime() ) for interval in feature_view_proto.meta.materialization_intervals: feature_view.materialization_intervals.append( ( utils.make_tzaware(interval.start_time.ToDatetime()), utils.make_tzaware(interval.end_time.ToDatetime()), ) ) return feature_view
def materialize( self, start_date: datetime, end_date: datetime, feature_views: Optional[List[str]] = None, ) -> None: """ Materialize data from the offline store into the online store. This method loads feature data in the specified interval from either the specified feature views, or all feature views if none are specified, into the online store where it is available for online serving. Args: start_date (datetime): Start date for time range of data to materialize into the online store end_date (datetime): End date for time range of data to materialize into the online store feature_views (List[str]): Optional list of feature view names. If selected, will only run materialization for the specified feature views. Examples: Materialize all features into the online store over the interval from 3 hours ago to 10 minutes ago. >>> from datetime import datetime, timedelta >>> from feast.feature_store import FeatureStore >>> >>> fs = FeatureStore(config=RepoConfig(provider="gcp")) >>> fs.materialize( >>> start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10) >>> ) """ self._tele.log("materialize") if utils.make_tzaware(start_date) > utils.make_tzaware(end_date): raise ValueError( f"The given start_date {start_date} is greater than the given end_date {end_date}." ) feature_views_to_materialize = [] if feature_views is None: feature_views_to_materialize = self._registry.list_feature_views( self.config.project ) else: for name in feature_views: feature_view = self._registry.get_feature_view( name, self.config.project ) feature_views_to_materialize.append(feature_view) # TODO paging large loads for feature_view in feature_views_to_materialize: provider = self._get_provider() _print_materialization_log(start_date, end_date, feature_view) provider.materialize_single_feature_view( feature_view, start_date, end_date, self._registry, self.project ) print(" done!")
def from_proto(cls, feature_view_proto: FeatureViewProto): """ Creates a feature view from a protobuf representation of a feature view. Args: feature_view_proto: A protobuf representation of a feature view. Returns: A FeatureViewProto object based on the feature view protobuf. """ batch_source = DataSource.from_proto( feature_view_proto.spec.batch_source) stream_source = ( DataSource.from_proto(feature_view_proto.spec.stream_source) if feature_view_proto.spec.HasField("stream_source") else None) feature_view = cls( name=feature_view_proto.spec.name, entities=[entity for entity in feature_view_proto.spec.entities], schema=[ Field.from_proto(field_proto) for field_proto in feature_view_proto.spec.features ], description=feature_view_proto.spec.description, tags=dict(feature_view_proto.spec.tags), owner=feature_view_proto.spec.owner, online=feature_view_proto.spec.online, ttl=(timedelta( days=0) if feature_view_proto.spec.ttl.ToNanoseconds() == 0 else feature_view_proto.spec.ttl.ToTimedelta()), source=batch_source, ) if stream_source: feature_view.stream_source = stream_source # FeatureViewProjections are not saved in the FeatureView proto. # Create the default projection. feature_view.projection = FeatureViewProjection.from_definition( feature_view) if feature_view_proto.meta.HasField("created_timestamp"): feature_view.created_timestamp = ( feature_view_proto.meta.created_timestamp.ToDatetime()) if feature_view_proto.meta.HasField("last_updated_timestamp"): feature_view.last_updated_timestamp = ( feature_view_proto.meta.last_updated_timestamp.ToDatetime()) for interval in feature_view_proto.meta.materialization_intervals: feature_view.materialization_intervals.append(( utils.make_tzaware(interval.start_time.ToDatetime()), utils.make_tzaware(interval.end_time.ToDatetime()), )) return feature_view
def online_write_batch( self, project: str, table: Union[FeatureTable, FeatureView], data: List[ Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ], progress: Optional[Callable[[int], Any]], ) -> None: client = self._get_client() entity_hset = {} feature_view = table.name ex = Timestamp() ex.seconds = EX_SECONDS ex_str = ex.SerializeToString() for entity_key, values, timestamp, created_ts in data: redis_key_bin = _redis_key(project, entity_key) ts = Timestamp() ts.seconds = int(utils.make_tzaware(timestamp).timestamp()) entity_hset[f"_ts:{feature_view}"] = ts.SerializeToString() entity_hset[f"_ex:{feature_view}"] = ex_str for feature_name, val in values.items(): f_key = _mmh3(f"{feature_view}:{feature_name}") entity_hset[f_key] = val.SerializeToString() client.hset(redis_key_bin, mapping=entity_hset) if progress: progress(1)
def online_write_batch( self, config: RepoConfig, table: FeatureView, data: List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]], progress: Optional[Callable[[int], Any]], ) -> None: online_config = config.online_store assert isinstance(online_config, DynamoDBOnlineStoreConfig) dynamodb_resource = self._get_dynamodb_resource(online_config.region) table_instance = dynamodb_resource.Table(_get_table_name( config, table)) with table_instance.batch_writer() as batch: for entity_key, features, timestamp, created_ts in data: entity_id = compute_entity_id(entity_key) batch.put_item( Item={ "entity_id": entity_id, # PartitionKey "event_ts": str(utils.make_tzaware(timestamp)), "values": { k: v.SerializeToString() for k, v in features.items() # Serialized Features }, }) if progress: progress(1)
def materialize_single_feature_view( self, feature_view: FeatureView, start_date: datetime, end_date: datetime, registry: Registry, project: str, tqdm_builder: Callable[[int], tqdm], ) -> None: entities = [] for entity_name in feature_view.entities: entities.append(registry.get_entity(entity_name, project)) ( join_key_columns, feature_name_columns, event_timestamp_column, created_timestamp_column, ) = _get_column_names(feature_view, entities) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) table = self.offline_store.pull_latest_from_table_or_query( data_source=feature_view.input, join_key_columns=join_key_columns, feature_name_columns=feature_name_columns, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, start_date=start_date, end_date=end_date, ) if feature_view.input.field_mapping is not None: table = _run_field_mapping(table, feature_view.input.field_mapping) join_keys = [entity.join_key for entity in entities] rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys) with tqdm_builder(len(rows_to_write)) as pbar: self.online_write_batch( project, feature_view, rows_to_write, lambda x: pbar.update(x) ) feature_view.materialization_intervals.append((start_date, end_date)) registry.apply_feature_view(feature_view, project)
def materialize_single_feature_view( self, feature_view: FeatureView, start_date: datetime, end_date: datetime, registry: Registry, project: str, ) -> None: assert isinstance(feature_view.input, BigQuerySource) entities = [] for entity_name in feature_view.entities: entities.append(registry.get_entity(entity_name, project)) ( join_key_columns, feature_name_columns, event_timestamp_column, created_timestamp_column, ) = _get_column_names(feature_view, entities) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) offline_store = get_offline_store_from_sources([feature_view.input]) table = offline_store.pull_latest_from_table_or_query( data_source=feature_view.input, join_key_columns=join_key_columns, feature_name_columns=feature_name_columns, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, start_date=start_date, end_date=end_date, ) if feature_view.input.field_mapping is not None: table = _run_field_mapping(table, feature_view.input.field_mapping) join_keys = [entity.join_key for entity in entities] rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys) self.online_write_batch(project, feature_view, rows_to_write, None) feature_view.materialization_intervals.append((start_date, end_date)) registry.apply_feature_view(feature_view, project)
def materialize_command(ctx: click.Context, start_ts: str, end_ts: str, views: List[str]): """ Run a (non-incremental) materialization job to ingest data into the online store. Feast will read all data between START_TS and END_TS from the offline store and write it to the online store. If you don't specify feature view names using --views, all registered Feature Views will be materialized. START_TS and END_TS should be in ISO 8601 format, e.g. '2021-07-16T19:20:01' """ repo = ctx.obj["CHDIR"] cli_check_repo(repo) store = FeatureStore(repo_path=str(repo)) store.materialize( feature_views=None if not views else views, start_date=utils.make_tzaware(datetime.fromisoformat(start_ts)), end_date=utils.make_tzaware(datetime.fromisoformat(end_ts)), )
def from_proto(cls, feature_view_proto: FeatureViewProto): """ Creates a feature view from a protobuf representation of a feature view Args: feature_view_proto: A protobuf representation of a feature view Returns: Returns a FeatureViewProto object based on the feature view protobuf """ feature_view = cls( name=feature_view_proto.spec.name, entities=[entity for entity in feature_view_proto.spec.entities], features=[ Feature( name=feature.name, dtype=ValueType(feature.value_type), labels=feature.labels, ) for feature in feature_view_proto.spec.features ], tags=dict(feature_view_proto.spec.tags), online=feature_view_proto.spec.online, ttl=(None if feature_view_proto.spec.ttl.seconds == 0 and feature_view_proto.spec.ttl.nanos == 0 else feature_view_proto.spec.ttl), input=DataSource.from_proto(feature_view_proto.spec.input), ) feature_view.created_timestamp = feature_view_proto.meta.created_timestamp for interval in feature_view_proto.meta.materialization_intervals: feature_view.materialization_intervals.append(( utils.make_tzaware(interval.start_time.ToDatetime()), utils.make_tzaware(interval.end_time.ToDatetime()), )) return feature_view
def _insert_data_test_table(data, project, tbl_name, region): dynamodb_resource = boto3.resource("dynamodb", region_name=region) table_instance = dynamodb_resource.Table(f"{project}.{tbl_name}") for entity_key, features, timestamp, created_ts in data: entity_id = compute_entity_id(entity_key) with table_instance.batch_writer() as batch: batch.put_item( Item={ "entity_id": entity_id, "event_ts": str(utils.make_tzaware(timestamp)), "values": {k: v.SerializeToString() for k, v in features.items()}, })
def retrieve_saved_dataset(self, config: RepoConfig, dataset: SavedDataset) -> RetrievalJob: set_usage_attribute("provider", self.__class__.__name__) feature_name_columns = [ ref.replace(":", "__") if dataset.full_feature_names else ref.split(":")[1] for ref in dataset.features ] # ToDo: replace hardcoded value event_ts_column = "event_timestamp" return self.offline_store.pull_all_from_table_or_query( config=config, data_source=dataset.storage.to_data_source(), join_key_columns=dataset.join_keys, feature_name_columns=feature_name_columns, event_timestamp_column=event_ts_column, start_date=make_tzaware( dataset.min_event_timestamp), # type: ignore end_date=make_tzaware(dataset.max_event_timestamp + timedelta(seconds=1)), # type: ignore )
def _materialize_single_feature_view(self, feature_view: FeatureView, start_date: datetime, end_date: datetime) -> None: ( entity_names, feature_names, event_timestamp_column, created_timestamp_column, ) = _run_reverse_field_mapping(feature_view) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) provider = self._get_provider() table = provider.pull_latest_from_table_or_query( feature_view.input, entity_names, feature_names, event_timestamp_column, created_timestamp_column, start_date, end_date, ) if feature_view.input.field_mapping is not None: table = _run_forward_field_mapping( table, feature_view.input.field_mapping) rows_to_write = _convert_arrow_to_proto(table, feature_view) provider = self._get_provider() provider.online_write_batch(self.config.project, feature_view, rows_to_write, None) feature_view.materialization_intervals.append((start_date, end_date)) self.apply([feature_view])
def _write_batch_non_duplicates( self, table_instance, data: List[ Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ], progress: Optional[Callable[[int], Any]], ): """Deduplicate write batch request items on ``entity_id`` primary key.""" with table_instance.batch_writer(overwrite_by_pkeys=["entity_id"]) as batch: for entity_key, features, timestamp, created_ts in data: entity_id = compute_entity_id(entity_key) batch.put_item( Item={ "entity_id": entity_id, # PartitionKey "event_ts": str(utils.make_tzaware(timestamp)), "values": { k: v.SerializeToString() for k, v in features.items() # Serialized Features }, } ) if progress: progress(1)
def _write_minibatch( client, project: str, table: Union[FeatureTable, FeatureView], data: Sequence[ Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ], progress: Optional[Callable[[int], Any]], ): from google.api_core.exceptions import Conflict from google.cloud import datastore num_retries_on_conflict = 3 row_count = 0 for retry_number in range(num_retries_on_conflict): try: row_count = 0 with client.transaction(): for entity_key, features, timestamp, created_ts in data: document_id = compute_datastore_entity_id(entity_key) key = client.key( "Project", project, "Table", table.name, "Row", document_id, ) entity = client.get(key) if entity is not None: if entity["event_ts"] > utils.make_tzaware(timestamp): # Do not overwrite feature values computed from fresher data continue elif ( entity["event_ts"] == utils.make_tzaware(timestamp) and created_ts is not None and entity["created_ts"] is not None and entity["created_ts"] > utils.make_tzaware(created_ts) ): # Do not overwrite feature values computed from the same data, but # computed later than this one continue else: entity = datastore.Entity(key=key) entity.update( dict( key=entity_key.SerializeToString(), values={ k: v.SerializeToString() for k, v in features.items() }, event_ts=utils.make_tzaware(timestamp), created_ts=( utils.make_tzaware(created_ts) if created_ts is not None else None ), ) ) client.put(entity) row_count += 1 if progress: progress(1) break # make sure to break out of retry loop if all went well except Conflict: if retry_number == num_retries_on_conflict - 1: raise
def online_write_batch( self, config: RepoConfig, table: FeatureView, data: List[ Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]] ], progress: Optional[Callable[[int], Any]], ) -> None: online_store_config = config.online_store assert isinstance(online_store_config, RedisOnlineStoreConfig) client = self._get_client(online_store_config) project = config.project feature_view = table.name ts_key = f"_ts:{feature_view}" keys = [] # redis pipelining optimization: send multiple commands to redis server without waiting for every reply with client.pipeline() as pipe: # check if a previous record under the key bin exists # TODO: investigate if check and set is a better approach rather than pulling all entity ts and then setting # it may be significantly slower but avoids potential (rare) race conditions for entity_key, _, _, _ in data: redis_key_bin = _redis_key(project, entity_key) keys.append(redis_key_bin) pipe.hmget(redis_key_bin, ts_key) prev_event_timestamps = pipe.execute() # flattening the list of lists. `hmget` does the lookup assuming a list of keys in the key bin prev_event_timestamps = [i[0] for i in prev_event_timestamps] for redis_key_bin, prev_event_time, (_, values, timestamp, _) in zip( keys, prev_event_timestamps, data ): event_time_seconds = int(utils.make_tzaware(timestamp).timestamp()) # ignore if event_timestamp is before the event features that are currently in the feature store if prev_event_time: prev_ts = Timestamp() prev_ts.ParseFromString(prev_event_time) if prev_ts.seconds and event_time_seconds <= prev_ts.seconds: # TODO: somehow signal that it's not overwriting the current record? if progress: progress(1) continue ts = Timestamp() ts.seconds = event_time_seconds entity_hset = dict() entity_hset[ts_key] = ts.SerializeToString() for feature_name, val in values.items(): f_key = _mmh3(f"{feature_view}:{feature_name}") entity_hset[f_key] = val.SerializeToString() pipe.hset(redis_key_bin, mapping=entity_hset) # TODO: support expiring the entity / features in Redis # otherwise entity features remain in redis until cleaned up in separate process # client.expire redis_key_bin based a ttl setting results = pipe.execute() if progress: progress(len(results))
def materialize( self, start_date: datetime, end_date: datetime, feature_views: Optional[List[str]] = None, ) -> None: """ Materialize data from the offline store into the online store. This method loads feature data in the specified interval from either the specified feature views, or all feature views if none are specified, into the online store where it is available for online serving. Args: start_date (datetime): Start date for time range of data to materialize into the online store end_date (datetime): End date for time range of data to materialize into the online store feature_views (List[str]): Optional list of feature view names. If selected, will only run materialization for the specified feature views. Examples: Materialize all features into the online store over the interval from 3 hours ago to 10 minutes ago. >>> from feast import FeatureStore, RepoConfig >>> from datetime import datetime, timedelta >>> fs = FeatureStore(repo_path="feature_repo") >>> fs.materialize( ... start_date=datetime.utcnow() - timedelta(hours=3), end_date=datetime.utcnow() - timedelta(minutes=10) ... ) Materializing... <BLANKLINE> ... """ if utils.make_tzaware(start_date) > utils.make_tzaware(end_date): raise ValueError( f"The given start_date {start_date} is greater than the given end_date {end_date}." ) feature_views_to_materialize = [] if feature_views is None: feature_views_to_materialize = self._registry.list_feature_views( self.project) else: for name in feature_views: feature_view = self._registry.get_feature_view( name, self.project) feature_views_to_materialize.append(feature_view) _print_materialization_log( start_date, end_date, len(feature_views_to_materialize), self.config.online_store.type, ) # TODO paging large loads for feature_view in feature_views_to_materialize: provider = self._get_provider() print( f"{Style.BRIGHT + Fore.GREEN}{feature_view.name}{Style.RESET_ALL}:" ) def tqdm_builder(length): return tqdm(total=length, ncols=100) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) provider.materialize_single_feature_view( config=self.config, feature_view=feature_view, start_date=start_date, end_date=end_date, registry=self._registry, project=self.project, tqdm_builder=tqdm_builder, ) self._registry.apply_materialization(feature_view, self.project, start_date, end_date)
def convert_timestamp_records_to_utc(records: List[Dict[str, Any]], column: str) -> List[Dict[str, Any]]: for record in records: record[column] = utils.make_tzaware(record[column]).astimezone(utc) return records
def materialize_incremental( self, end_date: datetime, feature_views: Optional[List[str]] = None, ) -> None: """ Materialize incremental new data from the offline store into the online store. This method loads incremental new feature data up to the specified end time from either the specified feature views, or all feature views if none are specified, into the online store where it is available for online serving. The start time of the interval materialized is either the most recent end time of a prior materialization or (now - ttl) if no such prior materialization exists. Args: end_date (datetime): End date for time range of data to materialize into the online store feature_views (List[str]): Optional list of feature view names. If selected, will only run materialization for the specified feature views. Examples: Materialize all features into the online store up to 5 minutes ago. >>> from datetime import datetime, timedelta >>> from feast.feature_store import FeatureStore >>> >>> fs = FeatureStore(config=RepoConfig(provider="gcp", registry="gs://my-fs/", project="my_fs_proj")) >>> fs.materialize_incremental(end_date=datetime.utcnow() - timedelta(minutes=5)) """ feature_views_to_materialize = [] if feature_views is None: feature_views_to_materialize = self._registry.list_feature_views( self.project) else: for name in feature_views: feature_view = self._registry.get_feature_view( name, self.project) feature_views_to_materialize.append(feature_view) _print_materialization_log( None, end_date, len(feature_views_to_materialize), self.config.online_store.type, ) # TODO paging large loads for feature_view in feature_views_to_materialize: start_date = feature_view.most_recent_end_time if start_date is None: if feature_view.ttl is None: raise Exception( f"No start time found for feature view {feature_view.name}. materialize_incremental() requires" f" either a ttl to be set or for materialize() to have been run at least once." ) start_date = datetime.utcnow() - feature_view.ttl provider = self._get_provider() print( f"{Style.BRIGHT + Fore.GREEN}{feature_view.name}{Style.RESET_ALL}" f" from {Style.BRIGHT + Fore.GREEN}{start_date.replace(microsecond=0).astimezone()}{Style.RESET_ALL}" f" to {Style.BRIGHT + Fore.GREEN}{end_date.replace(microsecond=0).astimezone()}{Style.RESET_ALL}:" ) def tqdm_builder(length): return tqdm(total=length, ncols=100) start_date = utils.make_tzaware(start_date) end_date = utils.make_tzaware(end_date) provider.materialize_single_feature_view( feature_view, start_date, end_date, self._registry, self.project, tqdm_builder, ) self._registry.apply_materialization(feature_view, self.project, start_date, end_date)
def convert_timestamp_records_to_utc(records, column): for record in records: record[column] = utils.make_tzaware(record[column]).astimezone(utc) return records