예제 #1
0
    def online_write_batch(
        self,
        project: str,
        table: Union[FeatureTable, FeatureView],
        data: List[
            Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]
        ],
        progress: Optional[Callable[[int], Any]],
    ) -> None:
        client = self._get_client()

        entity_hset = {}
        feature_view = table.name

        ex = Timestamp()
        ex.seconds = EX_SECONDS
        ex_str = ex.SerializeToString()

        for entity_key, values, timestamp, created_ts in data:
            redis_key_bin = _redis_key(project, entity_key)
            ts = Timestamp()
            ts.seconds = int(utils.make_tzaware(timestamp).timestamp())
            entity_hset[f"_ts:{feature_view}"] = ts.SerializeToString()
            entity_hset[f"_ex:{feature_view}"] = ex_str

            for feature_name, val in values.items():
                f_key = _mmh3(f"{feature_view}:{feature_name}")
                entity_hset[f_key] = val.SerializeToString()

            client.hset(redis_key_bin, mapping=entity_hset)
            if progress:
                progress(1)
예제 #2
0
파일: redis.py 프로젝트: Shopify/feast
    def online_write_batch(
        self,
        config: RepoConfig,
        table: FeatureView,
        data: List[
            Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]
        ],
        progress: Optional[Callable[[int], Any]],
    ) -> None:
        online_store_config = config.online_store
        assert isinstance(online_store_config, RedisOnlineStoreConfig)

        client = self._get_client(online_store_config)
        project = config.project

        feature_view = table.name
        ts_key = f"_ts:{feature_view}"
        keys = []
        # redis pipelining optimization: send multiple commands to redis server without waiting for every reply
        with client.pipeline() as pipe:
            # check if a previous record under the key bin exists
            # TODO: investigate if check and set is a better approach rather than pulling all entity ts and then setting
            # it may be significantly slower but avoids potential (rare) race conditions
            for entity_key, _, _, _ in data:
                redis_key_bin = _redis_key(project, entity_key)
                keys.append(redis_key_bin)
                pipe.hmget(redis_key_bin, ts_key)
            prev_event_timestamps = pipe.execute()
            # flattening the list of lists. `hmget` does the lookup assuming a list of keys in the key bin
            prev_event_timestamps = [i[0] for i in prev_event_timestamps]

            for redis_key_bin, prev_event_time, (_, values, timestamp, _) in zip(
                keys, prev_event_timestamps, data
            ):
                event_time_seconds = int(utils.make_tzaware(timestamp).timestamp())

                # ignore if event_timestamp is before the event features that are currently in the feature store
                if prev_event_time:
                    prev_ts = Timestamp()
                    prev_ts.ParseFromString(prev_event_time)
                    if prev_ts.seconds and event_time_seconds <= prev_ts.seconds:
                        # TODO: somehow signal that it's not overwriting the current record?
                        if progress:
                            progress(1)
                        continue

                ts = Timestamp()
                ts.seconds = event_time_seconds
                entity_hset = dict()
                entity_hset[ts_key] = ts.SerializeToString()

                for feature_name, val in values.items():
                    f_key = _mmh3(f"{feature_view}:{feature_name}")
                    entity_hset[f_key] = val.SerializeToString()

                pipe.hset(redis_key_bin, mapping=entity_hset)
                # TODO: support expiring the entity / features in Redis
                # otherwise entity features remain in redis until cleaned up in separate process
                # client.expire redis_key_bin based a ttl setting
            results = pipe.execute()
            if progress:
                progress(len(results))