Exemplo n.º 1
0
def _convert_arrow_to_proto(
    table: pyarrow.Table, feature_view: FeatureView
) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime,
                Optional[datetime]]]:
    rows_to_write = []
    for row in zip(*table.to_pydict().values()):
        entity_key = EntityKeyProto()
        for entity_name in feature_view.entities:
            entity_key.entity_names.append(entity_name)
            idx = table.column_names.index(entity_name)
            value = python_value_to_proto_value(row[idx])
            entity_key.entity_values.append(value)
        feature_dict = {}
        for feature in feature_view.features:
            idx = table.column_names.index(feature.name)
            value = python_value_to_proto_value(row[idx])
            feature_dict[feature.name] = value
        event_timestamp_idx = table.column_names.index(
            feature_view.input.event_timestamp_column)
        event_timestamp = row[event_timestamp_idx]
        if feature_view.input.created_timestamp_column is not None:
            created_timestamp_idx = table.column_names.index(
                feature_view.input.created_timestamp_column)
            created_timestamp = row[created_timestamp_idx]
        else:
            created_timestamp = None

        rows_to_write.append(
            (entity_key, feature_dict, event_timestamp, created_timestamp))
    return rows_to_write
Exemplo n.º 2
0
def _convert_arrow_to_proto(
    table: pyarrow.Table,
    feature_view: FeatureView,
    join_keys: List[str],
) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime,
                Optional[datetime]]]:
    rows_to_write = []

    def _coerce_datetime(ts):
        """
        Depending on underlying time resolution, arrow to_pydict() sometimes returns pandas
        timestamp type (for nanosecond resolution), and sometimes you get standard python datetime
        (for microsecond resolution).

        While pandas timestamp class is a subclass of python datetime, it doesn't always behave the
        same way. We convert it to normal datetime so that consumers downstream don't have to deal
        with these quirks.
        """

        if isinstance(ts, pandas.Timestamp):
            return ts.to_pydatetime()
        else:
            return ts

    column_names_idx = {k: i for i, k in enumerate(table.column_names)}
    for row in zip(*table.to_pydict().values()):
        entity_key = EntityKeyProto()
        for join_key in join_keys:
            entity_key.join_keys.append(join_key)
            idx = column_names_idx[join_key]
            value = python_value_to_proto_value(row[idx])
            entity_key.entity_values.append(value)
        feature_dict = {}
        for feature in feature_view.features:
            idx = column_names_idx[feature.name]
            value = python_value_to_proto_value(row[idx], feature.dtype)
            feature_dict[feature.name] = value
        event_timestamp_idx = column_names_idx[
            feature_view.batch_source.event_timestamp_column]
        event_timestamp = _coerce_datetime(row[event_timestamp_idx])

        if feature_view.batch_source.created_timestamp_column:
            created_timestamp_idx = column_names_idx[
                feature_view.batch_source.created_timestamp_column]
            created_timestamp = _coerce_datetime(row[created_timestamp_idx])
        else:
            created_timestamp = None

        rows_to_write.append(
            (entity_key, feature_dict, event_timestamp, created_timestamp))
    return rows_to_write
Exemplo n.º 3
0
def _convert_arrow_to_proto(
    table: Union[pyarrow.Table, pyarrow.RecordBatch],
    feature_view: FeatureView,
    join_keys: List[str],
) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime,
                Optional[datetime]]]:
    # Handle join keys
    join_key_values = {k: table.column(k).to_pylist() for k in join_keys}
    entity_keys = [
        EntityKeyProto(
            join_keys=join_keys,
            entity_values=[
                python_value_to_proto_value(join_key_values[k][idx])
                for k in join_keys
            ],
        ) for idx in range(table.num_rows)
    ]

    # Serialize the features per row
    feature_dict = {
        feature.name: [
            python_value_to_proto_value(val, feature.dtype)
            for val in table.column(feature.name).to_pylist()
        ]
        for feature in feature_view.features
    }
    features = [
        dict(zip(feature_dict, vars)) for vars in zip(*feature_dict.values())
    ]

    # Convert event_timestamps
    event_timestamps = [
        _coerce_datetime(val) for val in table.column(
            feature_view.batch_source.event_timestamp_column).to_pylist()
    ]

    # Convert created_timestamps if they exist
    if feature_view.batch_source.created_timestamp_column:
        created_timestamps = [
            _coerce_datetime(val)
            for val in table.column(feature_view.batch_source.
                                    created_timestamp_column).to_pylist()
        ]
    else:
        created_timestamps = [None] * table.num_rows

    return list(
        zip(entity_keys, features, event_timestamps, created_timestamps))
Exemplo n.º 4
0
    def _augment_response_with_on_demand_transforms(
        self,
        feature_refs: List[str],
        full_feature_names: bool,
        initial_response: OnlineResponse,
        result_rows: List[GetOnlineFeaturesResponse.FieldValues],
    ) -> OnlineResponse:
        all_on_demand_feature_views = {
            view.name: view
            for view in self._registry.list_on_demand_feature_views(
                project=self.project, allow_cache=True)
        }
        all_odfv_feature_names = all_on_demand_feature_views.keys()

        if len(all_on_demand_feature_views) == 0:
            return initial_response
        initial_response_df = initial_response.to_df()

        odfv_feature_refs = defaultdict(list)
        for feature_ref in feature_refs:
            view_name, feature_name = feature_ref.split(":")
            if view_name in all_odfv_feature_names:
                odfv_feature_refs[view_name].append(feature_name)

        # Apply on demand transformations
        for odfv_name, _feature_refs in odfv_feature_refs.items():
            odfv = all_on_demand_feature_views[odfv_name]
            transformed_features_df = odfv.get_transformed_features_df(
                full_feature_names, initial_response_df)
            for row_idx in range(len(result_rows)):
                result_row = result_rows[row_idx]

                selected_subset = [
                    f for f in transformed_features_df.columns
                    if f in _feature_refs
                ]

                for transformed_feature in selected_subset:
                    transformed_feature_name = (
                        f"{odfv.name}__{transformed_feature}"
                        if full_feature_names else transformed_feature)
                    proto_value = python_value_to_proto_value(
                        transformed_features_df[transformed_feature].
                        values[row_idx])
                    result_row.fields[transformed_feature_name].CopyFrom(
                        proto_value)
                    result_row.statuses[
                        transformed_feature_name] = GetOnlineFeaturesResponse.FieldStatus.PRESENT
        return OnlineResponse(
            GetOnlineFeaturesResponse(field_values=result_rows))
Exemplo n.º 5
0
    def get_online_features(
        self,
        features: Union[List[str], FeatureService],
        entity_rows: List[Dict[str, Any]],
        feature_refs: Optional[List[str]] = None,
        full_feature_names: bool = False,
    ) -> OnlineResponse:
        """
        Retrieves the latest online feature data.

        Note: This method will download the full feature registry the first time it is run. If you are using a
        remote registry like GCS or S3 then that may take a few seconds. The registry remains cached up to a TTL
        duration (which can be set to infinity). If the cached registry is stale (more time than the TTL has
        passed), then a new registry will be downloaded synchronously by this method. This download may
        introduce latency to online feature retrieval. In order to avoid synchronous downloads, please call
        refresh_registry() prior to the TTL being reached. Remember it is possible to set the cache TTL to
        infinity (cache forever).

        Args:
            features: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
                Only the feature name is required.
            entity_rows: A list of dictionaries where each key-value is an entity-name, entity-value pair.

        Returns:
            OnlineResponse containing the feature data in records.

        Raises:
            Exception: No entity with the specified name exists.

        Examples:
            Materialize all features into the online store over the interval
            from 3 hours ago to 10 minutes ago, and then retrieve these online features.

            >>> from feast import FeatureStore, RepoConfig
            >>> fs = FeatureStore(repo_path="feature_repo")
            >>> online_response = fs.get_online_features(
            ...     features=[
            ...         "driver_hourly_stats:conv_rate",
            ...         "driver_hourly_stats:acc_rate",
            ...         "driver_hourly_stats:avg_daily_trips",
            ...     ],
            ...     entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}, {"driver_id": 1003}, {"driver_id": 1004}],
            ... )
            >>> online_response_dict = online_response.to_dict()
        """
        _feature_refs = self._get_features(features, feature_refs)
        all_feature_views = self._list_feature_views(allow_cache=True,
                                                     hide_dummy_entity=False)
        all_on_demand_feature_views = self._registry.list_on_demand_feature_views(
            project=self.project, allow_cache=True)

        _validate_feature_refs(_feature_refs, full_feature_names)
        grouped_refs, grouped_odfv_refs = _group_feature_refs(
            _feature_refs, all_feature_views, all_on_demand_feature_views)
        if len(grouped_odfv_refs) > 0:
            log_event(UsageEvent.GET_ONLINE_FEATURES_WITH_ODFV)

        feature_views = list(view for view, _ in grouped_refs)
        entityless_case = DUMMY_ENTITY_NAME in [
            entity_name for feature_view in feature_views
            for entity_name in feature_view.entities
        ]

        provider = self._get_provider()
        entities = self._list_entities(allow_cache=True,
                                       hide_dummy_entity=False)
        entity_name_to_join_key_map = {}
        for entity in entities:
            entity_name_to_join_key_map[entity.name] = entity.join_key

        needed_request_data_features = self._get_needed_request_data_features(
            grouped_odfv_refs)

        join_key_rows = []
        request_data_features: Dict[str, List[Any]] = {}
        # Entity rows may be either entities or request data.
        for row in entity_rows:
            join_key_row = {}
            for entity_name, entity_value in row.items():
                # Found request data
                if entity_name in needed_request_data_features:
                    if entity_name not in request_data_features:
                        request_data_features[entity_name] = []
                    request_data_features[entity_name].append(entity_value)
                    continue
                try:
                    join_key = entity_name_to_join_key_map[entity_name]
                except KeyError:
                    raise EntityNotFoundException(entity_name, self.project)
                join_key_row[join_key] = entity_value
                if entityless_case:
                    join_key_row[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL
            if len(join_key_row) > 0:
                # May be empty if this entity row was request data
                join_key_rows.append(join_key_row)

        if len(needed_request_data_features) != len(
                request_data_features.keys()):
            raise RequestDataNotFoundInEntityRowsException(
                feature_names=needed_request_data_features)

        entity_row_proto_list = _infer_online_entity_rows(join_key_rows)

        union_of_entity_keys: List[EntityKeyProto] = []
        result_rows: List[GetOnlineFeaturesResponse.FieldValues] = []

        for entity_row_proto in entity_row_proto_list:
            # Create a list of entity keys to filter down for each feature view at lookup time.
            union_of_entity_keys.append(_entity_row_to_key(entity_row_proto))
            # Also create entity values to append to the result
            result_rows.append(_entity_row_to_field_values(entity_row_proto))

        # Add more feature values to the existing result rows for the request data features
        for feature_name, feature_values in request_data_features.items():
            for row_idx, feature_value in enumerate(feature_values):
                result_row = result_rows[row_idx]
                result_row.fields[feature_name].CopyFrom(
                    python_value_to_proto_value(feature_value))
                result_row.statuses[
                    feature_name] = GetOnlineFeaturesResponse.FieldStatus.PRESENT

        for table, requested_features in grouped_refs:
            self._populate_result_rows_from_feature_view(
                entity_name_to_join_key_map,
                full_feature_names,
                provider,
                requested_features,
                result_rows,
                table,
                union_of_entity_keys,
            )

        initial_response = OnlineResponse(
            GetOnlineFeaturesResponse(field_values=result_rows))
        return self._augment_response_with_on_demand_transforms(
            _feature_refs, full_feature_names, initial_response, result_rows)