def conv_rate_plus_100_feature_view( inputs: Dict[str, Union[RequestDataSource, FeatureView]], infer_features: bool = False, features: Optional[List[Feature]] = None, ) -> OnDemandFeatureView: _features = features or [ Feature("conv_rate_plus_100", ValueType.DOUBLE), Feature("conv_rate_plus_val_to_add", ValueType.DOUBLE), ] return OnDemandFeatureView( name=conv_rate_plus_100.__name__, inputs=inputs, features=[] if infer_features else _features, udf=conv_rate_plus_100, )
def similarity_feature_view( inputs: Dict[str, Union[RequestDataSource, FeatureView]], infer_features: bool = False, features: Optional[List[Feature]] = None, ) -> OnDemandFeatureView: _features = features or [ Feature("cos_double", ValueType.DOUBLE), Feature("cos_float", ValueType.FLOAT), ] return OnDemandFeatureView( name=similarity.__name__, inputs=inputs, features=[] if infer_features else _features, udf=similarity, )
def conv_rate_plus_100_feature_view( sources: Dict[str, Union[RequestSource, FeatureView]], infer_features: bool = False, features: Optional[List[Field]] = None, ) -> OnDemandFeatureView: # Test that positional arguments and Features still work for ODFVs. _features = features or [ Field(name="conv_rate_plus_100", dtype=Float64), Field(name="conv_rate_plus_val_to_add", dtype=Float64), Field(name="conv_rate_plus_100_rounded", dtype=Int32), ] return OnDemandFeatureView( name=conv_rate_plus_100.__name__, schema=[] if infer_features else _features, sources=sources, udf=conv_rate_plus_100, )
def similarity_feature_view( sources: Dict[str, Union[RequestSource, FeatureView]], infer_features: bool = False, features: Optional[List[Feature]] = None, ) -> OnDemandFeatureView: _fields = [ Field(name="cos_double", dtype=Float64), Field(name="cos_float", dtype=Float32), ] if features is not None: _fields = [Field.from_feature(feature) for feature in features] return OnDemandFeatureView( name=similarity.__name__, sources=sources, schema=[] if infer_features else _fields, udf=similarity, )
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: if not isinstance(entity_df, pd.DataFrame): raise ValueError( f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}" ) entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL # local modifiable copy of global variable if entity_df_event_timestamp_col not in entity_df.columns: datetime_columns = entity_df.select_dtypes( include=["datetime", "datetimetz"]).columns if len(datetime_columns) == 1: print( f"Using {datetime_columns[0]} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}." ) entity_df_event_timestamp_col = datetime_columns[0] else: raise ValueError( f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." ) ( feature_views_to_features, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(config.project), ) # Create lazy function that is only called from the RetrievalJob object def evaluate_historical_retrieval(): # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC entity_df[entity_df_event_timestamp_col] = entity_df[ entity_df_event_timestamp_col].apply( lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) # Create a copy of entity_df to prevent modifying the original entity_df_with_features = entity_df.copy() # Convert event timestamp column to datetime and normalize time zone to UTC # This is necessary to avoid issues with pd.merge_asof entity_df_with_features[ entity_df_event_timestamp_col] = pd.to_datetime( entity_df_with_features[entity_df_event_timestamp_col], utc=True) # Sort event timestamp values entity_df_with_features = entity_df_with_features.sort_values( entity_df_event_timestamp_col) # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = ( feature_view.batch_source.event_timestamp_column) created_timestamp_column = ( feature_view.batch_source.created_timestamp_column) # Read offline parquet data in pyarrow format. filesystem, path = FileSource.create_filesystem_and_path( feature_view.batch_source.path, feature_view.batch_source.file_options. s3_endpoint_override, ) table = pyarrow.parquet.read_table(path, filesystem=filesystem) # Rename columns by the field mapping dictionary if it exists if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping( table, feature_view.batch_source.field_mapping) # Rename entity columns by the join_key_map dictionary if it exists if feature_view.projection.join_key_map: table = _run_field_mapping( table, feature_view.projection.join_key_map) # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values, # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean # If the dtype is 'object', then missing values are inferred as python `None`s. # More details at: # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing df_to_join = table.to_pandas() # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC df_to_join[event_timestamp_column] = df_to_join[ event_timestamp_column].apply(lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) if created_timestamp_column: df_to_join[created_timestamp_column] = df_to_join[ created_timestamp_column].apply( lambda x: x if x.tzinfo is not None else x.replace( tzinfo=pytz.utc)) # Sort dataframe by the event timestamp column df_to_join = df_to_join.sort_values(event_timestamp_column) # Build a list of all the features we should select from this source feature_names = [] for feature in features: # Modify the separator for feature refs in column names to double underscore. We are using # double underscore as separator for consistency with other databases like BigQuery, # where there are very few characters available for use as separators if full_feature_names: formatted_feature_name = ( f"{feature_view.projection.name_to_use()}__{feature}" ) else: formatted_feature_name = feature # Add the feature name to the list of columns feature_names.append(formatted_feature_name) # Ensure that the source dataframe feature column includes the feature view name as a prefix df_to_join.rename( columns={feature: formatted_feature_name}, inplace=True, ) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) right_entity_columns = join_keys right_entity_key_columns = [event_timestamp_column ] + right_entity_columns # Remove all duplicate entity keys (using created timestamp) right_entity_key_sort_columns = right_entity_key_columns if created_timestamp_column: # If created_timestamp is available, use it to dedupe deterministically right_entity_key_sort_columns = right_entity_key_sort_columns + [ created_timestamp_column ] df_to_join.sort_values(by=right_entity_key_sort_columns, inplace=True) df_to_join.drop_duplicates( right_entity_key_sort_columns, keep="last", ignore_index=True, inplace=True, ) # Select only the columns we need to join from the feature dataframe df_to_join = df_to_join[right_entity_key_columns + feature_names] # Do point in-time-join between entity_df and feature dataframe entity_df_with_features = pd.merge_asof( entity_df_with_features, df_to_join, left_on=entity_df_event_timestamp_col, right_on=event_timestamp_column, by=right_entity_columns or None, tolerance=feature_view.ttl, ) # Remove right (feature table/view) event_timestamp column. if event_timestamp_column != entity_df_event_timestamp_col: entity_df_with_features.drop( columns=[event_timestamp_column], inplace=True) # Ensure that we delete dataframes to free up memory del df_to_join # Move "event_timestamp" column to front current_cols = entity_df_with_features.columns.tolist() current_cols.remove(entity_df_event_timestamp_col) entity_df_with_features = entity_df_with_features[ [entity_df_event_timestamp_col] + current_cols] return entity_df_with_features job = FileRetrievalJob( evaluation_function=evaluate_historical_retrieval, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry), ) return job
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) snowflake_conn = get_snowflake_conn(config.offline_store) entity_schema = _get_entity_schema(entity_df, snowflake_conn, config) entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( entity_schema) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( entity_df, entity_df_event_timestamp_col, snowflake_conn, ) @contextlib.contextmanager def query_generator() -> Iterator[str]: table_name = offline_utils.get_temp_entity_table_name() _upload_entity_df(entity_df, snowflake_conn, config, table_name) expected_join_keys = offline_utils.get_expected_join_keys( project, feature_views, registry) offline_utils.assert_expected_columns_in_entity_df( entity_schema, expected_join_keys, entity_df_event_timestamp_col) # Build a query context containing all information required to template the Snowflake SQL query query_context = offline_utils.get_feature_view_query_context( feature_refs, feature_views, registry, project, entity_df_event_timestamp_range, ) query_context = _fix_entity_selections_identifiers(query_context) # Generate the Snowflake SQL query from the query context query = offline_utils.build_point_in_time_query( query_context, left_table_query_string=table_name, entity_df_event_timestamp_col=entity_df_event_timestamp_col, entity_df_columns=entity_schema.keys(), query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, full_feature_names=full_feature_names, ) yield query return SnowflakeRetrievalJob( query=query_generator, snowflake_conn=snowflake_conn, config=config, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry), metadata=RetrievalMetadata( features=feature_refs, keys=list(entity_schema.keys() - {entity_df_event_timestamp_col}), min_event_timestamp=entity_df_event_timestamp_range[0], max_event_timestamp=entity_df_event_timestamp_range[1], ), )
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) redshift_client = aws_utils.get_redshift_data_client( config.offline_store.region ) s3_resource = aws_utils.get_s3_resource(config.offline_store.region) @contextlib.contextmanager def query_generator() -> Iterator[str]: table_name = offline_utils.get_temp_entity_table_name() entity_schema = _upload_entity_df_and_get_entity_schema( entity_df, redshift_client, config, s3_resource, table_name ) entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( entity_schema ) expected_join_keys = offline_utils.get_expected_join_keys( project, feature_views, registry ) offline_utils.assert_expected_columns_in_entity_df( entity_schema, expected_join_keys, entity_df_event_timestamp_col ) # Build a query context containing all information required to template the Redshift SQL query query_context = offline_utils.get_feature_view_query_context( feature_refs, feature_views, registry, project, ) # Generate the Redshift SQL query from the query context query = offline_utils.build_point_in_time_query( query_context, left_table_query_string=table_name, entity_df_event_timestamp_col=entity_df_event_timestamp_col, query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, full_feature_names=full_feature_names, ) yield query # Clean up the uploaded Redshift table aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"DROP TABLE {table_name}", ) return RedshiftRetrievalJob( query=query_generator, redshift_client=redshift_client, s3_resource=s3_resource, config=config, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry ), drop_columns=["entity_timestamp"] + [ f"{feature_view.name}__entity_row_unique_id" for feature_view in feature_views ], )
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pandas.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: assert isinstance(config.offline_store, SparkOfflineStoreConfig) warnings.warn( "The spark offline store is an experimental feature in alpha development. " "Some functionality may still be unstable so functionality can change in the future.", RuntimeWarning, ) spark_session = get_spark_session_or_start_new_with_repoconfig( store_config=config.offline_store) tmp_entity_df_table_name = offline_utils.get_temp_entity_table_name() entity_schema = _get_entity_schema( spark_session=spark_session, entity_df=entity_df, ) event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( entity_schema=entity_schema, ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( entity_df, event_timestamp_col, spark_session, ) _upload_entity_df( spark_session=spark_session, table_name=tmp_entity_df_table_name, entity_df=entity_df, event_timestamp_col=event_timestamp_col, ) expected_join_keys = offline_utils.get_expected_join_keys( project=project, feature_views=feature_views, registry=registry) offline_utils.assert_expected_columns_in_entity_df( entity_schema=entity_schema, join_keys=expected_join_keys, entity_df_event_timestamp_col=event_timestamp_col, ) query_context = offline_utils.get_feature_view_query_context( feature_refs, feature_views, registry, project, entity_df_event_timestamp_range, ) query = offline_utils.build_point_in_time_query( feature_view_query_contexts=query_context, left_table_query_string=tmp_entity_df_table_name, entity_df_event_timestamp_col=event_timestamp_col, entity_df_columns=entity_schema.keys(), query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, full_feature_names=full_feature_names, ) return SparkRetrievalJob( spark_session=spark_session, query=query, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry), metadata=RetrievalMetadata( features=feature_refs, keys=list(set(entity_schema.keys()) - {event_timestamp_col}), min_event_timestamp=entity_df_event_timestamp_range[0], max_event_timestamp=entity_df_event_timestamp_range[1], ), )
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: if not isinstance(entity_df, pd.DataFrame) and not isinstance( entity_df, dd.DataFrame): raise ValueError( f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}" ) entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL # local modifiable copy of global variable if entity_df_event_timestamp_col not in entity_df.columns: datetime_columns = entity_df.select_dtypes( include=["datetime", "datetimetz"]).columns if len(datetime_columns) == 1: print( f"Using {datetime_columns[0]} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}." ) entity_df_event_timestamp_col = datetime_columns[0] else: raise ValueError( f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." ) ( feature_views_to_features, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(config.project), ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( entity_df, entity_df_event_timestamp_col) # Create lazy function that is only called from the RetrievalJob object def evaluate_historical_retrieval(): # Create a copy of entity_df to prevent modifying the original entity_df_with_features = entity_df.copy() entity_df_event_timestamp_col_type = entity_df_with_features.dtypes[ entity_df_event_timestamp_col] if (not hasattr(entity_df_event_timestamp_col_type, "tz") or entity_df_event_timestamp_col_type.tz != pytz.UTC): # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC entity_df_with_features[ entity_df_event_timestamp_col] = entity_df_with_features[ entity_df_event_timestamp_col].apply( lambda x: x if x.tzinfo is not None else x.replace( tzinfo=pytz.utc)) # Convert event timestamp column to datetime and normalize time zone to UTC # This is necessary to avoid issues with pd.merge_asof if isinstance(entity_df_with_features, dd.DataFrame): entity_df_with_features[ entity_df_event_timestamp_col] = dd.to_datetime( entity_df_with_features[ entity_df_event_timestamp_col], utc=True) else: entity_df_with_features[ entity_df_event_timestamp_col] = pd.to_datetime( entity_df_with_features[ entity_df_event_timestamp_col], utc=True) # Sort event timestamp values entity_df_with_features = entity_df_with_features.sort_values( entity_df_event_timestamp_col) join_keys = [] all_join_keys = [] # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = feature_view.batch_source.timestamp_field created_timestamp_column = ( feature_view.batch_source.created_timestamp_column) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) right_entity_key_columns = [ event_timestamp_column, created_timestamp_column, ] + join_keys right_entity_key_columns = [ c for c in right_entity_key_columns if c ] all_join_keys = list(set(all_join_keys + join_keys)) df_to_join = _read_datasource(feature_view.batch_source) df_to_join, event_timestamp_column = _field_mapping( df_to_join, feature_view, features, right_entity_key_columns, entity_df_event_timestamp_col, event_timestamp_column, full_feature_names, ) df_to_join = _merge(entity_df_with_features, df_to_join, join_keys) df_to_join = _normalize_timestamp(df_to_join, event_timestamp_column, created_timestamp_column) df_to_join = _filter_ttl( df_to_join, feature_view, entity_df_event_timestamp_col, event_timestamp_column, ) df_to_join = _drop_duplicates( df_to_join, all_join_keys, event_timestamp_column, created_timestamp_column, entity_df_event_timestamp_col, ) entity_df_with_features = _drop_columns( df_to_join, event_timestamp_column, created_timestamp_column) # Ensure that we delete dataframes to free up memory del df_to_join return entity_df_with_features.persist() job = FileRetrievalJob( evaluation_function=evaluate_historical_retrieval, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry), metadata=RetrievalMetadata( features=feature_refs, keys=list( set(entity_df.columns) - {entity_df_event_timestamp_col}), min_event_timestamp=entity_df_event_timestamp_range[0], max_event_timestamp=entity_df_event_timestamp_range[1], ), ) return job