def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, SnowflakeSource) assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) from_expression = (data_source.get_table_query_string() ) # returns schema.table as a string if join_key_columns: partition_by_join_key_string = '"' + '", "'.join( join_key_columns) + '"' partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) else: partition_by_join_key_string = "" timestamp_columns = [event_timestamp_column] if created_timestamp_column: timestamp_columns.append(created_timestamp_column) timestamp_desc_string = '"' + '" DESC, "'.join( timestamp_columns) + '" DESC' field_string = ('"' + '", "'.join(join_key_columns + feature_name_columns + timestamp_columns) + '"') if data_source.snowflake_options.warehouse: config.offline_store.warehouse = data_source.snowflake_options.warehouse snowflake_conn = get_snowflake_conn(config.offline_store) query = f""" SELECT {field_string} {f''', TRIM({repr(DUMMY_ENTITY_VAL)}::VARIANT,'"') AS "{DUMMY_ENTITY_ID}"''' if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS "_feast_row" FROM {from_expression} WHERE "{event_timestamp_column}" BETWEEN TO_TIMESTAMP_NTZ({start_date.timestamp()}) AND TO_TIMESTAMP_NTZ({end_date.timestamp()}) ) WHERE "_feast_row" = 1 """ return SnowflakeRetrievalJob( query=query, snowflake_conn=snowflake_conn, config=config, full_feature_names=False, on_demand_feature_views=None, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: spark_session = get_spark_session_or_start_new_with_repoconfig( config.offline_store) assert isinstance(config.offline_store, SparkOfflineStoreConfig) assert isinstance(data_source, SparkSource) warnings.warn( "The spark offline store is an experimental feature in alpha development. " "Some functionality may still be unstable so functionality can change in the future.", RuntimeWarning, ) print("Pulling latest features from spark offline store") from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) start_date_str = _format_datetime(start_date) end_date_str = _format_datetime(end_date) query = f""" SELECT {field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS feast_row_ FROM {from_expression} t1 WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date_str}') AND TIMESTAMP('{end_date_str}') ) t2 WHERE feast_row_ = 1 """ return SparkRetrievalJob( spark_session=spark_session, query=query, full_feature_names=False, on_demand_feature_views=None, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, RedshiftSource) assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ( "PARTITION BY " + partition_by_join_key_string ) timestamp_columns = [event_timestamp_column] if created_timestamp_column: timestamp_columns.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamp_columns) + " DESC" field_string = ", ".join( join_key_columns + feature_name_columns + timestamp_columns ) redshift_client = aws_utils.get_redshift_data_client( config.offline_store.region ) s3_resource = aws_utils.get_s3_resource(config.offline_store.region) query = f""" SELECT {field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' ) WHERE _feast_row = 1 """ # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return RedshiftRetrievalJob( query=query, redshift_client=redshift_client, s3_resource=s3_resource, config=config, full_feature_names=False, on_demand_feature_views=None, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, PostgreSQLSource) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join( _append_alias(join_key_columns, "a")) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(_append_alias(timestamps, "a")) + " DESC" a_field_string = ", ".join( _append_alias(join_key_columns + feature_name_columns + timestamps, "a")) b_field_string = ", ".join( _append_alias(join_key_columns + feature_name_columns + timestamps, "b")) query = f""" SELECT {b_field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {a_field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM ({from_expression}) a WHERE a."{event_timestamp_column}" BETWEEN '{start_date}'::timestamptz AND '{end_date}'::timestamptz ) b WHERE _feast_row = 1 """ return PostgreSQLRetrievalJob( query=query, config=config, full_feature_names=False, on_demand_feature_views=None, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, BigQuerySource) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) client = _get_bigquery_client( project=config.offline_store.project_id, location=config.offline_store.location, ) query = f""" SELECT {field_string} {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') ) WHERE _feast_row = 1 """ # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return BigQueryRetrievalJob( query=query, client=client, config=config, full_feature_names=False, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, RedshiftSource) assert isinstance(config.offline_store, RedshiftOfflineStoreConfig) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamp_columns = [event_timestamp_column] if created_timestamp_column: timestamp_columns.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamp_columns) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamp_columns) redshift_client = aws_utils.get_redshift_data_client( config.offline_store.region) s3_resource = aws_utils.get_s3_resource(config.offline_store.region) query = f""" SELECT {field_string} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' ) WHERE _feast_row = 1 """ return RedshiftRetrievalJob( query=query, redshift_client=redshift_client, s3_resource=s3_resource, config=config, )
def pull_all_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, start_date: datetime, end_date: datetime, ) -> RetrievalJob: """ Note that join_key_columns, feature_name_columns, event_timestamp_column, and created_timestamp_column have all already been mapped to column names of the source table and those column names are the values passed into this function. """ assert isinstance(data_source, SparkSource) warnings.warn( "The spark offline store is an experimental feature in alpha development. " "This API is unstable and it could and most probably will be changed in the future.", RuntimeWarning, ) spark_session = get_spark_session_or_start_new_with_repoconfig( store_config=config.offline_store) fields = ", ".join(join_key_columns + feature_name_columns + [event_timestamp_column]) from_expression = data_source.get_table_query_string() start_date = start_date.astimezone(tz=utc) end_date = end_date.astimezone(tz=utc) query = f""" SELECT {fields} FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' """ return SparkRetrievalJob(spark_session=spark_session, query=query, full_feature_names=False)
def pull_all_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, SnowflakeSource) from_expression = data_source.get_table_query_string() field_string = ('"' + '", "'.join(join_key_columns + feature_name_columns + [event_timestamp_column]) + '"') if data_source.snowflake_options.warehouse: config.offline_store.warehouse = data_source.snowflake_options.warehouse snowflake_conn = get_snowflake_conn(config.offline_store) start_date = start_date.astimezone(tz=utc) end_date = end_date.astimezone(tz=utc) query = f""" SELECT {field_string} FROM {from_expression} WHERE "{event_timestamp_column}" BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' """ return SnowflakeRetrievalJob( query=query, snowflake_conn=snowflake_conn, config=config, full_feature_names=False, )
def pull_latest_from_table_or_query( config: RepoConfig, data_source: DataSource, join_key_columns: List[str], feature_name_columns: List[str], event_timestamp_column: str, created_timestamp_column: Optional[str], start_date: datetime, end_date: datetime, ) -> RetrievalJob: assert isinstance(data_source, BigQuerySource) from_expression = data_source.get_table_query_string() partition_by_join_key_string = ", ".join(join_key_columns) if partition_by_join_key_string != "": partition_by_join_key_string = ("PARTITION BY " + partition_by_join_key_string) timestamps = [event_timestamp_column] if created_timestamp_column: timestamps.append(created_timestamp_column) timestamp_desc_string = " DESC, ".join(timestamps) + " DESC" field_string = ", ".join(join_key_columns + feature_name_columns + timestamps) client = _get_bigquery_client(project=config.offline_store.project_id) query = f""" SELECT {field_string} FROM ( SELECT {field_string}, ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row FROM {from_expression} WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}') ) WHERE _feast_row = 1 """ return BigQueryRetrievalJob(query=query, client=client, config=config)