def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, client: Trino, ) -> Tuple[datetime, datetime]: if type(entity_df) is str: results = client.execute_query( f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max " f"FROM ({entity_df})") entity_df_event_timestamp_range = ( pd.to_datetime(results.data[0][0]).to_pydatetime(), pd.to_datetime(results.data[0][1]).to_pydatetime(), ) elif isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[:, entity_df_event_timestamp_col].infer_objects( ) if pd.api.types.is_string_dtype(entity_df_event_timestamp): entity_df_event_timestamp = pd.to_datetime( entity_df_event_timestamp, utc=True) entity_df_event_timestamp_range = ( entity_df_event_timestamp.min().to_pydatetime(), entity_df_event_timestamp.max().to_pydatetime(), ) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range
def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, snowflake_conn: SnowflakeConnection, ) -> Tuple[datetime, datetime]: if isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[:, entity_df_event_timestamp_col].infer_objects( ) if pd.api.types.is_string_dtype(entity_df_event_timestamp): entity_df_event_timestamp = pd.to_datetime( entity_df_event_timestamp, utc=True) entity_df_event_timestamp_range = ( entity_df_event_timestamp.min().to_pydatetime(), entity_df_event_timestamp.max().to_pydatetime(), ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), determine range # from table query = f'SELECT MIN("{entity_df_event_timestamp_col}") AS "min_value", MAX("{entity_df_event_timestamp_col}") AS "max_value" FROM ({entity_df})' results = execute_snowflake_statement(snowflake_conn, query).fetchall() entity_df_event_timestamp_range = cast(Tuple[datetime, datetime], results[0]) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range
def _upload_entity_df_and_get_entity_schema( client: Trino, table_name: str, entity_df: Union[pd.DataFrame, str], connector: Dict[str, str], ) -> Dict[str, np.dtype]: """Uploads a Pandas entity dataframe into a Trino table and returns the resulting table""" if type(entity_df) is str: client.execute_query(f"CREATE TABLE {table_name} AS ({entity_df})") results = client.execute_query(f"SELECT * FROM {table_name} LIMIT 1") limited_entity_df = pd.DataFrame(data=results.data, columns=results.columns_names) for col_name, col_type in results.schema.items(): if col_type == "timestamp": limited_entity_df[col_name] = pd.to_datetime( limited_entity_df[col_name]) entity_schema = dict( zip(limited_entity_df.columns, limited_entity_df.dtypes)) return entity_schema elif isinstance(entity_df, pd.DataFrame): upload_pandas_dataframe_to_trino(client=client, df=entity_df, table=table_name, connector_args=connector) entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) return entity_schema else: raise InvalidEntityType(type(entity_df))
def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, client: Client, ) -> Tuple[datetime, datetime]: if type(entity_df) is str: job = client.query( f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max " f"FROM ({entity_df})") res = next(job.result()) entity_df_event_timestamp_range = ( res.get("min"), res.get("max"), ) elif isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[:, entity_df_event_timestamp_col].infer_objects( ) if pd.api.types.is_string_dtype(entity_df_event_timestamp): entity_df_event_timestamp = pd.to_datetime( entity_df_event_timestamp, utc=True) entity_df_event_timestamp_range = ( entity_df_event_timestamp.min().to_pydatetime(), entity_df_event_timestamp.max().to_pydatetime(), ) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range
def _upload_entity_df( entity_df: Union[pd.DataFrame, str], snowflake_conn: SnowflakeConnection, config: RepoConfig, table_name: str, ) -> None: if isinstance(entity_df, pd.DataFrame): # Write the data from the DataFrame to the table write_pandas( snowflake_conn, entity_df, table_name, auto_create_table=True, create_temp_table=True, ) return None elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), create a Snowflake table out of it, query = f'CREATE TEMPORARY TABLE "{table_name}" AS ({entity_df})' execute_snowflake_statement(snowflake_conn, query) return None else: raise InvalidEntityType(type(entity_df))
def _upload_entity_df_and_get_entity_schema( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Dict[str, np.dtype]: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if type(entity_df) is str: job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") block_until_done(client, job) limited_entity_df = ( client.query(f"SELECT * FROM {table_name} LIMIT 1").result().to_dataframe() ) entity_schema = dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) block_until_done(client, job) entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) else: raise InvalidEntityType(type(entity_df)) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return entity_schema
def _upload_entity_df( client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Table: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" if isinstance(entity_df, str): job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})") elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we don't have unnecessary columns entity_df.reset_index(drop=True, inplace=True) job = client.load_table_from_dataframe(entity_df, table_name) else: raise InvalidEntityType(type(entity_df)) block_until_done(client, job) # Ensure that the table expires after some time table = client.get_table(table=table_name) table.expires = datetime.utcnow() + timedelta(minutes=30) client.update_table(table, ["expires"]) return table
def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, config: RepoConfig, table_name: str, ) -> Tuple[datetime, datetime]: if isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[:, entity_df_event_timestamp_col].infer_objects( ) if pd.api.types.is_string_dtype(entity_df_event_timestamp): entity_df_event_timestamp = pd.to_datetime( entity_df_event_timestamp, utc=True) entity_df_event_timestamp_range = ( entity_df_event_timestamp.min(), entity_df_event_timestamp.max(), ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), determine range # from table with _get_conn(config.offline_store) as conn, conn.cursor() as cur: cur.execute( f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max FROM {table_name}" ), res = cur.fetchone() entity_df_event_timestamp_range = (res[0], res[1]) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range
def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, spark_session: SparkSession, ) -> Tuple[datetime, datetime]: if isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[:, entity_df_event_timestamp_col].infer_objects( ) if pd.api.types.is_string_dtype(entity_df_event_timestamp): entity_df_event_timestamp = pd.to_datetime( entity_df_event_timestamp, utc=True) entity_df_event_timestamp_range = ( entity_df_event_timestamp.min().to_pydatetime(), entity_df_event_timestamp.max().to_pydatetime(), ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), determine range # from table df = spark_session.sql(entity_df).select(entity_df_event_timestamp_col) # TODO(kzhang132): need utc conversion here. entity_df_event_timestamp_range = ( df.agg({ entity_df_event_timestamp_col: "max" }).collect()[0][0], df.agg({ entity_df_event_timestamp_col: "min" }).collect()[0][0], ) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range
def _upload_entity_df( entity_df: Union[pd.DataFrame, str], redshift_client, config: RepoConfig, s3_resource, table_name: str, ): if isinstance(entity_df, pd.DataFrame): # If the entity_df is a pandas dataframe, upload it to Redshift aws_utils.upload_df_to_redshift( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, s3_resource, f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet", config.offline_store.iam_role, table_name, entity_df, ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), create a Redshift table out of it aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"CREATE TABLE {table_name} AS ({entity_df})", ) else: raise InvalidEntityType(type(entity_df))
def _get_entity_schema( spark_session: SparkSession, entity_df: Union[pandas.DataFrame, str]) -> Dict[str, np.dtype]: if isinstance(entity_df, pd.DataFrame): return dict(zip(entity_df.columns, entity_df.dtypes)) elif isinstance(entity_df, str): entity_spark_df = spark_session.sql(entity_df) return dict( zip( entity_spark_df.columns, spark_schema_to_np_dtypes(entity_spark_df.dtypes), )) else: raise InvalidEntityType(type(entity_df))
def _get_entity_schema( client: Client, entity_df: Union[pd.DataFrame, str]) -> Dict[str, np.dtype]: if isinstance(entity_df, str): entity_df_sample = (client.query( f"SELECT * FROM ({entity_df}) LIMIT 1").result().to_dataframe()) entity_schema = dict( zip(entity_df_sample.columns, entity_df_sample.dtypes)) elif isinstance(entity_df, pd.DataFrame): entity_schema = dict(zip(entity_df.columns, entity_df.dtypes)) else: raise InvalidEntityType(type(entity_df)) return entity_schema
def _upload_entity_df( spark_session: SparkSession, table_name: str, entity_df: Union[pandas.DataFrame, str], event_timestamp_col: str, ) -> None: if isinstance(entity_df, pd.DataFrame): entity_df[event_timestamp_col] = pd.to_datetime( entity_df[event_timestamp_col], utc=True) spark_session.createDataFrame(entity_df).createOrReplaceTempView( table_name) return elif isinstance(entity_df, str): spark_session.sql(entity_df).createOrReplaceTempView(table_name) return else: raise InvalidEntityType(type(entity_df))
def _upload_entity_df_and_get_entity_schema( entity_df: Union[pd.DataFrame, str], redshift_client, config: RepoConfig, s3_resource, table_name: str, ) -> Dict[str, np.dtype]: if isinstance(entity_df, pd.DataFrame): # If the entity_df is a pandas dataframe, upload it to Redshift # and construct the schema from the original entity_df dataframe aws_utils.upload_df_to_redshift( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, s3_resource, f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet", config.offline_store.iam_role, table_name, entity_df, ) return dict(zip(entity_df.columns, entity_df.dtypes)) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), create a Redshift table out of it, # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"CREATE TABLE {table_name} AS ({entity_df})", ) limited_entity_df = RedshiftRetrievalJob( f"SELECT * FROM {table_name} LIMIT 1", redshift_client, s3_resource, config, full_feature_names=False, on_demand_feature_views=None, ).to_df() return dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) else: raise InvalidEntityType(type(entity_df))
def _get_entity_schema( entity_df: Union[pd.DataFrame, str], redshift_client, config: RepoConfig, s3_resource, ) -> Dict[str, np.dtype]: if isinstance(entity_df, pd.DataFrame): return dict(zip(entity_df.columns, entity_df.dtypes)) elif isinstance(entity_df, str): # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it entity_df_sample = RedshiftRetrievalJob( f"SELECT * FROM ({entity_df}) LIMIT 1", redshift_client, s3_resource, config, full_feature_names=False, ).to_df() return dict(zip(entity_df_sample.columns, entity_df_sample.dtypes)) else: raise InvalidEntityType(type(entity_df))
def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, redshift_client, config: RepoConfig, table_name: str, ) -> Tuple[datetime, datetime]: if isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[:, entity_df_event_timestamp_col].infer_objects( ) if pd.api.types.is_string_dtype(entity_df_event_timestamp): entity_df_event_timestamp = pd.to_datetime( entity_df_event_timestamp, utc=True) entity_df_event_timestamp_range = ( entity_df_event_timestamp.min(), entity_df_event_timestamp.max(), ) elif isinstance(entity_df, str): # If the entity_df is a string (SQL query), determine range # from table statement_id = aws_utils.execute_redshift_statement( redshift_client, config.offline_store.cluster_id, config.offline_store.database, config.offline_store.user, f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max FROM {table_name}", ) res = aws_utils.get_redshift_statement_result( redshift_client, statement_id)["Records"][0] entity_df_event_timestamp_range = ( parser.parse(res[0]["stringValue"]), parser.parse(res[1]["stringValue"]), ) else: raise InvalidEntityType(type(entity_df)) return entity_df_event_timestamp_range