예제 #1
0
파일: trino.py 프로젝트: feast-dev/feast
def _get_entity_df_event_timestamp_range(
    entity_df: Union[pd.DataFrame, str],
    entity_df_event_timestamp_col: str,
    client: Trino,
) -> Tuple[datetime, datetime]:
    if type(entity_df) is str:
        results = client.execute_query(
            f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max "
            f"FROM ({entity_df})")

        entity_df_event_timestamp_range = (
            pd.to_datetime(results.data[0][0]).to_pydatetime(),
            pd.to_datetime(results.data[0][1]).to_pydatetime(),
        )
    elif isinstance(entity_df, pd.DataFrame):
        entity_df_event_timestamp = entity_df.loc[:,
                                                  entity_df_event_timestamp_col].infer_objects(
                                                  )
        if pd.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pd.to_datetime(
                entity_df_event_timestamp, utc=True)
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min().to_pydatetime(),
            entity_df_event_timestamp.max().to_pydatetime(),
        )
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range
예제 #2
0
def _get_entity_df_event_timestamp_range(
    entity_df: Union[pd.DataFrame, str],
    entity_df_event_timestamp_col: str,
    snowflake_conn: SnowflakeConnection,
) -> Tuple[datetime, datetime]:
    if isinstance(entity_df, pd.DataFrame):
        entity_df_event_timestamp = entity_df.loc[:,
                                                  entity_df_event_timestamp_col].infer_objects(
                                                  )
        if pd.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pd.to_datetime(
                entity_df_event_timestamp, utc=True)
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min().to_pydatetime(),
            entity_df_event_timestamp.max().to_pydatetime(),
        )
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), determine range
        # from table
        query = f'SELECT MIN("{entity_df_event_timestamp_col}") AS "min_value", MAX("{entity_df_event_timestamp_col}") AS "max_value" FROM ({entity_df})'
        results = execute_snowflake_statement(snowflake_conn, query).fetchall()

        entity_df_event_timestamp_range = cast(Tuple[datetime, datetime],
                                               results[0])
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range
예제 #3
0
파일: trino.py 프로젝트: feast-dev/feast
def _upload_entity_df_and_get_entity_schema(
    client: Trino,
    table_name: str,
    entity_df: Union[pd.DataFrame, str],
    connector: Dict[str, str],
) -> Dict[str, np.dtype]:
    """Uploads a Pandas entity dataframe into a Trino table and returns the resulting table"""
    if type(entity_df) is str:
        client.execute_query(f"CREATE TABLE {table_name} AS ({entity_df})")

        results = client.execute_query(f"SELECT * FROM {table_name} LIMIT 1")

        limited_entity_df = pd.DataFrame(data=results.data,
                                         columns=results.columns_names)
        for col_name, col_type in results.schema.items():
            if col_type == "timestamp":
                limited_entity_df[col_name] = pd.to_datetime(
                    limited_entity_df[col_name])
        entity_schema = dict(
            zip(limited_entity_df.columns, limited_entity_df.dtypes))

        return entity_schema
    elif isinstance(entity_df, pd.DataFrame):
        upload_pandas_dataframe_to_trino(client=client,
                                         df=entity_df,
                                         table=table_name,
                                         connector_args=connector)
        entity_schema = dict(zip(entity_df.columns, entity_df.dtypes))
        return entity_schema
    else:
        raise InvalidEntityType(type(entity_df))
예제 #4
0
파일: bigquery.py 프로젝트: feast-dev/feast
def _get_entity_df_event_timestamp_range(
    entity_df: Union[pd.DataFrame, str],
    entity_df_event_timestamp_col: str,
    client: Client,
) -> Tuple[datetime, datetime]:
    if type(entity_df) is str:
        job = client.query(
            f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max "
            f"FROM ({entity_df})")
        res = next(job.result())
        entity_df_event_timestamp_range = (
            res.get("min"),
            res.get("max"),
        )
    elif isinstance(entity_df, pd.DataFrame):
        entity_df_event_timestamp = entity_df.loc[:,
                                                  entity_df_event_timestamp_col].infer_objects(
                                                  )
        if pd.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pd.to_datetime(
                entity_df_event_timestamp, utc=True)
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min().to_pydatetime(),
            entity_df_event_timestamp.max().to_pydatetime(),
        )
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range
예제 #5
0
def _upload_entity_df(
    entity_df: Union[pd.DataFrame, str],
    snowflake_conn: SnowflakeConnection,
    config: RepoConfig,
    table_name: str,
) -> None:

    if isinstance(entity_df, pd.DataFrame):
        # Write the data from the DataFrame to the table
        write_pandas(
            snowflake_conn,
            entity_df,
            table_name,
            auto_create_table=True,
            create_temp_table=True,
        )

        return None
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), create a Snowflake table out of it,
        query = f'CREATE TEMPORARY TABLE "{table_name}" AS ({entity_df})'
        execute_snowflake_statement(snowflake_conn, query)

        return None
    else:
        raise InvalidEntityType(type(entity_df))
예제 #6
0
def _upload_entity_df_and_get_entity_schema(
    client: Client, table_name: str, entity_df: Union[pd.DataFrame, str],
) -> Dict[str, np.dtype]:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    if type(entity_df) is str:
        job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})")
        block_until_done(client, job)

        limited_entity_df = (
            client.query(f"SELECT * FROM {table_name} LIMIT 1").result().to_dataframe()
        )

        entity_schema = dict(zip(limited_entity_df.columns, limited_entity_df.dtypes))
    elif isinstance(entity_df, pd.DataFrame):
        # Drop the index so that we dont have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)
        job = client.load_table_from_dataframe(entity_df, table_name)
        block_until_done(client, job)
        entity_schema = dict(zip(entity_df.columns, entity_df.dtypes))
    else:
        raise InvalidEntityType(type(entity_df))

    # Ensure that the table expires after some time
    table = client.get_table(table=table_name)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return entity_schema
예제 #7
0
파일: bigquery.py 프로젝트: feast-dev/feast
def _upload_entity_df(
    client: Client,
    table_name: str,
    entity_df: Union[pd.DataFrame, str],
) -> Table:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    if isinstance(entity_df, str):
        job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})")

    elif isinstance(entity_df, pd.DataFrame):
        # Drop the index so that we don't have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)
        job = client.load_table_from_dataframe(entity_df, table_name)
    else:
        raise InvalidEntityType(type(entity_df))

    block_until_done(client, job)

    # Ensure that the table expires after some time
    table = client.get_table(table=table_name)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return table
예제 #8
0
파일: postgres.py 프로젝트: feast-dev/feast
def _get_entity_df_event_timestamp_range(
    entity_df: Union[pd.DataFrame, str],
    entity_df_event_timestamp_col: str,
    config: RepoConfig,
    table_name: str,
) -> Tuple[datetime, datetime]:
    if isinstance(entity_df, pd.DataFrame):
        entity_df_event_timestamp = entity_df.loc[:,
                                                  entity_df_event_timestamp_col].infer_objects(
                                                  )
        if pd.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pd.to_datetime(
                entity_df_event_timestamp, utc=True)
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min(),
            entity_df_event_timestamp.max(),
        )
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), determine range
        # from table
        with _get_conn(config.offline_store) as conn, conn.cursor() as cur:
            cur.execute(
                f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max FROM {table_name}"
            ),
            res = cur.fetchone()
        entity_df_event_timestamp_range = (res[0], res[1])
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range
예제 #9
0
def _get_entity_df_event_timestamp_range(
    entity_df: Union[pd.DataFrame, str],
    entity_df_event_timestamp_col: str,
    spark_session: SparkSession,
) -> Tuple[datetime, datetime]:
    if isinstance(entity_df, pd.DataFrame):
        entity_df_event_timestamp = entity_df.loc[:,
                                                  entity_df_event_timestamp_col].infer_objects(
                                                  )
        if pd.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pd.to_datetime(
                entity_df_event_timestamp, utc=True)
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min().to_pydatetime(),
            entity_df_event_timestamp.max().to_pydatetime(),
        )
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), determine range
        # from table
        df = spark_session.sql(entity_df).select(entity_df_event_timestamp_col)
        # TODO(kzhang132): need utc conversion here.
        entity_df_event_timestamp_range = (
            df.agg({
                entity_df_event_timestamp_col: "max"
            }).collect()[0][0],
            df.agg({
                entity_df_event_timestamp_col: "min"
            }).collect()[0][0],
        )
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range
예제 #10
0
파일: redshift.py 프로젝트: feast-dev/feast
def _upload_entity_df(
    entity_df: Union[pd.DataFrame, str],
    redshift_client,
    config: RepoConfig,
    s3_resource,
    table_name: str,
):
    if isinstance(entity_df, pd.DataFrame):
        # If the entity_df is a pandas dataframe, upload it to Redshift
        aws_utils.upload_df_to_redshift(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            s3_resource,
            f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet",
            config.offline_store.iam_role,
            table_name,
            entity_df,
        )
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), create a Redshift table out of it
        aws_utils.execute_redshift_statement(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            f"CREATE TABLE {table_name} AS ({entity_df})",
        )
    else:
        raise InvalidEntityType(type(entity_df))
예제 #11
0
def _get_entity_schema(
        spark_session: SparkSession,
        entity_df: Union[pandas.DataFrame, str]) -> Dict[str, np.dtype]:
    if isinstance(entity_df, pd.DataFrame):
        return dict(zip(entity_df.columns, entity_df.dtypes))
    elif isinstance(entity_df, str):
        entity_spark_df = spark_session.sql(entity_df)
        return dict(
            zip(
                entity_spark_df.columns,
                spark_schema_to_np_dtypes(entity_spark_df.dtypes),
            ))
    else:
        raise InvalidEntityType(type(entity_df))
예제 #12
0
파일: bigquery.py 프로젝트: feast-dev/feast
def _get_entity_schema(
        client: Client, entity_df: Union[pd.DataFrame,
                                         str]) -> Dict[str, np.dtype]:
    if isinstance(entity_df, str):
        entity_df_sample = (client.query(
            f"SELECT * FROM ({entity_df}) LIMIT 1").result().to_dataframe())

        entity_schema = dict(
            zip(entity_df_sample.columns, entity_df_sample.dtypes))
    elif isinstance(entity_df, pd.DataFrame):
        entity_schema = dict(zip(entity_df.columns, entity_df.dtypes))
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_schema
예제 #13
0
def _upload_entity_df(
    spark_session: SparkSession,
    table_name: str,
    entity_df: Union[pandas.DataFrame, str],
    event_timestamp_col: str,
) -> None:
    if isinstance(entity_df, pd.DataFrame):
        entity_df[event_timestamp_col] = pd.to_datetime(
            entity_df[event_timestamp_col], utc=True)
        spark_session.createDataFrame(entity_df).createOrReplaceTempView(
            table_name)
        return
    elif isinstance(entity_df, str):
        spark_session.sql(entity_df).createOrReplaceTempView(table_name)
        return
    else:
        raise InvalidEntityType(type(entity_df))
예제 #14
0
파일: redshift.py 프로젝트: dmatrix/feast
def _upload_entity_df_and_get_entity_schema(
    entity_df: Union[pd.DataFrame, str],
    redshift_client,
    config: RepoConfig,
    s3_resource,
    table_name: str,
) -> Dict[str, np.dtype]:
    if isinstance(entity_df, pd.DataFrame):
        # If the entity_df is a pandas dataframe, upload it to Redshift
        # and construct the schema from the original entity_df dataframe
        aws_utils.upload_df_to_redshift(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            s3_resource,
            f"{config.offline_store.s3_staging_location}/entity_df/{table_name}.parquet",
            config.offline_store.iam_role,
            table_name,
            entity_df,
        )
        return dict(zip(entity_df.columns, entity_df.dtypes))
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), create a Redshift table out of it,
        # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it
        aws_utils.execute_redshift_statement(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            f"CREATE TABLE {table_name} AS ({entity_df})",
        )
        limited_entity_df = RedshiftRetrievalJob(
            f"SELECT * FROM {table_name} LIMIT 1",
            redshift_client,
            s3_resource,
            config,
            full_feature_names=False,
            on_demand_feature_views=None,
        ).to_df()
        return dict(zip(limited_entity_df.columns, limited_entity_df.dtypes))
    else:
        raise InvalidEntityType(type(entity_df))
예제 #15
0
파일: redshift.py 프로젝트: feast-dev/feast
def _get_entity_schema(
    entity_df: Union[pd.DataFrame, str],
    redshift_client,
    config: RepoConfig,
    s3_resource,
) -> Dict[str, np.dtype]:
    if isinstance(entity_df, pd.DataFrame):
        return dict(zip(entity_df.columns, entity_df.dtypes))

    elif isinstance(entity_df, str):
        # get pandas dataframe consisting of 1 row (LIMIT 1) and generate the schema out of it
        entity_df_sample = RedshiftRetrievalJob(
            f"SELECT * FROM ({entity_df}) LIMIT 1",
            redshift_client,
            s3_resource,
            config,
            full_feature_names=False,
        ).to_df()
        return dict(zip(entity_df_sample.columns, entity_df_sample.dtypes))
    else:
        raise InvalidEntityType(type(entity_df))
예제 #16
0
def _get_entity_df_event_timestamp_range(
    entity_df: Union[pd.DataFrame, str],
    entity_df_event_timestamp_col: str,
    redshift_client,
    config: RepoConfig,
    table_name: str,
) -> Tuple[datetime, datetime]:
    if isinstance(entity_df, pd.DataFrame):
        entity_df_event_timestamp = entity_df.loc[:,
                                                  entity_df_event_timestamp_col].infer_objects(
                                                  )
        if pd.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pd.to_datetime(
                entity_df_event_timestamp, utc=True)
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min(),
            entity_df_event_timestamp.max(),
        )
    elif isinstance(entity_df, str):
        # If the entity_df is a string (SQL query), determine range
        # from table
        statement_id = aws_utils.execute_redshift_statement(
            redshift_client,
            config.offline_store.cluster_id,
            config.offline_store.database,
            config.offline_store.user,
            f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max FROM {table_name}",
        )
        res = aws_utils.get_redshift_statement_result(
            redshift_client, statement_id)["Records"][0]
        entity_df_event_timestamp_range = (
            parser.parse(res[0]["stringValue"]),
            parser.parse(res[1]["stringValue"]),
        )
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range