예제 #1
0
def _do_get_featuregroup_partitions(featuregroup_name,
                                    featurestore_metadata,
                                    featurestore=None,
                                    featuregroup_version=1):
    """
    Gets the partitions of a featuregroup

     Args:
        :featuregroup_name: the featuregroup to get partitions for
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroup_version: the version of the featuregroup, defaults to 1

     Returns:
        a dataframe with the partitions of the featuregroup
     """
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups,
                                          featuregroup_name,
                                          featuregroup_version)
    if fg.featuregroup_type == featurestore_metadata.settings.on_demand_featuregroup_type:
        raise CannotGetPartitionsOfOnDemandFeatureGroup(
            "The feature group with name: {} , and version: {} "
            "is an on-demand feature group. "
            "Get partitions operation is only supported for "
            "cached feature groups.".format(featuregroup_name,
                                            featuregroup_version))
    hive = util._create_hive_connection(featurestore)

    sql_str = "SHOW PARTITIONS " + fs_utils._get_table_name(
        featuregroup_name, featuregroup_version)
    result = _run_and_log_sql(hive, sql_str)
    return result
예제 #2
0
def _do_get_feature(feature,
                    featurestore_metadata,
                    featurestore=None,
                    featuregroup=None,
                    featuregroup_version=1):
    """
    Gets a particular feature (column) from a featurestore, if no featuregroup is specified it queries
    hopsworks metastore to see if the feature exists in any of the featuregroups in the featurestore.
    If the user knows which featuregroup contain the feature, it should be specified as it will improve performance
    of the query.

    Args:
        :feature: the feature name to get
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroup: (Optional) the featuregroup where the feature resides
        :featuregroup_version: (Optional) the version of the featuregroup
        :featurestore_metadata: the metadata of the featurestore to query

    Returns:
        A pandas dataframe with the feature

    """
    hive = util._create_hive_connection(featurestore)

    feature_query = FeatureQuery(feature, featurestore_metadata, featurestore,
                                 featuregroup, featuregroup_version)
    logical_query_plan = LogicalQueryPlan(feature_query)
    logical_query_plan.create_logical_plan()
    logical_query_plan.construct_sql()

    dataframe = _run_and_log_sql(hive, logical_query_plan.sql_str)
    return dataframe
예제 #3
0
def _do_get_features(features, featurestore_metadata, featurestore=None, featuregroups_version_dict={}, join_key=None):
    """
    Gets a list of features (columns) from the featurestore. If no featuregroup is specified it will query hopsworks
    metastore to find where the features are stored.

    Args:
        :features: a list of features to get from the featurestore
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroups: (Optional) a dict with (fg --> version) for all the featuregroups where the features resides
        :featuregroup_version: (Optional) the version of the featuregroup
        :join_key: (Optional) column name to join on
        :featurestore_metadata: the metadata of the featurestore

    Returns:
        A pandas dataframe with all the features

    """
    if featurestore is None:
        featurestore = fs_utils._do_get_project_featurestore()

    hive_conn = util._create_hive_connection(featurestore)

    features_query = FeaturesQuery(features, featurestore_metadata, featurestore, featuregroups_version_dict, join_key)
    logical_query_plan = LogicalQueryPlan(features_query)
    logical_query_plan.create_logical_plan()
    logical_query_plan.construct_sql()

    result = _run_and_log_sql(hive_conn, logical_query_plan.sql_str)

    return result
예제 #4
0
def _run_and_log_sql(sql_str, featurestore, online=False):
    """
    Runs and logs an SQL query with pyHive

    Args:
        :sql_str: the query to run
        :featurestore: name of the featurestore
        :online: if true, run the query using online feature store JDBC connector

    Returns:
        :pd.DataFrame: the result of the SQL query as pandas dataframe
    """
    if not online:
        hive_conn = None
        try:
            fs_utils._log(
                "Running sql: {} against the offline feature store".format(
                    sql_str))
            hive_conn = util._create_hive_connection(featurestore)
            dataframe = pd.read_sql(sql_str, hive_conn)
        finally:
            if hive_conn:
                hive_conn.close()
    else:
        connection = None
        try:
            fs_utils._log(
                "Running sql: {} against online feature store".format(sql_str))
            metadata = _get_featurestore_metadata(featurestore,
                                                  update_cache=False)
            storage_connector = _do_get_online_featurestore_connector(
                featurestore, metadata)
            pw, user = _get_online_feature_store_password_and_user(
                storage_connector)
            parsed = urllib.parse.urlparse(
                urllib.parse.urlparse(
                    storage_connector.connection_string).path)
            db_connection_str = 'mysql+pymysql://' + user + \
                ':' + pw + '@' + parsed.netloc + parsed.path
            engine = sqlalchemy.create_engine(db_connection_str,
                                              poolclass=NullPool)
            db_connection = engine.connect()
            dataframe = pd.read_sql(sql_str, con=db_connection)
        finally:
            if connection:
                connection.close()

    # pd.read_sql returns columns in table.column format if columns are not specified in SQL query, i.e. SELECT * FROM..
    # this also occurs when sql query specifies table, i.e. SELECT table1.column1 table2.column2 FROM ... JOIN ...
    # we want only want hive table column names as dataframe column names
    dataframe.columns = [
        column.split('.')[1] if '.' in column else column
        for column in dataframe.columns
    ]

    return dataframe
예제 #5
0
def _do_get_cached_featuregroup(featuregroup_name,
                                featurestore=None,
                                featuregroup_version=1):
    """
    Gets a cached featuregroup from a featurestore as a pandas dataframe

    Args:
        :featuregroup_name: name of the featuregroup to get
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroup_version: (Optional) the version of the featuregroup

    Returns:
        a pandas dataframe with the contents of the feature group

    """
    hive_conn = util._create_hive_connection(featurestore)

    featuregroup_query = FeaturegroupQuery(featuregroup_name, featurestore,
                                           featuregroup_version)
    logical_query_plan = LogicalQueryPlan(featuregroup_query)
    logical_query_plan.create_logical_plan()
    logical_query_plan.construct_sql()
    dataframe = _run_and_log_sql(hive_conn, logical_query_plan.sql_str)
    return dataframe