예제 #1
0
def _do_get_features(features,
                     featurestore_metadata,
                     featurestore=None,
                     featuregroups_version_dict={},
                     join_key=None,
                     online=False):
    """
    Gets a list of features (columns) from the featurestore. If no featuregroup is specified it will query hopsworks
    metastore to find where the features are stored.

    Args:
        :features: a list of features to get from the featurestore
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroups: (Optional) a dict with (fg --> version) for all the featuregroups where the features resides
        :featuregroup_version: (Optional) the version of the featuregroup
        :join_key: (Optional) column name to join on
        :featurestore_metadata: the metadata of the featurestore
        :online: a boolean flag whether to fetch the online feature or the offline one (assuming that the
                 feature group that the feature is stored in has online serving enabled)
                 (for cached feature groups only)

    Returns:
        A pandas dataframe with all the features

    """
    if online and ((not featurestore_metadata.settings.online_enabled) or
                   (not featurestore_metadata.featurestore.online_enabled)):
        raise OnlineFeaturestoreNotEnabled(
            "Online Feature Store is not enabled for this project or cluster, "
            "talk to an administrator to enable it")
    if featurestore is None:
        featurestore = fs_utils._do_get_project_featurestore()

    features_query = FeaturesQuery(features, featurestore_metadata,
                                   featurestore, featuregroups_version_dict,
                                   join_key)
    logical_query_plan = LogicalQueryPlan(features_query)
    logical_query_plan.create_logical_plan()
    logical_query_plan.construct_sql()

    result = _run_and_log_sql(logical_query_plan.sql_str, featurestore, online)

    return result
예제 #2
0
def _do_get_feature(feature,
                    featurestore_metadata,
                    featurestore=None,
                    featuregroup=None,
                    featuregroup_version=1,
                    online=False):
    """
    Gets a particular feature (column) from a featurestore, if no featuregroup is specified it queries
    hopsworks metastore to see if the feature exists in any of the featuregroups in the featurestore.
    If the user knows which featuregroup contain the feature, it should be specified as it will improve performance
    of the query.

    Args:
        :feature: the feature name to get
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroup: (Optional) the featuregroup where the feature resides
        :featuregroup_version: (Optional) the version of the featuregroup
        :featurestore_metadata: the metadata of the featurestore to query
        :online: a boolean flag whether to fetch the online feature or the offline one (assuming that the
                 feature group that the feature is stored in has online serving enabled)
                 (for cached feature groups only)

    Returns:
        A pandas dataframe with the feature

    """
    if online and ((not featurestore_metadata.settings.online_enabled) or
                   (not featurestore_metadata.featurestore.online_enabled)):
        raise OnlineFeaturestoreNotEnabled(
            "Online Feature Store is not enabled for this project or cluster, "
            "talk to an administrator to enable it")
    if featurestore is None:
        featurestore = fs_utils._do_get_project_featurestore()

    feature_query = FeatureQuery(feature, featurestore_metadata, featurestore,
                                 featuregroup, featuregroup_version)
    logical_query_plan = LogicalQueryPlan(feature_query)
    logical_query_plan.create_logical_plan()
    logical_query_plan.construct_sql()

    dataframe = _run_and_log_sql(logical_query_plan.sql_str, featurestore,
                                 online)
    return dataframe
예제 #3
0
def _do_get_featuregroup(featuregroup_name,
                         featurestore_metadata,
                         featurestore=None,
                         featuregroup_version=1,
                         online=False):
    """
    Gets a featuregroup from a featurestore as a pandas dataframe

    Args:
        :featuregroup_name: name of the featuregroup to get
        :featurestore_metadata: featurestore metadata
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroup_version: (Optional) the version of the featuregroup
        :online: a boolean flag whether to fetch the online feature or the offline one (assuming that the
                 feature group that the feature is stored in has online serving enabled)
                 (for cached feature groups only)

    Returns:
        a pandas dataframe with the contents of the featurestore

    """
    if online and ((not featurestore_metadata.settings.online_enabled) or
                   (not featurestore_metadata.featurestore.online_enabled)):
        raise OnlineFeaturestoreNotEnabled(
            "Online Feature Store is not enabled for this project or cluster, "
            "talk to an administrator to enable it")
    if featurestore is None:
        featurestore = fs_utils._do_get_project_featurestore()
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups,
                                          featuregroup_name,
                                          featuregroup_version)

    if fg.featuregroup_type == featurestore_metadata.settings.cached_featuregroup_type:
        return _do_get_cached_featuregroup(featuregroup_name, featurestore,
                                           featuregroup_version, online)

    raise ValueError(
        "The feature group type: " + fg.featuregroup_type +
        " was not recognized. Recognized types include: {} and {}".format(
            featurestore_metadata.settings.on_demand_featuregroup_type,
            featurestore_metadata.settings.cached_featuregroup_type))
예제 #4
0
def _run_and_log_sql(sql_str, featurestore, online=False):
    """
    Runs and logs an SQL query with pyHive

    Args:
        :sql_str: the query to run
        :featurestore: name of the featurestore
        :online: if true, run the query using online feature store JDBC connector

    Returns:
        :pd.DataFrame: the result of the SQL query as pandas dataframe
    """
    if not online:
        hive_conn = None
        try:
            fs_utils._log(
                "Running sql: {} against the offline feature store".format(
                    sql_str))
            hive_conn = util._create_hive_connection(featurestore)
            dataframe = pd.read_sql(sql_str, hive_conn)
        finally:
            if hive_conn:
                hive_conn.close()
    else:
        connection = None
        try:
            fs_utils._log(
                "Running sql: {} against online feature store".format(sql_str))
            metadata = _get_featurestore_metadata(featurestore,
                                                  update_cache=False)
            featurestore_metadata = _get_featurestore_metadata(
                featurestore, update_cache=False)
            if online and (
                (not featurestore_metadata.settings.online_enabled) or
                (not featurestore_metadata.featurestore.online_enabled)):
                raise OnlineFeaturestoreNotEnabled(
                    "Online Feature Store is not enabled for this project or cluster, "
                    "talk to an administrator to enable it")
            storage_connector = _do_get_online_featurestore_connector(
                featurestore, featurestore_metadata)
            pw, user = _get_online_feature_store_password_and_user(
                storage_connector)
            parsed = urllib.parse.urlparse(
                urllib.parse.urlparse(
                    storage_connector.connection_string).path)
            db_connection_str = 'mysql+pymysql://' + user + \
                ':' + pw + '@' + parsed.netloc + parsed.path
            engine = sqlalchemy.create_engine(db_connection_str,
                                              poolclass=NullPool)
            db_connection = engine.connect()
            dataframe = pd.read_sql(sql_str, con=db_connection)
        finally:
            if connection:
                connection.close()

    # pd.read_sql returns columns in table.column format if columns are not specified in SQL query, i.e. SELECT * FROM..
    # this also occurs when sql query specifies table, i.e. SELECT table1.column1 table2.column2 FROM ... JOIN ...
    # we want only want hive table column names as dataframe column names
    dataframe.columns = [
        column.split('.')[1] if '.' in column else column
        for column in dataframe.columns
    ]

    return dataframe