def _do_get_featuregroup_partitions(featuregroup_name, featurestore_metadata, featurestore=None, featuregroup_version=1): """ Gets the partitions of a featuregroup Args: :featuregroup_name: the featuregroup to get partitions for :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore :featuregroup_version: the version of the featuregroup, defaults to 1 Returns: a dataframe with the partitions of the featuregroup """ fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version) if fg.featuregroup_type == featurestore_metadata.settings.on_demand_featuregroup_type: raise CannotGetPartitionsOfOnDemandFeatureGroup( "The feature group with name: {} , and version: {} " "is an on-demand feature group. " "Get partitions operation is only supported for " "cached feature groups.".format(featuregroup_name, featuregroup_version)) hive = util._create_hive_connection(featurestore) sql_str = "SHOW PARTITIONS " + fs_utils._get_table_name( featuregroup_name, featuregroup_version) result = _run_and_log_sql(hive, sql_str) return result
def _do_get_feature(feature, featurestore_metadata, featurestore=None, featuregroup=None, featuregroup_version=1): """ Gets a particular feature (column) from a featurestore, if no featuregroup is specified it queries hopsworks metastore to see if the feature exists in any of the featuregroups in the featurestore. If the user knows which featuregroup contain the feature, it should be specified as it will improve performance of the query. Args: :feature: the feature name to get :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore :featuregroup: (Optional) the featuregroup where the feature resides :featuregroup_version: (Optional) the version of the featuregroup :featurestore_metadata: the metadata of the featurestore to query Returns: A pandas dataframe with the feature """ hive = util._create_hive_connection(featurestore) feature_query = FeatureQuery(feature, featurestore_metadata, featurestore, featuregroup, featuregroup_version) logical_query_plan = LogicalQueryPlan(feature_query) logical_query_plan.create_logical_plan() logical_query_plan.construct_sql() dataframe = _run_and_log_sql(hive, logical_query_plan.sql_str) return dataframe
def _do_get_features(features, featurestore_metadata, featurestore=None, featuregroups_version_dict={}, join_key=None): """ Gets a list of features (columns) from the featurestore. If no featuregroup is specified it will query hopsworks metastore to find where the features are stored. Args: :features: a list of features to get from the featurestore :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore :featuregroups: (Optional) a dict with (fg --> version) for all the featuregroups where the features resides :featuregroup_version: (Optional) the version of the featuregroup :join_key: (Optional) column name to join on :featurestore_metadata: the metadata of the featurestore Returns: A pandas dataframe with all the features """ if featurestore is None: featurestore = fs_utils._do_get_project_featurestore() hive_conn = util._create_hive_connection(featurestore) features_query = FeaturesQuery(features, featurestore_metadata, featurestore, featuregroups_version_dict, join_key) logical_query_plan = LogicalQueryPlan(features_query) logical_query_plan.create_logical_plan() logical_query_plan.construct_sql() result = _run_and_log_sql(hive_conn, logical_query_plan.sql_str) return result
def _run_and_log_sql(sql_str, featurestore, online=False): """ Runs and logs an SQL query with pyHive Args: :sql_str: the query to run :featurestore: name of the featurestore :online: if true, run the query using online feature store JDBC connector Returns: :pd.DataFrame: the result of the SQL query as pandas dataframe """ if not online: hive_conn = None try: fs_utils._log( "Running sql: {} against the offline feature store".format( sql_str)) hive_conn = util._create_hive_connection(featurestore) dataframe = pd.read_sql(sql_str, hive_conn) finally: if hive_conn: hive_conn.close() else: connection = None try: fs_utils._log( "Running sql: {} against online feature store".format(sql_str)) metadata = _get_featurestore_metadata(featurestore, update_cache=False) storage_connector = _do_get_online_featurestore_connector( featurestore, metadata) pw, user = _get_online_feature_store_password_and_user( storage_connector) parsed = urllib.parse.urlparse( urllib.parse.urlparse( storage_connector.connection_string).path) db_connection_str = 'mysql+pymysql://' + user + \ ':' + pw + '@' + parsed.netloc + parsed.path engine = sqlalchemy.create_engine(db_connection_str, poolclass=NullPool) db_connection = engine.connect() dataframe = pd.read_sql(sql_str, con=db_connection) finally: if connection: connection.close() # pd.read_sql returns columns in table.column format if columns are not specified in SQL query, i.e. SELECT * FROM.. # this also occurs when sql query specifies table, i.e. SELECT table1.column1 table2.column2 FROM ... JOIN ... # we want only want hive table column names as dataframe column names dataframe.columns = [ column.split('.')[1] if '.' in column else column for column in dataframe.columns ] return dataframe
def _do_get_cached_featuregroup(featuregroup_name, featurestore=None, featuregroup_version=1): """ Gets a cached featuregroup from a featurestore as a pandas dataframe Args: :featuregroup_name: name of the featuregroup to get :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore :featuregroup_version: (Optional) the version of the featuregroup Returns: a pandas dataframe with the contents of the feature group """ hive_conn = util._create_hive_connection(featurestore) featuregroup_query = FeaturegroupQuery(featuregroup_name, featurestore, featuregroup_version) logical_query_plan = LogicalQueryPlan(featuregroup_query) logical_query_plan.create_logical_plan() logical_query_plan.construct_sql() dataframe = _run_and_log_sql(hive_conn, logical_query_plan.sql_str) return dataframe