def _feature_query(self): """ Creates a logical query plan for a user-query for a single feature Returns: None Raises: :FeaturegroupNotFound: if the feature could not be found in any of the featuregroups in the metadata """ self.join_str = None self.features_str = self.query.feature if self.query.featuregroup != None: self.featuregroups_str = fs_utils._get_table_name(self.query.featuregroup, self.query.featuregroup_version) self.featuregroups = [self.query.featurestore_metadata.featuregroups[ fs_utils._get_table_name(self.query.featuregroup, self.query.featuregroup_version) ]] else: featuregroups_parsed = self.query.featurestore_metadata.featuregroups if len(featuregroups_parsed.values()) == 0: raise FeaturegroupNotFound("Could not find any featuregroups in the metastore " "that contains the given feature, " "please explicitly supply featuregroups as an argument to the API call") featuregroup_matched = query_planner._find_feature(self.query.feature, self.query.featurestore, featuregroups_parsed.values()) self.featuregroups_str = fs_utils._get_table_name(featuregroup_matched.name, featuregroup_matched.version) self.featuregroups = [featuregroup_matched] fs_utils._log("Logical query plan for getting 1 feature from the featurestore created successfully")
def _run_and_log_sql(sql_str, featurestore, online=False): """ Runs and logs an SQL query with pyHive Args: :sql_str: the query to run :featurestore: name of the featurestore :online: if true, run the query using online feature store JDBC connector Returns: :pd.DataFrame: the result of the SQL query as pandas dataframe """ if not online: hive_conn = None try: fs_utils._log( "Running sql: {} against the offline feature store".format( sql_str)) hive_conn = util._create_hive_connection(featurestore) dataframe = pd.read_sql(sql_str, hive_conn) finally: if hive_conn: hive_conn.close() else: connection = None try: fs_utils._log( "Running sql: {} against online feature store".format(sql_str)) metadata = _get_featurestore_metadata(featurestore, update_cache=False) storage_connector = _do_get_online_featurestore_connector( featurestore, metadata) pw, user = _get_online_feature_store_password_and_user( storage_connector) parsed = urllib.parse.urlparse( urllib.parse.urlparse( storage_connector.connection_string).path) db_connection_str = 'mysql+pymysql://' + user + \ ':' + pw + '@' + parsed.netloc + parsed.path engine = sqlalchemy.create_engine(db_connection_str, poolclass=NullPool) db_connection = engine.connect() dataframe = pd.read_sql(sql_str, con=db_connection) finally: if connection: connection.close() # pd.read_sql returns columns in table.column format if columns are not specified in SQL query, i.e. SELECT * FROM.. # this also occurs when sql query specifies table, i.e. SELECT table1.column1 table2.column2 FROM ... JOIN ... # we want only want hive table column names as dataframe column names dataframe.columns = [ column.split('.')[1] if '.' in column else column for column in dataframe.columns ] return dataframe
def construct_sql(self): """ Constructs a HiveSQL query from the logical plan Returns: None """ sql_str = "SELECT " + self.features_str + " FROM " + self.featuregroups_str if self.join_str is not None: sql_str = sql_str + " " + self.join_str self.sql_str = sql_str fs_utils._log("SQL string for the query created successfully")
def _run_and_log_sql(hive_conn, sql_str): """ Runs and logs an SQL query with pyHive Args: :hive_conn: the hive session :sql_str: the query to run Returns: :pd.DataFrame: the result of the SQL query as pandas dataframe """ fs_utils._log("Running sql: {}".format(sql_str)) # ToDo: right now hive connection is closed after every call. Manage connections better in future (pooling) dataframe = pd.read_sql(sql_str, hive_conn) hive_conn.close() return dataframe
def _run_and_log_sql(hive_conn, sql_str): """ Runs and logs an SQL query with pyHive Args: :hive_conn: the hive session :sql_str: the query to run Returns: :pd.DataFrame: the result of the SQL query as pandas dataframe """ fs_utils._log("Running sql: {}".format(sql_str)) # ToDo: right now hive connection is closed after every call. Manage connections better in future (pooling) dataframe = pd.read_sql(sql_str, hive_conn) # pd.read_sql returns columns in table.column format if columns are not specified in SQL query, i.e. SELECT * FROM.. # this also occurs when sql query specifies table, i.e. SELECT table1.column1 table2.column2 FROM ... JOIN ... # we want only want hive table column names as dataframe column names dataframe.columns = [column.split('.')[1] if '.' in column else column for column in dataframe.columns] hive_conn.close() return dataframe
def _features_query(self): """ Creates a logical query plan from a user query to get a list of features Returns: None Raises: :FeaturegroupNotFound: if the some of the features could not be found in any of the featuregroups """ self.features_str = ", ".join(self.query.features) self.join_str = None if len(self.query.featuregroups_version_dict) == 1: self.featuregroups_str = fs_utils._get_table_name( self.query.featuregroups_version_dict[0][ constants.REST_CONFIG.JSON_FEATUREGROUP_NAME], self.query.featuregroups_version_dict[0][ constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION]) featuregroups = [ self.query.featurestore_metadata.featuregroups[ fs_utils._get_table_name( self.query.featuregroups_version_dict[0][ constants.REST_CONFIG.JSON_FEATUREGROUP_NAME], self.query.featuregroups_version_dict[0][ constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])] ] self.featuregroups = featuregroups if len(self.query.featuregroups_version_dict) > 1: if self.query.join_key != None: featuregroups = [ self.query.featurestore_metadata. featuregroups[fs_utils._get_table_name( entry[constants.REST_CONFIG.JSON_FEATUREGROUP_NAME], entry[ constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])] for entry in self.query.featuregroups_version_dict ] self.join_str = query_planner._get_join_str( featuregroups, self.query.join_key) self.featuregroups_str = fs_utils._get_table_name( featuregroups[0].name, featuregroups[0].version) self.featuregroups = featuregroups else: featuregroups_parsed = self.query.featurestore_metadata.featuregroups if len(featuregroups_parsed.values()) == 0: raise FeaturegroupNotFound( "Could not find any featuregroups containing " "the features in the metastore, " "please explicitly supply featuregroups as an argument to the API call" ) featuregroups_filtered = list( filter( lambda fg: fg.name in self.query. featuregroups_version_dict_orig and self.query. featuregroups_version_dict_orig[fg.name] == fg.version, featuregroups_parsed.values())) join_col = query_planner._get_join_col(featuregroups_filtered) self.join_str = query_planner._get_join_str( featuregroups_filtered, join_col) self.featuregroups_str = fs_utils._get_table_name( featuregroups_filtered[0].name, featuregroups_filtered[0].version) self.featuregroups = featuregroups_filtered if len(self.query.featuregroups_version_dict) == 0: featuregroups_parsed = self.query.featurestore_metadata.featuregroups if len(featuregroups_parsed.values()) == 0: raise FeaturegroupNotFound( "Could not find any featuregroups in the metastore, " "please explicitly supply featuregroups as an argument to the API call" ) feature_to_featuregroup = {} feature_featuregroups = [] for feature in self.query.features: featuregroup_matched = query_planner._find_feature( feature, self.query.featurestore, featuregroups_parsed.values()) feature_to_featuregroup[feature] = featuregroup_matched if not query_planner._check_if_list_of_featuregroups_contains_featuregroup( feature_featuregroups, featuregroup_matched.name, featuregroup_matched.version): feature_featuregroups.append(featuregroup_matched) if len(feature_featuregroups) == 1: self.featuregroups_str = fs_utils._get_table_name( feature_featuregroups[0].name, feature_featuregroups[0].version) else: join_col = query_planner._get_join_col(feature_featuregroups) self.join_str = query_planner._get_join_str( feature_featuregroups, join_col) self.featuregroups_str = fs_utils._get_table_name( feature_featuregroups[0].name, feature_featuregroups[0].version) self.featuregroups = feature_featuregroups fs_utils._log( "Logical query plan for getting {} features from the featurestore created successfully" .format(len(self.query.features)))