def _get_join_str(featuregroups, join_key): """ Constructs the JOIN COl,... ON X string from a list of tables (featuregroups) and join column Args: :featuregroups: the featuregroups to join :join_key: the key to join on Returns: SQL join string to join a set of feature groups together """ join_str = "" for idx, fg in enumerate(featuregroups): if (idx != 0): join_str = join_str + "JOIN " + fs_utils._get_table_name( fg.name, fg.version) + " " join_str = join_str + "ON " for idx, fg in enumerate(featuregroups): if (idx != 0 and idx < (len(featuregroups) - 1)): join_str = join_str + fs_utils._get_table_name(featuregroups[0].name, featuregroups[0].version) + ".`" \ + join_key + "`=" + \ fs_utils._get_table_name(fg.name, fg.version) + ".`" + join_key \ + "` AND " elif (idx != 0 and idx == (len(featuregroups) - 1)): join_str = join_str + fs_utils._get_table_name(featuregroups[0].name, featuregroups[0].version) + ".`" \ + join_key + "`=" + \ fs_utils._get_table_name(fg.name, fg.version) + ".`" + join_key + "`" return join_str
def _find_featuregroup(featuregroups, featuregroup_name, featuregroup_version): """ A helper function to look for a feature group name and version in a list of feature groups Args: :featuregroups: a list of featuregroup metadata in the feature store :featuregroup_name: name of the feature group :featuregroup_version: version of the feature group Returns: The feature group if it finds it, otherwise exception Raises: :FeaturegroupNotFound: if the requested feature group could not be found """ try: return featuregroups[fs_utils._get_table_name(featuregroup_name, featuregroup_version)] except KeyError: featuregroup_names = list( map(lambda fg: fs_utils._get_table_name(fg.name, fg.version), featuregroups.values())) raise FeaturegroupNotFound("Could not find the requested feature group with name: {} " \ "and version: {} among the list of available feature groups: {}".format( featuregroup_name, featuregroup_version, featuregroup_names))
def _feature_query(self): """ Creates a logical query plan for a user-query for a single feature Returns: None Raises: :FeaturegroupNotFound: if the feature could not be found in any of the featuregroups in the metadata """ self.join_str = None self.features_str = self.query.feature if self.query.featuregroup != None: self.featuregroups_str = fs_utils._get_table_name(self.query.featuregroup, self.query.featuregroup_version) self.featuregroups = [self.query.featurestore_metadata.featuregroups[ fs_utils._get_table_name(self.query.featuregroup, self.query.featuregroup_version) ]] else: featuregroups_parsed = self.query.featurestore_metadata.featuregroups if len(featuregroups_parsed.values()) == 0: raise FeaturegroupNotFound("Could not find any featuregroups in the metastore " "that contains the given feature, " "please explicitly supply featuregroups as an argument to the API call") featuregroup_matched = query_planner._find_feature(self.query.feature, self.query.featurestore, featuregroups_parsed.values()) self.featuregroups_str = fs_utils._get_table_name(featuregroup_matched.name, featuregroup_matched.version) self.featuregroups = [featuregroup_matched] fs_utils._log("Logical query plan for getting 1 feature from the featurestore created successfully")
def _find_training_dataset(training_datasets, training_dataset, training_dataset_version): """ A helper function to look for a training dataset name and version in a list of training datasets Args: :training_datasets: a list of training datasets metadata :training_dataset: name of the training dataset :training_dataset_version: version of the training dataset Returns: The training dataset if it finds it, otherwise exception Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found """ try: return training_datasets[fs_utils._get_table_name( training_dataset, training_dataset_version)] except KeyError: training_dataset_names = list( map(lambda td: fs_utils._get_table_name(td.name, td.version), training_datasets.values())) raise TrainingDatasetNotFound("Could not find the requested training dataset with name: {} " \ "and version: {} among the list of available training datasets: {}".format( training_dataset, training_dataset_version, training_dataset_names))
def _get_feature_featuregroup_mapping(logical_query_plan, featurestore, featurestore_metadata): """ Extracts a mapping of feature to featuregroup from the logical query plan. Args: :logical_query_plan: the logical query plan containing information about the features to be fetched :featurestore: the featurestore :featurestore_metadata: featurestore metadata Returns: a map of feature name to featuregroup """ mapping = {} if isinstance(logical_query_plan.query, FeatureQuery): fg = logical_query_plan.featuregroups[0] mapping[_get_feature_short_name( logical_query_plan.query.feature)] = fs_utils._get_table_name( fg.name, fg.version) if isinstance(logical_query_plan.query, FeaturesQuery): for f in logical_query_plan.query.features: fg = _find_feature(f, featurestore, logical_query_plan.featuregroups) mapping[_get_feature_short_name(f)] = fs_utils._get_table_name( fg.name, fg.version) if isinstance(logical_query_plan.query, FeaturegroupQuery): fg = _find_featuregroup(featurestore_metadata.featuregroups, logical_query_plan.query.featuregroup, logical_query_plan.query.featuregroup_version) for f in fg.features: mapping[_get_feature_short_name( f.name)] = fs_utils._get_table_name(fg.name, fg.version) return mapping
def _parse_featurestore_metadata(self, metadata_json): """ Parses the featurestore metadata from the REST API and puts it into an optimized data structure with O(1) lookup time for features, featuregroups, and training datasets Args: :featurestore_metadata: the JSON metadata of the featurestore returned by hopsworks Returns: A dict with parsed metadata """ featuregroups = {} training_datasets = {} features_to_featuregroups = {} storage_connectors = {} for fg in metadata_json[constants.REST_CONFIG.JSON_FEATUREGROUPS]: featuregroups[fs_utils._get_table_name(fg[constants.REST_CONFIG.JSON_FEATUREGROUP_NAME], fg[constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])] = \ Featuregroup(fg) for f in fg[constants.REST_CONFIG.JSON_FEATUREGROUP_FEATURES]: if f[constants.REST_CONFIG. JSON_FEATURE_NAME] in features_to_featuregroups: features_to_featuregroups[f[ constants.REST_CONFIG.JSON_FEATURE_NAME]].append( Featuregroup(fg)) else: features_to_featuregroups[f[ constants.REST_CONFIG.JSON_FEATURE_NAME]] = [ Featuregroup(fg) ] for td in metadata_json[constants.REST_CONFIG.JSON_TRAINING_DATASETS]: training_datasets[fs_utils._get_table_name(td[constants.REST_CONFIG.JSON_TRAINING_DATASET_NAME], td[constants.REST_CONFIG.JSON_TRAINING_DATASET_VERSION])] = \ TrainingDataset(td) settings = FeaturestoreSettings( metadata_json[constants.REST_CONFIG.JSON_FEATURESTORE_SETTINGS]) for sc in metadata_json[ constants.REST_CONFIG.JSON_FEATURESTORE_STORAGE_CONNECTORS]: if sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_TYPE] == \ settings.jdbc_connector_type: storage_connectors[sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_NAME]] = \ JDBCStorageConnector(sc) if sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_TYPE] == \ settings.s3_connector_type: storage_connectors[sc[ constants.REST_CONFIG. JSON_FEATURESTORE_CONNECTOR_NAME]] = S3StorageConnector(sc) if sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_TYPE] == \ settings.hopsfs_connector_type: storage_connectors[sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_NAME]] = \ HopsfsStorageConnector(sc) featurestore = Featurestore( metadata_json[constants.REST_CONFIG.JSON_FEATURESTORE]) return featuregroups, training_datasets, features_to_featuregroups, featurestore, settings, storage_connectors
def _find_feature(feature, featurestore, featuregroups_parsed): """ Looks if a given feature can be uniquely found in a list of featuregroups and returns that featuregroup. Otherwise it throws an exception Args: :feature: the feature to search for :featurestore: the featurestore where the featuregroups resides :featuregroups_parsed: the featuregroups to look through Returns: the featuregroup that contains the feature Raises: :FeatureNotFound: if the requested feature could not be found """ featuregroups_matched = _find_featuregroup_that_contains_feature( featuregroups_parsed, feature) if (len(featuregroups_matched) == 0): raise FeatureNotFound( "Could not find the feature with name '{}' in any of the featuregroups of the featurestore: '{}'" .format(feature, featurestore)) if (len(featuregroups_matched) > 1): featuregroups_matched_str_list = map( lambda fg: fs_utils._get_table_name(fg.name, fg.version), featuregroups_matched) featuregroups_matched_str = ",".join(featuregroups_matched_str_list) raise FeatureNameCollisionError( "Found the feature with name '{}' " "in more than one of the featuregroups of the featurestore: '{}', " "please specify the optional argument 'featuregroup=', " "the matched featuregroups were: {}".format( feature, featurestore, featuregroups_matched_str)) return featuregroups_matched[0]
def _do_get_featuregroup_partitions(featuregroup_name, featurestore_metadata, featurestore=None, featuregroup_version=1, online=False): """ Gets the partitions of a featuregroup Args: :featuregroup_name: the featuregroup to get partitions for :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore :featuregroup_version: the version of the featuregroup, defaults to 1 :online: a boolean flag whether to fetch the online feature or the offline one (assuming that the feature group that the feature is stored in has online serving enabled) (for cached feature groups only) Returns: a dataframe with the partitions of the featuregroup """ fg = query_planner._find_featuregroup( featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version) if fg.featuregroup_type == featurestore_metadata.settings.on_demand_featuregroup_type: raise CannotGetPartitionsOfOnDemandFeatureGroup("The feature group with name: {} , and version: {} " "is an on-demand feature group. " "Get partitions operation is only supported for " "cached feature groups." .format(featuregroup_name, featuregroup_version)) sql_str = "SHOW PARTITIONS " + \ fs_utils._get_table_name(featuregroup_name, featuregroup_version) result = _run_and_log_sql(sql_str, featurestore, online) return result
def _do_get_featuregroup_partitions(featuregroup_name, featurestore_metadata, featurestore=None, featuregroup_version=1): """ Gets the partitions of a featuregroup Args: :featuregroup_name: the featuregroup to get partitions for :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore :featuregroup_version: the version of the featuregroup, defaults to 1 Returns: a dataframe with the partitions of the featuregroup """ fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version) if fg.featuregroup_type == featurestore_metadata.settings.on_demand_featuregroup_type: raise CannotGetPartitionsOfOnDemandFeatureGroup( "The feature group with name: {} , and version: {} " "is an on-demand feature group. " "Get partitions operation is only supported for " "cached feature groups.".format(featuregroup_name, featuregroup_version)) hive = util._create_hive_connection(featurestore) sql_str = "SHOW PARTITIONS " + fs_utils._get_table_name( featuregroup_name, featuregroup_version) result = _run_and_log_sql(hive, sql_str) return result
def _featuregroup_query(self): """ Creates a logical query plan for a user query to get a featuregroup Returns: None """ self.features_str = "*" self.join_str = None self.featuregroups_str = fs_utils._get_table_name( self.query.featuregroup, self.query.featuregroup_version)
def _do_get_training_datasets(featurestore_metadata): """ Gets a list of all training datasets in a featurestore Args: :featurestore_metadata: metadata of the featurestore Returns: A list of names of the training datasets in this featurestore """ training_dataset_names = list( map(lambda td: fs_utils._get_table_name(td.name, td.version), featurestore_metadata.training_datasets.values())) return training_dataset_names
def _find_featuregroup_that_contains_feature(featuregroups, feature): """ Go through list of featuregroups and find the ones that contain the feature Args: :featuregroups: featuregroups to search through :feature: the feature to look for Returns: a list of featuregroup names and versions for featuregroups that contain the given feature """ matches = [] for fg in featuregroups: for f in fg.features: fg_table_name = fs_utils._get_table_name(fg.name, fg.version) full_name = fg_table_name + "." + f.name if f.name == feature or full_name == feature or ( f.name in feature and fg.name in feature): matches.append(fg) break return matches
def _features_query(self): """ Creates a logical query plan from a user query to get a list of features Returns: None Raises: :FeaturegroupNotFound: if the some of the features could not be found in any of the featuregroups """ self.features_str = ", ".join(self.query.features) self.join_str = None if len(self.query.featuregroups_version_dict) == 1: self.featuregroups_str = fs_utils._get_table_name( self.query.featuregroups_version_dict[0][ constants.REST_CONFIG.JSON_FEATUREGROUP_NAME], self.query.featuregroups_version_dict[0][ constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION]) featuregroups = [ self.query.featurestore_metadata.featuregroups[ fs_utils._get_table_name( self.query.featuregroups_version_dict[0][ constants.REST_CONFIG.JSON_FEATUREGROUP_NAME], self.query.featuregroups_version_dict[0][ constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])] ] self.featuregroups = featuregroups if len(self.query.featuregroups_version_dict) > 1: if self.query.join_key != None: featuregroups = [ self.query.featurestore_metadata. featuregroups[fs_utils._get_table_name( entry[constants.REST_CONFIG.JSON_FEATUREGROUP_NAME], entry[ constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])] for entry in self.query.featuregroups_version_dict ] self.join_str = query_planner._get_join_str( featuregroups, self.query.join_key) self.featuregroups_str = fs_utils._get_table_name( featuregroups[0].name, featuregroups[0].version) self.featuregroups = featuregroups else: featuregroups_parsed = self.query.featurestore_metadata.featuregroups if len(featuregroups_parsed.values()) == 0: raise FeaturegroupNotFound( "Could not find any featuregroups containing " "the features in the metastore, " "please explicitly supply featuregroups as an argument to the API call" ) featuregroups_filtered = list( filter( lambda fg: fg.name in self.query. featuregroups_version_dict_orig and self.query. featuregroups_version_dict_orig[fg.name] == fg.version, featuregroups_parsed.values())) join_col = query_planner._get_join_col(featuregroups_filtered) self.join_str = query_planner._get_join_str( featuregroups_filtered, join_col) self.featuregroups_str = fs_utils._get_table_name( featuregroups_filtered[0].name, featuregroups_filtered[0].version) self.featuregroups = featuregroups_filtered if len(self.query.featuregroups_version_dict) == 0: featuregroups_parsed = self.query.featurestore_metadata.featuregroups if len(featuregroups_parsed.values()) == 0: raise FeaturegroupNotFound( "Could not find any featuregroups in the metastore, " "please explicitly supply featuregroups as an argument to the API call" ) feature_to_featuregroup = {} feature_featuregroups = [] for feature in self.query.features: featuregroup_matched = query_planner._find_feature( feature, self.query.featurestore, featuregroups_parsed.values()) feature_to_featuregroup[feature] = featuregroup_matched if not query_planner._check_if_list_of_featuregroups_contains_featuregroup( feature_featuregroups, featuregroup_matched.name, featuregroup_matched.version): feature_featuregroups.append(featuregroup_matched) if len(feature_featuregroups) == 1: self.featuregroups_str = fs_utils._get_table_name( feature_featuregroups[0].name, feature_featuregroups[0].version) else: join_col = query_planner._get_join_col(feature_featuregroups) self.join_str = query_planner._get_join_str( feature_featuregroups, join_col) self.featuregroups_str = fs_utils._get_table_name( feature_featuregroups[0].name, feature_featuregroups[0].version) self.featuregroups = feature_featuregroups fs_utils._log( "Logical query plan for getting {} features from the featurestore created successfully" .format(len(self.query.features)))