Exemplo n.º 1
0
def _get_join_str(featuregroups, join_key):
    """
    Constructs the JOIN COl,... ON X string from a list of tables (featuregroups) and join column
    Args:
        :featuregroups: the featuregroups to join
        :join_key: the key to join on

    Returns:
        SQL join string to join a set of feature groups together
    """
    join_str = ""
    for idx, fg in enumerate(featuregroups):
        if (idx != 0):
            join_str = join_str + "JOIN " + fs_utils._get_table_name(
                fg.name, fg.version) + " "
    join_str = join_str + "ON "
    for idx, fg in enumerate(featuregroups):
        if (idx != 0 and idx < (len(featuregroups) - 1)):
            join_str = join_str + fs_utils._get_table_name(featuregroups[0].name, featuregroups[0].version) + ".`" \
                       + join_key + "`=" + \
                       fs_utils._get_table_name(fg.name, fg.version) + ".`" + join_key \
                       + "` AND "
        elif (idx != 0 and idx == (len(featuregroups) - 1)):
            join_str = join_str + fs_utils._get_table_name(featuregroups[0].name, featuregroups[0].version) + ".`" \
                       + join_key + "`=" + \
                       fs_utils._get_table_name(fg.name, fg.version) + ".`" + join_key + "`"
    return join_str
Exemplo n.º 2
0
def _find_featuregroup(featuregroups, featuregroup_name, featuregroup_version):
    """
    A helper function to look for a feature group name and version in a list of feature groups

    Args:
        :featuregroups: a list of featuregroup metadata in the feature store
        :featuregroup_name: name of the feature group
        :featuregroup_version: version of the feature group

    Returns:
        The feature group if it finds it, otherwise exception

    Raises:
        :FeaturegroupNotFound: if the requested feature group could not be found
    """
    try:
        return featuregroups[fs_utils._get_table_name(featuregroup_name,
                                                      featuregroup_version)]
    except KeyError:
        featuregroup_names = list(
            map(lambda fg: fs_utils._get_table_name(fg.name, fg.version),
                featuregroups.values()))
        raise FeaturegroupNotFound("Could not find the requested feature group with name: {} " \
                                      "and version: {} among the list of available feature groups: {}".format(
            featuregroup_name,
            featuregroup_version,
            featuregroup_names))
    def _feature_query(self):
        """
        Creates a logical query plan for a user-query for a single feature

        Returns:
            None

        Raises:
            :FeaturegroupNotFound: if the feature could not be found in any of the featuregroups in the metadata
        """
        self.join_str = None
        self.features_str = self.query.feature
        if self.query.featuregroup != None:
            self.featuregroups_str =  fs_utils._get_table_name(self.query.featuregroup,
                                                                 self.query.featuregroup_version)
            self.featuregroups = [self.query.featurestore_metadata.featuregroups[
                                 fs_utils._get_table_name(self.query.featuregroup,
                                                          self.query.featuregroup_version)
                             ]]
        else:
            featuregroups_parsed = self.query.featurestore_metadata.featuregroups
            if len(featuregroups_parsed.values()) == 0:
                raise FeaturegroupNotFound("Could not find any featuregroups in the metastore "
                                           "that contains the given feature, "
                                           "please explicitly supply featuregroups as an argument to the API call")
            featuregroup_matched = query_planner._find_feature(self.query.feature, self.query.featurestore,
                                                               featuregroups_parsed.values())
            self.featuregroups_str = fs_utils._get_table_name(featuregroup_matched.name, featuregroup_matched.version)
            self.featuregroups = [featuregroup_matched]

        fs_utils._log("Logical query plan for getting 1 feature from the featurestore created successfully")
Exemplo n.º 4
0
def _find_training_dataset(training_datasets, training_dataset,
                           training_dataset_version):
    """
    A helper function to look for a training dataset name and version in a list of training datasets

    Args:
        :training_datasets: a list of training datasets metadata
        :training_dataset: name of the training dataset
        :training_dataset_version: version of the training dataset

    Returns:
        The training dataset if it finds it, otherwise exception

    Raises:
        :TrainingDatasetNotFound: if the requested training dataset could not be found
    """
    try:
        return training_datasets[fs_utils._get_table_name(
            training_dataset, training_dataset_version)]
    except KeyError:
        training_dataset_names = list(
            map(lambda td: fs_utils._get_table_name(td.name, td.version),
                training_datasets.values()))
        raise TrainingDatasetNotFound("Could not find the requested training dataset with name: {} " \
                                      "and version: {} among the list of available training datasets: {}".format(
            training_dataset,
            training_dataset_version,
            training_dataset_names))
Exemplo n.º 5
0
def _get_feature_featuregroup_mapping(logical_query_plan, featurestore,
                                      featurestore_metadata):
    """
    Extracts a mapping of feature to featuregroup from the logical query plan.

    Args:
        :logical_query_plan: the logical query plan containing information about the features to be fetched
        :featurestore: the featurestore
        :featurestore_metadata: featurestore metadata

    Returns:
        a map of feature name to featuregroup
    """
    mapping = {}
    if isinstance(logical_query_plan.query, FeatureQuery):
        fg = logical_query_plan.featuregroups[0]
        mapping[_get_feature_short_name(
            logical_query_plan.query.feature)] = fs_utils._get_table_name(
                fg.name, fg.version)
    if isinstance(logical_query_plan.query, FeaturesQuery):
        for f in logical_query_plan.query.features:
            fg = _find_feature(f, featurestore,
                               logical_query_plan.featuregroups)
            mapping[_get_feature_short_name(f)] = fs_utils._get_table_name(
                fg.name, fg.version)
    if isinstance(logical_query_plan.query, FeaturegroupQuery):
        fg = _find_featuregroup(featurestore_metadata.featuregroups,
                                logical_query_plan.query.featuregroup,
                                logical_query_plan.query.featuregroup_version)
        for f in fg.features:
            mapping[_get_feature_short_name(
                f.name)] = fs_utils._get_table_name(fg.name, fg.version)
    return mapping
    def _parse_featurestore_metadata(self, metadata_json):
        """
        Parses the featurestore metadata from the REST API and puts it into an optimized data structure
        with O(1) lookup time for features, featuregroups, and training datasets

        Args:
            :featurestore_metadata: the JSON metadata of the featurestore returned by hopsworks

        Returns:
            A dict with parsed metadata

        """
        featuregroups = {}
        training_datasets = {}
        features_to_featuregroups = {}
        storage_connectors = {}
        for fg in metadata_json[constants.REST_CONFIG.JSON_FEATUREGROUPS]:
            featuregroups[fs_utils._get_table_name(fg[constants.REST_CONFIG.JSON_FEATUREGROUP_NAME],
                                                     fg[constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])] = \
                Featuregroup(fg)
            for f in fg[constants.REST_CONFIG.JSON_FEATUREGROUP_FEATURES]:
                if f[constants.REST_CONFIG.
                     JSON_FEATURE_NAME] in features_to_featuregroups:
                    features_to_featuregroups[f[
                        constants.REST_CONFIG.JSON_FEATURE_NAME]].append(
                            Featuregroup(fg))
                else:
                    features_to_featuregroups[f[
                        constants.REST_CONFIG.JSON_FEATURE_NAME]] = [
                            Featuregroup(fg)
                        ]
        for td in metadata_json[constants.REST_CONFIG.JSON_TRAINING_DATASETS]:
            training_datasets[fs_utils._get_table_name(td[constants.REST_CONFIG.JSON_TRAINING_DATASET_NAME],
                                                         td[constants.REST_CONFIG.JSON_TRAINING_DATASET_VERSION])] = \
                TrainingDataset(td)

        settings = FeaturestoreSettings(
            metadata_json[constants.REST_CONFIG.JSON_FEATURESTORE_SETTINGS])
        for sc in metadata_json[
                constants.REST_CONFIG.JSON_FEATURESTORE_STORAGE_CONNECTORS]:
            if sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_TYPE] == \
                    settings.jdbc_connector_type:
                storage_connectors[sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_NAME]] = \
                    JDBCStorageConnector(sc)
            if sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_TYPE] == \
                    settings.s3_connector_type:
                storage_connectors[sc[
                    constants.REST_CONFIG.
                    JSON_FEATURESTORE_CONNECTOR_NAME]] = S3StorageConnector(sc)
            if sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_TYPE] == \
                    settings.hopsfs_connector_type:
                storage_connectors[sc[constants.REST_CONFIG.JSON_FEATURESTORE_CONNECTOR_NAME]] = \
                    HopsfsStorageConnector(sc)
        featurestore = Featurestore(
            metadata_json[constants.REST_CONFIG.JSON_FEATURESTORE])
        return featuregroups, training_datasets, features_to_featuregroups, featurestore, settings, storage_connectors
Exemplo n.º 7
0
def _find_feature(feature, featurestore, featuregroups_parsed):
    """
    Looks if a given feature can be uniquely found in a list of featuregroups and returns that featuregroup.
    Otherwise it throws an exception

    Args:
        :feature: the feature to search for
        :featurestore: the featurestore where the featuregroups resides
        :featuregroups_parsed: the featuregroups to look through

    Returns:
        the featuregroup that contains the feature

    Raises:
        :FeatureNotFound: if the requested feature could not be found
    """
    featuregroups_matched = _find_featuregroup_that_contains_feature(
        featuregroups_parsed, feature)
    if (len(featuregroups_matched) == 0):
        raise FeatureNotFound(
            "Could not find the feature with name '{}' in any of the featuregroups of the featurestore: '{}'"
            .format(feature, featurestore))
    if (len(featuregroups_matched) > 1):
        featuregroups_matched_str_list = map(
            lambda fg: fs_utils._get_table_name(fg.name, fg.version),
            featuregroups_matched)
        featuregroups_matched_str = ",".join(featuregroups_matched_str_list)
        raise FeatureNameCollisionError(
            "Found the feature with name '{}' "
            "in more than one of the featuregroups of the featurestore: '{}', "
            "please specify the optional argument 'featuregroup=', "
            "the matched featuregroups were: {}".format(
                feature, featurestore, featuregroups_matched_str))
    return featuregroups_matched[0]
Exemplo n.º 8
0
def _do_get_featuregroup_partitions(featuregroup_name, featurestore_metadata, featurestore=None, featuregroup_version=1,
                                    online=False):
    """
    Gets the partitions of a featuregroup

     Args:
        :featuregroup_name: the featuregroup to get partitions for
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroup_version: the version of the featuregroup, defaults to 1
        :online: a boolean flag whether to fetch the online feature or the offline one (assuming that the
                 feature group that the feature is stored in has online serving enabled)
                 (for cached feature groups only)

     Returns:
        a dataframe with the partitions of the featuregroup
     """
    fg = query_planner._find_featuregroup(
        featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
    if fg.featuregroup_type == featurestore_metadata.settings.on_demand_featuregroup_type:
        raise CannotGetPartitionsOfOnDemandFeatureGroup("The feature group with name: {} , and version: {} "
                                                        "is an on-demand feature group. "
                                                        "Get partitions operation is only supported for "
                                                        "cached feature groups."
                                                        .format(featuregroup_name, featuregroup_version))

    sql_str = "SHOW PARTITIONS " + \
        fs_utils._get_table_name(featuregroup_name, featuregroup_version)
    result = _run_and_log_sql(sql_str, featurestore, online)
    return result
Exemplo n.º 9
0
def _do_get_featuregroup_partitions(featuregroup_name,
                                    featurestore_metadata,
                                    featurestore=None,
                                    featuregroup_version=1):
    """
    Gets the partitions of a featuregroup

     Args:
        :featuregroup_name: the featuregroup to get partitions for
        :featurestore: the featurestore where the featuregroup resides, defaults to the project's featurestore
        :featuregroup_version: the version of the featuregroup, defaults to 1

     Returns:
        a dataframe with the partitions of the featuregroup
     """
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups,
                                          featuregroup_name,
                                          featuregroup_version)
    if fg.featuregroup_type == featurestore_metadata.settings.on_demand_featuregroup_type:
        raise CannotGetPartitionsOfOnDemandFeatureGroup(
            "The feature group with name: {} , and version: {} "
            "is an on-demand feature group. "
            "Get partitions operation is only supported for "
            "cached feature groups.".format(featuregroup_name,
                                            featuregroup_version))
    hive = util._create_hive_connection(featurestore)

    sql_str = "SHOW PARTITIONS " + fs_utils._get_table_name(
        featuregroup_name, featuregroup_version)
    result = _run_and_log_sql(hive, sql_str)
    return result
Exemplo n.º 10
0
    def _featuregroup_query(self):
        """
        Creates a logical query plan for a user query to get a featuregroup

        Returns:
            None

        """
        self.features_str = "*"
        self.join_str = None
        self.featuregroups_str = fs_utils._get_table_name(
            self.query.featuregroup, self.query.featuregroup_version)
Exemplo n.º 11
0
def _do_get_training_datasets(featurestore_metadata):
    """
    Gets a list of all training datasets in a featurestore

    Args:
        :featurestore_metadata: metadata of the featurestore

    Returns:
        A list of names of the training datasets in this featurestore
    """
    training_dataset_names = list(
        map(lambda td: fs_utils._get_table_name(td.name, td.version),
            featurestore_metadata.training_datasets.values()))
    return training_dataset_names
Exemplo n.º 12
0
def _find_featuregroup_that_contains_feature(featuregroups, feature):
    """
    Go through list of featuregroups and find the ones that contain the feature

    Args:
        :featuregroups: featuregroups to search through
        :feature: the feature to look for

    Returns:
        a list of featuregroup names and versions for featuregroups that contain the given feature

    """
    matches = []
    for fg in featuregroups:
        for f in fg.features:
            fg_table_name = fs_utils._get_table_name(fg.name, fg.version)
            full_name = fg_table_name + "." + f.name
            if f.name == feature or full_name == feature or (
                    f.name in feature and fg.name in feature):
                matches.append(fg)
                break
    return matches
Exemplo n.º 13
0
    def _features_query(self):
        """
        Creates a logical query plan from a user query to get a list of features

        Returns:
            None

        Raises:
            :FeaturegroupNotFound: if the some of the features could not be found in any of the featuregroups
        """
        self.features_str = ", ".join(self.query.features)
        self.join_str = None

        if len(self.query.featuregroups_version_dict) == 1:
            self.featuregroups_str = fs_utils._get_table_name(
                self.query.featuregroups_version_dict[0][
                    constants.REST_CONFIG.JSON_FEATUREGROUP_NAME],
                self.query.featuregroups_version_dict[0][
                    constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])
            featuregroups = [
                self.query.featurestore_metadata.featuregroups[
                    fs_utils._get_table_name(
                        self.query.featuregroups_version_dict[0][
                            constants.REST_CONFIG.JSON_FEATUREGROUP_NAME],
                        self.query.featuregroups_version_dict[0][
                            constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])]
            ]
            self.featuregroups = featuregroups

        if len(self.query.featuregroups_version_dict) > 1:
            if self.query.join_key != None:
                featuregroups = [
                    self.query.featurestore_metadata.
                    featuregroups[fs_utils._get_table_name(
                        entry[constants.REST_CONFIG.JSON_FEATUREGROUP_NAME],
                        entry[
                            constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])]
                    for entry in self.query.featuregroups_version_dict
                ]
                self.join_str = query_planner._get_join_str(
                    featuregroups, self.query.join_key)
                self.featuregroups_str = fs_utils._get_table_name(
                    featuregroups[0].name, featuregroups[0].version)
                self.featuregroups = featuregroups

            else:
                featuregroups_parsed = self.query.featurestore_metadata.featuregroups
                if len(featuregroups_parsed.values()) == 0:
                    raise FeaturegroupNotFound(
                        "Could not find any featuregroups containing "
                        "the features in the metastore, "
                        "please explicitly supply featuregroups as an argument to the API call"
                    )
                featuregroups_filtered = list(
                    filter(
                        lambda fg: fg.name in self.query.
                        featuregroups_version_dict_orig and self.query.
                        featuregroups_version_dict_orig[fg.name] == fg.version,
                        featuregroups_parsed.values()))
                join_col = query_planner._get_join_col(featuregroups_filtered)
                self.join_str = query_planner._get_join_str(
                    featuregroups_filtered, join_col)
                self.featuregroups_str = fs_utils._get_table_name(
                    featuregroups_filtered[0].name,
                    featuregroups_filtered[0].version)
                self.featuregroups = featuregroups_filtered

        if len(self.query.featuregroups_version_dict) == 0:
            featuregroups_parsed = self.query.featurestore_metadata.featuregroups
            if len(featuregroups_parsed.values()) == 0:
                raise FeaturegroupNotFound(
                    "Could not find any featuregroups in the metastore, "
                    "please explicitly supply featuregroups as an argument to the API call"
                )
            feature_to_featuregroup = {}
            feature_featuregroups = []
            for feature in self.query.features:
                featuregroup_matched = query_planner._find_feature(
                    feature, self.query.featurestore,
                    featuregroups_parsed.values())
                feature_to_featuregroup[feature] = featuregroup_matched
                if not query_planner._check_if_list_of_featuregroups_contains_featuregroup(
                        feature_featuregroups, featuregroup_matched.name,
                        featuregroup_matched.version):
                    feature_featuregroups.append(featuregroup_matched)

            if len(feature_featuregroups) == 1:
                self.featuregroups_str = fs_utils._get_table_name(
                    feature_featuregroups[0].name,
                    feature_featuregroups[0].version)
            else:
                join_col = query_planner._get_join_col(feature_featuregroups)
                self.join_str = query_planner._get_join_str(
                    feature_featuregroups, join_col)
                self.featuregroups_str = fs_utils._get_table_name(
                    feature_featuregroups[0].name,
                    feature_featuregroups[0].version)
            self.featuregroups = feature_featuregroups

        fs_utils._log(
            "Logical query plan for getting {} features from the featurestore created successfully"
            .format(len(self.query.features)))