def _feature_query(self):
        """
        Creates a logical query plan for a user-query for a single feature

        Returns:
            None

        Raises:
            :FeaturegroupNotFound: if the feature could not be found in any of the featuregroups in the metadata
        """
        self.join_str = None
        self.features_str = self.query.feature
        if self.query.featuregroup != None:
            self.featuregroups_str =  fs_utils._get_table_name(self.query.featuregroup,
                                                                 self.query.featuregroup_version)
            self.featuregroups = [self.query.featurestore_metadata.featuregroups[
                                 fs_utils._get_table_name(self.query.featuregroup,
                                                          self.query.featuregroup_version)
                             ]]
        else:
            featuregroups_parsed = self.query.featurestore_metadata.featuregroups
            if len(featuregroups_parsed.values()) == 0:
                raise FeaturegroupNotFound("Could not find any featuregroups in the metastore "
                                           "that contains the given feature, "
                                           "please explicitly supply featuregroups as an argument to the API call")
            featuregroup_matched = query_planner._find_feature(self.query.feature, self.query.featurestore,
                                                               featuregroups_parsed.values())
            self.featuregroups_str = fs_utils._get_table_name(featuregroup_matched.name, featuregroup_matched.version)
            self.featuregroups = [featuregroup_matched]

        fs_utils._log("Logical query plan for getting 1 feature from the featurestore created successfully")
示例#2
0
def _run_and_log_sql(sql_str, featurestore, online=False):
    """
    Runs and logs an SQL query with pyHive

    Args:
        :sql_str: the query to run
        :featurestore: name of the featurestore
        :online: if true, run the query using online feature store JDBC connector

    Returns:
        :pd.DataFrame: the result of the SQL query as pandas dataframe
    """
    if not online:
        hive_conn = None
        try:
            fs_utils._log(
                "Running sql: {} against the offline feature store".format(
                    sql_str))
            hive_conn = util._create_hive_connection(featurestore)
            dataframe = pd.read_sql(sql_str, hive_conn)
        finally:
            if hive_conn:
                hive_conn.close()
    else:
        connection = None
        try:
            fs_utils._log(
                "Running sql: {} against online feature store".format(sql_str))
            metadata = _get_featurestore_metadata(featurestore,
                                                  update_cache=False)
            storage_connector = _do_get_online_featurestore_connector(
                featurestore, metadata)
            pw, user = _get_online_feature_store_password_and_user(
                storage_connector)
            parsed = urllib.parse.urlparse(
                urllib.parse.urlparse(
                    storage_connector.connection_string).path)
            db_connection_str = 'mysql+pymysql://' + user + \
                ':' + pw + '@' + parsed.netloc + parsed.path
            engine = sqlalchemy.create_engine(db_connection_str,
                                              poolclass=NullPool)
            db_connection = engine.connect()
            dataframe = pd.read_sql(sql_str, con=db_connection)
        finally:
            if connection:
                connection.close()

    # pd.read_sql returns columns in table.column format if columns are not specified in SQL query, i.e. SELECT * FROM..
    # this also occurs when sql query specifies table, i.e. SELECT table1.column1 table2.column2 FROM ... JOIN ...
    # we want only want hive table column names as dataframe column names
    dataframe.columns = [
        column.split('.')[1] if '.' in column else column
        for column in dataframe.columns
    ]

    return dataframe
    def construct_sql(self):
        """
        Constructs a HiveSQL query from the logical plan

        Returns:
            None
        """
        sql_str = "SELECT " + self.features_str + " FROM " + self.featuregroups_str
        if self.join_str is not None:
            sql_str = sql_str + " " + self.join_str
        self.sql_str = sql_str
        fs_utils._log("SQL string for the query created successfully")
示例#4
0
def _run_and_log_sql(hive_conn, sql_str):
    """
    Runs and logs an SQL query with pyHive

    Args:
        :hive_conn: the hive session
        :sql_str: the query to run

    Returns:
        :pd.DataFrame: the result of the SQL query as pandas dataframe
    """
    fs_utils._log("Running sql: {}".format(sql_str))
    # ToDo: right now hive connection is closed after every call. Manage connections better in future (pooling)

    dataframe = pd.read_sql(sql_str, hive_conn)
    hive_conn.close()

    return dataframe
示例#5
0
def _run_and_log_sql(hive_conn, sql_str):
    """
    Runs and logs an SQL query with pyHive

    Args:
        :hive_conn: the hive session
        :sql_str: the query to run

    Returns:
        :pd.DataFrame: the result of the SQL query as pandas dataframe
    """
    fs_utils._log("Running sql: {}".format(sql_str))
    # ToDo: right now hive connection is closed after every call. Manage connections better in future (pooling)

    dataframe = pd.read_sql(sql_str, hive_conn)

    # pd.read_sql returns columns in table.column format if columns are not specified in SQL query, i.e. SELECT * FROM..
    # this also occurs when sql query specifies table, i.e. SELECT table1.column1 table2.column2 FROM ... JOIN ...
    # we want only want hive table column names as dataframe column names
    dataframe.columns = [column.split('.')[1] if '.' in column else column for column in dataframe.columns]

    hive_conn.close()

    return dataframe
    def _features_query(self):
        """
        Creates a logical query plan from a user query to get a list of features

        Returns:
            None

        Raises:
            :FeaturegroupNotFound: if the some of the features could not be found in any of the featuregroups
        """
        self.features_str = ", ".join(self.query.features)
        self.join_str = None

        if len(self.query.featuregroups_version_dict) == 1:
            self.featuregroups_str = fs_utils._get_table_name(
                self.query.featuregroups_version_dict[0][
                    constants.REST_CONFIG.JSON_FEATUREGROUP_NAME],
                self.query.featuregroups_version_dict[0][
                    constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])
            featuregroups = [
                self.query.featurestore_metadata.featuregroups[
                    fs_utils._get_table_name(
                        self.query.featuregroups_version_dict[0][
                            constants.REST_CONFIG.JSON_FEATUREGROUP_NAME],
                        self.query.featuregroups_version_dict[0][
                            constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])]
            ]
            self.featuregroups = featuregroups

        if len(self.query.featuregroups_version_dict) > 1:
            if self.query.join_key != None:
                featuregroups = [
                    self.query.featurestore_metadata.
                    featuregroups[fs_utils._get_table_name(
                        entry[constants.REST_CONFIG.JSON_FEATUREGROUP_NAME],
                        entry[
                            constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION])]
                    for entry in self.query.featuregroups_version_dict
                ]
                self.join_str = query_planner._get_join_str(
                    featuregroups, self.query.join_key)
                self.featuregroups_str = fs_utils._get_table_name(
                    featuregroups[0].name, featuregroups[0].version)
                self.featuregroups = featuregroups

            else:
                featuregroups_parsed = self.query.featurestore_metadata.featuregroups
                if len(featuregroups_parsed.values()) == 0:
                    raise FeaturegroupNotFound(
                        "Could not find any featuregroups containing "
                        "the features in the metastore, "
                        "please explicitly supply featuregroups as an argument to the API call"
                    )
                featuregroups_filtered = list(
                    filter(
                        lambda fg: fg.name in self.query.
                        featuregroups_version_dict_orig and self.query.
                        featuregroups_version_dict_orig[fg.name] == fg.version,
                        featuregroups_parsed.values()))
                join_col = query_planner._get_join_col(featuregroups_filtered)
                self.join_str = query_planner._get_join_str(
                    featuregroups_filtered, join_col)
                self.featuregroups_str = fs_utils._get_table_name(
                    featuregroups_filtered[0].name,
                    featuregroups_filtered[0].version)
                self.featuregroups = featuregroups_filtered

        if len(self.query.featuregroups_version_dict) == 0:
            featuregroups_parsed = self.query.featurestore_metadata.featuregroups
            if len(featuregroups_parsed.values()) == 0:
                raise FeaturegroupNotFound(
                    "Could not find any featuregroups in the metastore, "
                    "please explicitly supply featuregroups as an argument to the API call"
                )
            feature_to_featuregroup = {}
            feature_featuregroups = []
            for feature in self.query.features:
                featuregroup_matched = query_planner._find_feature(
                    feature, self.query.featurestore,
                    featuregroups_parsed.values())
                feature_to_featuregroup[feature] = featuregroup_matched
                if not query_planner._check_if_list_of_featuregroups_contains_featuregroup(
                        feature_featuregroups, featuregroup_matched.name,
                        featuregroup_matched.version):
                    feature_featuregroups.append(featuregroup_matched)

            if len(feature_featuregroups) == 1:
                self.featuregroups_str = fs_utils._get_table_name(
                    feature_featuregroups[0].name,
                    feature_featuregroups[0].version)
            else:
                join_col = query_planner._get_join_col(feature_featuregroups)
                self.join_str = query_planner._get_join_str(
                    feature_featuregroups, join_col)
                self.featuregroups_str = fs_utils._get_table_name(
                    feature_featuregroups[0].name,
                    feature_featuregroups[0].version)
            self.featuregroups = feature_featuregroups

        fs_utils._log(
            "Logical query plan for getting {} features from the featurestore created successfully"
            .format(len(self.query.features)))