Exemplo n.º 1
0
    def compute_stats(self, tbl: LuxSQLTable):
        """
        Function which computes the min and max values for each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        # precompute statistics
        tbl.unique_values = {}
        tbl._min_max = {}
        length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = ""),lux.config.SQLconnection,)
        tbl._length = list(length_query["length"])[0]

        self.get_unique_values(tbl)
        for attribute in tbl.columns:
            if tbl._data_type[attribute] == "quantitative":
                min_max_query = pandas.read_sql(lux.config.query_templates['min_max_query'].format(attribute = attribute, table_name = tbl.table_name),lux.config.SQLconnection,)
                tbl._min_max[attribute] = (list(min_max_query["min"])[0],list(min_max_query["max"])[0],)
Exemplo n.º 2
0
    def compute_data_type(self, lst: LuxSQLTable):
        """
        Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        lst: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        data_type = {}
        sql_dtypes = {}
        self.get_cardinality(lst)
        if "." in lst.table_name:
            table_name = lst.table_name[lst.table_name.index(".") + 1 :]
        else:
            table_name = lst.table_name
        # get the data types of the attributes in the SQL table
        for attr in list(lst.columns):
            datatype_query = "SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{}' AND COLUMN_NAME = '{}'".format(
                table_name, attr
            )
            datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0]

            sql_dtypes[attr] = datatype
        for attr in list(lst.columns):
            if str(attr).lower() in ["month", "year"]:
                data_type[attr] = "temporal"
            elif sql_dtypes[attr] in [
                "character",
                "character varying",
                "boolean",
                "uuid",
                "text",
            ]:
                data_type[attr] = "nominal"
            elif sql_dtypes[attr] in [
                "integer",
                "numeric",
                "decimal",
                "bigint",
                "real",
                "smallint",
                "smallserial",
                "serial",
                "double precision",
            ]:
                if lst.cardinality[attr] < 13:
                    data_type[attr] = "nominal"
                elif check_if_id_like(lst, attr):
                    lst._data_type[attr] = "id"
                else:
                    data_type[attr] = "quantitative"
            elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]:
                data_type[attr] = "temporal"
        lst._data_type = data_type
Exemplo n.º 3
0
    def get_unique_values(self, tbl: LuxSQLTable):
        """
        Function which collects the unique values for each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        unique_vals = {}
        for attr in list(tbl.columns):
            unique_query = 'SELECT Distinct("{}") FROM {} WHERE "{}" IS NOT NULL'.format(
                attr, tbl.table_name, attr
            )
            unique_data = pandas.read_sql(
                unique_query,
                lux.config.SQLconnection,
            )
            unique_vals[attr] = list(unique_data[attr])
        tbl.unique_values = unique_vals
Exemplo n.º 4
0
    def get_cardinality(self, tbl: LuxSQLTable):
        """
        Function which computes the cardinality for each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        cardinality = {}
        for attr in list(tbl.columns):
            card_query = 'SELECT Count(Distinct("{}")) FROM {} WHERE "{}" IS NOT NULL'.format(
                attr, tbl.table_name, attr
            )
            card_data = pandas.read_sql(
                card_query,
                lux.config.SQLconnection,
            )
            cardinality[attr] = list(card_data["count"])[0]
        tbl.cardinality = cardinality
Exemplo n.º 5
0
    def get_SQL_attributes(self, tbl: LuxSQLTable):
        """
        Retrieves the names of variables within a specified Lux DataFrame's Postgres SQL table.
        Uses these variables to populate the Lux DataFrame's columns list.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose columns will be populated

        Returns
        -------
        None
        """
        if "." in tbl.table_name:
            table_name = tbl.table_name[self.table_name.index(".") + 1 :]
        else:
            table_name = tbl.table_name
        attr_query = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = '{}'".format(
            table_name
        )
        attributes = list(pandas.read_sql(attr_query, lux.config.SQLconnection)["column_name"])
        for attr in attributes:
            tbl[attr] = None
        tbl._setup_done = True
Exemplo n.º 6
0
    def get_SQL_attributes(self, tbl: LuxSQLTable):
        """
        Retrieves the names of variables within a specified Lux DataFrame's Postgres SQL table.
        Uses these variables to populate the Lux DataFrame's columns list.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose columns will be populated

        Returns
        -------
        None
        """
        if "." in tbl.table_name:
            table_name = tbl.table_name[self.table_name.index(".") + 1 :]
        else:
            table_name = tbl.table_name
        attr_query = lux.config.query_templates['table_attributes_query'].format(
            table_name = table_name, 
        )
        attributes = list(pandas.read_sql(attr_query, lux.config.SQLconnection)["column_name"])
        for attr in attributes:
            tbl[attr] = None
        tbl._setup_done = True
Exemplo n.º 7
0
    def execute_sampling(tbl: LuxSQLTable):
        SAMPLE_FLAG = lux.config.sampling
        SAMPLE_START = lux.config.sampling_start
        SAMPLE_CAP = lux.config.sampling_cap
        SAMPLE_FRAC = 0.2

        length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = ""),lux.config.SQLconnection,)
        limit = int(list(length_query["length"])[0]) * SAMPLE_FRAC
        sample_query = lux.config.query_templates['sample_query'].format(table_name = tbl.table_name, where_clause = "", num_rows = str(int(limit)))
        tbl._sampled = pandas.read_sql(sample_query, lux.config.SQLconnection)
Exemplo n.º 8
0
    def compute_data_type(self, tbl: LuxSQLTable):
        """
        Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        data_type = {}
        self.get_cardinality(tbl)
        if "." in tbl.table_name:
            table_name = tbl.table_name[tbl.table_name.index(".") + 1 :]
        else:
            table_name = tbl.table_name
        # get the data types of the attributes in the SQL table
        for attr in list(tbl.columns):
            datatype_query = lux.config.query_templates['datatype_query'].format(table_name = table_name, attribute = attr)
            datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0]
            if str(attr).lower() in {"month", "year"} or "time" in datatype or "date" in datatype:
                data_type[attr] = "temporal"
            elif datatype in {
                "character",
                "character varying",
                "boolean",
                "uuid",
                "text",
            }:
                data_type[attr] = "nominal"
            elif datatype in {
                "integer",
                "numeric",
                "decimal",
                "bigint",
                "real",
                "smallint",
                "smallserial",
                "serial",
                "double",
                "double precision",
            }:
                if tbl.cardinality[attr] < 13:
                    data_type[attr] = "nominal"
                elif check_if_id_like(tbl, attr):
                    data_type[attr] = "id"
                else:
                    data_type[attr] = "quantitative"

        tbl._data_type = data_type
Exemplo n.º 9
0
    def execute_sampling(tbl: LuxSQLTable):
        SAMPLE_FLAG = lux.config.sampling
        SAMPLE_START = lux.config.sampling_start
        SAMPLE_CAP = lux.config.sampling_cap
        SAMPLE_FRAC = 0.2

        length_query = pandas.read_sql(
            "SELECT COUNT(*) as length FROM {}".format(tbl.table_name),
            lux.config.SQLconnection,
        )
        limit = int(list(length_query["length"])[0]) * SAMPLE_FRAC
        tbl._sampled = pandas.read_sql(
            "SELECT * from {} LIMIT {}".format(tbl.table_name, str(limit)), lux.config.SQLconnection
        )
Exemplo n.º 10
0
    def compute_stats(self, tbl: LuxSQLTable):
        """
        Function which computes the min and max values for each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        # precompute statistics
        tbl.unique_values = {}
        tbl._min_max = {}
        length_query = pandas.read_sql(
            "SELECT COUNT(1) as length FROM {}".format(tbl.table_name),
            lux.config.SQLconnection,
        )
        tbl._length = list(length_query["length"])[0]

        self.get_unique_values(tbl)
        for attribute in tbl.columns:
            if tbl._data_type[attribute] == "quantitative":
                min_max_query = pandas.read_sql(
                    'SELECT MIN("{}") as min, MAX("{}") as max FROM {}'.format(
                        attribute, attribute, tbl.table_name
                    ),
                    lux.config.SQLconnection,
                )
                tbl._min_max[attribute] = (
                    list(min_max_query["min"])[0],
                    list(min_max_query["max"])[0],
                )
Exemplo n.º 11
0
    def get_unique_values(self, tbl: LuxSQLTable):
        """
        Function which collects the unique values for each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        unique_vals = {}
        for attr in list(tbl.columns):
            unique_query = lux.config.query_templates['unique_query'].format(attribute = attr, table_name = tbl.table_name)
            unique_data = pandas.read_sql(unique_query,lux.config.SQLconnection,)
            unique_vals[attr] = list(unique_data[attr])
        tbl.unique_values = unique_vals
Exemplo n.º 12
0
    def get_cardinality(self, tbl: LuxSQLTable):
        """
        Function which computes the cardinality for each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        cardinality = {}
        for attr in list(tbl.columns):
            card_query = lux.config.query_templates['cardinality_query'].format(attribute = attr, table_name = tbl.table_name)
            card_data = pandas.read_sql(card_query,lux.config.SQLconnection,)
            cardinality[attr] = list(card_data["count"])[0]
        tbl.cardinality = cardinality
Exemplo n.º 13
0
    def compute_dataset_metadata(self, tbl: LuxSQLTable):
        """
        Function which computes the metadata required for the Lux recommendation system.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        if not tbl._setup_done:
            self.get_SQL_attributes(tbl)
        tbl._data_type = {}
        #####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this
        #####      in the initialization and do it just once
        self.compute_data_type(tbl)
        self.compute_stats(tbl)
Exemplo n.º 14
0
 def execute_preview(lst: LuxSQLTable):
     lst._sampled = pandas.read_sql(
         "SELECT * from {} LIMIT 5".format(lst.table_name), lux.config.SQLconnection
     )