def test_inequalityfilter(): connection = psycopg2.connect( "host=localhost dbname=postgres user=postgres password=lux") sql_df = lux.LuxSQLTable() lux.config.set_SQL_connection(connection) sql_df.set_SQL_table("car") vis = Vis([ lux.Clause(attribute="Horsepower", filter_op=">", value=50), lux.Clause(attribute="MilesPerGal"), ]) vis._vis_data = sql_df filter_output = SQLExecutor.execute_filter(vis) assert filter_output[ 0] == 'WHERE "Horsepower" > \'50\' AND "MilesPerGal" IS NOT NULL' assert filter_output[1] == ["Horsepower"] intent = [ lux.Clause(attribute="Horsepower", filter_op="<=", value=100), lux.Clause(attribute="MilesPerGal"), ] vis = Vis(intent, sql_df) vis._vis_data = sql_df filter_output = SQLExecutor.execute_filter(vis) assert filter_output[ 0] == 'WHERE "Horsepower" <= \'100\' AND "MilesPerGal" IS NOT NULL' assert filter_output[1] == ["Horsepower"]
def test_color_barchart_code_export(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") vis = Vis([lux.Clause("origin"), lux.Clause("cylinders")], tbl) SQLExecutor.execute([vis], tbl) code = vis.to_code("python") try: exec(code, globals()) create_chart_data(tbl, vis) except: assert False
def test_histogram_code_export(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") vis = Vis([lux.Clause("horsepower")], tbl) SQLExecutor.execute([vis], tbl) code = vis.to_code("python") try: exec(code, globals()) create_chart_data(tbl, vis) except: assert False
def test_lazy_execution(): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") intent = [ lux.Clause(attribute="horsepower", aggregation="mean"), lux.Clause(attribute="origin"), ] vis = Vis(intent) # Check data field in vis is empty before calling executor assert vis.data is None SQLExecutor.execute([vis], tbl) assert type(vis.data) == lux.core.frame.LuxDataFrame
def test_lazy_execution(): connection = psycopg2.connect( "host=localhost dbname=postgres user=postgres password=lux") sql_df = lux.LuxSQLTable() lux.config.set_SQL_connection(connection) sql_df.set_SQL_table("car") intent = [ lux.Clause(attribute="Horsepower", aggregation="mean"), lux.Clause(attribute="Origin"), ] vis = Vis(intent) # Check data field in vis is empty before calling executor assert vis.data is None SQLExecutor.execute([vis], sql_df) assert type(vis.data) == lux.core.frame.LuxDataFrame
def __init__(self, *args, **kw): self._history = History() self._intent = [] self._inferred_intent = [] self._recommendation = {} self._saved_export = None self._current_vis = [] self._prev = None self._widget = None super(LuxDataFrame, self).__init__(*args, **kw) self.table_name = "" if lux.config.SQLconnection == "": from lux.executor.PandasExecutor import PandasExecutor lux.config.executor = PandasExecutor() else: from lux.executor.SQLExecutor import SQLExecutor lux.config.executor = SQLExecutor() self._sampled = None self._toggle_pandas_display = True self._message = Message() self._pandas_only = False # Metadata self._data_type = {} self.unique_values = None self.cardinality = None self._min_max = None self.pre_aggregated = None self._type_override = {} warnings.formatwarning = lux.warning_format
def __init__(self, *args, table_name="", **kw): super(LuxSQLTable, self).__init__(*args, **kw) from lux.executor.SQLExecutor import SQLExecutor lux.config.executor = SQLExecutor() self._length = 0 self._setup_done = False if table_name != "": self.set_SQL_table(table_name) warnings.formatwarning = lux.warning_format
def set_executor_type(self, exe): if exe == "SQL": from lux.executor.SQLExecutor import SQLExecutor self.executor = SQLExecutor() elif exe == "Pandas": from lux.executor.PandasExecutor import PandasExecutor self.SQLconnection = "" self.executor = PandasExecutor() else: raise ValueError("Executor type must be either 'Pandas' or 'SQL'")
def maintain_metadata(self): if lux.config.SQLconnection != "" and lux.config.executor.name != "SQL": from lux.executor.SQLExecutor import SQLExecutor lux.config.executor = SQLExecutor() # Check that metadata has not yet been computed if not hasattr(self, "_metadata_fresh") or not self._metadata_fresh: # only compute metadata information if the dataframe is non-empty if len(self) > 0 or lux.config.executor.name == "SQLExecutor": lux.config.executor.compute_stats(self) lux.config.executor.compute_dataset_metadata(self) self._infer_structure() self._metadata_fresh = True
def test_inequalityfilter(): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") vis = Vis( [ lux.Clause(attribute="horsepower", filter_op=">", value=50), lux.Clause(attribute="milespergal"), ] ) vis._vis_data = tbl filter_output = SQLExecutor.execute_filter(vis) assert filter_output[0] == 'WHERE "horsepower" > \'50\' AND "milespergal" IS NOT NULL' assert filter_output[1] == ["horsepower"] intent = [ lux.Clause(attribute="horsepower", filter_op="<=", value=100), lux.Clause(attribute="milespergal"), ] vis = Vis(intent, tbl) vis._vis_data = tbl filter_output = SQLExecutor.execute_filter(vis) assert filter_output[0] == 'WHERE "horsepower" <= \'100\' AND "milespergal" IS NOT NULL' assert filter_output[1] == ["horsepower"]
def read_query_template(self, query_template): from lux.executor.SQLExecutor import SQLExecutor query_dict = {} if type(query_template) is str: for line in query_template.split("\n"): (key, val) = line.split(":") query_dict[key] = val.strip() else: with open(query_file) as f: for line in f: (key, val) = line.split(":") query_dict[key] = val.strip() self.query_templates = query_dict self.executor = SQLExecutor()
def maintain_metadata(self) -> None: """ Maintain dataset metadata and statistics (Compute only if needed) """ is_sql_tbl = lux.config.executor.name == "SQLExecutor" if lux.config.SQLconnection != "" and is_sql_tbl: from lux.executor.SQLExecutor import SQLExecutor lux.config.executor = SQLExecutor() if lux.config.lazy_maintain: # Check that metadata has not yet been computed if not hasattr(self, "_metadata_fresh") or not self._metadata_fresh: # only compute metadata information if the dataframe is non-empty self.compute_metadata() else: self.compute_metadata()
def set_executor_type(self, exe): if exe == "SQL": import pkgutil if pkgutil.find_loader("psycopg2") is None: raise ImportError( "psycopg2 is not installed. Run `pip install psycopg2' to install psycopg2 to enable the Postgres connection." ) else: import psycopg2 from lux.executor.SQLExecutor import SQLExecutor self.executor = SQLExecutor() else: from lux.executor.PandasExecutor import PandasExecutor self.executor = PandasExecutor()
def __init__(self, *args, joins=[], **kw): super(JoinedSQLTable, self).__init__(*args, **kw) from lux.executor.SQLExecutor import SQLExecutor lux.config.executor = SQLExecutor() # self._metadata.joins = [] tables = self.extract_tables(joins) if len(tables) > 4: warnings.warn( f"\nPlease provide a maximum of 4 (Four) unique tables to ensure optimal performance.", stacklevel=2, ) view_name = self.create_view(tables, joins) self._length = 0 if view_name != "": self.set_SQL_table(view_name) # self._metadata.using_view = True warnings.formatwarning = lux.warning_format
def test_filter(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") intent = [ lux.Clause(attribute="horsepower"), lux.Clause(attribute="year"), lux.Clause(attribute="origin", filter_op="=", value="USA"), ] vis = Vis(intent, tbl) vis._vis_data = tbl filter_output = SQLExecutor.execute_filter(vis) where_clause = filter_output[0] where_clause_list = where_clause.split(" AND ") assert ("WHERE \"origin\" = 'USA'" in where_clause_list and '"horsepower" IS NOT NULL' in where_clause_list and '"year" IS NOT NULL' in where_clause_list) assert filter_output[1] == ["origin"]
def test_filter(): connection = psycopg2.connect( "host=localhost dbname=postgres user=postgres password=lux") sql_df = lux.LuxSQLTable() lux.config.set_SQL_connection(connection) sql_df.set_SQL_table("car") intent = [ lux.Clause(attribute="Horsepower"), lux.Clause(attribute="Year"), lux.Clause(attribute="Origin", filter_op="=", value="USA"), ] vis = Vis(intent, sql_df) vis._vis_data = sql_df filter_output = SQLExecutor.execute_filter(vis) assert ( filter_output[0] == 'WHERE "Origin" = \'USA\' AND "Year" IS NOT NULL AND "Horsepower" IS NOT NULL' ) assert filter_output[1] == ["Origin"]
def check_if_id_like(df, attribute): import re # Strong signals # so that aggregated reset_index fields don't get misclassified high_cardinality = df.cardinality[attribute] > 500 attribute_contain_id = re.search(r"id|ID|iD|Id", str(attribute)) is not None almost_all_vals_unique = df.cardinality[attribute] >= 0.98 * len(df) is_string = pd.api.types.is_string_dtype(df[attribute]) if is_string: # For string IDs, usually serial numbers or codes with alphanumerics have a consistent length (eg., CG-39405) with little deviation. For a high cardinality string field but not ID field (like Name or Brand), there is less uniformity across the string lengths. if len(df) > 50: if lux.config.executor.name == "PandasExecutor": sampled = df[attribute].sample(50, random_state=99) else: from lux.executor.SQLExecutor import SQLExecutor sampled = SQLExecutor.execute_preview(df, preview_size=50) else: sampled = df[attribute] str_length_uniformity = sampled.apply( lambda x: type(x) == str and len(x)).std() < 3 return (high_cardinality and (attribute_contain_id or almost_all_vals_unique) and str_length_uniformity) else: if len(df) >= 2: series = df[attribute] diff = series.diff() evenly_spaced = all(diff.iloc[1:] == diff.iloc[1]) else: evenly_spaced = True if attribute_contain_id: almost_all_vals_unique = df.cardinality[attribute] >= 0.75 * len( df) return high_cardinality and (almost_all_vals_unique or evenly_spaced)
def deviation_from_overall( vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str, exclude_nan: bool = True, ) -> int: """ Difference in bar chart/histogram shape from overall chart Note: this function assumes that the filtered vis.data is operating on the same range as the unfiltered vis.data. Parameters ---------- vis : Vis ldf : LuxDataFrame filter_specs : list List of filters from the Vis msr_attribute : str The attribute name of the measure value of the chart exclude_nan: bool Whether to include/exclude NaN values as part of the deviation calculation Returns ------- int Score describing how different the vis is from the overall vis """ if lux.config.executor.name == "PandasExecutor": if exclude_nan: vdata = vis.data.dropna() else: vdata = vis.data v_filter_size = get_filtered_size(filter_specs, ldf) v_size = len(vis.data) elif lux.config.executor.name == "SQLExecutor": from lux.executor.SQLExecutor import SQLExecutor v_filter_size = SQLExecutor.get_filtered_size(filter_specs, ldf) v_size = len(ldf) vdata = vis.data v_filter = vdata[msr_attribute] total = v_filter.sum() v_filter = v_filter / total # normalize by total to get ratio if total == 0: return 0 # Generate an "Overall" Vis (TODO: This is computed multiple times for every vis, alternative is to directly access df.current_vis but we do not have guaruntee that will always be unfiltered vis (in the non-Filter action scenario)) import copy unfiltered_vis = copy.copy(vis) # Remove filters, keep only attribute intent unfiltered_vis._inferred_intent = utils.get_attrs_specs( vis._inferred_intent) lux.config.executor.execute([unfiltered_vis], ldf) if exclude_nan: uv = unfiltered_vis.data.dropna() else: uv = unfiltered_vis.data v = uv[msr_attribute] v = v / v.sum() assert len(v) == len( v_filter), "Data for filtered and unfiltered vis have unequal length." sig = v_filter_size / v_size # significance factor # Euclidean distance as L2 function rankSig = 1 # category measure value ranking significance factor # if the vis is a barchart, count how many categories' rank, based on measure value, changes after the filter is applied if vis.mark == "bar": dimList = vis.get_attr_by_data_model("dimension") # use Pandas rank function to calculate rank positions for each category v_rank = uv.rank() v_filter_rank = vdata.rank() # go through and count the number of ranking changes between the filtered and unfiltered data numCategories = ldf.cardinality[dimList[0].attribute] for r in range(0, numCategories - 1): if v_rank[msr_attribute][r] != v_filter_rank[msr_attribute][r]: rankSig += 1 # normalize ranking significance factor rankSig = rankSig / numCategories from scipy.spatial.distance import euclidean return sig * rankSig * euclidean(v, v_filter)