def test_autoencoding_histogram(global_var): lux.config.set_executor_type("Pandas") # No channel specified df = pytest.car_df # change pandas dtype for the column "Year" to datetype df["Year"] = pd.to_datetime(df["Year"], format="%Y") vis = Vis([lux.Clause(attribute="MilesPerGal", channel="y")], df) check_attribute_on_channel(vis, "MilesPerGal", "y") vis = Vis([lux.Clause(attribute="MilesPerGal", channel="x")], df) assert vis.get_attr_by_channel("x")[0].attribute == "MilesPerGal" assert vis.get_attr_by_channel("y")[0].attribute == "Record" # No channel specified # test for sql executor connection = psycopg2.connect( "host=localhost dbname=postgres user=postgres password=lux") lux.config.set_SQL_connection(connection) sql_df = lux.LuxSQLTable(table_name="cars") vis = Vis([lux.Clause(attribute="milespergal", channel="y")], sql_df) check_attribute_on_channel(vis, "milespergal", "y") vis = Vis([lux.Clause(attribute="milespergal", channel="x")], sql_df) assert vis.get_attr_by_channel("x")[0].attribute == "milespergal" assert vis.get_attr_by_channel("y")[0].attribute == "Record"
def test_special_char(): dataset = [ {"special.char": 1, "normal": 2}, {"special.char": 1, "normal": 2}, {"special.char": 1, "normal": 5}, {"special.char": 1, "normal": 2}, {"special.char": 1, "normal": 3}, {"special.char": 1, "normal": 2}, {"special.char": 1, "normal": 6}, {"special.char": 1, "normal": 2}, {"special.char": 1, "normal": 7}, {"special.char": 1, "normal": 2}, {"special.char": 3, "normal": 10}, {"special.char": 1, "normal": 1}, {"special.char": 5, "normal": 2}, {"special.char": 1, "normal": 2}, {"special.char": 1, "normal": 2}, {"special.char": 1, "normal": 2}, {"special.char": 1, "normal": 2}, ] test = pd.DataFrame(dataset) from lux.vis.Vis import Vis # TODO: add assert that checks that the bar chart is rendered correctly in Altair vis = Vis(["special.char"], test) assert vis.mark == "bar" assert vis.intent == ["special.char"] assert vis.get_attr_by_channel("x")[0].attribute == "Record" assert vis.get_attr_by_channel("y")[0].attribute == "special.char" vis = vis.to_Altair() assert ( "alt.Y('specialchar', type= 'nominal', axis=alt.Axis(labelOverlap=True, title='special.char'))" in vis ) assert ( "alt.X('Record', type= 'quantitative', title='Number of Records', axis=alt.Axis(title='Number of Records')" in vis ) # Checking that this works even when there are multiple "." in column test = test.rename(columns={"special.char": "special..char.."}) # TODO: add assert that checks that the bar chart is rendered correctly in Altair vis = Vis(["special..char.."], test) assert vis.mark == "bar" assert vis.intent == ["special..char.."] assert vis.get_attr_by_channel("x")[0].attribute == "Record" assert vis.get_attr_by_channel("y")[0].attribute == "special..char.." vis = vis.to_Altair() assert ( "alt.Y('specialchar', type= 'nominal', axis=alt.Axis(labelOverlap=True, title='special..char..')" in vis ) assert ( "alt.X('Record', type= 'quantitative', title='Number of Records', axis=alt.Axis(title='Number of Records')" in vis )
def execute_2D_binning(vis: Vis) -> None: """ Apply 2D binning (heatmap) to vis.data Parameters ---------- vis : Vis """ pd.reset_option("mode.chained_assignment") with pd.option_context("mode.chained_assignment", None): x_attr = vis.get_attr_by_channel("x")[0].attribute y_attr = vis.get_attr_by_channel("y")[0].attribute vis._vis_data["xBin"] = pd.cut(vis._vis_data[x_attr], bins=lux.config.heatmap_bin_size) vis._vis_data["yBin"] = pd.cut(vis._vis_data[y_attr], bins=lux.config.heatmap_bin_size) color_attr = vis.get_attr_by_channel("color") if len(color_attr) > 0: color_attr = color_attr[0] groups = vis._vis_data.groupby( ["xBin", "yBin"], history=False)[color_attr.attribute] if color_attr.data_type == "nominal": # Compute mode and count. Mode aggregates each cell by taking the majority vote for the category variable. In cases where there is ties across categories, pick the first item (.iat[0]) result = groups.agg([ ("count", "count"), (color_attr.attribute, lambda x: pd.Series.mode(x).iat[0]), ]).reset_index() elif color_attr.data_type == "quantitative" or color_attr.data_type == "temporal": # Compute the average of all values in the bin result = groups.agg([("count", "count"), (color_attr.attribute, "mean") ]).reset_index() result = result.dropna() else: groups = vis._vis_data.groupby(["xBin", "yBin"], history=False)[x_attr] result = groups.count().reset_index(name=x_attr) result = result.rename(columns={x_attr: "count"}) result = result[result["count"] != 0] # convert type to facilitate weighted correlation interestingess calculation result["xBinStart"] = result["xBin"].apply( lambda x: x.left).astype("float") result["xBinEnd"] = result["xBin"].apply(lambda x: x.right) result["yBinStart"] = result["yBin"].apply( lambda x: x.left).astype("float") result["yBinEnd"] = result["yBin"].apply(lambda x: x.right) vis._vis_data = result.drop(columns=["xBin", "yBin"])
def execute_2D_binning(vis: Vis): pd.reset_option('mode.chained_assignment') with pd.option_context('mode.chained_assignment', None): x_attr = vis.get_attr_by_channel("x")[0] y_attr = vis.get_attr_by_channel("y")[0] vis._vis_data.loc[:, "xBin"] = pd.cut(vis._vis_data[x_attr.attribute], bins=40) vis._vis_data.loc[:, "yBin"] = pd.cut(vis._vis_data[y_attr.attribute], bins=40) color_attr = vis.get_attr_by_channel("color") if (len(color_attr) > 0): color_attr = color_attr[0] groups = vis._vis_data.groupby(['xBin', 'yBin'])[color_attr.attribute] if (color_attr.data_type == "nominal"): # Compute mode and count. Mode aggregates each cell by taking the majority vote for the category variable. In cases where there is ties across categories, pick the first item (.iat[0]) result = groups.agg([("count", "count"), (color_attr.attribute, lambda x: pd.Series.mode(x).iat[0]) ]).reset_index() elif (color_attr.data_type == "quantitative"): # Compute the average of all values in the bin result = groups.agg([("count", "count"), (color_attr.attribute, "mean") ]).reset_index() result = result.dropna() else: groups = vis._vis_data.groupby(['xBin', 'yBin'])[x_attr.attribute] result = groups.agg("count").reset_index( ) # .agg in this line throws SettingWithCopyWarning result = result.rename(columns={x_attr.attribute: "count"}) result = result[result["count"] != 0] # convert type to facilitate weighted correlation interestingess calculation result.loc[:, "xBinStart"] = result["xBin"].apply( lambda x: x.left).astype('float') result.loc[:, "xBinEnd"] = result["xBin"].apply(lambda x: x.right) result.loc[:, "yBinStart"] = result["yBin"].apply( lambda x: x.left).astype('float') result.loc[:, "yBinEnd"] = result["yBin"].apply(lambda x: x.right) vis._vis_data = result.drop(columns=["xBin", "yBin"])
def execute_aggregate(view: Vis, ldf: LuxDataFrame): import pandas as pd x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] groupby_attr = "" measure_attr = "" if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if (measure_attr != ""): #barchart case, need count data for each group if (measure_attr.attribute == "Record"): where_clause, filterVars = SQLExecutor.execute_filter(view) count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format( groupby_attr.attribute, groupby_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(count_query, ldf.SQLconnection) view.data = view.data.rename(columns={"count": "Record"}) view.data = utils.pandas_to_lux(view.data) else: where_clause, filterVars = SQLExecutor.execute_filter(view) if agg_func == "mean": mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "sum": mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "max": mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) #pad empty categories with 0 counts after filter is applied all_attr_vals = ldf.unique_values[groupby_attr.attribute] result_vals = list(view.data[groupby_attr.attribute]) if (len(result_vals) != len(all_attr_vals)): # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints for vals in all_attr_vals: if (vals not in result_vals): view.data.loc[len(view.data)] = [ vals ] + [0] * (len(view.data.columns) - 1)
def execute_2D_binning(vis: Vis): pd.reset_option('mode.chained_assignment') with pd.option_context('mode.chained_assignment', None): x_attr = vis.get_attr_by_channel("x")[0] y_attr = vis.get_attr_by_channel("y")[0] vis._vis_data.loc[:, "xBin"] = pd.cut(vis._vis_data[x_attr.attribute], bins=30) vis._vis_data.loc[:, "yBin"] = pd.cut(vis._vis_data[y_attr.attribute], bins=30) groups = vis._vis_data.groupby(['xBin', 'yBin'])[x_attr.attribute] result = groups.agg("count").reset_index( ) # .agg in this line throws SettingWithCopyWarning result = result.rename(columns={x_attr.attribute: "z"}) result = result[result["z"] != 0] # convert type to facilitate weighted correlation interestingess calculation result.loc[:, "xBinStart"] = result["xBin"].apply( lambda x: x.left).astype('float') result.loc[:, "xBinEnd"] = result["xBin"].apply(lambda x: x.right) result.loc[:, "yBinStart"] = result["yBin"].apply( lambda x: x.left).astype('float') result.loc[:, "yBinEnd"] = result["yBin"].apply(lambda x: x.right) vis._vis_data = result.drop(columns=["xBin", "yBin"])
def enforce_specified_channel(vis: Vis, auto_channel: Dict[str, str]): """ Enforces that the channels specified in the Vis by users overrides the showMe autoChannels. Parameters ---------- vis : lux.vis.Vis Input Vis without channel specification. auto_channel : Dict[str,str] Key-value pair in the form [channel: attributeName] specifying the showMe recommended channel location. Returns ------- vis : lux.vis.Vis Vis with channel specification combining both original and auto_channel specification. Raises ------ ValueError Ensures no more than one attribute is placed in the same channel. """ result_dict = ( {} ) # result of enforcing specified channel will be stored in result_dict specified_dict = ( {} ) # specified_dict={"x":[],"y":[list of Dobj with y specified as channel]} # create a dictionary of specified channels in the given dobj for val in auto_channel.keys(): specified_dict[val] = vis.get_attr_by_channel(val) result_dict[val] = "" # for every element, replace with what's in specified_dict if specified for sVal, sAttr in specified_dict.items(): if len(sAttr) == 1: # if specified in dobj # remove the specified channel from auto_channel (matching by value, since channel key may not be same) for i in list(auto_channel.keys()): if (auto_channel[i].attribute == sAttr[0].attribute) and ( auto_channel[i].channel == sVal ): # need to ensure that the channel is the same (edge case when duplicate Cols with same attribute name) auto_channel.pop(i) break sAttr[0].channel = sVal result_dict[sVal] = sAttr[0] elif len(sAttr) > 1: raise ValueError( "There should not be more than one attribute specified in the same channel." ) # For the leftover channels that are still unspecified in result_dict, # and the leftovers in the auto_channel specification, # step through them together and fill it automatically. leftover_channels = list( filter(lambda x: result_dict[x] == "", result_dict)) for leftover_channel, leftover_encoding in zip(leftover_channels, auto_channel.values()): leftover_encoding.channel = leftover_channel result_dict[leftover_channel] = leftover_encoding vis._inferred_intent = list(result_dict.values()) return vis
def test_autoencoding_histogram(global_var): # No channel specified # test for sql executor sql_df = lux.LuxSQLTable(table_name="cars") vis = Vis([lux.Clause(attribute="milespergal", channel="y")], sql_df) check_attribute_on_channel(vis, "milespergal", "y") vis = Vis([lux.Clause(attribute="milespergal", channel="x")], sql_df) assert vis.get_attr_by_channel("x")[0].attribute == "milespergal" assert vis.get_attr_by_channel("y")[0].attribute == "Record"
def test_autoencoding_histogram(global_var): # No channel specified df = pytest.car_df # change pandas dtype for the column "Year" to datetype df["Year"] = pd.to_datetime(df["Year"], format="%Y") vis = Vis([lux.Clause(attribute="MilesPerGal", channel="y")], df) check_attribute_on_channel(vis, "MilesPerGal", "y") vis = Vis([lux.Clause(attribute="MilesPerGal", channel="x")], df) assert vis.get_attr_by_channel("x")[0].attribute == "MilesPerGal" assert vis.get_attr_by_channel("y")[0].attribute == "Record"
def test_autoencoding_histogram(): # No channel specified df = pd.read_csv("lux/data/car.csv") df["Year"] = pd.to_datetime( df["Year"], format='%Y') # change pandas dtype for the column "Year" to datetype vis = Vis([lux.Clause(attribute="MilesPerGal", channel="y")], df) check_attribute_on_channel(vis, "MilesPerGal", "y") vis = Vis([lux.Clause(attribute="MilesPerGal", channel="x")], df) assert vis.get_attr_by_channel("x")[0].attribute == "MilesPerGal" assert vis.get_attr_by_channel("y")[0].attribute == "Record"
def test_refresh_inplace(): df = pd.DataFrame({ 'date': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'], 'value': [10.5, 15.2, 20.3, 25.2] }) assert df.data_type['nominal'][0] == 'date' from lux.vis.Vis import Vis vis = Vis(["date", "value"], df) df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d") assert df.data_type['temporal'][0] == 'date' vis.refresh_source(df) assert vis.mark == "line" assert vis.get_attr_by_channel("x")[0].attribute == "date" assert vis.get_attr_by_channel("y")[0].attribute == "value"
def test_refresh_inplace(): df = pd.DataFrame({ 'date': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'], 'value': [10.5, 15.2, 20.3, 25.2] }) with pytest.warns( UserWarning, match="Lux detects that the attribute 'date' may be temporal."): df._repr_html_() assert df.data_type_lookup["date"] == "temporal" from lux.vis.Vis import Vis vis = Vis(["date", "value"], df) df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d") df.maintain_metadata() assert df.data_type['temporal'][0] == 'date' vis.refresh_source(df) assert vis.mark == "line" assert vis.get_attr_by_channel("x")[0].attribute == "date" assert vis.get_attr_by_channel("y")[0].attribute == "value"
def test_refresh_inplace(): df = pd.DataFrame( { "date": ["2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01"], "value": [10.5, 15.2, 20.3, 25.2], } ) with pytest.warns(UserWarning, match="Lux detects that the attribute 'date' may be temporal."): df._ipython_display_() assert df.data_type["date"] == "temporal" from lux.vis.Vis import Vis vis = Vis(["date", "value"], df) df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d") df.maintain_metadata() inverted_data_type = lux.config.executor.invert_data_type(df.data_type) assert inverted_data_type["temporal"][0] == "date" vis.refresh_source(df) assert vis.mark == "line" assert vis.get_attr_by_channel("x")[0].attribute == "date" assert vis.get_attr_by_channel("y")[0].attribute == "value"
def execute_aggregate(vis: Vis, isFiltered=True): ''' Aggregate data points on an axis for bar or line charts Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization ldf : lux.core.frame LuxDataFrame with specified intent. Returns ------- None ''' import numpy as np x_attr = vis.get_attr_by_channel("x")[0] y_attr = vis.get_attr_by_channel("y")[0] has_color = False groupby_attr = "" measure_attr = "" if (x_attr.aggregation is None or y_attr.aggregation is None): return if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if (groupby_attr.attribute in vis.data.unique_values.keys()): attr_unique_vals = vis.data.unique_values[groupby_attr.attribute] #checks if color is specified in the Vis if len(vis.get_attr_by_channel("color")) == 1: color_attr = vis.get_attr_by_channel("color")[0] color_attr_vals = vis.data.unique_values[color_attr.attribute] color_cardinality = len(color_attr_vals) #NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable---------------- has_color = True else: color_cardinality = 1 if (measure_attr != ""): if (measure_attr.attribute == "Record"): vis._vis_data = vis.data.reset_index() #if color is specified, need to group by groupby_attr and color_attr if has_color: vis._vis_data = vis.data.groupby( [groupby_attr.attribute, color_attr.attribute]).count().reset_index() vis._vis_data = vis.data.rename( columns={"index": "Record"}) vis._vis_data = vis.data[[ groupby_attr.attribute, color_attr.attribute, "Record" ]] else: vis._vis_data = vis.data.groupby( groupby_attr.attribute).count().reset_index() vis._vis_data = vis.data.rename( columns={"index": "Record"}) vis._vis_data = vis.data[[ groupby_attr.attribute, "Record" ]] else: #if color is specified, need to group by groupby_attr and color_attr if has_color: groupby_result = vis.data.groupby( [groupby_attr.attribute, color_attr.attribute]) else: groupby_result = vis.data.groupby(groupby_attr.attribute) groupby_result = groupby_result.agg(agg_func) intermediate = groupby_result.reset_index() vis._vis_data = intermediate.__finalize__(vis.data) result_vals = list(vis.data[groupby_attr.attribute]) #create existing group by attribute combinations if color is specified #this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them if has_color: res_color_combi_vals = [] result_color_vals = list(vis.data[color_attr.attribute]) for i in range(0, len(result_vals)): res_color_combi_vals.append( [result_vals[i], result_color_vals[i]]) # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints if (isFiltered or has_color and attr_unique_vals): N_unique_vals = len(attr_unique_vals) if (len(result_vals) != N_unique_vals * color_cardinality): columns = vis.data.columns if has_color: df = pd.DataFrame({ columns[0]: attr_unique_vals * color_cardinality, columns[1]: pd.Series(color_attr_vals).repeat(N_unique_vals) }) vis._vis_data = vis.data.merge( df, on=[columns[0], columns[1]], how='right', suffixes=['', '_right']) for col in columns[2:]: vis.data[col] = vis.data[col].fillna( 0) #Triggers __setitem__ assert len( list(vis.data[groupby_attr.attribute]) ) == N_unique_vals * len( color_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." vis._vis_data = vis.data.iloc[:, : 3] # Keep only the three relevant columns not the *_right columns resulting from merge else: df = pd.DataFrame({columns[0]: attr_unique_vals}) vis._vis_data = vis.data.merge(df, on=columns[0], how='right', suffixes=['', '_right']) for col in columns[1:]: vis.data[col] = vis.data[col].fillna(0) assert len( list(vis.data[groupby_attr.attribute]) ) == N_unique_vals, f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." vis._vis_data = vis.data.sort_values(by=groupby_attr.attribute, ascending=True) vis._vis_data = vis.data.reset_index() vis._vis_data = vis.data.drop(columns="index")
def interestingness(vis:Vis ,ldf:LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data)==0: raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) record_attrs = list(filter(lambda x: x.attribute=="Record" and x.data_model=="measure", vis_attrs_specs)) n_record = len(record_attrs) for clause in vis_attrs_specs: if (clause.attribute!="Record"): if (clause.data_model == 'dimension'): n_dim += 1 if (clause.data_model == 'measure'): n_msr += 1 n_filter = len(filter_specs) attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) # Line/Bar Chart #print("r:", n_record, "m:", n_msr, "d:",n_dim) if (n_dim == 1 and (n_msr==0 or n_msr==1)): if (v_size<2): return -1 if (n_filter == 0): return unevenness(vis, ldf, measure_lst, dimension_lst) elif(n_filter==1): return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif (n_dim == 0 and n_msr == 1): if (v_size<2): return -1 if (n_filter == 0): v = vis.data["Number of Records"] return skewness(v) elif (n_filter == 1): return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") # Scatter Plot elif (n_dim == 0 and n_msr == 2): if (v_size<2): return -1 if (n_filter==1): v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size/v_size else: sig = 1 return sig * monotonicity(vis,attr_specs) # Scatterplot colored by Dimension elif (n_dim == 1 and n_msr == 2): if (v_size<5): return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if (C<40): return 1/C else: return -1 # Scatterplot colored by dimension elif (n_dim== 1 and n_msr == 2): return 0.2 # Scatterplot colored by measure elif (n_msr == 3): return 0.1 # colored line and barchart cases elif ((vis.mark == "line" or vis.mark == "bar") and n_dim == 2): return 0.2 # Default else: return -1
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) record_attrs = list( filter( lambda x: x.attribute == "Record" and x.data_model == "measure", vis_attrs_specs, )) n_record = len(record_attrs) for clause in vis_attrs_specs: if clause.attribute != "Record": if clause.data_model == "dimension": n_dim += 1 if clause.data_model == "measure": n_msr += 1 n_filter = len(filter_specs) attr_specs = [ clause for clause in vis_attrs_specs if clause.attribute != "Record" ] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif n_dim == 0 and n_msr == 1: if v_size < 2: return -1 if n_filter == 0 and "Number of Records" in vis.data: if "Number of Records" in vis.data: v = vis.data["Number of Records"] return skewness(v) elif n_filter == 1 and "Number of Records" in vis.data: return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 # Scatter Plot elif n_dim == 0 and n_msr == 2: if v_size < 10: return -1 if vis.mark == "heatmap": return weighted_correlation(vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]) if n_filter == 1: v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(vis, attr_specs) # Scatterplot colored by Dimension elif n_dim == 1 and n_msr == 2: if v_size < 10: return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if C < 40: return 1 / C else: return -1 # Scatterplot colored by dimension elif n_dim == 1 and n_msr == 2: return 0.2 # Scatterplot colored by measure elif n_msr == 3: return 0.1 # colored line and barchart cases elif vis.mark == "line" and n_dim == 2: return 0.15 # for colored bar chart, scoring based on Chi-square test for independence score. # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users elif vis.mark == "bar" and n_dim == 2: from scipy.stats import chi2_contingency measure_column = vis.get_attr_by_data_model("measure")[0].attribute dimension_columns = vis.get_attr_by_data_model("dimension") groupby_column = dimension_columns[0].attribute color_column = dimension_columns[1].attribute contingency_table = [] groupby_cardinality = ldf.cardinality[groupby_column] groupby_unique_vals = ldf.unique_values[groupby_column] for c in range(0, groupby_cardinality): contingency_table.append( vis.data[vis.data[groupby_column] == groupby_unique_vals[c]][measure_column]) score = 0.12 # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in # a category having no counts try: color_cardinality = ldf.cardinality[color_column] # scale down score based on number of categories chi2_score = chi2_contingency(contingency_table)[0] * 0.9**( color_cardinality + groupby_cardinality) score = min(0.10, chi2_score) except ValueError: pass return score # Default else: return -1
def execute_aggregate(view: Vis, tbl: LuxSQLTable, isFiltered=True): """ Aggregate data points on an axis for bar or line charts Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization tbl : lux.core.frame LuxSQLTable with specified intent. isFiltered: boolean boolean that represents whether a vis has had a filter applied to its data Returns ------- None """ x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] has_color = False groupby_attr = "" measure_attr = "" if x_attr.aggregation is None or y_attr.aggregation is None: return if y_attr.aggregation != "": groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if x_attr.aggregation != "": groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if groupby_attr.attribute in tbl.unique_values.keys(): attr_unique_vals = tbl.unique_values[groupby_attr.attribute] # checks if color is specified in the Vis if len(view.get_attr_by_channel("color")) == 1: color_attr = view.get_attr_by_channel("color")[0] color_attr_vals = tbl.unique_values[color_attr.attribute] color_cardinality = len(color_attr_vals) # NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable---------------- has_color = True else: color_cardinality = 1 if measure_attr != "": # barchart case, need count data for each group if measure_attr.attribute == "Record": where_clause, filterVars = SQLExecutor.execute_filter(view) length_query = pandas.read_sql( "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause), lux.config.SQLconnection, ) # generates query for colored barchart case if has_color: count_query = 'SELECT "{}", "{}", COUNT("{}") FROM {} {} GROUP BY "{}", "{}"'.format( groupby_attr.attribute, color_attr.attribute, groupby_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, color_attr.attribute, ) view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection) view._vis_data = view._vis_data.rename(columns={"count": "Record"}) view._vis_data = utils.pandas_to_lux(view._vis_data) # generates query for normal barchart case else: count_query = 'SELECT "{}", COUNT("{}") FROM {} {} GROUP BY "{}"'.format( groupby_attr.attribute, groupby_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, ) view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection) view._vis_data = view._vis_data.rename(columns={"count": "Record"}) view._vis_data = utils.pandas_to_lux(view._vis_data) # view._vis_data.length = list(length_query["length"])[0] # aggregate barchart case, need aggregate data (mean, sum, max) for each group else: where_clause, filterVars = SQLExecutor.execute_filter(view) length_query = pandas.read_sql( "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause), lux.config.SQLconnection, ) # generates query for colored barchart case if has_color: if agg_func == "mean": agg_query = ( 'SELECT "{}", "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, color_attr.attribute, ) ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "sum": agg_query = ( 'SELECT "{}", "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, color_attr.attribute, ) ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "max": agg_query = ( 'SELECT "{}", "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, color_attr.attribute, ) ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) # generates query for normal barchart case else: if agg_func == "mean": agg_query = 'SELECT "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "sum": agg_query = 'SELECT "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "max": agg_query = 'SELECT "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) result_vals = list(view._vis_data[groupby_attr.attribute]) # create existing group by attribute combinations if color is specified # this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them if has_color: res_color_combi_vals = [] result_color_vals = list(view._vis_data[color_attr.attribute]) for i in range(0, len(result_vals)): res_color_combi_vals.append([result_vals[i], result_color_vals[i]]) # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints if isFiltered or has_color and attr_unique_vals: N_unique_vals = len(attr_unique_vals) if len(result_vals) != N_unique_vals * color_cardinality: columns = view._vis_data.columns if has_color: df = pandas.DataFrame( { columns[0]: attr_unique_vals * color_cardinality, columns[1]: pandas.Series(color_attr_vals).repeat(N_unique_vals), } ) view._vis_data = view._vis_data.merge( df, on=[columns[0], columns[1]], how="right", suffixes=["", "_right"], ) for col in columns[2:]: view._vis_data[col] = view._vis_data[col].fillna(0) # Triggers __setitem__ assert len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals * len( color_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." view._vis_data = view._vis_data.iloc[ :, :3 ] # Keep only the three relevant columns not the *_right columns resulting from merge else: df = pandas.DataFrame({columns[0]: attr_unique_vals}) view._vis_data = view._vis_data.merge( df, on=columns[0], how="right", suffixes=["", "_right"] ) for col in columns[1:]: view._vis_data[col] = view._vis_data[col].fillna(0) assert ( len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." view._vis_data = view._vis_data.sort_values(by=groupby_attr.attribute, ascending=True) view._vis_data = view._vis_data.reset_index() view._vis_data = view._vis_data.drop(columns="index")
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") try: filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) n_dim = vis._ndim n_msr = vis._nmsr n_filter = len(filter_specs) attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) if ( n_dim == 1 and (n_msr == 0 or n_msr == 1) and ldf.current_vis is not None and vis.get_attr_by_channel("y")[0].data_type == "quantitative" and len(ldf.current_vis) == 1 and ldf.current_vis[0].mark == "line" and len(get_filter_specs(ldf.intent)) > 0 ): query_vc = VisList(ldf.current_vis, ldf) query_vis = query_vc[0] preprocess(query_vis) preprocess(vis) return 1 - euclidean_dist(query_vis, vis) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif n_dim == 0 and n_msr == 1: if v_size < 2: return -1 if n_filter == 0 and "Number of Records" in vis.data: if "Number of Records" in vis.data: v = vis.data["Number of Records"] return skewness(v) elif n_filter == 1 and "Number of Records" in vis.data: return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 # Scatter Plot elif n_dim == 0 and n_msr == 2: if v_size < 10: return -1 if vis.mark == "heatmap": return weighted_correlation( vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"] ) if n_filter == 1: v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(vis, attr_specs) # Scatterplot colored by Dimension elif n_dim == 1 and n_msr == 2: if v_size < 10: return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if C < 40: return 1 / C else: return -1 # Scatterplot colored by dimension elif n_dim == 1 and n_msr == 2: return 0.2 # Scatterplot colored by measure elif n_msr == 3: return 0.1 # colored line and barchart cases elif vis.mark == "line" and n_dim == 2: return 0.15 # for colored bar chart, scoring based on Chi-square test for independence score. # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users elif vis.mark == "bar" and n_dim == 2: from scipy.stats import chi2_contingency measure_column = vis.get_attr_by_data_model("measure")[0].attribute dimension_columns = vis.get_attr_by_data_model("dimension") groupby_column = dimension_columns[0].attribute color_column = dimension_columns[1].attribute contingency_tbl = pd.crosstab( vis.data[groupby_column], vis.data[color_column], values=vis.data[measure_column], aggfunc=sum, ) try: color_cardinality = ldf.cardinality[color_column] groupby_cardinality = ldf.cardinality[groupby_column] # scale down score based on number of categories chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** ( color_cardinality + groupby_cardinality ) score = min(0.10, chi2_score) except (ValueError, KeyError): # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts score = -1 return score # Default else: return -1 except: if lux.config.interestingness_fallback: # Supress interestingness related issues warnings.warn(f"An error occurred when computing interestingness for: {vis}") return -1 else: raise
def execute_aggregate(view: Vis, isFiltered=True): ''' Aggregate data points on an axis for bar or line charts Parameters ---------- view: lux.Vis lux.Vis object that represents a visualization ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with specified intent. Returns ------- None ''' import numpy as np import pandas as pd import time x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] has_color = False groupby_attr = "" measure_attr = "" if (x_attr.aggregation is None or y_attr.aggregation is None): return if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation #checks if color is specified in the Vis if len(view.get_attr_by_channel("color")) == 1: color_attr = view.get_attr_by_channel("color")[0] color_attr_vals = view.data.unique_values[color_attr.attribute] color_cardinality = len(color_attr_vals) #NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable---------------- has_color = True else: color_cardinality = 1 all_attr_vals = view.data.unique_values[groupby_attr.attribute] if (measure_attr != ""): if (measure_attr.attribute == "Record"): view.data = view.data.reset_index() #if color is specified, need to group by groupby_attr and color_attr if has_color: view.data = view.data.groupby( [groupby_attr.attribute, color_attr.attribute]).count().reset_index() view.data = view.data.rename(columns={"index": "Record"}) view.data = view.data[[ groupby_attr.attribute, color_attr.attribute, "Record" ]] else: view.data = view.data.groupby( groupby_attr.attribute).count().reset_index() view.data = view.data.rename(columns={"index": "Record"}) view.data = view.data[[groupby_attr.attribute, "Record"]] else: #if color is specified, need to group by groupby_attr and color_attr if has_color: groupby_result = view.data.groupby( [groupby_attr.attribute, color_attr.attribute]) else: groupby_result = view.data.groupby(groupby_attr.attribute) view.data = groupby_result.agg(agg_func).reset_index() result_vals = list(view.data[groupby_attr.attribute]) #create existing group by attribute combinations if color is specified #this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them if has_color: res_color_combi_vals = [] result_color_vals = list(view.data[color_attr.attribute]) for i in range(0, len(result_vals)): res_color_combi_vals.append( [result_vals[i], result_color_vals[i]]) if (len(result_vals) != len(all_attr_vals) * color_cardinality and (isFiltered or has_color)): ####### ORIGINAL # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints # for vals in all_attr_vals: # if (vals not in result_vals): # view.data.loc[len(view.data)] = [vals]+[0]*(len(view.data.columns)-1) ####### SOLUTION 1 - INCOMPLETE SOLUTION, FAILS ON NONETYPE # start = time.time() # list_diff = np.setdiff1d(all_attr_vals, result_vals) # print(time.time() - start, 's') # df = pd.DataFrame({view.data.columns[1]: list_diff}) # for col in view.data.columns[1:]: # df[col] = 0 # view.data = view.data.append(df) ####### SOLUTION 2 # columns = view.data.columns # df = pd.DataFrame({columns[0]: all_attr_vals}) # for col in columns[1:]: # df[col] = 0 # view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['_left', '_right']) # for col in columns[1:]: # view.data[col + '_left'] = view.data[col + '_left'].fillna(0) # view.data[col + '_right'] = view.data[col + '_right'].fillna(0) # view.data[col] = view.data[col + '_left'] + view.data[col + '_right'] # del view.data[col + '_left'] # del view.data[col + '_right'] ####### SOLUTION 3 # columns = view.data.columns # df = pd.DataFrame({columns[0]: all_attr_vals}) # for col in columns[1:]: # df[col] = 0 # view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['', '_right']) # for col in columns[1:]: # view.data[col] = view.data[col].fillna(0) # del view.data[col + '_right'] ####### SOLUTION 4 columns = view.data.columns if has_color: df = pd.DataFrame({ columns[0]: all_attr_vals * color_cardinality, columns[1]: pd.Series(color_attr_vals).repeat(len(all_attr_vals)) }) view.data = view.data.merge(df, on=[columns[0], columns[1]], how='right', suffixes=['', '_right']) for col in columns[2:]: view.data[col] = view.data[col].fillna(0) assert len( list(view.data[groupby_attr.attribute]) ) == len(all_attr_vals) * len( color_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." # for vals in all_attr_vals: # for cvals in color_attr_vals: # temp_combi = [vals, cvals] # if (temp_combi not in res_color_combi_vals): # view.data.loc[len(view.data)] = [vals]+[cvals]+[0]*(len(view.data.columns)-2) else: df = pd.DataFrame({columns[0]: all_attr_vals}) view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['', '_right']) for col in columns[1:]: view.data[col] = view.data[col].fillna(0) assert len(list(view.data[groupby_attr.attribute])) == len( all_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." view.data = view.data.sort_values(by=groupby_attr.attribute, ascending=True) view.data = view.data.reset_index() view.data = view.data.drop(columns="index")
def execute_aggregate(vis: Vis, isFiltered=True): """ Aggregate data points on an axis for bar or line charts Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization ldf : lux.core.frame LuxDataFrame with specified intent. Returns ------- None """ import numpy as np x_attr = vis.get_attr_by_channel("x")[0] y_attr = vis.get_attr_by_channel("y")[0] has_color = False groupby_attr = "" measure_attr = "" if x_attr.aggregation is None or y_attr.aggregation is None: return if y_attr.aggregation != "": groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if x_attr.aggregation != "": groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if groupby_attr.attribute in vis.data.unique_values.keys(): attr_unique_vals = vis.data.unique_values[groupby_attr.attribute] # checks if color is specified in the Vis if len(vis.get_attr_by_channel("color")) == 1: color_attr = vis.get_attr_by_channel("color")[0] color_attr_vals = vis.data.unique_values[color_attr.attribute] color_cardinality = len(color_attr_vals) # NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable---------------- has_color = True else: color_cardinality = 1 if measure_attr != "": if measure_attr.attribute == "Record": # need to get the index name so that we can rename the index column to "Record" # if there is no index, default to "index" index_name = vis.data.index.name if index_name == None: index_name = "index" vis._vis_data = vis.data.reset_index() # if color is specified, need to group by groupby_attr and color_attr if has_color: vis._vis_data = (vis.data.groupby( [groupby_attr.attribute, color_attr.attribute], dropna=False, history=False).count().reset_index().rename( columns={index_name: "Record"})) vis._vis_data = vis.data[[ groupby_attr.attribute, color_attr.attribute, "Record" ]] else: vis._vis_data = (vis.data.groupby( groupby_attr.attribute, dropna=False, history=False).count().reset_index().rename( columns={index_name: "Record"})) vis._vis_data = vis.data[[ groupby_attr.attribute, "Record" ]] else: # if color is specified, need to group by groupby_attr and color_attr if has_color: groupby_result = vis.data.groupby( [groupby_attr.attribute, color_attr.attribute], dropna=False, history=False) else: groupby_result = vis.data.groupby(groupby_attr.attribute, dropna=False, history=False) groupby_result = groupby_result.agg(agg_func) intermediate = groupby_result.reset_index() vis._vis_data = intermediate.__finalize__(vis.data) result_vals = list(vis.data[groupby_attr.attribute]) # create existing group by attribute combinations if color is specified # this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them if has_color: res_color_combi_vals = [] result_color_vals = list(vis.data[color_attr.attribute]) for i in range(0, len(result_vals)): res_color_combi_vals.append( [result_vals[i], result_color_vals[i]]) # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints if isFiltered or has_color and attr_unique_vals: N_unique_vals = len(attr_unique_vals) if len(result_vals) != N_unique_vals * color_cardinality: columns = vis.data.columns if has_color: df = pd.DataFrame({ columns[0]: attr_unique_vals * color_cardinality, columns[1]: pd.Series(color_attr_vals).repeat(N_unique_vals), }) vis._vis_data = vis.data.merge( df, on=[columns[0], columns[1]], how="right", suffixes=["", "_right"], ) for col in columns[2:]: vis.data[col] = vis.data[col].fillna( 0) # Triggers __setitem__ assert len( list(vis.data[groupby_attr.attribute]) ) == N_unique_vals * len( color_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." # Keep only the three relevant columns not the *_right columns resulting from merge vis._vis_data = vis.data.iloc[:, :3] else: df = pd.DataFrame({columns[0]: attr_unique_vals}) vis._vis_data = vis.data.merge(df, on=columns[0], how="right", suffixes=["", "_right"]) for col in columns[1:]: vis.data[col] = vis.data[col].fillna(0) assert ( len(list(vis.data[ groupby_attr.attribute])) == N_unique_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." vis._vis_data = vis._vis_data.dropna( subset=[measure_attr.attribute]) try: vis._vis_data = vis._vis_data.sort_values( by=groupby_attr.attribute, ascending=True) except TypeError: warnings.warn( f"\nLux detects that the attribute '{groupby_attr.attribute}' maybe contain mixed type." + f"\nTo visualize this attribute, you may want to convert the '{groupby_attr.attribute}' into a uniform type as follows:" + f"\n\tdf['{groupby_attr.attribute}'] = df['{groupby_attr.attribute}'].astype(str)" ) vis._vis_data[groupby_attr.attribute] = vis._vis_data[ groupby_attr.attribute].astype(str) vis._vis_data = vis._vis_data.sort_values( by=groupby_attr.attribute, ascending=True) vis._vis_data = vis._vis_data.reset_index() vis._vis_data = vis._vis_data.drop(columns="index")