예제 #1
0
def test_autoencoding_histogram(global_var):
    lux.config.set_executor_type("Pandas")
    # No channel specified
    df = pytest.car_df
    # change pandas dtype for the column "Year" to datetype
    df["Year"] = pd.to_datetime(df["Year"], format="%Y")
    vis = Vis([lux.Clause(attribute="MilesPerGal", channel="y")], df)
    check_attribute_on_channel(vis, "MilesPerGal", "y")

    vis = Vis([lux.Clause(attribute="MilesPerGal", channel="x")], df)
    assert vis.get_attr_by_channel("x")[0].attribute == "MilesPerGal"
    assert vis.get_attr_by_channel("y")[0].attribute == "Record"

    # No channel specified
    # test for sql executor
    connection = psycopg2.connect(
        "host=localhost dbname=postgres user=postgres password=lux")
    lux.config.set_SQL_connection(connection)
    sql_df = lux.LuxSQLTable(table_name="cars")
    vis = Vis([lux.Clause(attribute="milespergal", channel="y")], sql_df)
    check_attribute_on_channel(vis, "milespergal", "y")

    vis = Vis([lux.Clause(attribute="milespergal", channel="x")], sql_df)
    assert vis.get_attr_by_channel("x")[0].attribute == "milespergal"
    assert vis.get_attr_by_channel("y")[0].attribute == "Record"
예제 #2
0
def test_special_char():
    dataset = [
        {"special.char": 1, "normal": 2},
        {"special.char": 1, "normal": 2},
        {"special.char": 1, "normal": 5},
        {"special.char": 1, "normal": 2},
        {"special.char": 1, "normal": 3},
        {"special.char": 1, "normal": 2},
        {"special.char": 1, "normal": 6},
        {"special.char": 1, "normal": 2},
        {"special.char": 1, "normal": 7},
        {"special.char": 1, "normal": 2},
        {"special.char": 3, "normal": 10},
        {"special.char": 1, "normal": 1},
        {"special.char": 5, "normal": 2},
        {"special.char": 1, "normal": 2},
        {"special.char": 1, "normal": 2},
        {"special.char": 1, "normal": 2},
        {"special.char": 1, "normal": 2},
    ]
    test = pd.DataFrame(dataset)

    from lux.vis.Vis import Vis

    # TODO: add assert that checks that the bar chart is rendered correctly in Altair
    vis = Vis(["special.char"], test)
    assert vis.mark == "bar"
    assert vis.intent == ["special.char"]
    assert vis.get_attr_by_channel("x")[0].attribute == "Record"
    assert vis.get_attr_by_channel("y")[0].attribute == "special.char"
    vis = vis.to_Altair()
    assert (
        "alt.Y('specialchar', type= 'nominal', axis=alt.Axis(labelOverlap=True, title='special.char'))"
        in vis
    )
    assert (
        "alt.X('Record', type= 'quantitative', title='Number of Records', axis=alt.Axis(title='Number of Records')"
        in vis
    )
    # Checking that this works even when there are multiple "." in column
    test = test.rename(columns={"special.char": "special..char.."})
    # TODO: add assert that checks that the bar chart is rendered correctly in Altair
    vis = Vis(["special..char.."], test)
    assert vis.mark == "bar"
    assert vis.intent == ["special..char.."]
    assert vis.get_attr_by_channel("x")[0].attribute == "Record"
    assert vis.get_attr_by_channel("y")[0].attribute == "special..char.."
    vis = vis.to_Altair()
    assert (
        "alt.Y('specialchar', type= 'nominal', axis=alt.Axis(labelOverlap=True, title='special..char..')"
        in vis
    )
    assert (
        "alt.X('Record', type= 'quantitative', title='Number of Records', axis=alt.Axis(title='Number of Records')"
        in vis
    )
예제 #3
0
    def execute_2D_binning(vis: Vis) -> None:
        """
        Apply 2D binning (heatmap) to vis.data

        Parameters
        ----------
        vis : Vis
        """
        pd.reset_option("mode.chained_assignment")
        with pd.option_context("mode.chained_assignment", None):
            x_attr = vis.get_attr_by_channel("x")[0].attribute
            y_attr = vis.get_attr_by_channel("y")[0].attribute

            vis._vis_data["xBin"] = pd.cut(vis._vis_data[x_attr],
                                           bins=lux.config.heatmap_bin_size)
            vis._vis_data["yBin"] = pd.cut(vis._vis_data[y_attr],
                                           bins=lux.config.heatmap_bin_size)

            color_attr = vis.get_attr_by_channel("color")
            if len(color_attr) > 0:
                color_attr = color_attr[0]
                groups = vis._vis_data.groupby(
                    ["xBin", "yBin"], history=False)[color_attr.attribute]
                if color_attr.data_type == "nominal":
                    # Compute mode and count. Mode aggregates each cell by taking the majority vote for the category variable. In cases where there is ties across categories, pick the first item (.iat[0])
                    result = groups.agg([
                        ("count", "count"),
                        (color_attr.attribute,
                         lambda x: pd.Series.mode(x).iat[0]),
                    ]).reset_index()
                elif color_attr.data_type == "quantitative" or color_attr.data_type == "temporal":
                    # Compute the average of all values in the bin
                    result = groups.agg([("count", "count"),
                                         (color_attr.attribute, "mean")
                                         ]).reset_index()
                result = result.dropna()
            else:
                groups = vis._vis_data.groupby(["xBin", "yBin"],
                                               history=False)[x_attr]
                result = groups.count().reset_index(name=x_attr)
                result = result.rename(columns={x_attr: "count"})
                result = result[result["count"] != 0]

            # convert type to facilitate weighted correlation interestingess calculation
            result["xBinStart"] = result["xBin"].apply(
                lambda x: x.left).astype("float")
            result["xBinEnd"] = result["xBin"].apply(lambda x: x.right)

            result["yBinStart"] = result["yBin"].apply(
                lambda x: x.left).astype("float")
            result["yBinEnd"] = result["yBin"].apply(lambda x: x.right)

            vis._vis_data = result.drop(columns=["xBin", "yBin"])
예제 #4
0
파일: PandasExecutor.py 프로젝트: ccubc/lux
    def execute_2D_binning(vis: Vis):
        pd.reset_option('mode.chained_assignment')
        with pd.option_context('mode.chained_assignment', None):
            x_attr = vis.get_attr_by_channel("x")[0]
            y_attr = vis.get_attr_by_channel("y")[0]

            vis._vis_data.loc[:,
                              "xBin"] = pd.cut(vis._vis_data[x_attr.attribute],
                                               bins=40)
            vis._vis_data.loc[:,
                              "yBin"] = pd.cut(vis._vis_data[y_attr.attribute],
                                               bins=40)

            color_attr = vis.get_attr_by_channel("color")
            if (len(color_attr) > 0):
                color_attr = color_attr[0]
                groups = vis._vis_data.groupby(['xBin',
                                                'yBin'])[color_attr.attribute]
                if (color_attr.data_type == "nominal"):
                    # Compute mode and count. Mode aggregates each cell by taking the majority vote for the category variable. In cases where there is ties across categories, pick the first item (.iat[0])
                    result = groups.agg([("count", "count"),
                                         (color_attr.attribute,
                                          lambda x: pd.Series.mode(x).iat[0])
                                         ]).reset_index()
                elif (color_attr.data_type == "quantitative"):
                    # Compute the average of all values in the bin
                    result = groups.agg([("count", "count"),
                                         (color_attr.attribute, "mean")
                                         ]).reset_index()
                result = result.dropna()
            else:
                groups = vis._vis_data.groupby(['xBin',
                                                'yBin'])[x_attr.attribute]
                result = groups.agg("count").reset_index(
                )  # .agg in this line throws SettingWithCopyWarning
                result = result.rename(columns={x_attr.attribute: "count"})
                result = result[result["count"] != 0]

            # convert type to facilitate weighted correlation interestingess calculation
            result.loc[:, "xBinStart"] = result["xBin"].apply(
                lambda x: x.left).astype('float')
            result.loc[:, "xBinEnd"] = result["xBin"].apply(lambda x: x.right)

            result.loc[:, "yBinStart"] = result["yBin"].apply(
                lambda x: x.left).astype('float')
            result.loc[:, "yBinEnd"] = result["yBin"].apply(lambda x: x.right)

            vis._vis_data = result.drop(columns=["xBin", "yBin"])
예제 #5
0
    def execute_aggregate(view: Vis, ldf: LuxDataFrame):
        import pandas as pd
        x_attr = view.get_attr_by_channel("x")[0]
        y_attr = view.get_attr_by_channel("y")[0]
        groupby_attr = ""
        measure_attr = ""
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation

        if (measure_attr != ""):
            #barchart case, need count data for each group
            if (measure_attr.attribute == "Record"):
                where_clause, filterVars = SQLExecutor.execute_filter(view)
                count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format(
                    groupby_attr.attribute, groupby_attr.attribute,
                    ldf.table_name, where_clause, groupby_attr.attribute)
                view.data = pd.read_sql(count_query, ldf.SQLconnection)
                view.data = view.data.rename(columns={"count": "Record"})
                view.data = utils.pandas_to_lux(view.data)

            else:
                where_clause, filterVars = SQLExecutor.execute_filter(view)
                if agg_func == "mean":
                    mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)
                if agg_func == "sum":
                    mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)
                if agg_func == "max":
                    mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)

            #pad empty categories with 0 counts after filter is applied
            all_attr_vals = ldf.unique_values[groupby_attr.attribute]
            result_vals = list(view.data[groupby_attr.attribute])
            if (len(result_vals) != len(all_attr_vals)):
                # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
                for vals in all_attr_vals:
                    if (vals not in result_vals):
                        view.data.loc[len(view.data)] = [
                            vals
                        ] + [0] * (len(view.data.columns) - 1)
예제 #6
0
    def execute_2D_binning(vis: Vis):
        pd.reset_option('mode.chained_assignment')
        with pd.option_context('mode.chained_assignment', None):
            x_attr = vis.get_attr_by_channel("x")[0]
            y_attr = vis.get_attr_by_channel("y")[0]

            vis._vis_data.loc[:,
                              "xBin"] = pd.cut(vis._vis_data[x_attr.attribute],
                                               bins=30)
            vis._vis_data.loc[:,
                              "yBin"] = pd.cut(vis._vis_data[y_attr.attribute],
                                               bins=30)
            groups = vis._vis_data.groupby(['xBin', 'yBin'])[x_attr.attribute]
            result = groups.agg("count").reset_index(
            )  # .agg in this line throws SettingWithCopyWarning
            result = result.rename(columns={x_attr.attribute: "z"})
            result = result[result["z"] != 0]

            # convert type to facilitate weighted correlation interestingess calculation
            result.loc[:, "xBinStart"] = result["xBin"].apply(
                lambda x: x.left).astype('float')
            result.loc[:, "xBinEnd"] = result["xBin"].apply(lambda x: x.right)

            result.loc[:, "yBinStart"] = result["yBin"].apply(
                lambda x: x.left).astype('float')
            result.loc[:, "yBinEnd"] = result["yBin"].apply(lambda x: x.right)

            vis._vis_data = result.drop(columns=["xBin", "yBin"])
예제 #7
0
파일: Compiler.py 프로젝트: Qutubkhan/lux
    def enforce_specified_channel(vis: Vis, auto_channel: Dict[str, str]):
        """
        Enforces that the channels specified in the Vis by users overrides the showMe autoChannels.

        Parameters
        ----------
        vis : lux.vis.Vis
                Input Vis without channel specification.
        auto_channel : Dict[str,str]
                Key-value pair in the form [channel: attributeName] specifying the showMe recommended channel location.

        Returns
        -------
        vis : lux.vis.Vis
                Vis with channel specification combining both original and auto_channel specification.

        Raises
        ------
        ValueError
                Ensures no more than one attribute is placed in the same channel.
        """
        result_dict = (
            {}
        )  # result of enforcing specified channel will be stored in result_dict
        specified_dict = (
            {}
        )  # specified_dict={"x":[],"y":[list of Dobj with y specified as channel]}
        # create a dictionary of specified channels in the given dobj
        for val in auto_channel.keys():
            specified_dict[val] = vis.get_attr_by_channel(val)
            result_dict[val] = ""
        # for every element, replace with what's in specified_dict if specified
        for sVal, sAttr in specified_dict.items():
            if len(sAttr) == 1:  # if specified in dobj
                # remove the specified channel from auto_channel (matching by value, since channel key may not be same)
                for i in list(auto_channel.keys()):
                    if (auto_channel[i].attribute == sAttr[0].attribute) and (
                            auto_channel[i].channel == sVal
                    ):  # need to ensure that the channel is the same (edge case when duplicate Cols with same attribute name)
                        auto_channel.pop(i)
                        break
                sAttr[0].channel = sVal
                result_dict[sVal] = sAttr[0]
            elif len(sAttr) > 1:
                raise ValueError(
                    "There should not be more than one attribute specified in the same channel."
                )
        # For the leftover channels that are still unspecified in result_dict,
        # and the leftovers in the auto_channel specification,
        # step through them together and fill it automatically.
        leftover_channels = list(
            filter(lambda x: result_dict[x] == "", result_dict))
        for leftover_channel, leftover_encoding in zip(leftover_channels,
                                                       auto_channel.values()):
            leftover_encoding.channel = leftover_channel
            result_dict[leftover_channel] = leftover_encoding
        vis._inferred_intent = list(result_dict.values())
        return vis
예제 #8
0
def test_autoencoding_histogram(global_var):
    # No channel specified
    # test for sql executor
    sql_df = lux.LuxSQLTable(table_name="cars")
    vis = Vis([lux.Clause(attribute="milespergal", channel="y")], sql_df)
    check_attribute_on_channel(vis, "milespergal", "y")

    vis = Vis([lux.Clause(attribute="milespergal", channel="x")], sql_df)
    assert vis.get_attr_by_channel("x")[0].attribute == "milespergal"
    assert vis.get_attr_by_channel("y")[0].attribute == "Record"
예제 #9
0
def test_autoencoding_histogram(global_var):
    # No channel specified
    df = pytest.car_df
    # change pandas dtype for the column "Year" to datetype
    df["Year"] = pd.to_datetime(df["Year"], format="%Y")
    vis = Vis([lux.Clause(attribute="MilesPerGal", channel="y")], df)
    check_attribute_on_channel(vis, "MilesPerGal", "y")

    vis = Vis([lux.Clause(attribute="MilesPerGal", channel="x")], df)
    assert vis.get_attr_by_channel("x")[0].attribute == "MilesPerGal"
    assert vis.get_attr_by_channel("y")[0].attribute == "Record"
예제 #10
0
def test_autoencoding_histogram():
    # No channel specified
    df = pd.read_csv("lux/data/car.csv")
    df["Year"] = pd.to_datetime(
        df["Year"],
        format='%Y')  # change pandas dtype for the column "Year" to datetype
    vis = Vis([lux.Clause(attribute="MilesPerGal", channel="y")], df)
    check_attribute_on_channel(vis, "MilesPerGal", "y")

    vis = Vis([lux.Clause(attribute="MilesPerGal", channel="x")], df)
    assert vis.get_attr_by_channel("x")[0].attribute == "MilesPerGal"
    assert vis.get_attr_by_channel("y")[0].attribute == "Record"
예제 #11
0
def test_refresh_inplace():
    df = pd.DataFrame({
        'date': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
        'value': [10.5, 15.2, 20.3, 25.2]
    })

    assert df.data_type['nominal'][0] == 'date'

    from lux.vis.Vis import Vis
    vis = Vis(["date", "value"], df)

    df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d")

    assert df.data_type['temporal'][0] == 'date'

    vis.refresh_source(df)
    assert vis.mark == "line"
    assert vis.get_attr_by_channel("x")[0].attribute == "date"
    assert vis.get_attr_by_channel("y")[0].attribute == "value"
예제 #12
0
def test_refresh_inplace():
    df = pd.DataFrame({
        'date': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'],
        'value': [10.5, 15.2, 20.3, 25.2]
    })
    with pytest.warns(
            UserWarning,
            match="Lux detects that the attribute 'date' may be temporal."):
        df._repr_html_()
    assert df.data_type_lookup["date"] == "temporal"

    from lux.vis.Vis import Vis
    vis = Vis(["date", "value"], df)

    df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d")
    df.maintain_metadata()
    assert df.data_type['temporal'][0] == 'date'

    vis.refresh_source(df)
    assert vis.mark == "line"
    assert vis.get_attr_by_channel("x")[0].attribute == "date"
    assert vis.get_attr_by_channel("y")[0].attribute == "value"
예제 #13
0
def test_refresh_inplace():
    df = pd.DataFrame(
        {
            "date": ["2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01"],
            "value": [10.5, 15.2, 20.3, 25.2],
        }
    )
    with pytest.warns(UserWarning, match="Lux detects that the attribute 'date' may be temporal."):
        df._ipython_display_()
    assert df.data_type["date"] == "temporal"

    from lux.vis.Vis import Vis

    vis = Vis(["date", "value"], df)

    df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
    df.maintain_metadata()
    inverted_data_type = lux.config.executor.invert_data_type(df.data_type)
    assert inverted_data_type["temporal"][0] == "date"

    vis.refresh_source(df)
    assert vis.mark == "line"
    assert vis.get_attr_by_channel("x")[0].attribute == "date"
    assert vis.get_attr_by_channel("y")[0].attribute == "value"
예제 #14
0
    def execute_aggregate(vis: Vis, isFiltered=True):
        '''
        Aggregate data points on an axis for bar or line charts

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.core.frame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        '''
        import numpy as np

        x_attr = vis.get_attr_by_channel("x")[0]
        y_attr = vis.get_attr_by_channel("y")[0]
        has_color = False
        groupby_attr = ""
        measure_attr = ""
        if (x_attr.aggregation is None or y_attr.aggregation is None):
            return
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        if (groupby_attr.attribute in vis.data.unique_values.keys()):
            attr_unique_vals = vis.data.unique_values[groupby_attr.attribute]
        #checks if color is specified in the Vis
        if len(vis.get_attr_by_channel("color")) == 1:
            color_attr = vis.get_attr_by_channel("color")[0]
            color_attr_vals = vis.data.unique_values[color_attr.attribute]
            color_cardinality = len(color_attr_vals)
            #NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable----------------
            has_color = True
        else:
            color_cardinality = 1

        if (measure_attr != ""):
            if (measure_attr.attribute == "Record"):
                vis._vis_data = vis.data.reset_index()
                #if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    vis._vis_data = vis.data.groupby(
                        [groupby_attr.attribute,
                         color_attr.attribute]).count().reset_index()
                    vis._vis_data = vis.data.rename(
                        columns={"index": "Record"})
                    vis._vis_data = vis.data[[
                        groupby_attr.attribute, color_attr.attribute, "Record"
                    ]]
                else:
                    vis._vis_data = vis.data.groupby(
                        groupby_attr.attribute).count().reset_index()
                    vis._vis_data = vis.data.rename(
                        columns={"index": "Record"})
                    vis._vis_data = vis.data[[
                        groupby_attr.attribute, "Record"
                    ]]
            else:
                #if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    groupby_result = vis.data.groupby(
                        [groupby_attr.attribute, color_attr.attribute])
                else:
                    groupby_result = vis.data.groupby(groupby_attr.attribute)
                groupby_result = groupby_result.agg(agg_func)
                intermediate = groupby_result.reset_index()
                vis._vis_data = intermediate.__finalize__(vis.data)
            result_vals = list(vis.data[groupby_attr.attribute])
            #create existing group by attribute combinations if color is specified
            #this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them
            if has_color:
                res_color_combi_vals = []
                result_color_vals = list(vis.data[color_attr.attribute])
                for i in range(0, len(result_vals)):
                    res_color_combi_vals.append(
                        [result_vals[i], result_color_vals[i]])
            # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
            if (isFiltered or has_color and attr_unique_vals):
                N_unique_vals = len(attr_unique_vals)
                if (len(result_vals) != N_unique_vals * color_cardinality):
                    columns = vis.data.columns
                    if has_color:
                        df = pd.DataFrame({
                            columns[0]:
                            attr_unique_vals * color_cardinality,
                            columns[1]:
                            pd.Series(color_attr_vals).repeat(N_unique_vals)
                        })
                        vis._vis_data = vis.data.merge(
                            df,
                            on=[columns[0], columns[1]],
                            how='right',
                            suffixes=['', '_right'])
                        for col in columns[2:]:
                            vis.data[col] = vis.data[col].fillna(
                                0)  #Triggers __setitem__
                        assert len(
                            list(vis.data[groupby_attr.attribute])
                        ) == N_unique_vals * len(
                            color_attr_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`."
                        vis._vis_data = vis.data.iloc[:, :
                                                      3]  # Keep only the three relevant columns not the *_right columns resulting from merge
                    else:
                        df = pd.DataFrame({columns[0]: attr_unique_vals})

                        vis._vis_data = vis.data.merge(df,
                                                       on=columns[0],
                                                       how='right',
                                                       suffixes=['', '_right'])

                        for col in columns[1:]:
                            vis.data[col] = vis.data[col].fillna(0)
                        assert len(
                            list(vis.data[groupby_attr.attribute])
                        ) == N_unique_vals, f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
            vis._vis_data = vis.data.sort_values(by=groupby_attr.attribute,
                                                 ascending=True)
            vis._vis_data = vis.data.reset_index()
            vis._vis_data = vis.data.drop(columns="index")
예제 #15
0
def interestingness(vis:Vis ,ldf:LuxDataFrame) -> int:
	"""
	Compute the interestingness score of the vis.
	The interestingness metric is dependent on the vis type.

	Parameters
	----------
	vis : Vis
	ldf : LuxDataFrame

	Returns
	-------
	int
		Interestingness Score
	"""	
	

	if vis.data is None or len(vis.data)==0:
		raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")

	n_dim = 0
	n_msr = 0
	
	filter_specs = utils.get_filter_specs(vis._inferred_intent)
	vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)

	record_attrs = list(filter(lambda x: x.attribute=="Record" and x.data_model=="measure", vis_attrs_specs))
	n_record = len(record_attrs)
	for clause in vis_attrs_specs:
		if (clause.attribute!="Record"):
			if (clause.data_model == 'dimension'):
				n_dim += 1
			if (clause.data_model == 'measure'):
				n_msr += 1
	n_filter = len(filter_specs)
	attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
	dimension_lst = vis.get_attr_by_data_model("dimension")
	measure_lst = vis.get_attr_by_data_model("measure")
	v_size = len(vis.data)
	# Line/Bar Chart
	#print("r:", n_record, "m:", n_msr, "d:",n_dim)
	if (n_dim == 1 and (n_msr==0 or n_msr==1)):
		if (v_size<2): return -1 
		if (n_filter == 0):
			return unevenness(vis, ldf, measure_lst, dimension_lst)
		elif(n_filter==1):
			return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
	# Histogram
	elif (n_dim == 0 and n_msr == 1):
		if (v_size<2): return -1 
		if (n_filter == 0):
			v = vis.data["Number of Records"]
			return skewness(v)
		elif (n_filter == 1):
			return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
	# Scatter Plot
	elif (n_dim == 0 and n_msr == 2):
		if (v_size<2): return -1 
		if (n_filter==1):
			v_filter_size = get_filtered_size(filter_specs, vis.data)
			sig = v_filter_size/v_size
		else:
			sig = 1
		return sig * monotonicity(vis,attr_specs)
	# Scatterplot colored by Dimension
	elif (n_dim == 1 and n_msr == 2):
		if (v_size<5): return -1 
		color_attr = vis.get_attr_by_channel("color")[0].attribute
		
		C = ldf.cardinality[color_attr]
		if (C<40):
			return 1/C
		else:
			return -1
	# Scatterplot colored by dimension
	elif (n_dim== 1 and n_msr == 2):
		return 0.2
	# Scatterplot colored by measure
	elif (n_msr == 3):
		return 0.1	
	# colored line and barchart cases
	elif ((vis.mark == "line" or vis.mark == "bar") and n_dim == 2):
		return 0.2
	# Default
	else:
		return -1
예제 #16
0
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
    """
    Compute the interestingness score of the vis.
    The interestingness metric is dependent on the vis type.

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame

    Returns
    -------
    int
            Interestingness Score
    """

    if vis.data is None or len(vis.data) == 0:
        return -1
        # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")

    n_dim = 0
    n_msr = 0

    filter_specs = utils.get_filter_specs(vis._inferred_intent)
    vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)

    record_attrs = list(
        filter(
            lambda x: x.attribute == "Record" and x.data_model == "measure",
            vis_attrs_specs,
        ))
    n_record = len(record_attrs)
    for clause in vis_attrs_specs:
        if clause.attribute != "Record":
            if clause.data_model == "dimension":
                n_dim += 1
            if clause.data_model == "measure":
                n_msr += 1
    n_filter = len(filter_specs)
    attr_specs = [
        clause for clause in vis_attrs_specs if clause.attribute != "Record"
    ]
    dimension_lst = vis.get_attr_by_data_model("dimension")
    measure_lst = vis.get_attr_by_data_model("measure")
    v_size = len(vis.data)
    # Line/Bar Chart
    # print("r:", n_record, "m:", n_msr, "d:",n_dim)
    if n_dim == 1 and (n_msr == 0 or n_msr == 1):
        if v_size < 2:
            return -1
        if n_filter == 0:
            return unevenness(vis, ldf, measure_lst, dimension_lst)
        elif n_filter == 1:
            return deviation_from_overall(vis, ldf, filter_specs,
                                          measure_lst[0].attribute)
    # Histogram
    elif n_dim == 0 and n_msr == 1:
        if v_size < 2:
            return -1
        if n_filter == 0 and "Number of Records" in vis.data:
            if "Number of Records" in vis.data:
                v = vis.data["Number of Records"]
                return skewness(v)
        elif n_filter == 1 and "Number of Records" in vis.data:
            return deviation_from_overall(vis, ldf, filter_specs,
                                          "Number of Records")
        return -1
    # Scatter Plot
    elif n_dim == 0 and n_msr == 2:
        if v_size < 10:
            return -1
        if vis.mark == "heatmap":
            return weighted_correlation(vis.data["xBinStart"],
                                        vis.data["yBinStart"],
                                        vis.data["count"])
        if n_filter == 1:
            v_filter_size = get_filtered_size(filter_specs, vis.data)
            sig = v_filter_size / v_size
        else:
            sig = 1
        return sig * monotonicity(vis, attr_specs)
    # Scatterplot colored by Dimension
    elif n_dim == 1 and n_msr == 2:
        if v_size < 10:
            return -1
        color_attr = vis.get_attr_by_channel("color")[0].attribute

        C = ldf.cardinality[color_attr]
        if C < 40:
            return 1 / C
        else:
            return -1
    # Scatterplot colored by dimension
    elif n_dim == 1 and n_msr == 2:
        return 0.2
    # Scatterplot colored by measure
    elif n_msr == 3:
        return 0.1
    # colored line and barchart cases
    elif vis.mark == "line" and n_dim == 2:
        return 0.15
    # for colored bar chart, scoring based on Chi-square test for independence score.
    # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
    elif vis.mark == "bar" and n_dim == 2:
        from scipy.stats import chi2_contingency

        measure_column = vis.get_attr_by_data_model("measure")[0].attribute
        dimension_columns = vis.get_attr_by_data_model("dimension")

        groupby_column = dimension_columns[0].attribute
        color_column = dimension_columns[1].attribute

        contingency_table = []
        groupby_cardinality = ldf.cardinality[groupby_column]
        groupby_unique_vals = ldf.unique_values[groupby_column]
        for c in range(0, groupby_cardinality):
            contingency_table.append(
                vis.data[vis.data[groupby_column] ==
                         groupby_unique_vals[c]][measure_column])
        score = 0.12
        # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in
        # a category having no counts

        try:
            color_cardinality = ldf.cardinality[color_column]
            # scale down score based on number of categories
            chi2_score = chi2_contingency(contingency_table)[0] * 0.9**(
                color_cardinality + groupby_cardinality)
            score = min(0.10, chi2_score)
        except ValueError:
            pass
        return score
    # Default
    else:
        return -1
예제 #17
0
    def execute_aggregate(view: Vis, tbl: LuxSQLTable, isFiltered=True):
        """
        Aggregate data points on an axis for bar or line charts
        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        tbl : lux.core.frame
            LuxSQLTable with specified intent.
        isFiltered: boolean
            boolean that represents whether a vis has had a filter applied to its data
        Returns
        -------
        None
        """
        x_attr = view.get_attr_by_channel("x")[0]
        y_attr = view.get_attr_by_channel("y")[0]
        has_color = False
        groupby_attr = ""
        measure_attr = ""
        if x_attr.aggregation is None or y_attr.aggregation is None:
            return
        if y_attr.aggregation != "":
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if x_attr.aggregation != "":
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        if groupby_attr.attribute in tbl.unique_values.keys():
            attr_unique_vals = tbl.unique_values[groupby_attr.attribute]
        # checks if color is specified in the Vis
        if len(view.get_attr_by_channel("color")) == 1:
            color_attr = view.get_attr_by_channel("color")[0]
            color_attr_vals = tbl.unique_values[color_attr.attribute]
            color_cardinality = len(color_attr_vals)
            # NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable----------------
            has_color = True
        else:
            color_cardinality = 1
        if measure_attr != "":
            # barchart case, need count data for each group
            if measure_attr.attribute == "Record":
                where_clause, filterVars = SQLExecutor.execute_filter(view)

                length_query = pandas.read_sql(
                    "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause),
                    lux.config.SQLconnection,
                )
                # generates query for colored barchart case
                if has_color:
                    count_query = 'SELECT "{}", "{}", COUNT("{}") FROM {} {} GROUP BY "{}", "{}"'.format(
                        groupby_attr.attribute,
                        color_attr.attribute,
                        groupby_attr.attribute,
                        tbl.table_name,
                        where_clause,
                        groupby_attr.attribute,
                        color_attr.attribute,
                    )
                    view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection)
                    view._vis_data = view._vis_data.rename(columns={"count": "Record"})
                    view._vis_data = utils.pandas_to_lux(view._vis_data)
                # generates query for normal barchart case
                else:
                    count_query = 'SELECT "{}", COUNT("{}") FROM {} {} GROUP BY "{}"'.format(
                        groupby_attr.attribute,
                        groupby_attr.attribute,
                        tbl.table_name,
                        where_clause,
                        groupby_attr.attribute,
                    )
                    view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection)
                    view._vis_data = view._vis_data.rename(columns={"count": "Record"})
                    view._vis_data = utils.pandas_to_lux(view._vis_data)
                # view._vis_data.length = list(length_query["length"])[0]
            # aggregate barchart case, need aggregate data (mean, sum, max) for each group
            else:
                where_clause, filterVars = SQLExecutor.execute_filter(view)

                length_query = pandas.read_sql(
                    "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause),
                    lux.config.SQLconnection,
                )
                # generates query for colored barchart case
                if has_color:
                    if agg_func == "mean":
                        agg_query = (
                            'SELECT "{}", "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format(
                                groupby_attr.attribute,
                                color_attr.attribute,
                                measure_attr.attribute,
                                measure_attr.attribute,
                                tbl.table_name,
                                where_clause,
                                groupby_attr.attribute,
                                color_attr.attribute,
                            )
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)

                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                    if agg_func == "sum":
                        agg_query = (
                            'SELECT "{}", "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format(
                                groupby_attr.attribute,
                                color_attr.attribute,
                                measure_attr.attribute,
                                measure_attr.attribute,
                                tbl.table_name,
                                where_clause,
                                groupby_attr.attribute,
                                color_attr.attribute,
                            )
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                    if agg_func == "max":
                        agg_query = (
                            'SELECT "{}", "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format(
                                groupby_attr.attribute,
                                color_attr.attribute,
                                measure_attr.attribute,
                                measure_attr.attribute,
                                tbl.table_name,
                                where_clause,
                                groupby_attr.attribute,
                                color_attr.attribute,
                            )
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                # generates query for normal barchart case
                else:
                    if agg_func == "mean":
                        agg_query = 'SELECT "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}"'.format(
                            groupby_attr.attribute,
                            measure_attr.attribute,
                            measure_attr.attribute,
                            tbl.table_name,
                            where_clause,
                            groupby_attr.attribute,
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                    if agg_func == "sum":
                        agg_query = 'SELECT "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}"'.format(
                            groupby_attr.attribute,
                            measure_attr.attribute,
                            measure_attr.attribute,
                            tbl.table_name,
                            where_clause,
                            groupby_attr.attribute,
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                    if agg_func == "max":
                        agg_query = 'SELECT "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}"'.format(
                            groupby_attr.attribute,
                            measure_attr.attribute,
                            measure_attr.attribute,
                            tbl.table_name,
                            where_clause,
                            groupby_attr.attribute,
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
            result_vals = list(view._vis_data[groupby_attr.attribute])
            # create existing group by attribute combinations if color is specified
            # this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them
            if has_color:
                res_color_combi_vals = []
                result_color_vals = list(view._vis_data[color_attr.attribute])
                for i in range(0, len(result_vals)):
                    res_color_combi_vals.append([result_vals[i], result_color_vals[i]])
            # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
            if isFiltered or has_color and attr_unique_vals:
                N_unique_vals = len(attr_unique_vals)
                if len(result_vals) != N_unique_vals * color_cardinality:
                    columns = view._vis_data.columns
                    if has_color:
                        df = pandas.DataFrame(
                            {
                                columns[0]: attr_unique_vals * color_cardinality,
                                columns[1]: pandas.Series(color_attr_vals).repeat(N_unique_vals),
                            }
                        )
                        view._vis_data = view._vis_data.merge(
                            df,
                            on=[columns[0], columns[1]],
                            how="right",
                            suffixes=["", "_right"],
                        )
                        for col in columns[2:]:
                            view._vis_data[col] = view._vis_data[col].fillna(0)  # Triggers __setitem__
                        assert len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals * len(
                            color_attr_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`."
                        view._vis_data = view._vis_data.iloc[
                            :, :3
                        ]  # Keep only the three relevant columns not the *_right columns resulting from merge
                    else:
                        df = pandas.DataFrame({columns[0]: attr_unique_vals})

                        view._vis_data = view._vis_data.merge(
                            df, on=columns[0], how="right", suffixes=["", "_right"]
                        )

                        for col in columns[1:]:
                            view._vis_data[col] = view._vis_data[col].fillna(0)
                        assert (
                            len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
            view._vis_data = view._vis_data.sort_values(by=groupby_attr.attribute, ascending=True)
            view._vis_data = view._vis_data.reset_index()
            view._vis_data = view._vis_data.drop(columns="index")
예제 #18
0
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
    """
    Compute the interestingness score of the vis.
    The interestingness metric is dependent on the vis type.

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame

    Returns
    -------
    int
            Interestingness Score
    """

    if vis.data is None or len(vis.data) == 0:
        return -1
        # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")
    try:
        filter_specs = utils.get_filter_specs(vis._inferred_intent)
        vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)
        n_dim = vis._ndim
        n_msr = vis._nmsr
        n_filter = len(filter_specs)
        attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
        dimension_lst = vis.get_attr_by_data_model("dimension")
        measure_lst = vis.get_attr_by_data_model("measure")
        v_size = len(vis.data)

        if (
            n_dim == 1
            and (n_msr == 0 or n_msr == 1)
            and ldf.current_vis is not None
            and vis.get_attr_by_channel("y")[0].data_type == "quantitative"
            and len(ldf.current_vis) == 1
            and ldf.current_vis[0].mark == "line"
            and len(get_filter_specs(ldf.intent)) > 0
        ):
            query_vc = VisList(ldf.current_vis, ldf)
            query_vis = query_vc[0]
            preprocess(query_vis)
            preprocess(vis)
            return 1 - euclidean_dist(query_vis, vis)

        # Line/Bar Chart
        # print("r:", n_record, "m:", n_msr, "d:",n_dim)
        if n_dim == 1 and (n_msr == 0 or n_msr == 1):
            if v_size < 2:
                return -1

            if n_filter == 0:
                return unevenness(vis, ldf, measure_lst, dimension_lst)
            elif n_filter == 1:
                return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
        # Histogram
        elif n_dim == 0 and n_msr == 1:
            if v_size < 2:
                return -1
            if n_filter == 0 and "Number of Records" in vis.data:
                if "Number of Records" in vis.data:
                    v = vis.data["Number of Records"]
                    return skewness(v)
            elif n_filter == 1 and "Number of Records" in vis.data:
                return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
            return -1
        # Scatter Plot
        elif n_dim == 0 and n_msr == 2:
            if v_size < 10:
                return -1
            if vis.mark == "heatmap":
                return weighted_correlation(
                    vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]
                )
            if n_filter == 1:
                v_filter_size = get_filtered_size(filter_specs, vis.data)
                sig = v_filter_size / v_size
            else:
                sig = 1
            return sig * monotonicity(vis, attr_specs)
        # Scatterplot colored by Dimension
        elif n_dim == 1 and n_msr == 2:
            if v_size < 10:
                return -1
            color_attr = vis.get_attr_by_channel("color")[0].attribute

            C = ldf.cardinality[color_attr]
            if C < 40:
                return 1 / C
            else:
                return -1
        # Scatterplot colored by dimension
        elif n_dim == 1 and n_msr == 2:
            return 0.2
        # Scatterplot colored by measure
        elif n_msr == 3:
            return 0.1
        # colored line and barchart cases
        elif vis.mark == "line" and n_dim == 2:
            return 0.15
        # for colored bar chart, scoring based on Chi-square test for independence score.
        # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
        elif vis.mark == "bar" and n_dim == 2:
            from scipy.stats import chi2_contingency

            measure_column = vis.get_attr_by_data_model("measure")[0].attribute
            dimension_columns = vis.get_attr_by_data_model("dimension")

            groupby_column = dimension_columns[0].attribute
            color_column = dimension_columns[1].attribute

            contingency_tbl = pd.crosstab(
                vis.data[groupby_column],
                vis.data[color_column],
                values=vis.data[measure_column],
                aggfunc=sum,
            )

            try:
                color_cardinality = ldf.cardinality[color_column]
                groupby_cardinality = ldf.cardinality[groupby_column]
                # scale down score based on number of categories
                chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** (
                    color_cardinality + groupby_cardinality
                )
                score = min(0.10, chi2_score)
            except (ValueError, KeyError):
                # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts
                score = -1
            return score
        # Default
        else:
            return -1
    except:
        if lux.config.interestingness_fallback:
            # Supress interestingness related issues
            warnings.warn(f"An error occurred when computing interestingness for: {vis}")
            return -1
        else:
            raise
예제 #19
0
    def execute_aggregate(view: Vis, isFiltered=True):
        '''
        Aggregate data points on an axis for bar or line charts

        Parameters
        ----------
        view: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.luxDataFrame.LuxDataFrame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        '''
        import numpy as np
        import pandas as pd
        import time

        x_attr = view.get_attr_by_channel("x")[0]
        y_attr = view.get_attr_by_channel("y")[0]
        has_color = False
        groupby_attr = ""
        measure_attr = ""
        if (x_attr.aggregation is None or y_attr.aggregation is None):
            return
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        #checks if color is specified in the Vis
        if len(view.get_attr_by_channel("color")) == 1:
            color_attr = view.get_attr_by_channel("color")[0]
            color_attr_vals = view.data.unique_values[color_attr.attribute]
            color_cardinality = len(color_attr_vals)
            #NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable----------------
            has_color = True
        else:
            color_cardinality = 1
        all_attr_vals = view.data.unique_values[groupby_attr.attribute]
        if (measure_attr != ""):
            if (measure_attr.attribute == "Record"):
                view.data = view.data.reset_index()
                #if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    view.data = view.data.groupby(
                        [groupby_attr.attribute,
                         color_attr.attribute]).count().reset_index()
                    view.data = view.data.rename(columns={"index": "Record"})
                    view.data = view.data[[
                        groupby_attr.attribute, color_attr.attribute, "Record"
                    ]]
                else:
                    view.data = view.data.groupby(
                        groupby_attr.attribute).count().reset_index()
                    view.data = view.data.rename(columns={"index": "Record"})
                    view.data = view.data[[groupby_attr.attribute, "Record"]]
            else:
                #if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    groupby_result = view.data.groupby(
                        [groupby_attr.attribute, color_attr.attribute])
                else:
                    groupby_result = view.data.groupby(groupby_attr.attribute)
                view.data = groupby_result.agg(agg_func).reset_index()
            result_vals = list(view.data[groupby_attr.attribute])
            #create existing group by attribute combinations if color is specified
            #this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them
            if has_color:
                res_color_combi_vals = []
                result_color_vals = list(view.data[color_attr.attribute])
                for i in range(0, len(result_vals)):
                    res_color_combi_vals.append(
                        [result_vals[i], result_color_vals[i]])
            if (len(result_vals) != len(all_attr_vals) * color_cardinality
                    and (isFiltered or has_color)):
                ####### ORIGINAL
                # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
                # for vals in all_attr_vals:
                #     if (vals not in result_vals):
                #         view.data.loc[len(view.data)] = [vals]+[0]*(len(view.data.columns)-1)

                ####### SOLUTION 1 - INCOMPLETE SOLUTION, FAILS ON NONETYPE
                # start = time.time()
                # list_diff = np.setdiff1d(all_attr_vals, result_vals)
                # print(time.time() - start, 's')
                # df = pd.DataFrame({view.data.columns[1]: list_diff})

                # for col in view.data.columns[1:]:
                #     df[col] = 0

                # view.data = view.data.append(df)

                ####### SOLUTION 2
                # columns = view.data.columns

                # df = pd.DataFrame({columns[0]: all_attr_vals})
                # for col in columns[1:]:
                #     df[col] = 0

                # view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['_left', '_right'])

                # for col in columns[1:]:
                #     view.data[col + '_left'] = view.data[col + '_left'].fillna(0)
                #     view.data[col + '_right'] = view.data[col + '_right'].fillna(0)

                #     view.data[col] = view.data[col + '_left'] + view.data[col + '_right']

                #     del view.data[col + '_left']
                #     del view.data[col + '_right']

                ####### SOLUTION 3
                # columns = view.data.columns

                # df = pd.DataFrame({columns[0]: all_attr_vals})
                # for col in columns[1:]:
                #     df[col] = 0

                # view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['', '_right'])

                # for col in columns[1:]:
                #     view.data[col] = view.data[col].fillna(0)
                #     del view.data[col + '_right']

                ####### SOLUTION 4
                columns = view.data.columns
                if has_color:
                    df = pd.DataFrame({
                        columns[0]:
                        all_attr_vals * color_cardinality,
                        columns[1]:
                        pd.Series(color_attr_vals).repeat(len(all_attr_vals))
                    })
                    view.data = view.data.merge(df,
                                                on=[columns[0], columns[1]],
                                                how='right',
                                                suffixes=['', '_right'])
                    for col in columns[2:]:
                        view.data[col] = view.data[col].fillna(0)
                    assert len(
                        list(view.data[groupby_attr.attribute])
                    ) == len(all_attr_vals) * len(
                        color_attr_vals
                    ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`."
                    # for vals in all_attr_vals:
                    #     for cvals in color_attr_vals:
                    #         temp_combi = [vals, cvals]
                    #         if (temp_combi not in res_color_combi_vals):
                    #             view.data.loc[len(view.data)] = [vals]+[cvals]+[0]*(len(view.data.columns)-2)
                else:
                    df = pd.DataFrame({columns[0]: all_attr_vals})

                    view.data = view.data.merge(df,
                                                on=columns[0],
                                                how='right',
                                                suffixes=['', '_right'])

                    for col in columns[1:]:
                        view.data[col] = view.data[col].fillna(0)
                    assert len(list(view.data[groupby_attr.attribute])) == len(
                        all_attr_vals
                    ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
            view.data = view.data.sort_values(by=groupby_attr.attribute,
                                              ascending=True)
            view.data = view.data.reset_index()
            view.data = view.data.drop(columns="index")
예제 #20
0
파일: PandasExecutor.py 프로젝트: whmz/lux
    def execute_aggregate(vis: Vis, isFiltered=True):
        """
        Aggregate data points on an axis for bar or line charts

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.core.frame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        """
        import numpy as np

        x_attr = vis.get_attr_by_channel("x")[0]
        y_attr = vis.get_attr_by_channel("y")[0]
        has_color = False
        groupby_attr = ""
        measure_attr = ""
        if x_attr.aggregation is None or y_attr.aggregation is None:
            return
        if y_attr.aggregation != "":
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if x_attr.aggregation != "":
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        if groupby_attr.attribute in vis.data.unique_values.keys():
            attr_unique_vals = vis.data.unique_values[groupby_attr.attribute]
        # checks if color is specified in the Vis
        if len(vis.get_attr_by_channel("color")) == 1:
            color_attr = vis.get_attr_by_channel("color")[0]
            color_attr_vals = vis.data.unique_values[color_attr.attribute]
            color_cardinality = len(color_attr_vals)
            # NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable----------------
            has_color = True
        else:
            color_cardinality = 1
        if measure_attr != "":
            if measure_attr.attribute == "Record":
                # need to get the index name so that we can rename the index column to "Record"
                # if there is no index, default to "index"
                index_name = vis.data.index.name
                if index_name == None:
                    index_name = "index"

                vis._vis_data = vis.data.reset_index()
                # if color is specified, need to group by groupby_attr and color_attr

                if has_color:
                    vis._vis_data = (vis.data.groupby(
                        [groupby_attr.attribute, color_attr.attribute],
                        dropna=False,
                        history=False).count().reset_index().rename(
                            columns={index_name: "Record"}))
                    vis._vis_data = vis.data[[
                        groupby_attr.attribute, color_attr.attribute, "Record"
                    ]]
                else:
                    vis._vis_data = (vis.data.groupby(
                        groupby_attr.attribute, dropna=False,
                        history=False).count().reset_index().rename(
                            columns={index_name: "Record"}))
                    vis._vis_data = vis.data[[
                        groupby_attr.attribute, "Record"
                    ]]
            else:
                # if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    groupby_result = vis.data.groupby(
                        [groupby_attr.attribute, color_attr.attribute],
                        dropna=False,
                        history=False)
                else:
                    groupby_result = vis.data.groupby(groupby_attr.attribute,
                                                      dropna=False,
                                                      history=False)
                groupby_result = groupby_result.agg(agg_func)
                intermediate = groupby_result.reset_index()
                vis._vis_data = intermediate.__finalize__(vis.data)
            result_vals = list(vis.data[groupby_attr.attribute])
            # create existing group by attribute combinations if color is specified
            # this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them
            if has_color:
                res_color_combi_vals = []
                result_color_vals = list(vis.data[color_attr.attribute])
                for i in range(0, len(result_vals)):
                    res_color_combi_vals.append(
                        [result_vals[i], result_color_vals[i]])
            # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
            if isFiltered or has_color and attr_unique_vals:
                N_unique_vals = len(attr_unique_vals)
                if len(result_vals) != N_unique_vals * color_cardinality:
                    columns = vis.data.columns
                    if has_color:
                        df = pd.DataFrame({
                            columns[0]:
                            attr_unique_vals * color_cardinality,
                            columns[1]:
                            pd.Series(color_attr_vals).repeat(N_unique_vals),
                        })
                        vis._vis_data = vis.data.merge(
                            df,
                            on=[columns[0], columns[1]],
                            how="right",
                            suffixes=["", "_right"],
                        )
                        for col in columns[2:]:
                            vis.data[col] = vis.data[col].fillna(
                                0)  # Triggers __setitem__
                        assert len(
                            list(vis.data[groupby_attr.attribute])
                        ) == N_unique_vals * len(
                            color_attr_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`."

                        # Keep only the three relevant columns not the *_right columns resulting from merge
                        vis._vis_data = vis.data.iloc[:, :3]

                    else:
                        df = pd.DataFrame({columns[0]: attr_unique_vals})

                        vis._vis_data = vis.data.merge(df,
                                                       on=columns[0],
                                                       how="right",
                                                       suffixes=["", "_right"])

                        for col in columns[1:]:
                            vis.data[col] = vis.data[col].fillna(0)
                        assert (
                            len(list(vis.data[
                                groupby_attr.attribute])) == N_unique_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."

            vis._vis_data = vis._vis_data.dropna(
                subset=[measure_attr.attribute])
            try:
                vis._vis_data = vis._vis_data.sort_values(
                    by=groupby_attr.attribute, ascending=True)
            except TypeError:
                warnings.warn(
                    f"\nLux detects that the attribute '{groupby_attr.attribute}' maybe contain mixed type."
                    +
                    f"\nTo visualize this attribute, you may want to convert the '{groupby_attr.attribute}' into a uniform type as follows:"
                    +
                    f"\n\tdf['{groupby_attr.attribute}'] = df['{groupby_attr.attribute}'].astype(str)"
                )
                vis._vis_data[groupby_attr.attribute] = vis._vis_data[
                    groupby_attr.attribute].astype(str)
                vis._vis_data = vis._vis_data.sort_values(
                    by=groupby_attr.attribute, ascending=True)
            vis._vis_data = vis._vis_data.reset_index()
            vis._vis_data = vis._vis_data.drop(columns="index")