예제 #1
0
    def execute_binning(view: Vis, ldf: LuxDataFrame):
        import numpy as np
        import pandas as pd
        bin_attribute = list(
            filter(lambda x: x.bin_size != 0, view._inferred_intent))[0]
        num_bins = bin_attribute.bin_size
        attr_min = min(ldf.unique_values[bin_attribute.attribute])
        attr_max = max(ldf.unique_values[bin_attribute.attribute])
        attr_type = type(ldf.unique_values[bin_attribute.attribute][0])

        #need to calculate the bin edges before querying for the relevant data
        bin_width = (attr_max - attr_min) / num_bins
        upper_edges = []
        for e in range(1, num_bins):
            curr_edge = attr_min + e * bin_width
            if attr_type == int:
                upper_edges.append(str(math.ceil(curr_edge)))
            else:
                upper_edges.append(str(curr_edge))
        upper_edges = ",".join(upper_edges)
        view_filter, filter_vars = SQLExecutor.execute_filter(view)
        bin_count_query = "SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket({}, '{}') FROM {}) as Buckets GROUP BY width_bucket ORDER BY width_bucket".format(
            bin_attribute.attribute, '{' + upper_edges + '}', ldf.table_name)
        bin_count_data = pd.read_sql(bin_count_query, ldf.SQLconnection)

        #counts,binEdges = np.histogram(ldf[bin_attribute.attribute],bins=bin_attribute.bin_size)
        #binEdges of size N+1, so need to compute binCenter as the bin location
        upper_edges = [float(i) for i in upper_edges.split(",")]
        if attr_type == int:
            bin_centers = np.array(
                [math.ceil((attr_min + attr_min + bin_width) / 2)])
        else:
            bin_centers = np.array([(attr_min + attr_min + bin_width) / 2])
        bin_centers = np.append(
            bin_centers,
            np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0))
        if attr_type == int:
            bin_centers = np.append(
                bin_centers,
                math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2))
        else:
            bin_centers = np.append(
                bin_centers,
                (upper_edges[len(upper_edges) - 1] + attr_max) / 2)

        if len(bin_centers) > len(bin_count_data):
            bucket_lables = bin_count_data['width_bucket'].unique()
            for i in range(0, len(bin_centers)):
                if i not in bucket_lables:
                    bin_count_data = bin_count_data.append(
                        pd.DataFrame([[i, 0]], columns=bin_count_data.columns))

        view.data = pd.DataFrame(
            np.array([bin_centers, list(bin_count_data['count'])]).T,
            columns=[bin_attribute.attribute, "Number of Records"])
        view.data = utils.pandas_to_lux(view.data)
예제 #2
0
    def execute_aggregate(view: Vis, ldf: LuxDataFrame):
        import pandas as pd
        x_attr = view.get_attr_by_channel("x")[0]
        y_attr = view.get_attr_by_channel("y")[0]
        groupby_attr = ""
        measure_attr = ""
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation

        if (measure_attr != ""):
            #barchart case, need count data for each group
            if (measure_attr.attribute == "Record"):
                where_clause, filterVars = SQLExecutor.execute_filter(view)
                count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format(
                    groupby_attr.attribute, groupby_attr.attribute,
                    ldf.table_name, where_clause, groupby_attr.attribute)
                view.data = pd.read_sql(count_query, ldf.SQLconnection)
                view.data = view.data.rename(columns={"count": "Record"})
                view.data = utils.pandas_to_lux(view.data)

            else:
                where_clause, filterVars = SQLExecutor.execute_filter(view)
                if agg_func == "mean":
                    mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)
                if agg_func == "sum":
                    mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)
                if agg_func == "max":
                    mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)

            #pad empty categories with 0 counts after filter is applied
            all_attr_vals = ldf.unique_values[groupby_attr.attribute]
            result_vals = list(view.data[groupby_attr.attribute])
            if (len(result_vals) != len(all_attr_vals)):
                # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
                for vals in all_attr_vals:
                    if (vals not in result_vals):
                        view.data.loc[len(view.data)] = [
                            vals
                        ] + [0] * (len(view.data.columns) - 1)
예제 #3
0
    def execute_binning(view: Vis):
        '''
        Binning of data points for generating histograms

        Parameters
        ----------
        view: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.luxDataFrame.LuxDataFrame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        '''
        import numpy as np
        import pandas as pd  # is this import going to be conflicting with LuxDf?
        bin_attribute = list(
            filter(lambda x: x.bin_size != 0, view._inferred_intent))[0]
        #TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong.
        counts, bin_edges = np.histogram(view.data[bin_attribute.attribute],
                                         bins=bin_attribute.bin_size)
        #bin_edges of size N+1, so need to compute bin_center as the bin location
        bin_center = np.mean(np.vstack([bin_edges[0:-1], bin_edges[1:]]),
                             axis=0)
        # TODO: Should view.data be a LuxDataFrame or a Pandas DataFrame?
        view.data = pd.DataFrame(
            np.array([bin_center, counts]).T,
            columns=[bin_attribute.attribute, "Number of Records"])
예제 #4
0
    def execute_filter(view: Vis):
        assert view.data is not None, "execute_filter assumes input view.data is populated (if not, populate with LuxDataFrame values)"
        filters = utils.get_filter_specs(view._inferred_intent)

        if (filters):
            # TODO: Need to handle OR logic
            for filter in filters:
                view.data = PandasExecutor.apply_filter(
                    view.data, filter.attribute, filter.filter_op,
                    filter.value)
            return True
        else:
            return False
예제 #5
0
def test_vis_private_properties():
    from lux.vis.Vis import Vis
    df = pd.read_csv("lux/data/car.csv")
    vis = Vis(["Horsepower", "Weight"], df)
    vis._repr_html_()
    assert isinstance(vis.data, lux.core.frame.LuxDataFrame)
    with pytest.raises(AttributeError, match="can't set attribute"):
        vis.data = "some val"

    assert isinstance(vis.code, dict)
    with pytest.raises(AttributeError, match="can't set attribute"):
        vis.code = "some val"

    assert isinstance(vis.min_max, dict)
    with pytest.raises(AttributeError, match="can't set attribute"):
        vis.min_max = "some val"

    assert vis.mark == "scatter"
    with pytest.raises(AttributeError, match="can't set attribute"):
        vis.mark = "some val"
예제 #6
0
def test_vis_private_properties(global_var):
    from lux.vis.Vis import Vis

    df = pytest.car_df
    vis = Vis(["Horsepower", "Weight"], df)
    vis._ipython_display_()
    assert isinstance(vis.data, lux.core.frame.LuxDataFrame)
    with pytest.raises(AttributeError, match="can't set attribute"):
        vis.data = "some val"

    assert isinstance(vis.code, dict)
    with pytest.raises(AttributeError, match="can't set attribute"):
        vis.code = "some val"

    assert isinstance(vis.min_max, dict)
    with pytest.raises(AttributeError, match="can't set attribute"):
        vis.min_max = "some val"

    assert vis.mark == "scatter"
    with pytest.raises(AttributeError, match="can't set attribute"):
        vis.mark = "some val"
예제 #7
0
    def execute_aggregate(view: Vis, isFiltered=True):
        '''
        Aggregate data points on an axis for bar or line charts

        Parameters
        ----------
        view: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.luxDataFrame.LuxDataFrame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        '''
        import numpy as np
        import pandas as pd
        import time

        x_attr = view.get_attr_by_channel("x")[0]
        y_attr = view.get_attr_by_channel("y")[0]
        has_color = False
        groupby_attr = ""
        measure_attr = ""
        if (x_attr.aggregation is None or y_attr.aggregation is None):
            return
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        #checks if color is specified in the Vis
        if len(view.get_attr_by_channel("color")) == 1:
            color_attr = view.get_attr_by_channel("color")[0]
            color_attr_vals = view.data.unique_values[color_attr.attribute]
            color_cardinality = len(color_attr_vals)
            #NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable----------------
            has_color = True
        else:
            color_cardinality = 1
        all_attr_vals = view.data.unique_values[groupby_attr.attribute]
        if (measure_attr != ""):
            if (measure_attr.attribute == "Record"):
                view.data = view.data.reset_index()
                #if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    view.data = view.data.groupby(
                        [groupby_attr.attribute,
                         color_attr.attribute]).count().reset_index()
                    view.data = view.data.rename(columns={"index": "Record"})
                    view.data = view.data[[
                        groupby_attr.attribute, color_attr.attribute, "Record"
                    ]]
                else:
                    view.data = view.data.groupby(
                        groupby_attr.attribute).count().reset_index()
                    view.data = view.data.rename(columns={"index": "Record"})
                    view.data = view.data[[groupby_attr.attribute, "Record"]]
            else:
                #if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    groupby_result = view.data.groupby(
                        [groupby_attr.attribute, color_attr.attribute])
                else:
                    groupby_result = view.data.groupby(groupby_attr.attribute)
                view.data = groupby_result.agg(agg_func).reset_index()
            result_vals = list(view.data[groupby_attr.attribute])
            #create existing group by attribute combinations if color is specified
            #this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them
            if has_color:
                res_color_combi_vals = []
                result_color_vals = list(view.data[color_attr.attribute])
                for i in range(0, len(result_vals)):
                    res_color_combi_vals.append(
                        [result_vals[i], result_color_vals[i]])
            if (len(result_vals) != len(all_attr_vals) * color_cardinality
                    and (isFiltered or has_color)):
                ####### ORIGINAL
                # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
                # for vals in all_attr_vals:
                #     if (vals not in result_vals):
                #         view.data.loc[len(view.data)] = [vals]+[0]*(len(view.data.columns)-1)

                ####### SOLUTION 1 - INCOMPLETE SOLUTION, FAILS ON NONETYPE
                # start = time.time()
                # list_diff = np.setdiff1d(all_attr_vals, result_vals)
                # print(time.time() - start, 's')
                # df = pd.DataFrame({view.data.columns[1]: list_diff})

                # for col in view.data.columns[1:]:
                #     df[col] = 0

                # view.data = view.data.append(df)

                ####### SOLUTION 2
                # columns = view.data.columns

                # df = pd.DataFrame({columns[0]: all_attr_vals})
                # for col in columns[1:]:
                #     df[col] = 0

                # view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['_left', '_right'])

                # for col in columns[1:]:
                #     view.data[col + '_left'] = view.data[col + '_left'].fillna(0)
                #     view.data[col + '_right'] = view.data[col + '_right'].fillna(0)

                #     view.data[col] = view.data[col + '_left'] + view.data[col + '_right']

                #     del view.data[col + '_left']
                #     del view.data[col + '_right']

                ####### SOLUTION 3
                # columns = view.data.columns

                # df = pd.DataFrame({columns[0]: all_attr_vals})
                # for col in columns[1:]:
                #     df[col] = 0

                # view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['', '_right'])

                # for col in columns[1:]:
                #     view.data[col] = view.data[col].fillna(0)
                #     del view.data[col + '_right']

                ####### SOLUTION 4
                columns = view.data.columns
                if has_color:
                    df = pd.DataFrame({
                        columns[0]:
                        all_attr_vals * color_cardinality,
                        columns[1]:
                        pd.Series(color_attr_vals).repeat(len(all_attr_vals))
                    })
                    view.data = view.data.merge(df,
                                                on=[columns[0], columns[1]],
                                                how='right',
                                                suffixes=['', '_right'])
                    for col in columns[2:]:
                        view.data[col] = view.data[col].fillna(0)
                    assert len(
                        list(view.data[groupby_attr.attribute])
                    ) == len(all_attr_vals) * len(
                        color_attr_vals
                    ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`."
                    # for vals in all_attr_vals:
                    #     for cvals in color_attr_vals:
                    #         temp_combi = [vals, cvals]
                    #         if (temp_combi not in res_color_combi_vals):
                    #             view.data.loc[len(view.data)] = [vals]+[cvals]+[0]*(len(view.data.columns)-2)
                else:
                    df = pd.DataFrame({columns[0]: all_attr_vals})

                    view.data = view.data.merge(df,
                                                on=columns[0],
                                                how='right',
                                                suffixes=['', '_right'])

                    for col in columns[1:]:
                        view.data[col] = view.data[col].fillna(0)
                    assert len(list(view.data[groupby_attr.attribute])) == len(
                        all_attr_vals
                    ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
            view.data = view.data.sort_values(by=groupby_attr.attribute,
                                              ascending=True)
            view.data = view.data.reset_index()
            view.data = view.data.drop(columns="index")