def test_only_nan_in_group(self): data = Table( Domain([ContinuousVariable("A"), ContinuousVariable("B")]), np.array([[1, np.nan], [2, 1], [1, np.nan], [2, 1]]), ) self.send_signal(self.widget.Inputs.data, data) # select feature A as group-by self._set_selection(self.widget.gb_attrs_view, [0]) # select all aggregations for feature B self.select_table_rows(self.widget.agg_table_view, [1]) for cb in self.widget.agg_checkboxes.values(): while not cb.isChecked(): cb.click() # unselect all aggregations for attr A self.select_table_rows(self.widget.agg_table_view, [0]) for cb in self.widget.agg_checkboxes.values(): while cb.isChecked(): cb.click() expected_columns = [ "B - Mean", "B - Median", "B - Mode", "B - Standard deviation", "B - Variance", "B - Sum", "B - Min. value", "B - Max. value", "B - Span", "B - First value", "B - Last value", "B - Random value", "B - Count defined", "B - Count", "B - Proportion defined", "B - Concatenate", "A", ] n = np.nan expected_df = pd.DataFrame( [ [n, n, n, n, n, 0, n, n, n, n, n, n, 0, 2, 0, "", 1], [1, 1, 1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 2, 2, 1, "1.0 1.0", 2], ], columns=expected_columns, ) output_df = table_to_frame(self.get_output(self.widget.Outputs.data), include_metas=True) pd.testing.assert_frame_equal( output_df, expected_df, check_dtype=False, check_column_type=False, check_categorical=False, )
def test_only_nan_in_group(self): data = Table( Domain([ContinuousVariable("A"), ContinuousVariable("B")]), np.array([[1, np.nan], [2, 1], [1, np.nan], [2, 1]]), ) self.send_signal(self.widget.Inputs.data, data) # select feature A as group-by self._set_selection(self.widget.gb_attrs_view, [0]) # select all aggregations for feature B self.select_table_rows(self.widget.agg_table_view, [1]) for cb in self.widget.agg_checkboxes.values(): while not cb.isChecked(): cb.click() # unselect all aggregations for attr A self.select_table_rows(self.widget.agg_table_view, [0]) for cb in self.widget.agg_checkboxes.values(): while cb.isChecked(): cb.click() expected_columns = [ "B - 平均值", "B - 中位数", "B - 取模", "B - 标准差", "B - 方差", "B - 和", "B - 最小值", "B - 最大值", "B - 跨度", "B - 首值", "B - 末值", "B - 随机值", "B - 非缺失数量", "B - 数量", "B - 非缺失占比", "B - 串接(Concatenate)", "A", ] n = np.nan expected_df = pd.DataFrame( [ [n, n, n, n, n, 0, n, n, n, n, n, n, 0, 2, 0, "", 1], [1, 1, 1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 2, 2, 1, "1.0 1.0", 2], ], columns=expected_columns, ) output_df = table_to_frame( self.get_output(self.widget.Outputs.data), include_metas=True ) pd.testing.assert_frame_equal( output_df, expected_df, check_dtype=False, check_column_type=False, check_categorical=False, )
def __init__(self, table: Table, by: List[Variable]): self.table = table df = table_to_frame(table, include_metas=True) # observed=True keeps only groups with at leas one instance self.group_by = df.groupby([a.name for a in by], observed=True) # lru_cache that is caches on the object level self.compute_aggregation = lru_cache()(self._compute_aggregation)
def test_aggregation(self): d = self.data.domain gb = self.data.groupby([self.data.domain["a"], self.data.domain["b"]]) output = gb.aggregate({ d["cvar"]: [("Mean", "mean"), ("Median", "median"), ("Mean1", np.mean)], d["dvar"]: [("Count defined", "count"), ("Count", "size")], d["svar"]: [("Concatenate", "".join)], }) expected_columns = [ "cvar - Mean", "cvar - Median", "cvar - Mean1", "dvar - Count defined", "dvar - Count", "svar - Concatenate", "a", # groupby variables are last two in metas "b", ] exp_df = pd.DataFrame( [ [0.15, 0.15, 0.15, 2, 2, "sval1sval2", 1, 1], [0.3, 0.3, 0.3, 1, 2, "sval2", 1, 2], [0.433, 0.4, 0.433, 3, 3, "sval1sval2sval1", 1, 3], [1.5, 1.5, 1.5, 2, 2, "sval2sval1", 2, 1], [-0.5, -0.5, -0.5, 2, 2, "sval2sval1", 2, 2], [5, 5, 5, 2, 2, "sval2sval1", 2, 3], ], columns=expected_columns, ) out_df = table_to_frame(output, include_metas=True) pd.testing.assert_frame_equal( out_df, exp_df, check_dtype=False, check_column_type=False, check_categorical=False, atol=1e-3, )
def test_aggregation(self): """Test aggregation results""" self.send_signal(self.widget.Inputs.data, self.data) output = self.get_output(self.widget.Outputs.data) np.testing.assert_array_almost_equal( output.X, [[1, 2.143, 0.317, 0], [2, 2, 2, 0]], decimal=3) np.testing.assert_array_equal( output.metas, np.array( [ [ "sval1 sval2 sval2 sval1 sval2 sval1", 1.0, ], [ "sval2 sval1 sval2 sval1 sval2 sval1", 2.0, ], ], dtype=object, ), ) # select all aggregations for all features except a and b self._set_selection(self.widget.gb_attrs_view, [1, 2]) self.select_table_rows(self.widget.agg_table_view, [2, 3, 4]) # select all aggregations for cb in self.widget.agg_checkboxes.values(): cb.click() while not cb.isChecked(): cb.click() self.select_table_rows(self.widget.agg_table_view, [0, 1]) # unselect all aggregations for attr a and b for cb in self.widget.agg_checkboxes.values(): while cb.isChecked(): cb.click() expected_columns = [ "cvar - Mean", "cvar - Median", "cvar - Mode", "cvar - Standard deviation", "cvar - Variance", "cvar - Sum", "cvar - Min. value", "cvar - Max. value", "cvar - Span", "cvar - First value", "cvar - Last value", "cvar - Count defined", "cvar - Count", "cvar - Proportion defined", "dvar - Mode", "dvar - First value", "dvar - Last value", "dvar - Count defined", "dvar - Count", "dvar - Proportion defined", "svar - First value", "svar - Last value", "svar - Count defined", "svar - Count", "svar - Proportion defined", "cvar - Concatenate", "dvar - Concatenate", "svar - Concatenate", "a", # groupby variables are last two in metas "b", ] # fmt: off expected_df = pd.DataFrame( [[ .15, .15, .1, .07, .005, .3, .1, .2, .1, 0.1, 0.2, 2, 2, 1, "val1", "val1", "val2", 2, 2, 1, "sval1", "sval2", 2, 2, 1, "0.1 0.2", "val1 val2", "sval1 sval2", 1, 1 ], [ .3, .3, .3, np.nan, np.nan, .3, .3, .3, 0, .3, .3, 1, 2, 0.5, "val2", "val2", "val2", 1, 2, 0.5, "", "sval2", 2, 2, 1, "0.3", "val2", "sval2", 1, 2 ], [ .433, .4, .3, 0.153, 0.023, 1.3, .3, .6, .3, .3, .6, 3, 3, 1, "val1", "val1", "val1", 3, 3, 1, "sval1", "sval1", 3, 3, 1, "0.3 0.4 0.6", "val1 val2 val1", "sval1 sval2 sval1", 1, 3 ], [ 1.5, 1.5, 1, 0.707, 0.5, 3, 1, 2, 1, 1, 2, 2, 2, 1, "val1", "val2", "val1", 2, 2, 1, "sval2", "sval1", 2, 2, 1, "1.0 2.0", "val2 val1", "sval2 sval1", 2, 1 ], [ -0.5, -0.5, -4, 4.95, 24.5, -1, -4, 3, 7, 3, -4, 2, 2, 1, "val1", "val2", "val1", 2, 2, 1, "sval2", "sval1", 2, 2, 1, "3.0 -4.0", "val2 val1", "sval2 sval1", 2, 2 ], [ 5, 5, 5, 0, 0, 10, 5, 5, 0, 5, 5, 2, 2, 1, "val1", "val2", "val1", 2, 2, 1, "sval2", "sval1", 2, 2, 1, "5.0 5.0", "val2 val1", "sval2 sval1", 2, 3 ]], columns=expected_columns) # fmt: on output_df = table_to_frame(self.get_output(self.widget.Outputs.data), include_metas=True) # remove random since it is not possible to test output_df = output_df.loc[:, ~output_df.columns.str. endswith("Random value")] pd.testing.assert_frame_equal( output_df, expected_df, check_dtype=False, check_column_type=False, check_categorical=False, atol=1e-3, )