def test_select_to(self): df = data.df_diamonds[["carat", "cut"]] self.assertTrue( df.equals( data.df_diamonds >> gr.tf_select(gr.columns_to("color")))) self.assertTrue( df.equals( data.df_diamonds >> gr.tf_select(gr.columns_to(X.color)))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select(gr.columns_to(2))))
def test_select_from(self): df = data.df_diamonds[["x", "y", "z"]] self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select(gr.columns_from("x")))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select(gr.columns_from(X.x)))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select(gr.columns_from(7)))) self.assertTrue(data.df_diamonds[[]].equals( data.df_diamonds >> gr.tf_select(gr.columns_from(100))))
def test_var(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame({"v": [0.687392]}) test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "v": [2.074800, 0.022033, 0.056133, 0.033100, 0.005233], } ) test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # straight mutate t = df >> gr.tf_mutate(v=gr.var(X.x)) df_truth = df.copy() df_truth["v"] = 0.687392 test_vector = abs(t.v - df_truth.v) self.assertTrue(all(test_vector < 0.00001)) # grouped mutate # t = df >> group_by(X.cut) >> mutate(v=var(X.x)) # df_truth['v'] = pd.Series([2.074800, 2.074800, 2.074800, 0.022033, 0.022033, 0.022033, # 0.056133, 0.056133, 0.056133, 0.033100, 0.033100, 0.033100, # 0.005233, 0.005233, 0.005233], # index=t.index) # test_vector = abs(t.v - df_truth.v) # assert all(test_vector < .00001) # test with single value (var undefined) df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(1) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(v=gr.var(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "v": [np.nan, np.nan, np.nan, np.nan, np.nan], } ) self.assertTrue(t.equals(df_truth))
def test_sd(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame({"s": [0.829091]}) test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "s": [1.440417, 0.148436, 0.236925, 0.181934, 0.072342], } ) test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # straight mutate t = df >> gr.tf_mutate(s=gr.sd(X.x)) df_truth = df.copy() df_truth["s"] = 0.829091 test_vector = abs(t.s - df_truth.s) self.assertTrue(all(test_vector < 0.00001)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(s=gr.sd(X.x)) # df_truth['s'] = pd.Series([1.440417, 1.440417, 1.440417, 0.148436, 0.148436, 0.148436, # 0.236925, 0.236925, 0.236925, 0.181934, 0.181934, 0.181934, # 0.072342, 0.072342, 0.072342], # index=t.index) # test_vector = abs(t.s - df_truth.s) # print(t) # print(df_truth) self.assertTrue(all(test_vector < 0.00001)) # test with single value (var undefined) df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(1) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(s=gr.sd(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "s": [np.nan, np.nan, np.nan, np.nan, np.nan], } ) self.assertTrue(t.equals(df_truth))
def select_through(self): df = data.df_diamonds[["carat", "cut", "color"]] self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select( gr.columns_to("color", inclusive=True)))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select( gr.columns_to(X.color, inclusive=True)))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select( gr.columns_to(2, inclusive=True))))
def test_median(self): df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(3) >> gr.tf_select(X.cut, X.x) >> gr.tf_ungroup() ) # straight summarize t = df >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame({"m": [4.05]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "m": [6.27, 4.25, 3.95, 3.89, 3.95], } ) self.assertTrue(t.equals(df_truth)) # straight mutate t = df >> gr.tf_mutate(m=gr.median(X.x)) df_truth = df.copy() df_truth["m"] = 4.05 self.assertTrue(t.equals(df_truth)) # grouped mutate # t = df >> group_by(X.cut) >> mutate(m=median(X.x)) # df_truth['m'] = pd.Series( # [6.27, 6.27, 6.27, 4.25, 4.25, 4.25, 3.95, 3.95, 3.95, 3.89, 3.89, 3.89, 3.95, 3.95, 3.95], # index=t.index) # assert t.equals(df_truth) # make sure it handles case with even counts properly df = ( data.df_diamonds >> gr.tf_group_by(X.cut) >> gr.tf_head(2) >> gr.tf_select(X.cut, X.x) ) t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.median(X.x)) df_truth = pd.DataFrame( { "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "m": [5.160, 4.195, 3.940, 4.045, 3.945], } ) test_vector = abs(t.m - df_truth.m) self.assertTrue(all(test_vector < 0.000000001))
def test_select_between(self): df = data.df_diamonds[["cut", "color", "clarity"]] self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select( gr.columns_between(X.cut, X.clarity)))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select( gr.columns_between("cut", "clarity")))) self.assertTrue( df.equals( data.df_diamonds >> gr.tf_select(gr.columns_between(1, 3)))) df = data.df_diamonds[["x", "y", "z"]] assert df.equals( data.df_diamonds >> gr.tf_select(gr.columns_between("x", 20)))
def test_n(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(n=gr.n(X.x)) df_truth = pd.DataFrame({"n": [5]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(n=gr.n(X.x)) df_truth = pd.DataFrame({ "cut": ["Good", "Ideal", "Premium"], "n": [2, 1, 2] }) self.assertTrue(t.equals(df_truth)) # straight mutate t = df >> gr.tf_mutate(n=gr.n(X.x)) df_truth = df.copy() df_truth["n"] = 5 self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(n=gr.n(X.x)) df_truth["n"] = pd.Series([1, 2, 2, 2, 2, 2]) self.assertTrue(t.sort_index().equals(df_truth)) # Implicit mode summarize t = df >> gr.tf_summarize(n=gr.n()) df_truth = pd.DataFrame({"n": [5]}) self.assertTrue(t.equals(df_truth)) # Implicit mode mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(n=gr.n()) df_truth = df.copy() df_truth["n"] = pd.Series([1, 2, 2, 2, 2, 2]) self.assertTrue(t.sort_index().equals(df_truth))
def test_first(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(f=gr.first(X.x)) df_truth = pd.DataFrame({"f": [3.95]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(f=gr.first(X.x)) df_truth = pd.DataFrame({ "cut": ["Good", "Ideal", "Premium"], "f": [4.05, 3.95, 3.89] }) self.assertTrue(t.equals(df_truth)) # summarize with order_by t = df >> gr.tf_summarize(f=gr.first(X.x, order_by=gr.desc(X.cut))) df_truth = pd.DataFrame({"f": [3.89]}) # straight mutate t = df >> gr.tf_mutate(f=gr.first(X.x)) df_truth = df.copy() df_truth["f"] = df_truth.x.iloc[0] self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(f=gr.first(X.x)) df_truth["f"] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05]) self.assertTrue(t.sort_index().equals(df_truth))
def test_last(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(l=gr.last(X.x)) df_truth = pd.DataFrame({"l": [4.34]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(l=gr.last(X.x)) df_truth = pd.DataFrame({ "cut": ["Good", "Ideal", "Premium"], "l": [4.34, 3.95, 4.20] }) self.assertTrue(t.equals(df_truth)) # summarize with order_by t = df >> gr.tf_summarize(f=gr.last( X.x, order_by=[gr.desc(X.cut), gr.desc(X.x)])) df_truth = pd.DataFrame({"f": [4.05]}) assert df_truth.equals(t) # straight mutate t = df >> gr.tf_mutate(l=gr.last(X.x)) df_truth = df.copy() df_truth["l"] = df_truth.x.iloc[4] self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(l=gr.last(X.x)) df_truth["l"] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34]) self.assertTrue(t.sort_index().equals(df_truth))
def test_nth(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(10) # straight summarize t = df >> gr.tf_summarize(second=gr.nth(X.x, 1)) df_truth = pd.DataFrame({"second": [3.89]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by( X.cut) >> gr.tf_summarize(first=gr.nth(X.x, 0)) df_truth = pd.DataFrame({ "cut": ["Fair", "Good", "Ideal", "Premium", "Very Good"], "first": [3.87, 4.05, 3.95, 3.89, 3.94], }) self.assertTrue(t.equals(df_truth)) # summarize with order_by t = df >> gr.tf_summarize(last=gr.nth( X.x, -1, order_by=[gr.desc(X.cut), gr.desc(X.x)])) df_truth = pd.DataFrame({"last": [3.87]}) self.assertTrue(df_truth.equals(t)) # straight mutate t = df >> gr.tf_mutate(out_of_range=gr.nth(X.x, 500)) df_truth = df.copy() df_truth["out_of_range"] = np.nan self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by( X.cut) >> gr.tf_mutate(penultimate=gr.nth(X.x, -2)) df_truth = df.copy() df_truth["penultimate"] = pd.Series( [np.nan, 3.89, 4.05, 3.89, 4.05, 4.07, 4.07, 4.07, np.nan, 4.07]) self.assertTrue(t.sort_index().equals(df_truth))
def test_kmeans(self): ## Fit routine creates usable model var = ["x", "y"] md_fit = fit.fit_kmeans(self.df_cluster, var=var, n_clusters=2) df_res = gr.eval_df(md_fit, self.df_cluster[var]) ## Check correctness # Match clusters by min(x) id_true = (self.df_cluster >> gr.tf_filter(X.x == gr.colmin(X.x))).c[0] id_res = (df_res >> gr.tf_filter(X.x == gr.colmin(X.x))).cluster_id[0] df_res1 = (self.df_cluster >> gr.tf_filter(X.c == id_true) >> gr.tf_select(X.x, X.y)) df_res2 = (df_res >> gr.tf_filter(X.cluster_id == id_res) >> gr.tf_select(X.x, X.y)) self.assertTrue(gr.df_equal(df_res1, df_res2))
def test_desc(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(10) t = df >> gr.tf_summarize(last=gr.nth( X.x, -1, order_by=[gr.desc(X.cut), gr.desc(X.x)])) series_num = pd.Series([4, 1, 3, 2]) series_bool = pd.Series([True, False, True, False]) series_str = pd.Series(["d", "a", "c", "b"]) num_truth = series_num.rank(method="min", ascending=False) bool_truth = series_bool.rank(method="min", ascending=False) str_truth = series_str.rank(method="min", ascending=False) self.assertTrue(gr.desc(series_num).equals(num_truth)) self.assertTrue(gr.desc(series_bool).equals(bool_truth)) self.assertTrue(gr.desc(series_str).equals(str_truth))
def test_select(self): df = data.df_diamonds[["carat", "cut", "price"]] self.assertTrue( df.equals( data.df_diamonds >> gr.tf_select("carat", "cut", "price"))) self.assertTrue(df.equals(data.df_diamonds >> gr.tf_select(0, 1, 6))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select(0, 1, "price"))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select([0, X.cut], X.price))) self.assertTrue( df.equals( data.df_diamonds >> gr.tf_select(X.carat, X["cut"], X.price))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select(X[ ["carat", "cut", "price"]]))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select(X[["carat", "cut"]], X.price))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select(X.iloc[:, [0, 1, 6]]))) self.assertTrue( df.equals(data.df_diamonds >> gr.tf_select( [X.loc[:, ["carat", "cut", "price"]]])))
def test_max(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(m=gr.max(X.x)) df_truth = pd.DataFrame({"m": [4.34]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(m=gr.max(X.x)) df_truth = pd.DataFrame( {"cut": ["Good", "Ideal", "Premium"], "m": [4.34, 3.95, 4.20]} ) self.assertTrue(t.equals(df_truth)) # straight mutate t = df >> gr.tf_mutate(m=gr.max(X.x)) df_truth = df.copy() df_truth["m"] = 4.34 self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(m=gr.max(X.x)) df_truth["m"] = pd.Series([3.95, 4.20, 4.34, 4.20, 4.34]) self.assertTrue(t.sort_index().equals(df_truth))
def test_IQR(self): df = data.df_diamonds >> gr.tf_select(X.cut, X.x) >> gr.tf_head(5) # straight summarize t = df >> gr.tf_summarize(i=gr.IQR(X.x)) df_truth = pd.DataFrame({"i": [0.25]}) self.assertTrue(t.equals(df_truth)) # grouped summarize t = df >> gr.tf_group_by(X.cut) >> gr.tf_summarize(i=gr.IQR(X.x)) df_truth = pd.DataFrame( {"cut": ["Good", "Ideal", "Premium"], "i": [0.145, 0.000, 0.155]} ) test_vector = abs(t.i - df_truth.i) assert all(test_vector < 0.000000001) # straight mutate t = df >> gr.tf_mutate(i=gr.IQR(X.x)) df_truth = df.copy() df_truth["i"] = 0.25 self.assertTrue(t.equals(df_truth)) # grouped mutate t = df >> gr.tf_group_by(X.cut) >> gr.tf_mutate(i=gr.IQR(X.x)) df_truth["i"] = pd.Series([0.000, 0.155, 0.145, 0.155, 0.145]) test_vector = abs(t.i - df_truth.i) self.assertTrue(all(test_vector < 0.000000001))
def test_select_endswith(self): df = data.df_diamonds[["table", "price"]] assert df.equals(data.df_diamonds >> gr.tf_select(gr.ends_with("e")))
def test_select_inversion(self): df = data.df_diamonds.iloc[:, 3:] d = data.df_diamonds >> gr.tf_select(~X.carat, ~X.cut, ~X.color) self.assertTrue(df.equals(d))
def test_select_containing(self): df = data.df_diamonds[["carat", "cut", "color", "clarity", "price"]] assert df.equals(data.df_diamonds >> gr.tf_select(gr.contains("c")))
def test_select_matches(self): df = data.df_diamonds[["carat", "cut", "color", "clarity", "price"]] assert df.equals( data.df_diamonds >> gr.tf_select(gr.matches("^c[auol]|pri")))
def test_select_startswith(self): df = data.df_diamonds[["carat", "cut", "color", "clarity"]] assert df.equals(data.df_diamonds >> gr.tf_select(gr.starts_with("c")))