def test_gapply_all_cols(self): schema = StructType().add("val2", LongType()) pandasDF = pd.DataFrame.from_dict({ "key": [random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS)], "val1": [random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS)], "val2": [random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS)]}) df = self.spark.createDataFrame(pandasDF) gd = df.groupBy("key") def func(_, vals): assert vals.columns.tolist() == ["val1", "val2"], vals.columns return pd.DataFrame.from_records([(vals["val2"].sum(),)]) expected = pandasDF.groupby("key", as_index=False).agg({"val2": "sum"}) actual = gapply(gd, func, schema).toPandas() _assertPandasAlmostEqual(actual, expected) def func(_, vals): assert vals.columns.tolist() == ["val2", "val1"], vals.columns return pd.DataFrame.from_records([(vals["val2"].sum(),)]) gd = df.select("val2", "key", "val1").groupBy("key") actual = gapply(gd, func, schema).toPandas() _assertPandasAlmostEqual(actual, expected)
def test_gapply_all_cols(self): schema = StructType().add("val2", LongType()) pandasDF = pd.DataFrame.from_dict({ "key": [ random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS) ], "val1": [ random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS) ], "val2": [ random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS) ] }) df = self.spark.createDataFrame(pandasDF) gd = df.groupBy("key") def func(key, vals): assert vals.columns.tolist() == ["val1", "val2"], vals.columns return pd.DataFrame.from_records([(vals["val2"].sum(), )]) expected = pandasDF.groupby("key", as_index=False).agg({"val2": "sum"}) actual = gapply(gd, func, schema).toPandas() _assertPandasAlmostEqual(actual, expected) def func(key, vals): assert vals.columns.tolist() == ["val2", "val1"], vals.columns return pd.DataFrame.from_records([(vals["val2"].sum(), )]) gd = df.select("val2", "key", "val1").groupBy("key") actual = gapply(gd, func, schema).toPandas() _assertPandasAlmostEqual(actual, expected)
def test_gapply_empty_schema(self): longLongSchema = StructType().add("a", LongType()).add("b", LongType()) emptyLongLongDF = self.spark.createDataFrame([(1, 2)], schema=longLongSchema) gd = emptyLongLongDF.groupBy("a") self.assertEqual( gapply(gd, _emptyFunc, StructType(), "b").collect(), [])
def test_gapply_double_key(self): schema = StructType().add("val", LongType()) randKeys = [ random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS) ] pandasDF = pd.DataFrame.from_dict({ "key1": randKeys, "key2": [GapplyTests.NKEYS + x for x in randKeys], "val": [ random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS) ] }) gd = self.spark.createDataFrame(pandasDF).groupBy("key2", "key1") def func(keys, vals): assert keys[0] == keys[1] + GapplyTests.NKEYS return pd.DataFrame.from_records([(vals["val"].sum(), )]) expected = pandasDF.groupby(["key2", "key1"], as_index=False).agg({"val": "sum"}) actual = gapply(gd, func, schema, "val").toPandas() _assertPandasAlmostEqual(actual, expected)
def test_gapply_empty(self): # Implicitly checks that pandas version is large enough (unit tests for the actual version # checking itself would require some serious mocking) longLongSchema = StructType().add("a", LongType()).add("b", LongType()) emptyLongLongDF = self.spark.createDataFrame([], schema=longLongSchema) gd = emptyLongLongDF.groupBy("a") self.assertEqual(gapply(gd, _emptyFunc, longLongSchema, "b").collect(), [])
def test_gapply_empty(self): # Implicitly checks that pandas version is large enough (unit tests for the actual version # checking itself would require some serious mocking) longLongSchema = StructType().add("a", LongType()).add("b", LongType()) emptyLongLongDF = self.spark.createDataFrame([], schema=longLongSchema) gd = emptyLongLongDF.groupBy("a") self.assertEqual( gapply(gd, _emptyFunc, longLongSchema, "b").collect(), [])
def checkGapplyEquivalentToPandas(self, pandasAggFunction, dataType, dataGen): schema = StructType().add("val", dataType) pandasDF = pd.DataFrame.from_dict({ "key": [random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS)], "val": [dataGen() for _ in range(GapplyTests.NROWS)]}) gd = self.spark.createDataFrame(pandasDF).groupBy("key") def func(key, vals): return pd.DataFrame.from_records([(pandasAggFunction(vals["val"]),)]) expected = pandasDF.groupby("key", as_index=False).agg({"val": pandasAggFunction}) actual = gapply(gd, func, schema, "val").toPandas() _assertPandasAlmostEqual(actual, expected)
def test_gapply_no_keys(self): schema = StructType().add("val", LongType()) pandasDF = pd.DataFrame.from_dict({ "key": [random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS)], "val": [random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS)]}) gd = self.spark.createDataFrame(pandasDF).groupBy("key") def func(_, vals): return pd.DataFrame.from_records([(vals["val"].sum(),)]) expected = pandasDF.groupby("key", as_index=False).agg({"val": "sum"})[["val"]] actual = gapply(gd, func, schema, "val").toPandas() _assertPandasAlmostEqual(actual, expected)
def test_gapply_double_key(self): schema = StructType().add("val", LongType()) randKeys = [random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS)] pandasDF = pd.DataFrame.from_dict({ "key1": randKeys, "key2": [GapplyTests.NKEYS + x for x in randKeys], "val": [random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS)]}) gd = self.spark.createDataFrame(pandasDF).groupBy("key2", "key1") def func(keys, vals): assert keys[0] == keys[1] + GapplyTests.NKEYS return pd.DataFrame.from_records([(vals["val"].sum(),)]) expected = pandasDF.groupby(["key2", "key1"], as_index=False).agg({"val": "sum"}) actual = gapply(gd, func, schema, "val").toPandas() _assertPandasAlmostEqual(actual, expected)
def checkGapplyEquivalentToPandas(self, pandasAggFunction, dataType, dataGen): schema = StructType().add("val", dataType) pandasDF = pd.DataFrame.from_dict({ "key": [ random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS) ], "val": [dataGen() for _ in range(GapplyTests.NROWS)] }) gd = self.spark.createDataFrame(pandasDF).groupBy("key") def func(key, vals): return pd.DataFrame.from_records([ (pandasAggFunction(vals["val"]), ) ]) expected = pandasDF.groupby("key", as_index=False).agg( {"val": pandasAggFunction}) actual = gapply(gd, func, schema, "val").toPandas() _assertPandasAlmostEqual(actual, expected)
def test_gapply_name_change(self): schema = StructType().add("VAL", LongType()) pandasDF = pd.DataFrame.from_dict({ "key": [ random.randrange(GapplyTests.NKEYS) for _ in range(GapplyTests.NROWS) ], "val": [ random.randrange(GapplyTests.NVALS) for _ in range(GapplyTests.NROWS) ] }) gd = self.spark.createDataFrame(pandasDF).groupBy("key") def func(key, vals): return pd.DataFrame.from_records([(vals["val"].sum(), )]) expected = pandasDF.groupby("key", as_index=False).agg({"val": "sum"}) expected = expected.rename(columns={"val": "VAL"}) actual = gapply(gd, func, schema, "val").toPandas() _assertPandasAlmostEqual(actual, expected)
def test_gapply_empty_schema(self): longLongSchema = StructType().add("a", LongType()).add("b", LongType()) emptyLongLongDF = self.spark.createDataFrame([(1, 2)], schema=longLongSchema) gd = emptyLongLongDF.groupBy("a") self.assertEqual(gapply(gd, _emptyFunc, StructType(), "b").collect(), [])