def test_concath(self): sdf20 = dummy_streaming_dataframe(20) sdf30 = dummy_streaming_dataframe(20) df20 = sdf20.to_dataframe() df30 = sdf30.to_dataframe() df = pandas.concat([df20, df30], axis=1) m1 = sdf20.concat(sdf30, axis=1) self.assertEqualDataFrame(m1.to_dataframe(), df) sdf22 = dummy_streaming_dataframe(22) sdf25 = dummy_streaming_dataframe(25) self.assertRaise(lambda: sdf22.concat(sdf25, axis=1).to_dataframe(), RuntimeError)
def test_groupby(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) gr = sdf20.groupby("key", lambda gr: gr.sum()) gr2 = df20.groupby("key").sum() self.assertEqualDataFrame(gr, gr2) self.assertRaise(lambda: sdf20.groupby("key", in_memory=False), NotImplementedError) # Do not replace lambda c:sum(c) by sum or... # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum gr2 = df20.groupby("key").agg([numpy.sum, lambda c: sum(c)]) gr = sdf20.groupby("key", lambda gr: gr.agg([numpy.sum, lambda c: sum(c)])) self.assertEqualDataFrame(gr, gr2) gr = sdf20.groupby("key", lambda gr: gr.count()) gr2 = df20.groupby("key").count() self.assertEqualDataFrame(gr, gr2) df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7])) sdf = StreamingDataFrame.read_df(df) gr = sdf.groupby("A") gr2 = df.groupby("A").sum() self.assertEqualDataFrame(gr, gr2)
def test_train_test_split_streaming_strat(self): sdf = dummy_streaming_dataframe( 100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(0, 100)]) trsdf, tesdf = sdf.train_test_split(streaming=True, unique_rows=True, stratify="tify") trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cfloat").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cfloat").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp) trgr = trdf.groupby("tify").count() trgr["part"] = 0 tegr = tedf.groupby("tify").count() tegr["part"] = 1 gr = pandas.concat([trgr, tegr]) self.assertGreater(gr['cfloat'].min(), 4)
def test_head(self): sdf = dummy_streaming_dataframe(100) st = sdf.head() self.assertEqual(st.shape, (5, 2)) st = sdf.head(n=20) self.assertEqual(st.shape, (20, 2)) st = sdf.head(n=20) self.assertEqual(st.shape, (20, 2))
def test_sample(self): sdf = dummy_streaming_dataframe(100) res = sdf.sample(frac=0.1) self.assertLesser(res.shape[0], 30) self.assertRaise(lambda: sdf.sample(n=5), ValueError) res = sdf.sample(frac=0.1) self.assertLesser(res.shape[0], 30) self.assertRaise(lambda: sdf.sample(n=5), ValueError)
def test_getitem(self): sdf = dummy_streaming_dataframe(100) sdf2 = sdf[["cint"]] self.assertEqual(sdf2.shape, (100, 1)) df1 = sdf.to_df() df2 = sdf2.to_df() self.assertEqualDataFrame(df1[["cint"]], df2) self.assertRaise(lambda: sdf["cint"], NotImplementedError) self.assertRaise(lambda: sdf[:, "cint"], NotImplementedError)
def test_apply(self): sdf = dummy_streaming_dataframe(100) self.assertNotEmpty(list(sdf)) sdf = sdf.applymap(str) self.assertNotEmpty(list(sdf)) sdf = sdf.apply(lambda row: row[["cint"]] + "r", axis=1) self.assertNotEmpty(list(sdf)) text = sdf.to_csv(header=False) self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r", text)
def test_shape(self): sdf = dummy_streaming_dataframe(100) dfs = [df for df in sdf] self.assertEqual(len(dfs), 10) self.assertEqual(len(dfs), 10) shape = sdf.shape self.assertEqual(shape, (100, 2)) self.assertRaise(lambda: sdf.sort_values("r"), StreamingInefficientException)
def test_sample_cache(self): sdf = dummy_streaming_dataframe(100) res = sdf.sample(frac=0.1, cache=True) df1 = res.to_df() df2 = res.to_df() self.assertEqualDataFrame(df1, df2) self.assertTrue(res.is_stable(n=df1.shape[0], do_check=True)) self.assertTrue(res.is_stable(n=df1.shape[0], do_check=False)) res = sdf.sample(frac=0.1, cache=False) self.assertFalse(res.is_stable(n=df1.shape[0], do_check=False))
def test_train_test_split(self): sdf = dummy_streaming_dataframe(100) tr, te = sdf.train_test_split(index=False, streaming=False) trsdf = StreamingDataFrame.read_str(tr) tesdf = StreamingDataFrame.read_str(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_sample_reservoir_cache(self): sdf = dummy_streaming_dataframe(100) res = sdf.sample(n=10, cache=True, reservoir=True) df1 = res.to_df() df2 = res.to_df() self.assertEqualDataFrame(df1, df2) self.assertEqual(df1.shape, (10, res.shape[1])) self.assertRaise(lambda: sdf.sample(n=10, cache=False, reservoir=True), ValueError) self.assertRaise( lambda: sdf.sample(frac=0.1, cache=True, reservoir=True), ValueError)
def test_where(self): sdf = dummy_streaming_dataframe(100) cols = sdf.columns self.assertEqual(list(cols), ['cint', 'cstr']) dts = sdf.dtypes self.assertEqual(len(dts), 2) res = sdf.where(lambda row: row["cint"] == 1) st = res.to_csv() self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st) res = sdf.where(lambda row: row["cint"] == 1) st = res.to_csv() self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st)
def test_groupby_streaming(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming("key", lambda gr: gr.sum(), strategy='streaming', as_index=False) gr2 = df20.groupby("key", as_index=False).sum() grs = [gr for gr in sgr] gr = pandas.concat(grs).groupby("key", as_index=False).sum() self.assertEqualDataFrame(gr, gr2)
def test_concatv(self): sdf20 = dummy_streaming_dataframe(20) sdf30 = dummy_streaming_dataframe(30) df20 = sdf20.to_dataframe() df30 = sdf30.to_dataframe() df = pandas.concat([df20, df30], axis=0) m1 = sdf20.concat(sdf30, axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) m1 = sdf20.concat(df30, axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) df30["g"] = 4 self.assertRaise(lambda: sdf20.concat(df30).to_dataframe(), ValueError, "Frame others[0] do not have the same column names") df20["cint"] = df20["cint"].astype(float) self.assertRaise(lambda: sdf20.concat(df20).to_dataframe(), ValueError, "Frame others[0] do not have the same column types")
def test_groupby_cum_asindex(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming("key", lambda gr: gr.sum(), strategy='cum', as_index=True) gr2 = df20.groupby("key", as_index=True).sum() lastgr = None for gr in sgr: self.assertEqual(list(gr.columns), list(gr2.columns)) lastgr = gr self.assertEqualDataFrame(lastgr, gr2)
def test_merge(self): def compares(a, b, how): m = a.merge(b, on="cint", indicator=True) dm = m.to_dataframe() da = a.to_dataframe() db = b.to_dataframe() exp = da.merge(db, on="cint", indicator=True) self.assertEqualDataFrame(dm.reset_index(drop=True), exp.reset_index(drop=True)) sdf20 = dummy_streaming_dataframe(20) sdf30 = dummy_streaming_dataframe(30) # itself hows = "inner left right outer".split() for how in hows: compares(sdf20, sdf20, how) compares(sdf20, sdf20, how) for how in hows: compares(sdf20, sdf30, how) compares(sdf20, sdf30, how) for how in hows: compares(sdf30, sdf20, how) compares(sdf30, sdf20, how) sdf20.merge(sdf20.to_dataframe(), on="cint", indicator=True)
def test_train_test_split_file_pattern(self): temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern") sdf = dummy_streaming_dataframe(100) names = os.path.join(temp, "spl_{0}.txt") self.assertRaise( lambda: sdf.train_test_split(names, index=False, streaming=False), ValueError) names = os.path.join(temp, "spl_{}.txt") tr, te = sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(tr) tesdf = StreamingDataFrame.read_csv(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_train_test_split_file(self): temp = get_temp_folder(__file__, "temp_train_test_split_file") names = [ os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt") ] sdf = dummy_streaming_dataframe(100) sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(names[0]) tesdf = StreamingDataFrame.read_csv(names[1]) self.assertGreater(trsdf.shape[0], 20) self.assertGreater(tesdf.shape[0], 20) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() self.assertGreater(trdf.shape[0], 20) self.assertGreater(tedf.shape[0], 20) df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_train_test_split_streaming(self): sdf = dummy_streaming_dataframe(100, asfloat=True) trsdf, tesdf = sdf.train_test_split(streaming=True, unique_rows=True, partitions=[0.7, 0.3]) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cfloat").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp) trdf2 = trsdf.to_dataframe() tedf2 = tesdf.to_dataframe() df_val = pandas.concat([trdf2, tedf2]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cfloat").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp) self.assertEqual(trdf.shape, trdf2.shape) self.assertEqual(tedf.shape, tedf2.shape) self.assertGreater(trdf.shape[0], tedf.shape[0]) self.assertGreater(trdf2.shape[0], tedf2.shape[0])
def test_tail(self): sdf = dummy_streaming_dataframe(100) st = sdf.tail() self.assertEqual(st.shape, (5, 2)) st = sdf.tail(n=20) self.assertEqual(st.shape, (10, 2))
def test_iterrows(self): sdf = dummy_streaming_dataframe(100) rows = list(sdf.iterrows()) self.assertEqual(sdf.shape[0], len(rows)) rows = list(sdf.iterrows()) self.assertEqual(sdf.shape[0], len(rows))
def test_to_csv(self): sdf = dummy_streaming_dataframe(100) st = sdf.to_csv() self.assertStartsWith(",cint,cstr\n0,0,s0", st) st = sdf.to_csv() self.assertStartsWith(",cint,cstr\n0,0,s0", st)
def test_init(self): sdf = dummy_streaming_dataframe(100) df1 = sdf.to_df() sdf2 = StreamingDataFrame(sdf) df2 = sdf2.to_df() self.assertEqualDataFrame(df1, df2)
def test_dataframe(self): sdf = dummy_streaming_dataframe(100) df = sdf.to_dataframe() self.assertEqual(df.shape, (100, 2))