def test_groupby_streaming(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming("key", lambda gr: gr.sum(), strategy='streaming', as_index=False) gr2 = df20.groupby("key", as_index=False).sum() grs = [gr for gr in sgr] gr = pandas.concat(grs).groupby("key", as_index=False).sum() self.assertEqualDataFrame(gr, gr2)
def test_train_test_split(self): sdf = dummy_streaming_dataframe(100) tr, te = sdf.train_test_split(index=False, streaming=False) trsdf = StreamingDataFrame.read_str(tr) tesdf = StreamingDataFrame.read_str(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_groupby_cum_asindex(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming( "key", lambda gr: gr.sum(), strategy='cum', as_index=True) gr2 = df20.groupby("key", as_index=True).sum() lastgr = None for gr in sgr: self.assertEqual(list(gr.columns), list(gr2.columns)) lastgr = gr self.assertEqualDataFrame(lastgr, gr2)
def test_sample_reservoir_cache(self): sdf = dummy_streaming_dataframe(100) res = sdf.sample(n=10, cache=True, reservoir=True) df1 = res.to_df() df2 = res.to_df() self.assertEqualDataFrame(df1, df2) self.assertEqual(df1.shape, (10, res.shape[1])) self.assertRaise(lambda: sdf.sample(n=10, cache=False, reservoir=True), ValueError) self.assertRaise( lambda: sdf.sample(frac=0.1, cache=True, reservoir=True), ValueError)
def test_concatv(self): sdf20 = dummy_streaming_dataframe(20) sdf30 = dummy_streaming_dataframe(30) df20 = sdf20.to_dataframe() df30 = sdf30.to_dataframe() df = pandas.concat([df20, df30], axis=0) m1 = sdf20.concat(sdf30, axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) m1 = sdf20.concat(df30, axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) df30["g"] = 4 self.assertRaise(lambda: sdf20.concat(df30).to_dataframe(), ValueError, "Frame others[0] do not have the same column names") df20["cint"] = df20["cint"].astype(float) self.assertRaise(lambda: sdf20.concat(df20).to_dataframe(), ValueError, "Frame others[0] do not have the same column types")
def test_where(self): sdf = dummy_streaming_dataframe(100) cols = sdf.columns self.assertEqual(list(cols), ['cint', 'cstr']) dts = sdf.dtypes self.assertEqual(len(dts), 2) res = sdf.where(lambda row: row["cint"] == 1) st = res.to_csv() self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace('\r', '')) res = sdf.where(lambda row: row["cint"] == 1) st = res.to_csv() self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace('\r', ''))
def test_merge(self): def compares(a, b, how): m = a.merge(b, on="cint", indicator=True) dm = m.to_dataframe() da = a.to_dataframe() db = b.to_dataframe() exp = da.merge(db, on="cint", indicator=True) self.assertEqualDataFrame(dm.reset_index(drop=True), exp.reset_index(drop=True)) sdf20 = dummy_streaming_dataframe(20) sdf30 = dummy_streaming_dataframe(30) # itself hows = "inner left right outer".split() for how in hows: compares(sdf20, sdf20, how) compares(sdf20, sdf20, how) for how in hows: compares(sdf20, sdf30, how) compares(sdf20, sdf30, how) for how in hows: compares(sdf30, sdf20, how) compares(sdf30, sdf20, how) sdf20.merge(sdf20.to_dataframe(), on="cint", indicator=True)
def test_train_test_split_file_pattern(self): temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern") sdf = dummy_streaming_dataframe(100) names = os.path.join(temp, "spl_{0}.txt") self.assertRaise(lambda: sdf.train_test_split( names, index=False, streaming=False), ValueError) names = os.path.join(temp, "spl_{}.txt") tr, te = sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(tr) tesdf = StreamingDataFrame.read_csv(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_train_test_split_file(self): temp = get_temp_folder(__file__, "temp_train_test_split_file") names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")] sdf = dummy_streaming_dataframe(100) sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(names[0]) tesdf = StreamingDataFrame.read_csv(names[1]) self.assertGreater(trsdf.shape[0], 20) self.assertGreater(tesdf.shape[0], 20) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() self.assertGreater(trdf.shape[0], 20) self.assertGreater(tedf.shape[0], 20) df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_train_test_split(self): sdf = dummy_streaming_dataframe(100) tr, te = sdf.train_test_split(index=False, streaming=False) self.assertRaise( lambda: StreamingDataFrame.read_str(tr, chunksize=None), ValueError) self.assertRaise( lambda: StreamingDataFrame.read_str(tr, iterator=False), ValueError) StreamingDataFrame.read_str(tr.encode('utf-8')) trsdf = StreamingDataFrame.read_str(tr) tesdf = StreamingDataFrame.read_str(te) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cint").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp)
def test_train_test_split_streaming(self): sdf = dummy_streaming_dataframe(100, asfloat=True) trsdf, tesdf = sdf.train_test_split( streaming=True, unique_rows=True, partitions=[0.7, 0.3]) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cfloat").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp) trdf2 = trsdf.to_dataframe() tedf2 = tesdf.to_dataframe() df_val = pandas.concat([trdf2, tedf2]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cfloat").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp) self.assertEqual(trdf.shape, trdf2.shape) self.assertEqual(tedf.shape, tedf2.shape) self.assertGreater(trdf.shape[0], tedf.shape[0]) self.assertGreater(trdf2.shape[0], tedf2.shape[0])
def test_train_test_split_streaming_strat(self): sdf = dummy_streaming_dataframe(100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(0, 100)]) trsdf, tesdf = sdf.train_test_split( streaming=True, unique_rows=True, stratify="tify") trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cfloat").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_val = pandas.concat([trdf, tedf]) self.assertEqual(df_exp.shape, df_val.shape) df_val = df_val.sort_values("cfloat").reset_index(drop=True) self.assertEqualDataFrame(df_val, df_exp) trgr = trdf.groupby("tify").count() trgr["part"] = 0 tegr = tedf.groupby("tify").count() tegr["part"] = 1 gr = pandas.concat([trgr, tegr]) self.assertGreater(gr['cfloat'].min(), 4)
def test_init(self): sdf = dummy_streaming_dataframe(100) df1 = sdf.to_df() sdf2 = StreamingDataFrame(sdf) df2 = sdf2.to_df() self.assertEqualDataFrame(df1, df2)
def test_tail(self): sdf = dummy_streaming_dataframe(100) st = sdf.tail() self.assertEqual(st.shape, (5, 2)) st = sdf.tail(n=20) self.assertEqual(st.shape, (10, 2))
def test_iterrows(self): sdf = dummy_streaming_dataframe(100) rows = list(sdf.iterrows()) self.assertEqual(sdf.shape[0], len(rows)) rows = list(sdf.iterrows()) self.assertEqual(sdf.shape[0], len(rows))
def test_to_csv(self): sdf = dummy_streaming_dataframe(100) st = sdf.to_csv() self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace('\r', '')) st = sdf.to_csv() self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace('\r', ''))
def test_dataframe(self): sdf = dummy_streaming_dataframe(100) df = sdf.to_dataframe() self.assertEqual(df.shape, (100, 2))