def test_groupby(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) gr = sdf20.groupby("key", lambda gr: gr.sum()) gr2 = df20.groupby("key").sum() self.assertEqualDataFrame(gr, gr2) self.assertRaise(lambda: sdf20.groupby( "key", in_memory=False), NotImplementedError) # Do not replace lambda c:sum(c) by sum or... # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum gr2 = df20.groupby("key").agg([numpy.sum, lambda c:sum(c)]) gr = sdf20.groupby("key", lambda gr: gr.agg( [numpy.sum, lambda c:sum(c)])) self.assertEqualDataFrame(gr, gr2) gr = sdf20.groupby("key", lambda gr: gr.count()) gr2 = df20.groupby("key").count() self.assertEqualDataFrame(gr, gr2) df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7])) sdf = StreamingDataFrame.read_df(df) gr = sdf.groupby("A") gr2 = df.groupby("A").sum() self.assertEqualDataFrame(gr, gr2)
def test_read_csv(self): temp = get_temp_folder(__file__, "temp_read_csv") df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"])) name = os.path.join(temp, "df.csv") name2 = os.path.join(temp, "df2.csv") name3 = os.path.join(temp, "df3.csv") df.to_csv(name, index=False) df.to_csv(name2, index=True) sdf = StreamingDataFrame.read_csv(name) text = sdf.to_csv(index=False) self.assertRaise( lambda: StreamingDataFrame.read_csv( name2, index_col=0, chunksize=None), ValueError) self.assertRaise( lambda: StreamingDataFrame.read_csv( name2, index_col=0, iterator=False), ValueError) sdf2 = StreamingDataFrame.read_csv(name2, index_col=0) text2 = sdf2.to_csv(index=True) sdf2.to_csv(name3, index=True) with open(name, "r", encoding='utf-8') as f: exp = f.read() with open(name2, "r", encoding='utf-8') as f: exp2 = f.read() with open(name3, "r", encoding='utf-8') as f: text3 = f.read() self.assertEqual(text.replace('\r', ''), exp) sdf2 = StreamingDataFrame.read_df(df) self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe()) self.assertEqual(text2.replace('\r', ''), exp2) self.assertEqual(text3.replace('\r', '').replace('\n\n', '\n'), exp2.replace('\r', ''))
def test_set_item_function(self): df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7])) self.assertRaise(lambda: StreamingDataFrame(df), TypeError) sdf = StreamingDataFrame.read_df(df) sdf['bb'] = sdf['b'].apply(lambda x: x + 11) df = sdf.to_df() ddf = ddf = pandas.DataFrame( data=dict(a=[4.5], b=[6], c=[7], bb=[17])) self.assertEqualDataFrame(df, ddf)
def test_add_column(self): df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) sdf = StreamingDataFrame.read_df(df) sdf2 = sdf.add_column("d", lambda row: int(1)) df2 = sdf2.to_dataframe() df["d"] = 1 self.assertEqualDataFrame(df, df2) sdf3 = StreamingDataFrame.read_df(df) sdf4 = sdf3.add_column("dd", 2) df4 = sdf4.to_dataframe() df["dd"] = 2 self.assertEqualDataFrame(df, df4) sdfA = StreamingDataFrame.read_df(df) sdfB = sdfA.add_column("dd12", lambda row: row["dd"] + 10) dfB = sdfB.to_dataframe() df["dd12"] = 12 self.assertEqualDataFrame(df, dfB)
def test_groupby_streaming(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming( "key", lambda gr: gr.sum(), strategy='streaming', as_index=False) gr2 = df20.groupby("key", as_index=False).sum() grs = list(sgr) gr = pandas.concat(grs).groupby("key", as_index=False).sum() self.assertEqualDataFrame(gr, gr2)
def test_merge_2(self): df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) df2 = pandas.concat([df, df]) sdf = StreamingDataFrame.read_df(df) sdf2 = sdf.concat(sdf, axis=0) self.assertEqualDataFrame(df2, sdf2.to_dataframe()) self.assertEqualDataFrame(df2, sdf2.to_dataframe()) m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20])) jm = df2.merge(m, left_on="Y", right_on="Y", how="outer") sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer") self.assertEqualDataFrame(jm.sort_values(["X", "Y"]).reset_index(drop=True), sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True))
def test_groupby_cum_asindex(self): df20 = dummy_streaming_dataframe(20).to_dataframe() df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming( "key", lambda gr: gr.sum(), strategy='cum', as_index=True) gr2 = df20.groupby("key", as_index=True).sum() lastgr = None for gr in sgr: self.assertEqual(list(gr.columns), list(gr2.columns)) lastgr = gr self.assertEqualDataFrame(lastgr, gr2)
def test_sort_values_reverse(self): temp = get_temp_folder(__file__, "temp_sort_values_reverse") name = os.path.join(temp, "_data_") df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1), dict(a=5, b="f", c=5.7, ind="a2", ai=2), dict(a=4, b="g", ind="a3", ai=3), dict(a=8, b="h", c=5.9, ai=4), dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) sdf = StreamingDataFrame.read_df(df, chunksize=2) sorted_df = df.sort_values(by="a", ascending=False) res = sdf.sort_values(by="a", temp_file=name, ascending=False) res_df = res.to_df() self.assertEqualDataFrame(sorted_df, res_df)
def test_train_test_split_streaming_tiny(self): df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) sdf2 = StreamingDataFrame.read_df(pandas.concat([df, df])) sdftr, sdfte = sdf2.train_test_split(test_size=0.5) df1 = sdfte.head() df2 = sdfte.head() if df1 is not None or df2 is not None: self.assertEqualDataFrame(df1, df2) df1 = sdftr.head() df2 = sdftr.head() if df1 is not None or df2 is not None: self.assertEqualDataFrame(df1, df2) sdf = StreamingDataFrame.read_df(df) sdf2 = sdf.concat(sdf, axis=0) sdftr, sdfte = sdf2.train_test_split(test_size=0.5) df1 = sdfte.head() df2 = sdfte.head() if df1 is not None or df2 is not None: self.assertEqualDataFrame(df1, df2) df1 = sdftr.head() df2 = sdftr.head() if df1 is not None or df2 is not None: self.assertEqualDataFrame(df1, df2)
def test_fillna(self): df = pandas.DataFrame( data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan])) sdf = StreamingDataFrame.read_df(df) df2 = pandas.DataFrame( data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"])) na = sdf.fillna(value=dict(X=10.0, Y="NAN")) ndf = na.to_df() self.assertEqual(ndf, df2) df3 = pandas.DataFrame( data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan])) na = sdf.fillna(value=dict(X=10.0)) ndf = na.to_df() self.assertEqual(ndf, df3)
def test_describe(self): x = numpy.arange(100001).astype(numpy.float64) / 100000 - 0.5 y = numpy.arange(100001).astype(numpy.int64) z = numpy.array([chr(65 + j % 45) for j in y]) df = pandas.DataFrame(data=dict(X=x, Y=y, Z=z)) sdf = StreamingDataFrame.read_df(df) desc = sdf.describe() self.assertEqual(['X', 'Y'], list(desc.columns)) self.assertEqual(desc.loc['min', :].tolist(), [-0.5, 0]) self.assertEqual(desc.loc['max', :].tolist(), [0.5, 100000]) self.assertEqualArray(desc.loc['mean', :], numpy.array([0, 50000])) self.assertEqualArray(desc.loc['25%', :], numpy.array([-0.25, 25000])) self.assertEqualArray(desc.loc['50%', :], numpy.array([0.0, 50000])) self.assertEqualArray(desc.loc['75%', :], numpy.array([0.25, 75000])) self.assertEqualArray(desc.loc['std', :], numpy.array( [2.886795e-01, 28867.946472]), decimal=4)
def test_set_item(self): df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7])) self.assertRaise(lambda: StreamingDataFrame(df), TypeError) sdf = StreamingDataFrame.read_df(df) def f(): sdf[['a']] = 10 self.assertRaise(f, ValueError) def g(): sdf['a'] = [10] self.assertRaise(g, NotImplementedError) sdf['aa'] = 10 df = sdf.to_df() ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10])) self.assertEqualDataFrame(df, ddf) sdf['bb'] = sdf['b'] + 10 df = sdf.to_df() ddf = ddf = pandas.DataFrame( data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16])) self.assertEqualDataFrame(df, ddf)
class MyStdout(object): def __init__(self, term=sys.stdout): self.term = term def write(self, text): text = colorize(text) self.term.write(text) def flush(self): pass sys.stdout = MyStdout() from pandas_streaming.df import StreamingDataFrame sdf = StreamingDataFrame.read_df(data) for data in sdf: print(data) print("") active = df.status.str.count("active").sum() inactive = df.status.str.count("inactive").sum() invalid = df.status.str.count("invalid").sum() print("\033[0;32mNumber of active SatNOGS: \033[0m", active) print("\033[0;31mNumber of inactive SatNOGS: \033[0m", inactive) print("\033[0;33mNumber of invalid SatNOGS: \033[0m", invalid) for remaining in range(300, 0, -1): sys.stdout.write("\r") sys.stdout.write("Refreshing in {:2d} seconds...".format(remaining)) sys.stdout.flush()