def test_groupby(self):
        df20 = dummy_streaming_dataframe(20).to_dataframe()
        df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
        sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
        gr = sdf20.groupby("key", lambda gr: gr.sum())
        gr2 = df20.groupby("key").sum()
        self.assertEqualDataFrame(gr, gr2)
        self.assertRaise(lambda: sdf20.groupby(
            "key", in_memory=False), NotImplementedError)

        # Do not replace lambda c:sum(c) by sum or...
        # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum
        gr2 = df20.groupby("key").agg([numpy.sum, lambda c:sum(c)])
        gr = sdf20.groupby("key", lambda gr: gr.agg(
            [numpy.sum, lambda c:sum(c)]))
        self.assertEqualDataFrame(gr, gr2)

        gr = sdf20.groupby("key", lambda gr: gr.count())
        gr2 = df20.groupby("key").count()
        self.assertEqualDataFrame(gr, gr2)

        df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7]))
        sdf = StreamingDataFrame.read_df(df)
        gr = sdf.groupby("A")
        gr2 = df.groupby("A").sum()
        self.assertEqualDataFrame(gr, gr2)
 def test_read_csv(self):
     temp = get_temp_folder(__file__, "temp_read_csv")
     df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"]))
     name = os.path.join(temp, "df.csv")
     name2 = os.path.join(temp, "df2.csv")
     name3 = os.path.join(temp, "df3.csv")
     df.to_csv(name, index=False)
     df.to_csv(name2, index=True)
     sdf = StreamingDataFrame.read_csv(name)
     text = sdf.to_csv(index=False)
     self.assertRaise(
         lambda: StreamingDataFrame.read_csv(
             name2, index_col=0, chunksize=None),
         ValueError)
     self.assertRaise(
         lambda: StreamingDataFrame.read_csv(
             name2, index_col=0, iterator=False),
         ValueError)
     sdf2 = StreamingDataFrame.read_csv(name2, index_col=0)
     text2 = sdf2.to_csv(index=True)
     sdf2.to_csv(name3, index=True)
     with open(name, "r", encoding='utf-8') as f:
         exp = f.read()
     with open(name2, "r", encoding='utf-8') as f:
         exp2 = f.read()
     with open(name3, "r", encoding='utf-8') as f:
         text3 = f.read()
     self.assertEqual(text.replace('\r', ''), exp)
     sdf2 = StreamingDataFrame.read_df(df)
     self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe())
     self.assertEqual(text2.replace('\r', ''), exp2)
     self.assertEqual(text3.replace('\r', '').replace('\n\n', '\n'),
                      exp2.replace('\r', ''))
 def test_set_item_function(self):
     df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
     self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
     sdf = StreamingDataFrame.read_df(df)
     sdf['bb'] = sdf['b'].apply(lambda x: x + 11)
     df = sdf.to_df()
     ddf = ddf = pandas.DataFrame(
         data=dict(a=[4.5], b=[6], c=[7], bb=[17]))
     self.assertEqualDataFrame(df, ddf)
    def test_add_column(self):
        df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
        sdf = StreamingDataFrame.read_df(df)
        sdf2 = sdf.add_column("d", lambda row: int(1))
        df2 = sdf2.to_dataframe()
        df["d"] = 1
        self.assertEqualDataFrame(df, df2)

        sdf3 = StreamingDataFrame.read_df(df)
        sdf4 = sdf3.add_column("dd", 2)
        df4 = sdf4.to_dataframe()
        df["dd"] = 2
        self.assertEqualDataFrame(df, df4)

        sdfA = StreamingDataFrame.read_df(df)
        sdfB = sdfA.add_column("dd12", lambda row: row["dd"] + 10)
        dfB = sdfB.to_dataframe()
        df["dd12"] = 12
        self.assertEqualDataFrame(df, dfB)
 def test_groupby_streaming(self):
     df20 = dummy_streaming_dataframe(20).to_dataframe()
     df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
     sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
     sgr = sdf20.groupby_streaming(
         "key", lambda gr: gr.sum(), strategy='streaming', as_index=False)
     gr2 = df20.groupby("key", as_index=False).sum()
     grs = list(sgr)
     gr = pandas.concat(grs).groupby("key", as_index=False).sum()
     self.assertEqualDataFrame(gr, gr2)
 def test_merge_2(self):
     df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
     df2 = pandas.concat([df, df])
     sdf = StreamingDataFrame.read_df(df)
     sdf2 = sdf.concat(sdf, axis=0)
     self.assertEqualDataFrame(df2, sdf2.to_dataframe())
     self.assertEqualDataFrame(df2, sdf2.to_dataframe())
     m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
     jm = df2.merge(m, left_on="Y", right_on="Y", how="outer")
     sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
     self.assertEqualDataFrame(jm.sort_values(["X", "Y"]).reset_index(drop=True),
                               sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True))
 def test_groupby_cum_asindex(self):
     df20 = dummy_streaming_dataframe(20).to_dataframe()
     df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
     sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
     sgr = sdf20.groupby_streaming(
         "key", lambda gr: gr.sum(), strategy='cum', as_index=True)
     gr2 = df20.groupby("key", as_index=True).sum()
     lastgr = None
     for gr in sgr:
         self.assertEqual(list(gr.columns), list(gr2.columns))
         lastgr = gr
     self.assertEqualDataFrame(lastgr, gr2)
 def test_sort_values_reverse(self):
     temp = get_temp_folder(__file__, "temp_sort_values_reverse")
     name = os.path.join(temp, "_data_")
     df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
                            dict(a=5, b="f", c=5.7, ind="a2", ai=2),
                            dict(a=4, b="g", ind="a3", ai=3),
                            dict(a=8, b="h", c=5.9, ai=4),
                            dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
     sdf = StreamingDataFrame.read_df(df, chunksize=2)
     sorted_df = df.sort_values(by="a", ascending=False)
     res = sdf.sort_values(by="a", temp_file=name, ascending=False)
     res_df = res.to_df()
     self.assertEqualDataFrame(sorted_df, res_df)
    def test_train_test_split_streaming_tiny(self):
        df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))

        sdf2 = StreamingDataFrame.read_df(pandas.concat([df, df]))
        sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
        df1 = sdfte.head()
        df2 = sdfte.head()
        if df1 is not None or df2 is not None:
            self.assertEqualDataFrame(df1, df2)
        df1 = sdftr.head()
        df2 = sdftr.head()
        if df1 is not None or df2 is not None:
            self.assertEqualDataFrame(df1, df2)
        sdf = StreamingDataFrame.read_df(df)
        sdf2 = sdf.concat(sdf, axis=0)
        sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
        df1 = sdfte.head()
        df2 = sdfte.head()
        if df1 is not None or df2 is not None:
            self.assertEqualDataFrame(df1, df2)
        df1 = sdftr.head()
        df2 = sdftr.head()
        if df1 is not None or df2 is not None:
            self.assertEqualDataFrame(df1, df2)
    def test_fillna(self):
        df = pandas.DataFrame(
            data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan]))
        sdf = StreamingDataFrame.read_df(df)

        df2 = pandas.DataFrame(
            data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"]))
        na = sdf.fillna(value=dict(X=10.0, Y="NAN"))
        ndf = na.to_df()
        self.assertEqual(ndf, df2)

        df3 = pandas.DataFrame(
            data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan]))
        na = sdf.fillna(value=dict(X=10.0))
        ndf = na.to_df()
        self.assertEqual(ndf, df3)
    def test_describe(self):
        x = numpy.arange(100001).astype(numpy.float64) / 100000 - 0.5
        y = numpy.arange(100001).astype(numpy.int64)
        z = numpy.array([chr(65 + j % 45) for j in y])
        df = pandas.DataFrame(data=dict(X=x, Y=y, Z=z))
        sdf = StreamingDataFrame.read_df(df)

        desc = sdf.describe()
        self.assertEqual(['X', 'Y'], list(desc.columns))
        self.assertEqual(desc.loc['min', :].tolist(), [-0.5, 0])
        self.assertEqual(desc.loc['max', :].tolist(), [0.5, 100000])
        self.assertEqualArray(desc.loc['mean', :], numpy.array([0, 50000]))
        self.assertEqualArray(desc.loc['25%', :], numpy.array([-0.25, 25000]))
        self.assertEqualArray(desc.loc['50%', :], numpy.array([0.0, 50000]))
        self.assertEqualArray(desc.loc['75%', :], numpy.array([0.25, 75000]))
        self.assertEqualArray(desc.loc['std', :], numpy.array(
            [2.886795e-01, 28867.946472]), decimal=4)
    def test_set_item(self):
        df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
        self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
        sdf = StreamingDataFrame.read_df(df)

        def f():
            sdf[['a']] = 10
        self.assertRaise(f, ValueError)

        def g():
            sdf['a'] = [10]
        self.assertRaise(g, NotImplementedError)

        sdf['aa'] = 10
        df = sdf.to_df()
        ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10]))
        self.assertEqualDataFrame(df, ddf)
        sdf['bb'] = sdf['b'] + 10
        df = sdf.to_df()
        ddf = ddf = pandas.DataFrame(
            data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16]))
        self.assertEqualDataFrame(df, ddf)
    class MyStdout(object):
        def __init__(self, term=sys.stdout):
            self.term = term

        def write(self, text):
            text = colorize(text)
            self.term.write(text)

        def flush(self):
            pass

    sys.stdout = MyStdout()

    from pandas_streaming.df import StreamingDataFrame

    sdf = StreamingDataFrame.read_df(data)

    for data in sdf:
        print(data)
        print("")
        active = df.status.str.count("active").sum()
        inactive = df.status.str.count("inactive").sum()
        invalid = df.status.str.count("invalid").sum()
        print("\033[0;32mNumber of active SatNOGS: \033[0m", active)
        print("\033[0;31mNumber of inactive SatNOGS: \033[0m", inactive)
        print("\033[0;33mNumber of invalid SatNOGS: \033[0m", invalid)

    for remaining in range(300, 0, -1):
        sys.stdout.write("\r")
        sys.stdout.write("Refreshing in {:2d} seconds...".format(remaining))
        sys.stdout.flush()