示例#1
0
    def test_concath(self):
        sdf20 = dummy_streaming_dataframe(20)
        sdf30 = dummy_streaming_dataframe(20)
        df20 = sdf20.to_dataframe()
        df30 = sdf30.to_dataframe()
        df = pandas.concat([df20, df30], axis=1)

        m1 = sdf20.concat(sdf30, axis=1)
        self.assertEqualDataFrame(m1.to_dataframe(), df)
        sdf22 = dummy_streaming_dataframe(22)
        sdf25 = dummy_streaming_dataframe(25)
        self.assertRaise(lambda: sdf22.concat(sdf25, axis=1).to_dataframe(),
                         RuntimeError)
示例#2
0
    def test_groupby(self):
        df20 = dummy_streaming_dataframe(20).to_dataframe()
        df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
        sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
        gr = sdf20.groupby("key", lambda gr: gr.sum())
        gr2 = df20.groupby("key").sum()
        self.assertEqualDataFrame(gr, gr2)
        self.assertRaise(lambda: sdf20.groupby("key", in_memory=False),
                         NotImplementedError)

        # Do not replace lambda c:sum(c) by sum or...
        # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum
        gr2 = df20.groupby("key").agg([numpy.sum, lambda c: sum(c)])
        gr = sdf20.groupby("key",
                           lambda gr: gr.agg([numpy.sum, lambda c: sum(c)]))
        self.assertEqualDataFrame(gr, gr2)

        gr = sdf20.groupby("key", lambda gr: gr.count())
        gr2 = df20.groupby("key").count()
        self.assertEqualDataFrame(gr, gr2)

        df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7]))
        sdf = StreamingDataFrame.read_df(df)
        gr = sdf.groupby("A")
        gr2 = df.groupby("A").sum()
        self.assertEqualDataFrame(gr, gr2)
示例#3
0
 def test_train_test_split_streaming_strat(self):
     sdf = dummy_streaming_dataframe(
         100,
         asfloat=True,
         tify=["t1" if i % 3 else "t0" for i in range(0, 100)])
     trsdf, tesdf = sdf.train_test_split(streaming=True,
                                         unique_rows=True,
                                         stratify="tify")
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cfloat").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cfloat").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
     trgr = trdf.groupby("tify").count()
     trgr["part"] = 0
     tegr = tedf.groupby("tify").count()
     tegr["part"] = 1
     gr = pandas.concat([trgr, tegr])
     self.assertGreater(gr['cfloat'].min(), 4)
示例#4
0
 def test_head(self):
     sdf = dummy_streaming_dataframe(100)
     st = sdf.head()
     self.assertEqual(st.shape, (5, 2))
     st = sdf.head(n=20)
     self.assertEqual(st.shape, (20, 2))
     st = sdf.head(n=20)
     self.assertEqual(st.shape, (20, 2))
示例#5
0
 def test_sample(self):
     sdf = dummy_streaming_dataframe(100)
     res = sdf.sample(frac=0.1)
     self.assertLesser(res.shape[0], 30)
     self.assertRaise(lambda: sdf.sample(n=5), ValueError)
     res = sdf.sample(frac=0.1)
     self.assertLesser(res.shape[0], 30)
     self.assertRaise(lambda: sdf.sample(n=5), ValueError)
示例#6
0
 def test_getitem(self):
     sdf = dummy_streaming_dataframe(100)
     sdf2 = sdf[["cint"]]
     self.assertEqual(sdf2.shape, (100, 1))
     df1 = sdf.to_df()
     df2 = sdf2.to_df()
     self.assertEqualDataFrame(df1[["cint"]], df2)
     self.assertRaise(lambda: sdf["cint"], NotImplementedError)
     self.assertRaise(lambda: sdf[:, "cint"], NotImplementedError)
示例#7
0
 def test_apply(self):
     sdf = dummy_streaming_dataframe(100)
     self.assertNotEmpty(list(sdf))
     sdf = sdf.applymap(str)
     self.assertNotEmpty(list(sdf))
     sdf = sdf.apply(lambda row: row[["cint"]] + "r", axis=1)
     self.assertNotEmpty(list(sdf))
     text = sdf.to_csv(header=False)
     self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r", text)
示例#8
0
 def test_shape(self):
     sdf = dummy_streaming_dataframe(100)
     dfs = [df for df in sdf]
     self.assertEqual(len(dfs), 10)
     self.assertEqual(len(dfs), 10)
     shape = sdf.shape
     self.assertEqual(shape, (100, 2))
     self.assertRaise(lambda: sdf.sort_values("r"),
                      StreamingInefficientException)
示例#9
0
 def test_sample_cache(self):
     sdf = dummy_streaming_dataframe(100)
     res = sdf.sample(frac=0.1, cache=True)
     df1 = res.to_df()
     df2 = res.to_df()
     self.assertEqualDataFrame(df1, df2)
     self.assertTrue(res.is_stable(n=df1.shape[0], do_check=True))
     self.assertTrue(res.is_stable(n=df1.shape[0], do_check=False))
     res = sdf.sample(frac=0.1, cache=False)
     self.assertFalse(res.is_stable(n=df1.shape[0], do_check=False))
示例#10
0
 def test_train_test_split(self):
     sdf = dummy_streaming_dataframe(100)
     tr, te = sdf.train_test_split(index=False, streaming=False)
     trsdf = StreamingDataFrame.read_str(tr)
     tesdf = StreamingDataFrame.read_str(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
示例#11
0
 def test_sample_reservoir_cache(self):
     sdf = dummy_streaming_dataframe(100)
     res = sdf.sample(n=10, cache=True, reservoir=True)
     df1 = res.to_df()
     df2 = res.to_df()
     self.assertEqualDataFrame(df1, df2)
     self.assertEqual(df1.shape, (10, res.shape[1]))
     self.assertRaise(lambda: sdf.sample(n=10, cache=False, reservoir=True),
                      ValueError)
     self.assertRaise(
         lambda: sdf.sample(frac=0.1, cache=True, reservoir=True),
         ValueError)
示例#12
0
 def test_where(self):
     sdf = dummy_streaming_dataframe(100)
     cols = sdf.columns
     self.assertEqual(list(cols), ['cint', 'cstr'])
     dts = sdf.dtypes
     self.assertEqual(len(dts), 2)
     res = sdf.where(lambda row: row["cint"] == 1)
     st = res.to_csv()
     self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st)
     res = sdf.where(lambda row: row["cint"] == 1)
     st = res.to_csv()
     self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st)
示例#13
0
 def test_groupby_streaming(self):
     df20 = dummy_streaming_dataframe(20).to_dataframe()
     df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
     sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
     sgr = sdf20.groupby_streaming("key",
                                   lambda gr: gr.sum(),
                                   strategy='streaming',
                                   as_index=False)
     gr2 = df20.groupby("key", as_index=False).sum()
     grs = [gr for gr in sgr]
     gr = pandas.concat(grs).groupby("key", as_index=False).sum()
     self.assertEqualDataFrame(gr, gr2)
示例#14
0
    def test_concatv(self):
        sdf20 = dummy_streaming_dataframe(20)
        sdf30 = dummy_streaming_dataframe(30)
        df20 = sdf20.to_dataframe()
        df30 = sdf30.to_dataframe()
        df = pandas.concat([df20, df30], axis=0)

        m1 = sdf20.concat(sdf30, axis=0)
        self.assertEqualDataFrame(m1.to_dataframe(), df)
        m1 = sdf20.concat(df30, axis=0)
        self.assertEqualDataFrame(m1.to_dataframe(), df)
        m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)
        self.assertEqualDataFrame(m1.to_dataframe(), df)
        m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0)
        self.assertEqualDataFrame(m1.to_dataframe(), df)

        df30["g"] = 4
        self.assertRaise(lambda: sdf20.concat(df30).to_dataframe(), ValueError,
                         "Frame others[0] do not have the same column names")
        df20["cint"] = df20["cint"].astype(float)
        self.assertRaise(lambda: sdf20.concat(df20).to_dataframe(), ValueError,
                         "Frame others[0] do not have the same column types")
示例#15
0
 def test_groupby_cum_asindex(self):
     df20 = dummy_streaming_dataframe(20).to_dataframe()
     df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
     sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
     sgr = sdf20.groupby_streaming("key",
                                   lambda gr: gr.sum(),
                                   strategy='cum',
                                   as_index=True)
     gr2 = df20.groupby("key", as_index=True).sum()
     lastgr = None
     for gr in sgr:
         self.assertEqual(list(gr.columns), list(gr2.columns))
         lastgr = gr
     self.assertEqualDataFrame(lastgr, gr2)
示例#16
0
    def test_merge(self):
        def compares(a, b, how):
            m = a.merge(b, on="cint", indicator=True)
            dm = m.to_dataframe()
            da = a.to_dataframe()
            db = b.to_dataframe()
            exp = da.merge(db, on="cint", indicator=True)
            self.assertEqualDataFrame(dm.reset_index(drop=True),
                                      exp.reset_index(drop=True))

        sdf20 = dummy_streaming_dataframe(20)
        sdf30 = dummy_streaming_dataframe(30)
        # itself
        hows = "inner left right outer".split()
        for how in hows:
            compares(sdf20, sdf20, how)
            compares(sdf20, sdf20, how)
        for how in hows:
            compares(sdf20, sdf30, how)
            compares(sdf20, sdf30, how)
        for how in hows:
            compares(sdf30, sdf20, how)
            compares(sdf30, sdf20, how)
        sdf20.merge(sdf20.to_dataframe(), on="cint", indicator=True)
示例#17
0
 def test_train_test_split_file_pattern(self):
     temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern")
     sdf = dummy_streaming_dataframe(100)
     names = os.path.join(temp, "spl_{0}.txt")
     self.assertRaise(
         lambda: sdf.train_test_split(names, index=False, streaming=False),
         ValueError)
     names = os.path.join(temp, "spl_{}.txt")
     tr, te = sdf.train_test_split(names, index=False, streaming=False)
     trsdf = StreamingDataFrame.read_csv(tr)
     tesdf = StreamingDataFrame.read_csv(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
示例#18
0
 def test_train_test_split_file(self):
     temp = get_temp_folder(__file__, "temp_train_test_split_file")
     names = [
         os.path.join(temp, "train.txt"),
         os.path.join(temp, "test.txt")
     ]
     sdf = dummy_streaming_dataframe(100)
     sdf.train_test_split(names, index=False, streaming=False)
     trsdf = StreamingDataFrame.read_csv(names[0])
     tesdf = StreamingDataFrame.read_csv(names[1])
     self.assertGreater(trsdf.shape[0], 20)
     self.assertGreater(tesdf.shape[0], 20)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     self.assertGreater(trdf.shape[0], 20)
     self.assertGreater(tedf.shape[0], 20)
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
示例#19
0
 def test_train_test_split_streaming(self):
     sdf = dummy_streaming_dataframe(100, asfloat=True)
     trsdf, tesdf = sdf.train_test_split(streaming=True,
                                         unique_rows=True,
                                         partitions=[0.7, 0.3])
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cfloat").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
     trdf2 = trsdf.to_dataframe()
     tedf2 = tesdf.to_dataframe()
     df_val = pandas.concat([trdf2, tedf2])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cfloat").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
     self.assertEqual(trdf.shape, trdf2.shape)
     self.assertEqual(tedf.shape, tedf2.shape)
     self.assertGreater(trdf.shape[0], tedf.shape[0])
     self.assertGreater(trdf2.shape[0], tedf2.shape[0])
示例#20
0
 def test_tail(self):
     sdf = dummy_streaming_dataframe(100)
     st = sdf.tail()
     self.assertEqual(st.shape, (5, 2))
     st = sdf.tail(n=20)
     self.assertEqual(st.shape, (10, 2))
示例#21
0
 def test_iterrows(self):
     sdf = dummy_streaming_dataframe(100)
     rows = list(sdf.iterrows())
     self.assertEqual(sdf.shape[0], len(rows))
     rows = list(sdf.iterrows())
     self.assertEqual(sdf.shape[0], len(rows))
示例#22
0
 def test_to_csv(self):
     sdf = dummy_streaming_dataframe(100)
     st = sdf.to_csv()
     self.assertStartsWith(",cint,cstr\n0,0,s0", st)
     st = sdf.to_csv()
     self.assertStartsWith(",cint,cstr\n0,0,s0", st)
示例#23
0
 def test_init(self):
     sdf = dummy_streaming_dataframe(100)
     df1 = sdf.to_df()
     sdf2 = StreamingDataFrame(sdf)
     df2 = sdf2.to_df()
     self.assertEqualDataFrame(df1, df2)
示例#24
0
 def test_dataframe(self):
     sdf = dummy_streaming_dataframe(100)
     df = sdf.to_dataframe()
     self.assertEqual(df.shape, (100, 2))