def test_groupby(self):
        df20 = dummy_streaming_dataframe(20).to_dataframe()
        df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
        sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
        gr = sdf20.groupby("key", lambda gr: gr.sum())
        gr2 = df20.groupby("key").sum()
        self.assertEqualDataFrame(gr, gr2)
        self.assertRaise(lambda: sdf20.groupby(
            "key", in_memory=False), NotImplementedError)

        # Do not replace lambda c:sum(c) by sum or...
        # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum
        gr2 = df20.groupby("key").agg([numpy.sum, lambda c:sum(c)])
        gr = sdf20.groupby("key", lambda gr: gr.agg(
            [numpy.sum, lambda c:sum(c)]))
        self.assertEqualDataFrame(gr, gr2)

        gr = sdf20.groupby("key", lambda gr: gr.count())
        gr2 = df20.groupby("key").count()
        self.assertEqualDataFrame(gr, gr2)

        df = pandas.DataFrame(dict(A=[3, 4, 3], B=[5, 6, 7]))
        sdf = StreamingDataFrame.read_df(df)
        gr = sdf.groupby("A")
        gr2 = df.groupby("A").sum()
        self.assertEqualDataFrame(gr, gr2)
 def test_read_csv(self):
     temp = get_temp_folder(__file__, "temp_read_csv")
     df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"]))
     name = os.path.join(temp, "df.csv")
     name2 = os.path.join(temp, "df2.csv")
     name3 = os.path.join(temp, "df3.csv")
     df.to_csv(name, index=False)
     df.to_csv(name2, index=True)
     sdf = StreamingDataFrame.read_csv(name)
     text = sdf.to_csv(index=False)
     self.assertRaise(
         lambda: StreamingDataFrame.read_csv(
             name2, index_col=0, chunksize=None),
         ValueError)
     self.assertRaise(
         lambda: StreamingDataFrame.read_csv(
             name2, index_col=0, iterator=False),
         ValueError)
     sdf2 = StreamingDataFrame.read_csv(name2, index_col=0)
     text2 = sdf2.to_csv(index=True)
     sdf2.to_csv(name3, index=True)
     with open(name, "r", encoding='utf-8') as f:
         exp = f.read()
     with open(name2, "r", encoding='utf-8') as f:
         exp2 = f.read()
     with open(name3, "r", encoding='utf-8') as f:
         text3 = f.read()
     self.assertEqual(text.replace('\r', ''), exp)
     sdf2 = StreamingDataFrame.read_df(df)
     self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe())
     self.assertEqual(text2.replace('\r', ''), exp2)
     self.assertEqual(text3.replace('\r', '').replace('\n\n', '\n'),
                      exp2.replace('\r', ''))
 def test_read_json_raw(self):
     data = [{
         'id': 1,
         'name': {
             'first': 'Coleen',
             'last': 'Volk'
         }
     }, {
         'name': {
             'given': 'Mose',
             'family': 'Regner'
         }
     }, {
         'id': 2,
         'name': 'FayeRaker'
     }]
     exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"},
             {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null},
             {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null,
             "name.given":null,"name.last":null}]""".replace(" ",
                                                             "").replace(
                                                                 "\n", "")
     self.assertRaise(lambda: StreamingDataFrame.read_json(data),
                      NotImplementedError)
     it = StreamingDataFrame.read_json(data, flatten=True)
     dfs = list(it)
     self.assertEqual(len(dfs), 1)
     js = dfs[0].to_json(orient='records')
     js_read = loads(js)
     js_exp = loads(exp)
     self.assertEqual(js_exp, js_read)
예제 #4
0
def json_to_dataframe_streaming(js, chunksize=100000, flatten=False, **kwargs):
    """
    Converts a big json dump (from @see fn convert_trace_to_json)
    to a dataframe. The function processes the data by streaming to avoid
    loading huge data in memory.
    Returns an iterator on dataframes.
    The function relies on :epkg:`pandas_streaming`.

    :param js: a filename, a json string, a stream containing json
    :param chunksize:
        see :func:`pandas_streaming.df.StreamingDataFrame.read_json`
    :param flatten:
        see :func:`pandas_streaming.df.StreamingDataFrame.read_json`
    :param kwargs:
        see :func:`pandas_streaming.df.StreamingDataFrame.read_json`
    :return: a dataframe
    """
    from pandas_streaming.df import StreamingDataFrame  # pylint: disable=C0415
    if isinstance(js, str):
        if len(js) < 5000 and os.path.exists(js):
            sdf = StreamingDataFrame.read_json(js)
        else:
            raise RuntimeError(
                "Use a stream or function json_to_dataframe instead of "
                "the streaming version.")
    else:
        sdf = StreamingDataFrame.read_json(js)

    sdf['ts_sec'] = sdf['ts'].apply(lambda t: t / 1e9)
    return sdf
 def test_set_item_function(self):
     df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
     self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
     sdf = StreamingDataFrame.read_df(df)
     sdf['bb'] = sdf['b'].apply(lambda x: x + 11)
     df = sdf.to_df()
     ddf = ddf = pandas.DataFrame(
         data=dict(a=[4.5], b=[6], c=[7], bb=[17]))
     self.assertEqualDataFrame(df, ddf)
 def test_train_test_split(self):
     sdf = dummy_streaming_dataframe(100)
     tr, te = sdf.train_test_split(index=False, streaming=False)
     trsdf = StreamingDataFrame.read_str(tr)
     tesdf = StreamingDataFrame.read_str(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
 def test_schema_consistant(self):
     df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"),
                            dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")])
     temp = get_temp_folder(__file__, "temp_schema_consistant")
     name = os.path.join(temp, "df.csv")
     stio = StringIO()
     df.to_csv(stio, index=False)
     self.assertNotEmpty(stio.getvalue())
     df.to_csv(name, index=False)
     self.assertEqual(df.shape, (4, 3))
     sdf = StreamingDataFrame.read_csv(name, chunksize=2)
     self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError)
     sdf = StreamingDataFrame.read_csv(
         name, chunksize=2, check_schema=False)
     pieces = list(sdf)
     self.assertEqual(len(pieces), 2)
 def test_train_test_split_file_pattern(self):
     temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern")
     sdf = dummy_streaming_dataframe(100)
     names = os.path.join(temp, "spl_{0}.txt")
     self.assertRaise(lambda: sdf.train_test_split(
         names, index=False, streaming=False), ValueError)
     names = os.path.join(temp, "spl_{}.txt")
     tr, te = sdf.train_test_split(names, index=False, streaming=False)
     trsdf = StreamingDataFrame.read_csv(tr)
     tesdf = StreamingDataFrame.read_csv(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
예제 #9
0
 def test_read_json_ijson(self):
     it = StreamingDataFrame.read_json(
         BytesIO(TestDataFrameIOHelpers.text_json))
     dfs = list(it)
     self.assertEqual(len(dfs), 1)
     js = dfs[0].to_json(orient='records', lines=True)
     jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']')
     self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp)
예제 #10
0
 def test_read_json_rows(self):
     data = '''{"a": 1, "b": 2}
               {"a": 3, "b": 4}'''
     it = StreamingDataFrame.read_json(StringIO(data), lines=True)
     dfs = list(it)
     self.assertEqual(len(dfs), 1)
     js = dfs[0].to_json(orient='records')
     self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]')
 def test_read_csv_names(self):
     this = os.path.abspath(os.path.dirname(__file__))
     data = os.path.join(this, "data", "buggy_hash2.csv")
     df = pandas.read_csv(data, sep="\t", names=[
                          "A", "B", "C"], header=None)
     sdf = StreamingDataFrame.read_csv(
         data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None)
     head = sdf.head(n=1)
     self.assertEqualDataFrame(df.head(n=1), head)
예제 #12
0
 def test_read_json_rows2(self):
     data = b'''{"a": 1, "b": 2}
               {"a": 3, "b": 4}'''
     dfs = pandas.read_json(BytesIO(data), lines=True)
     self.assertEqual(dfs.shape, (2, 2))
     it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
     dfs = list(it)
     self.assertEqual(len(dfs), 1)
     js = dfs[0].to_json(orient='records')
     self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js)
 def test_train_test_split_file(self):
     temp = get_temp_folder(__file__, "temp_train_test_split_file")
     names = [os.path.join(temp, "train.txt"),
              os.path.join(temp, "test.txt")]
     sdf = dummy_streaming_dataframe(100)
     sdf.train_test_split(names, index=False, streaming=False)
     trsdf = StreamingDataFrame.read_csv(names[0])
     tesdf = StreamingDataFrame.read_csv(names[1])
     self.assertGreater(trsdf.shape[0], 20)
     self.assertGreater(tesdf.shape[0], 20)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     self.assertGreater(trdf.shape[0], 20)
     self.assertGreater(tedf.shape[0], 20)
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
예제 #14
0
 def test_read_json_rows_file_lines_head(self):
     data = self.abs_path_join(__file__, 'data', 'example.json')
     dfs = pandas.read_json(data, orient='records', lines=True)
     self.assertEqual(dfs.shape, (2, 2))
     it = StreamingDataFrame.read_json(data, lines="stream")
     h1 = it.head()
     h2 = it.head()
     self.assertNotEmpty(h1)
     self.assertNotEmpty(h2)
     self.assertEqualDataFrame(h1, h2)
 def test_groupby_streaming(self):
     df20 = dummy_streaming_dataframe(20).to_dataframe()
     df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
     sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
     sgr = sdf20.groupby_streaming(
         "key", lambda gr: gr.sum(), strategy='streaming', as_index=False)
     gr2 = df20.groupby("key", as_index=False).sum()
     grs = list(sgr)
     gr = pandas.concat(grs).groupby("key", as_index=False).sum()
     self.assertEqualDataFrame(gr, gr2)
예제 #16
0
 def test_read_json_raw_head(self):
     data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
             {'name': {'given': 'Mose', 'family': 'Regner'}},
             {'id': 2, 'name': 'FayeRaker'}]
     it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1)
     h1 = it.head()
     h2 = it.head()
     self.assertEqualDataFrame(h1, h2)
     self.assertGreater(h1.shape[0], 1)
     self.assertGreater(h2.shape[0], 1)
    def test_add_column(self):
        df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
        sdf = StreamingDataFrame.read_df(df)
        sdf2 = sdf.add_column("d", lambda row: int(1))
        df2 = sdf2.to_dataframe()
        df["d"] = 1
        self.assertEqualDataFrame(df, df2)

        sdf3 = StreamingDataFrame.read_df(df)
        sdf4 = sdf3.add_column("dd", 2)
        df4 = sdf4.to_dataframe()
        df["dd"] = 2
        self.assertEqualDataFrame(df, df4)

        sdfA = StreamingDataFrame.read_df(df)
        sdfB = sdfA.add_column("dd12", lambda row: row["dd"] + 10)
        dfB = sdfB.to_dataframe()
        df["dd12"] = 12
        self.assertEqualDataFrame(df, dfB)
예제 #18
0
 def test_read_json_rows2_head(self):
     data = b'''{"a": 1, "b": 2}
               {"a": 3, "b": 4}'''
     dfs = pandas.read_json(BytesIO(data), lines=True)
     self.assertEqual(dfs.shape, (2, 2))
     it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
     h1 = it.head()
     h2 = it.head()
     self.assertNotEmpty(h1)
     self.assertNotEmpty(h2)
     self.assertEqualDataFrame(h1, h2)
 def test_merge_2(self):
     df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
     df2 = pandas.concat([df, df])
     sdf = StreamingDataFrame.read_df(df)
     sdf2 = sdf.concat(sdf, axis=0)
     self.assertEqualDataFrame(df2, sdf2.to_dataframe())
     self.assertEqualDataFrame(df2, sdf2.to_dataframe())
     m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
     jm = df2.merge(m, left_on="Y", right_on="Y", how="outer")
     sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
     self.assertEqualDataFrame(jm.sort_values(["X", "Y"]).reset_index(drop=True),
                               sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True))
 def test_groupby_cum_asindex(self):
     df20 = dummy_streaming_dataframe(20).to_dataframe()
     df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
     sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
     sgr = sdf20.groupby_streaming(
         "key", lambda gr: gr.sum(), strategy='cum', as_index=True)
     gr2 = df20.groupby("key", as_index=True).sum()
     lastgr = None
     for gr in sgr:
         self.assertEqual(list(gr.columns), list(gr2.columns))
         lastgr = gr
     self.assertEqualDataFrame(lastgr, gr2)
예제 #21
0
 def test_read_json_classic_file(self):
     data = self.abs_path_join(__file__, 'data', 'classic.json')
     dfs = pandas.read_json(data, orient='records')
     self.assertEqual(dfs.shape[1], 8)
     self.assertGreater(dfs.shape[0], 2)
     with open(data, "r", encoding="utf-8") as f:
         it = StreamingDataFrame.read_json(f, orient='records')
         h1 = it.to_df()
         h2 = it.to_df()
     self.assertNotEmpty(h1)
     self.assertNotEmpty(h2)
     self.assertEqualDataFrame(h1, h2)
     self.assertEqual(h1.shape[1], 8)
    def test_set_item(self):
        df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
        self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
        sdf = StreamingDataFrame.read_df(df)

        def f():
            sdf[['a']] = 10
        self.assertRaise(f, ValueError)

        def g():
            sdf['a'] = [10]
        self.assertRaise(g, NotImplementedError)

        sdf['aa'] = 10
        df = sdf.to_df()
        ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10]))
        self.assertEqualDataFrame(df, ddf)
        sdf['bb'] = sdf['b'] + 10
        df = sdf.to_df()
        ddf = ddf = pandas.DataFrame(
            data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16]))
        self.assertEqualDataFrame(df, ddf)
 def test_sort_values_reverse(self):
     temp = get_temp_folder(__file__, "temp_sort_values_reverse")
     name = os.path.join(temp, "_data_")
     df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
                            dict(a=5, b="f", c=5.7, ind="a2", ai=2),
                            dict(a=4, b="g", ind="a3", ai=3),
                            dict(a=8, b="h", c=5.9, ai=4),
                            dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
     sdf = StreamingDataFrame.read_df(df, chunksize=2)
     sorted_df = df.sort_values(by="a", ascending=False)
     res = sdf.sort_values(by="a", temp_file=name, ascending=False)
     res_df = res.to_df()
     self.assertEqualDataFrame(sorted_df, res_df)
예제 #24
0
 def test_read_json_classic(self):
     data = self.abs_path_join(__file__, 'data', 'classic.json')
     dfs = pandas.read_json(data, orient='records')
     dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9)
     self.assertEqual(dfs.shape[1], 9)
     self.assertGreater(dfs.shape[0], 2)
     it = StreamingDataFrame.read_json(data)
     it['ts2'] = it['ts'].apply(lambda t: t / 1e9)
     h1 = it.to_df()
     h2 = it.to_df()
     self.assertNotEmpty(h1)
     self.assertNotEmpty(h2)
     self.assertEqualDataFrame(h1, h2)
     self.assertEqual(h1.shape[1], 9)
    def test_train_test_split_streaming_tiny(self):
        df = pandas.DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))

        sdf2 = StreamingDataFrame.read_df(pandas.concat([df, df]))
        sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
        df1 = sdfte.head()
        df2 = sdfte.head()
        if df1 is not None or df2 is not None:
            self.assertEqualDataFrame(df1, df2)
        df1 = sdftr.head()
        df2 = sdftr.head()
        if df1 is not None or df2 is not None:
            self.assertEqualDataFrame(df1, df2)
        sdf = StreamingDataFrame.read_df(df)
        sdf2 = sdf.concat(sdf, axis=0)
        sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
        df1 = sdfte.head()
        df2 = sdfte.head()
        if df1 is not None or df2 is not None:
            self.assertEqualDataFrame(df1, df2)
        df1 = sdftr.head()
        df2 = sdftr.head()
        if df1 is not None or df2 is not None:
            self.assertEqualDataFrame(df1, df2)
    def test_fillna(self):
        df = pandas.DataFrame(
            data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan]))
        sdf = StreamingDataFrame.read_df(df)

        df2 = pandas.DataFrame(
            data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"]))
        na = sdf.fillna(value=dict(X=10.0, Y="NAN"))
        ndf = na.to_df()
        self.assertEqual(ndf, df2)

        df3 = pandas.DataFrame(
            data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan]))
        na = sdf.fillna(value=dict(X=10.0))
        ndf = na.to_df()
        self.assertEqual(ndf, df3)
    def test_describe(self):
        x = numpy.arange(100001).astype(numpy.float64) / 100000 - 0.5
        y = numpy.arange(100001).astype(numpy.int64)
        z = numpy.array([chr(65 + j % 45) for j in y])
        df = pandas.DataFrame(data=dict(X=x, Y=y, Z=z))
        sdf = StreamingDataFrame.read_df(df)

        desc = sdf.describe()
        self.assertEqual(['X', 'Y'], list(desc.columns))
        self.assertEqual(desc.loc['min', :].tolist(), [-0.5, 0])
        self.assertEqual(desc.loc['max', :].tolist(), [0.5, 100000])
        self.assertEqualArray(desc.loc['mean', :], numpy.array([0, 50000]))
        self.assertEqualArray(desc.loc['25%', :], numpy.array([-0.25, 25000]))
        self.assertEqualArray(desc.loc['50%', :], numpy.array([0.0, 50000]))
        self.assertEqualArray(desc.loc['75%', :], numpy.array([0.25, 75000]))
        self.assertEqualArray(desc.loc['std', :], numpy.array(
            [2.886795e-01, 28867.946472]), decimal=4)
예제 #28
0
    def test_read_json_file2(self):
        data = b'''{"a": {"c": 1}, "b": [2, 3]}
                   {"a": {"a": 3}, "b": [4, 5, "r"]}'''

        obj1 = list(
            enumerate_json_items(BytesIO(data), flatten=False, lines=True))
        obj2 = list(
            enumerate_json_items(BytesIO(data), flatten=True, lines=True))
        self.assertNotEqual(obj1, obj2)
        self.assertEqual(obj2, [{
            'a_c': 1,
            'b_0': 2,
            'b_1': 3
        }, {
            'a_a': 3,
            'b_0': 4,
            'b_1': 5,
            'b_2': 'r'
        }])

        it = StreamingDataFrame.read_json(BytesIO(data),
                                          lines="stream",
                                          flatten=True)
        dfs = list(it)
        self.assertEqual(
            ['a_a', 'a_c', 'b_0', 'b_1', 'b_2'],
            list(sorted(dfs[0].columns)),
        )
        self.assertEqual(len(dfs), 1)
        js = dfs[0].to_json(orient='records', lines=True)
        jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']')
        exp = [{
            'a_a': None,
            'a_c': 1.0,
            'b_0': 2,
            'b_1': 3,
            'b_2': None
        }, {
            'a_a': 3.0,
            'a_c': None,
            'b_0': 4,
            'b_1': 5,
            'b_2': 'r'
        }]
        self.assertEqual(exp, jsjson)
 def test_train_test_split(self):
     sdf = dummy_streaming_dataframe(100)
     tr, te = sdf.train_test_split(index=False, streaming=False)
     self.assertRaise(
         lambda: StreamingDataFrame.read_str(tr, chunksize=None),
         ValueError)
     self.assertRaise(
         lambda: StreamingDataFrame.read_str(tr, iterator=False),
         ValueError)
     StreamingDataFrame.read_str(tr.encode('utf-8'))
     trsdf = StreamingDataFrame.read_str(tr)
     tesdf = StreamingDataFrame.read_str(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
 def test_init(self):
     sdf = dummy_streaming_dataframe(100)
     df1 = sdf.to_df()
     sdf2 = StreamingDataFrame(sdf)
     df2 = sdf2.to_df()
     self.assertEqualDataFrame(df1, df2)