def test_read_csv(self):
     temp = get_temp_folder(__file__, "temp_read_csv")
     df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"]))
     name = os.path.join(temp, "df.csv")
     name2 = os.path.join(temp, "df2.csv")
     name3 = os.path.join(temp, "df3.csv")
     df.to_csv(name, index=False)
     df.to_csv(name2, index=True)
     sdf = StreamingDataFrame.read_csv(name)
     text = sdf.to_csv(index=False)
     self.assertRaise(
         lambda: StreamingDataFrame.read_csv(
             name2, index_col=0, chunksize=None),
         ValueError)
     self.assertRaise(
         lambda: StreamingDataFrame.read_csv(
             name2, index_col=0, iterator=False),
         ValueError)
     sdf2 = StreamingDataFrame.read_csv(name2, index_col=0)
     text2 = sdf2.to_csv(index=True)
     sdf2.to_csv(name3, index=True)
     with open(name, "r", encoding='utf-8') as f:
         exp = f.read()
     with open(name2, "r", encoding='utf-8') as f:
         exp2 = f.read()
     with open(name3, "r", encoding='utf-8') as f:
         text3 = f.read()
     self.assertEqual(text.replace('\r', ''), exp)
     sdf2 = StreamingDataFrame.read_df(df)
     self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe())
     self.assertEqual(text2.replace('\r', ''), exp2)
     self.assertEqual(text3.replace('\r', '').replace('\n\n', '\n'),
                      exp2.replace('\r', ''))
 def test_schema_consistant(self):
     df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"),
                            dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")])
     temp = get_temp_folder(__file__, "temp_schema_consistant")
     name = os.path.join(temp, "df.csv")
     stio = StringIO()
     df.to_csv(stio, index=False)
     self.assertNotEmpty(stio.getvalue())
     df.to_csv(name, index=False)
     self.assertEqual(df.shape, (4, 3))
     sdf = StreamingDataFrame.read_csv(name, chunksize=2)
     self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError)
     sdf = StreamingDataFrame.read_csv(
         name, chunksize=2, check_schema=False)
     pieces = list(sdf)
     self.assertEqual(len(pieces), 2)
 def test_train_test_split_file_pattern(self):
     temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern")
     sdf = dummy_streaming_dataframe(100)
     names = os.path.join(temp, "spl_{0}.txt")
     self.assertRaise(lambda: sdf.train_test_split(
         names, index=False, streaming=False), ValueError)
     names = os.path.join(temp, "spl_{}.txt")
     tr, te = sdf.train_test_split(names, index=False, streaming=False)
     trsdf = StreamingDataFrame.read_csv(tr)
     tesdf = StreamingDataFrame.read_csv(te)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)
 def test_read_csv_names(self):
     this = os.path.abspath(os.path.dirname(__file__))
     data = os.path.join(this, "data", "buggy_hash2.csv")
     df = pandas.read_csv(data, sep="\t", names=[
                          "A", "B", "C"], header=None)
     sdf = StreamingDataFrame.read_csv(
         data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None)
     head = sdf.head(n=1)
     self.assertEqualDataFrame(df.head(n=1), head)
 def test_train_test_split_file(self):
     temp = get_temp_folder(__file__, "temp_train_test_split_file")
     names = [os.path.join(temp, "train.txt"),
              os.path.join(temp, "test.txt")]
     sdf = dummy_streaming_dataframe(100)
     sdf.train_test_split(names, index=False, streaming=False)
     trsdf = StreamingDataFrame.read_csv(names[0])
     tesdf = StreamingDataFrame.read_csv(names[1])
     self.assertGreater(trsdf.shape[0], 20)
     self.assertGreater(tesdf.shape[0], 20)
     trdf = trsdf.to_dataframe()
     tedf = tesdf.to_dataframe()
     self.assertGreater(trdf.shape[0], 20)
     self.assertGreater(tedf.shape[0], 20)
     df_exp = sdf.to_dataframe()
     df_val = pandas.concat([trdf, tedf])
     self.assertEqual(df_exp.shape, df_val.shape)
     df_val = df_val.sort_values("cint").reset_index(drop=True)
     self.assertEqualDataFrame(df_val, df_exp)