def test_read_json_raw(self): data = [{ 'id': 1, 'name': { 'first': 'Coleen', 'last': 'Volk' } }, { 'name': { 'given': 'Mose', 'family': 'Regner' } }, { 'id': 2, 'name': 'FayeRaker' }] exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"}, {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null}, {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null, "name.given":null,"name.last":null}]""".replace(" ", "").replace( "\n", "") self.assertRaise(lambda: StreamingDataFrame.read_json(data), NotImplementedError) it = StreamingDataFrame.read_json(data, flatten=True) dfs = list(it) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records') js_read = loads(js) js_exp = loads(exp) self.assertEqual(js_exp, js_read)
def json_to_dataframe_streaming(js, chunksize=100000, flatten=False, **kwargs): """ Converts a big json dump (from @see fn convert_trace_to_json) to a dataframe. The function processes the data by streaming to avoid loading huge data in memory. Returns an iterator on dataframes. The function relies on :epkg:`pandas_streaming`. :param js: a filename, a json string, a stream containing json :param chunksize: see :func:`pandas_streaming.df.StreamingDataFrame.read_json` :param flatten: see :func:`pandas_streaming.df.StreamingDataFrame.read_json` :param kwargs: see :func:`pandas_streaming.df.StreamingDataFrame.read_json` :return: a dataframe """ from pandas_streaming.df import StreamingDataFrame # pylint: disable=C0415 if isinstance(js, str): if len(js) < 5000 and os.path.exists(js): sdf = StreamingDataFrame.read_json(js) else: raise RuntimeError( "Use a stream or function json_to_dataframe instead of " "the streaming version.") else: sdf = StreamingDataFrame.read_json(js) sdf['ts_sec'] = sdf['ts'].apply(lambda t: t / 1e9) return sdf
def test_read_json_ijson(self): it = StreamingDataFrame.read_json( BytesIO(TestDataFrameIOHelpers.text_json)) dfs = list(it) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records', lines=True) jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']') self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp)
def test_read_json_rows(self): data = '''{"a": 1, "b": 2} {"a": 3, "b": 4}''' it = StreamingDataFrame.read_json(StringIO(data), lines=True) dfs = list(it) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records') self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]')
def test_read_json_rows_file_lines_head(self): data = self.abs_path_join(__file__, 'data', 'example.json') dfs = pandas.read_json(data, orient='records', lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(data, lines="stream") h1 = it.head() h2 = it.head() self.assertNotEmpty(h1) self.assertNotEmpty(h2) self.assertEqualDataFrame(h1, h2)
def test_read_json_rows2(self): data = b'''{"a": 1, "b": 2} {"a": 3, "b": 4}''' dfs = pandas.read_json(BytesIO(data), lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(BytesIO(data), lines="stream") dfs = list(it) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records') self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js)
def test_read_json_raw_head(self): data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, {'name': {'given': 'Mose', 'family': 'Regner'}}, {'id': 2, 'name': 'FayeRaker'}] it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1) h1 = it.head() h2 = it.head() self.assertEqualDataFrame(h1, h2) self.assertGreater(h1.shape[0], 1) self.assertGreater(h2.shape[0], 1)
def test_read_json_rows2_head(self): data = b'''{"a": 1, "b": 2} {"a": 3, "b": 4}''' dfs = pandas.read_json(BytesIO(data), lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(BytesIO(data), lines="stream") h1 = it.head() h2 = it.head() self.assertNotEmpty(h1) self.assertNotEmpty(h2) self.assertEqualDataFrame(h1, h2)
def test_read_json_classic_file(self): data = self.abs_path_join(__file__, 'data', 'classic.json') dfs = pandas.read_json(data, orient='records') self.assertEqual(dfs.shape[1], 8) self.assertGreater(dfs.shape[0], 2) with open(data, "r", encoding="utf-8") as f: it = StreamingDataFrame.read_json(f, orient='records') h1 = it.to_df() h2 = it.to_df() self.assertNotEmpty(h1) self.assertNotEmpty(h2) self.assertEqualDataFrame(h1, h2) self.assertEqual(h1.shape[1], 8)
def test_read_json_classic(self): data = self.abs_path_join(__file__, 'data', 'classic.json') dfs = pandas.read_json(data, orient='records') dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9) self.assertEqual(dfs.shape[1], 9) self.assertGreater(dfs.shape[0], 2) it = StreamingDataFrame.read_json(data) it['ts2'] = it['ts'].apply(lambda t: t / 1e9) h1 = it.to_df() h2 = it.to_df() self.assertNotEmpty(h1) self.assertNotEmpty(h2) self.assertEqualDataFrame(h1, h2) self.assertEqual(h1.shape[1], 9)
def test_read_json_file2(self): data = b'''{"a": {"c": 1}, "b": [2, 3]} {"a": {"a": 3}, "b": [4, 5, "r"]}''' obj1 = list( enumerate_json_items(BytesIO(data), flatten=False, lines=True)) obj2 = list( enumerate_json_items(BytesIO(data), flatten=True, lines=True)) self.assertNotEqual(obj1, obj2) self.assertEqual(obj2, [{ 'a_c': 1, 'b_0': 2, 'b_1': 3 }, { 'a_a': 3, 'b_0': 4, 'b_1': 5, 'b_2': 'r' }]) it = StreamingDataFrame.read_json(BytesIO(data), lines="stream", flatten=True) dfs = list(it) self.assertEqual( ['a_a', 'a_c', 'b_0', 'b_1', 'b_2'], list(sorted(dfs[0].columns)), ) self.assertEqual(len(dfs), 1) js = dfs[0].to_json(orient='records', lines=True) jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']') exp = [{ 'a_a': None, 'a_c': 1.0, 'b_0': 2, 'b_1': 3, 'b_2': None }, { 'a_a': 3.0, 'a_c': None, 'b_0': 4, 'b_1': 5, 'b_2': 'r' }] self.assertEqual(exp, jsjson)