def test_read_json_raw(self):
     data = [{
         'id': 1,
         'name': {
             'first': 'Coleen',
             'last': 'Volk'
         }
     }, {
         'name': {
             'given': 'Mose',
             'family': 'Regner'
         }
     }, {
         'id': 2,
         'name': 'FayeRaker'
     }]
     exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"},
             {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null},
             {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null,
             "name.given":null,"name.last":null}]""".replace(" ",
                                                             "").replace(
                                                                 "\n", "")
     self.assertRaise(lambda: StreamingDataFrame.read_json(data),
                      NotImplementedError)
     it = StreamingDataFrame.read_json(data, flatten=True)
     dfs = list(it)
     self.assertEqual(len(dfs), 1)
     js = dfs[0].to_json(orient='records')
     js_read = loads(js)
     js_exp = loads(exp)
     self.assertEqual(js_exp, js_read)
Exemplo n.º 2
0
def json_to_dataframe_streaming(js, chunksize=100000, flatten=False, **kwargs):
    """
    Converts a big json dump (from @see fn convert_trace_to_json)
    to a dataframe. The function processes the data by streaming to avoid
    loading huge data in memory.
    Returns an iterator on dataframes.
    The function relies on :epkg:`pandas_streaming`.

    :param js: a filename, a json string, a stream containing json
    :param chunksize:
        see :func:`pandas_streaming.df.StreamingDataFrame.read_json`
    :param flatten:
        see :func:`pandas_streaming.df.StreamingDataFrame.read_json`
    :param kwargs:
        see :func:`pandas_streaming.df.StreamingDataFrame.read_json`
    :return: a dataframe
    """
    from pandas_streaming.df import StreamingDataFrame  # pylint: disable=C0415
    if isinstance(js, str):
        if len(js) < 5000 and os.path.exists(js):
            sdf = StreamingDataFrame.read_json(js)
        else:
            raise RuntimeError(
                "Use a stream or function json_to_dataframe instead of "
                "the streaming version.")
    else:
        sdf = StreamingDataFrame.read_json(js)

    sdf['ts_sec'] = sdf['ts'].apply(lambda t: t / 1e9)
    return sdf
Exemplo n.º 3
0
 def test_read_json_ijson(self):
     it = StreamingDataFrame.read_json(
         BytesIO(TestDataFrameIOHelpers.text_json))
     dfs = list(it)
     self.assertEqual(len(dfs), 1)
     js = dfs[0].to_json(orient='records', lines=True)
     jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']')
     self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp)
Exemplo n.º 4
0
 def test_read_json_rows(self):
     data = '''{"a": 1, "b": 2}
               {"a": 3, "b": 4}'''
     it = StreamingDataFrame.read_json(StringIO(data), lines=True)
     dfs = list(it)
     self.assertEqual(len(dfs), 1)
     js = dfs[0].to_json(orient='records')
     self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]')
Exemplo n.º 5
0
 def test_read_json_rows_file_lines_head(self):
     data = self.abs_path_join(__file__, 'data', 'example.json')
     dfs = pandas.read_json(data, orient='records', lines=True)
     self.assertEqual(dfs.shape, (2, 2))
     it = StreamingDataFrame.read_json(data, lines="stream")
     h1 = it.head()
     h2 = it.head()
     self.assertNotEmpty(h1)
     self.assertNotEmpty(h2)
     self.assertEqualDataFrame(h1, h2)
Exemplo n.º 6
0
 def test_read_json_rows2(self):
     data = b'''{"a": 1, "b": 2}
               {"a": 3, "b": 4}'''
     dfs = pandas.read_json(BytesIO(data), lines=True)
     self.assertEqual(dfs.shape, (2, 2))
     it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
     dfs = list(it)
     self.assertEqual(len(dfs), 1)
     js = dfs[0].to_json(orient='records')
     self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js)
Exemplo n.º 7
0
 def test_read_json_raw_head(self):
     data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
             {'name': {'given': 'Mose', 'family': 'Regner'}},
             {'id': 2, 'name': 'FayeRaker'}]
     it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1)
     h1 = it.head()
     h2 = it.head()
     self.assertEqualDataFrame(h1, h2)
     self.assertGreater(h1.shape[0], 1)
     self.assertGreater(h2.shape[0], 1)
Exemplo n.º 8
0
 def test_read_json_rows2_head(self):
     data = b'''{"a": 1, "b": 2}
               {"a": 3, "b": 4}'''
     dfs = pandas.read_json(BytesIO(data), lines=True)
     self.assertEqual(dfs.shape, (2, 2))
     it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
     h1 = it.head()
     h2 = it.head()
     self.assertNotEmpty(h1)
     self.assertNotEmpty(h2)
     self.assertEqualDataFrame(h1, h2)
Exemplo n.º 9
0
 def test_read_json_classic_file(self):
     data = self.abs_path_join(__file__, 'data', 'classic.json')
     dfs = pandas.read_json(data, orient='records')
     self.assertEqual(dfs.shape[1], 8)
     self.assertGreater(dfs.shape[0], 2)
     with open(data, "r", encoding="utf-8") as f:
         it = StreamingDataFrame.read_json(f, orient='records')
         h1 = it.to_df()
         h2 = it.to_df()
     self.assertNotEmpty(h1)
     self.assertNotEmpty(h2)
     self.assertEqualDataFrame(h1, h2)
     self.assertEqual(h1.shape[1], 8)
Exemplo n.º 10
0
 def test_read_json_classic(self):
     data = self.abs_path_join(__file__, 'data', 'classic.json')
     dfs = pandas.read_json(data, orient='records')
     dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9)
     self.assertEqual(dfs.shape[1], 9)
     self.assertGreater(dfs.shape[0], 2)
     it = StreamingDataFrame.read_json(data)
     it['ts2'] = it['ts'].apply(lambda t: t / 1e9)
     h1 = it.to_df()
     h2 = it.to_df()
     self.assertNotEmpty(h1)
     self.assertNotEmpty(h2)
     self.assertEqualDataFrame(h1, h2)
     self.assertEqual(h1.shape[1], 9)
Exemplo n.º 11
0
    def test_read_json_file2(self):
        data = b'''{"a": {"c": 1}, "b": [2, 3]}
                   {"a": {"a": 3}, "b": [4, 5, "r"]}'''

        obj1 = list(
            enumerate_json_items(BytesIO(data), flatten=False, lines=True))
        obj2 = list(
            enumerate_json_items(BytesIO(data), flatten=True, lines=True))
        self.assertNotEqual(obj1, obj2)
        self.assertEqual(obj2, [{
            'a_c': 1,
            'b_0': 2,
            'b_1': 3
        }, {
            'a_a': 3,
            'b_0': 4,
            'b_1': 5,
            'b_2': 'r'
        }])

        it = StreamingDataFrame.read_json(BytesIO(data),
                                          lines="stream",
                                          flatten=True)
        dfs = list(it)
        self.assertEqual(
            ['a_a', 'a_c', 'b_0', 'b_1', 'b_2'],
            list(sorted(dfs[0].columns)),
        )
        self.assertEqual(len(dfs), 1)
        js = dfs[0].to_json(orient='records', lines=True)
        jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']')
        exp = [{
            'a_a': None,
            'a_c': 1.0,
            'b_0': 2,
            'b_1': 3,
            'b_2': None
        }, {
            'a_a': 3.0,
            'a_c': None,
            'b_0': 4,
            'b_1': 5,
            'b_2': 'r'
        }]
        self.assertEqual(exp, jsjson)