예제 #1
0
    def test_stress_block_sizes(self):
        # Test a number of small block sizes to stress block stitching
        data_base, expected = make_random_json(num_cols=2, num_rows=100)
        read_options = ReadOptions()
        parse_options = ParseOptions()

        for data in [data_base, data_base.rstrip(b'\r\n')]:
            for newlines_in_values in [False, True]:
                parse_options.newlines_in_values = newlines_in_values
                for block_size in [22, 23, 37]:
                    read_options.block_size = block_size
                    table = self.read_bytes(data, read_options=read_options,
                                            parse_options=parse_options)
                    assert table.schema == expected.schema
                    if not table.equals(expected):
                        # Better error output
                        assert table.to_pydict() == expected.to_pydict()
예제 #2
0
    def test_block_sizes(self):
        rows = b'{"a": 1}\n{"a": 2}\n{"a": 3}'
        read_options = ReadOptions()
        parse_options = ParseOptions()

        for data in [rows, rows + b'\n']:
            for newlines_in_values in [False, True]:
                parse_options.newlines_in_values = newlines_in_values
                read_options.block_size = 4
                with pytest.raises(ValueError,
                                   match="try to increase block size"):
                    self.read_bytes(data, read_options=read_options,
                                    parse_options=parse_options)

                # Validate reader behavior with various block sizes.
                # There used to be bugs in this area.
                for block_size in range(9, 20):
                    read_options.block_size = block_size
                    table = self.read_bytes(data, read_options=read_options,
                                            parse_options=parse_options)
                    assert table.to_pydict() == {'a': [1, 2, 3]}
예제 #3
0
 def test_reconcile_accross_blocks(self):
     # ARROW-12065: reconciling inferred types accross blocks
     first_row = b'{                               }\n'
     read_options = ReadOptions(block_size=len(first_row))
     for next_rows, expected_pylist in [
         (b'{"a": 0}', [None, 0]),
         (b'{"a": []}', [None, []]),
         (b'{"a": []}\n{"a": [[1]]}', [None, [], [[1]]]),
         (b'{"a": {}}', [None, {}]),
         (b'{"a": {}}\n{"a": {"b": {"c": 1}}}',
          [None, {"b": None}, {"b": {"c": 1}}]),
     ]:
         table = self.read_bytes(first_row + next_rows,
                                 read_options=read_options)
         expected = {"a": expected_pylist}
         assert table.to_pydict() == expected
         # Check that the issue was exercised
         assert table.column("a").num_chunks > 1
예제 #4
0
 def read_json(self, *args, **kwargs):
     read_options = kwargs.setdefault('read_options', ReadOptions())
     read_options.use_threads = True
     table = read_json(*args, **kwargs)
     table.validate()
     return table