def test_stress_block_sizes(self): # Test a number of small block sizes to stress block stitching data_base, expected = make_random_json(num_cols=2, num_rows=100) read_options = ReadOptions() parse_options = ParseOptions() for data in [data_base, data_base.rstrip(b'\r\n')]: for newlines_in_values in [False, True]: parse_options.newlines_in_values = newlines_in_values for block_size in [22, 23, 37]: read_options.block_size = block_size table = self.read_bytes(data, read_options=read_options, parse_options=parse_options) assert table.schema == expected.schema if not table.equals(expected): # Better error output assert table.to_pydict() == expected.to_pydict()
def test_block_sizes(self): rows = b'{"a": 1}\n{"a": 2}\n{"a": 3}' read_options = ReadOptions() parse_options = ParseOptions() for data in [rows, rows + b'\n']: for newlines_in_values in [False, True]: parse_options.newlines_in_values = newlines_in_values read_options.block_size = 4 with pytest.raises(ValueError, match="try to increase block size"): self.read_bytes(data, read_options=read_options, parse_options=parse_options) # Validate reader behavior with various block sizes. # There used to be bugs in this area. for block_size in range(9, 20): read_options.block_size = block_size table = self.read_bytes(data, read_options=read_options, parse_options=parse_options) assert table.to_pydict() == {'a': [1, 2, 3]}
def test_reconcile_accross_blocks(self): # ARROW-12065: reconciling inferred types accross blocks first_row = b'{ }\n' read_options = ReadOptions(block_size=len(first_row)) for next_rows, expected_pylist in [ (b'{"a": 0}', [None, 0]), (b'{"a": []}', [None, []]), (b'{"a": []}\n{"a": [[1]]}', [None, [], [[1]]]), (b'{"a": {}}', [None, {}]), (b'{"a": {}}\n{"a": {"b": {"c": 1}}}', [None, {"b": None}, {"b": {"c": 1}}]), ]: table = self.read_bytes(first_row + next_rows, read_options=read_options) expected = {"a": expected_pylist} assert table.to_pydict() == expected # Check that the issue was exercised assert table.column("a").num_chunks > 1
def read_json(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = True table = read_json(*args, **kwargs) table.validate() return table