Exemplo n.º 1
0
    def test_inference_failure(self):
        # Inference on first block, then conversion failure on second block
        rows = b"a,b\n123,456\nabc,de\xff\ngh,ij\n"
        read_options = ReadOptions()
        read_options.block_size = len(rows) - 7
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
        assert reader.schema == expected_schema
        assert reader.read_next_batch().to_pydict() == {'a': [123], 'b': [456]}
        # Second block
        with pytest.raises(ValueError, match="CSV conversion error to int64"):
            reader.read_next_batch()
        # EOF
        with pytest.raises(StopIteration):
            reader.read_next_batch()

        # Inference on first block, then conversion failure on second block,
        # then success on third block
        rows = b"a,b\n1,2\nabc,def\n45,67\n"
        read_options.block_size = 8
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
        assert reader.schema == expected_schema
        assert reader.read_next_batch().to_pydict() == {'a': [1], 'b': [2]}
        # Second block
        with pytest.raises(ValueError, match="CSV conversion error to int64"):
            reader.read_next_batch()
        # Third block
        assert reader.read_next_batch().to_pydict() == {'a': [45], 'b': [67]}
        # EOF
        with pytest.raises(StopIteration):
            reader.read_next_batch()
Exemplo n.º 2
0
    def test_batch_lifetime(self):
        gc.collect()
        old_allocated = pa.total_allocated_bytes()

        # Memory occupation should not grow with CSV file size
        def check_one_batch(reader, expected):
            batch = reader.read_next_batch()
            assert batch.to_pydict() == expected

        rows = b"10,11\n12,13\n14,15\n16,17\n"
        read_options = ReadOptions()
        read_options.column_names = ['a', 'b']
        read_options.block_size = 6
        reader = self.open_bytes(rows, read_options=read_options)
        check_one_batch(reader, {'a': [10], 'b': [11]})
        allocated_after_first_batch = pa.total_allocated_bytes()
        check_one_batch(reader, {'a': [12], 'b': [13]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        check_one_batch(reader, {'a': [14], 'b': [15]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        check_one_batch(reader, {'a': [16], 'b': [17]})
        assert pa.total_allocated_bytes() == allocated_after_first_batch
        with pytest.raises(StopIteration):
            reader.read_next_batch()
        assert pa.total_allocated_bytes() == old_allocated
        reader = None
        assert pa.total_allocated_bytes() == old_allocated
Exemplo n.º 3
0
    def test_invalid_csv(self):
        # CSV errors on first block
        rows = b"a,b\n1,2,3\n4,5\n6,7\n"
        read_options = ReadOptions()
        read_options.block_size = 10
        with pytest.raises(pa.ArrowInvalid, match="Expected 2 columns, got 3"):
            reader = self.open_bytes(rows, read_options=read_options)

        # CSV errors on second block
        rows = b"a,b\n1,2\n3,4,5\n6,7\n"
        read_options.block_size = 8
        reader = self.open_bytes(rows, read_options=read_options)
        assert reader.read_next_batch().to_pydict() == {'a': [1], 'b': [2]}
        with pytest.raises(pa.ArrowInvalid, match="Expected 2 columns, got 3"):
            reader.read_next_batch()
        # Cannot continue after a parse error
        with pytest.raises(StopIteration):
            reader.read_next_batch()
Exemplo n.º 4
0
    def test_inference(self):
        # Inference is done on first block
        rows = b"a,b\n123,456\nabc,de\xff\ngh,ij\n"
        expected_schema = pa.schema([('a', pa.string()),
                                     ('b', pa.binary())])

        read_options = ReadOptions()
        read_options.block_size = len(rows)
        reader = self.open_bytes(rows, read_options=read_options)
        self.check_reader(reader, expected_schema,
                          [{'a': ['123', 'abc', 'gh'],
                            'b': [b'456', b'de\xff', b'ij']}])

        read_options.block_size = len(rows) - 1
        reader = self.open_bytes(rows, read_options=read_options)
        self.check_reader(reader, expected_schema,
                          [{'a': ['123', 'abc'],
                            'b': [b'456', b'de\xff']},
                           {'a': ['gh'],
                            'b': [b'ij']}])