Exemplo n.º 1
0
    def test_include_missing_columns(self):
        rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n"

        read_options = ReadOptions()
        convert_options = ConvertOptions()
        convert_options.include_columns = ['xx', 'ab', 'yy']
        convert_options.include_missing_columns = True
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('xx', pa.null()),
                            ('ab', pa.string()),
                            ('yy', pa.null())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "xx": [None, None, None],
            "ab": ["ef", "ij", "mn"],
            "yy": [None, None, None],
            }

        # Combining with `column_names`
        read_options.column_names = ["xx", "yy"]
        convert_options.include_columns = ["yy", "cd"]
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('yy', pa.string()),
                            ('cd', pa.null())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "yy": ["cd", "gh", "kl", "op"],
            "cd": [None, None, None, None],
            }

        # And with `column_types` as well
        convert_options.column_types = {"yy": pa.binary(),
                                        "cd": pa.int32()}
        table = self.read_bytes(rows, read_options=read_options,
                                convert_options=convert_options)
        schema = pa.schema([('yy', pa.binary()),
                            ('cd', pa.int32())])
        assert table.schema == schema
        assert table.to_pydict() == {
            "yy": [b"cd", b"gh", b"kl", b"op"],
            "cd": [None, None, None, None],
            }
Exemplo n.º 2
0
    def test_column_options(self):
        # With column_names
        rows = b"1,2,3\n4,5,6"
        read_options = ReadOptions()
        read_options.column_names = ['d', 'e', 'f']
        reader = self.open_bytes(rows, read_options=read_options)
        expected_schema = pa.schema([('d', pa.int64()), ('e', pa.int64()),
                                     ('f', pa.int64())])
        self.check_reader(reader, expected_schema, [{
            'd': [1, 4],
            'e': [2, 5],
            'f': [3, 6]
        }])

        # With include_columns
        convert_options = ConvertOptions()
        convert_options.include_columns = ['f', 'e']
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('f', pa.int64()), ('e', pa.int64())])
        self.check_reader(reader, expected_schema, [{
            'e': [2, 5],
            'f': [3, 6]
        }])

        # With column_types
        convert_options.column_types = {'e': pa.string()}
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('f', pa.int64()), ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'e': ["2", "5"],
            'f': [3, 6]
        }])

        # Missing columns in include_columns
        convert_options.include_columns = ['g', 'f', 'e']
        with pytest.raises(
                KeyError,
                match="Column 'g' in include_columns does not exist"):
            reader = self.open_bytes(rows,
                                     read_options=read_options,
                                     convert_options=convert_options)

        convert_options.include_missing_columns = True
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('g', pa.null()), ('f', pa.int64()),
                                     ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'g': [None, None],
            'e': ["2", "5"],
            'f': [3, 6]
        }])

        convert_options.column_types = {'e': pa.string(), 'g': pa.float64()}
        reader = self.open_bytes(rows,
                                 read_options=read_options,
                                 convert_options=convert_options)
        expected_schema = pa.schema([('g', pa.float64()), ('f', pa.int64()),
                                     ('e', pa.string())])
        self.check_reader(reader, expected_schema, [{
            'g': [None, None],
            'e': ["2", "5"],
            'f': [3, 6]
        }])