Python from_arrow示例，duckdb.from_arrow Python示例

示例#1

0

显示文件

文件： test_dictionary_arrow.py 项目： triump2020/duckdb

    def test_dictionary(self, duckdb_cursor):
        if not can_run:
            return
        indices = pa.array([0, 1, 0, 1, 2, 1, 0, 2])
        dictionary = pa.array([10, 100, None])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        rel = duckdb.from_arrow(arrow_table)

        assert rel.execute().fetchall() == [(10, ), (100, ), (10, ), (100, ),
                                            (None, ), (100, ), (10, ),
                                            (None, )]

        # Bigger than Vector Size
        indices_list = [0, 1, 0, 1, 2, 1, 0, 2, 3] * 10000
        indices = pa.array(indices_list)
        dictionary = pa.array([10, 100, None, 999999])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        rel = duckdb.from_arrow(arrow_table)
        result = [(10, ), (100, ), (10, ), (100, ), (None, ), (100, ), (10, ),
                  (None, ), (999999, )] * 10000
        assert rel.execute().fetchall() == result

        #Table with dictionary and normal array

        arrow_table = pa.Table.from_arrays(
            [dict_array, pa.array(indices_list)], ['a', 'b'])
        rel = duckdb.from_arrow(arrow_table)
        result = [(10, 0), (100, 1), (10, 0), (100, 1), (None, 2), (100, 1),
                  (10, 0), (None, 2), (999999, 3)] * 10000
        assert rel.execute().fetchall() == result

示例#2

0

显示文件

    def test_max_times(self, duckdb_cursor):
        if not can_run:
            return   
        data = pa.array([2147483647000000], type=pa.time64('us'))
        result = pa.Table.from_arrays([data],['a'])
        #Max Sec
        data = pa.array([2147483647], type=pa.time32('s'))
        arrow_table = pa.Table.from_arrays([data],['a'])
        rel = duckdb.from_arrow(arrow_table).arrow()
        assert (rel['a'] == result['a'])

        #Max MSec
        data = pa.array([2147483647000], type=pa.time64('us'))
        result = pa.Table.from_arrays([data],['a'])
        data = pa.array([2147483647], type=pa.time32('ms'))
        arrow_table = pa.Table.from_arrays([data],['a'])
        rel = duckdb.from_arrow(arrow_table).arrow()
        assert (rel['a'] == result['a'])

        #Max NSec
        data = pa.array([9223372036854774], type=pa.time64('us'))
        result = pa.Table.from_arrays([data],['a'])
        data = pa.array([9223372036854774000], type=pa.time64('ns'))
        arrow_table = pa.Table.from_arrays([data],['a'])
        rel = duckdb.from_arrow(arrow_table).arrow()

        print (rel['a'])
        print (result['a'])
        assert (rel['a'] == result['a'])

示例#3

0

显示文件

文件： test_timestamps.py 项目： triump2020/duckdb

    def test_timestamp_overflow(self, duckdb_cursor):
        if not can_run:
            return
        data = (pa.array([9223372036854775807], pa.timestamp('s')),
                pa.array([9223372036854775807], pa.timestamp('ms')),
                pa.array([9223372036854775807], pa.timestamp('us')))
        arrow_table = pa.Table.from_arrays([data[0], data[1], data[2]],
                                           ['a', 'b', 'c'])
        arrow_from_duck = duckdb.from_arrow(arrow_table).arrow()
        assert (arrow_from_duck['a'] == arrow_table['a'])
        assert (arrow_from_duck['b'] == arrow_table['b'])
        assert (arrow_from_duck['c'] == arrow_table['c'])

        with pytest.raises(Exception):
            duck_rel = duckdb.from_arrow(arrow_table)
            res = duck_rel.project('a::TIMESTAMP_US')
            res.fetchone()

        with pytest.raises(Exception):
            duck_rel = duckdb.from_arrow(arrow_table)
            res = duck_rel.project('b::TIMESTAMP_US')
            res.fetchone()

        with pytest.raises(Exception):
            duck_rel = duckdb.from_arrow(arrow_table)
            res = duck_rel.project('c::TIMESTAMP_NS')
            res.fetchone()

示例#4

0

显示文件

文件： test_integration.py 项目： triump2020/duckdb

    def test_unsigned_roundtrip(self, duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'unsigned.parquet')
        cols = 'a, b, c, d'

        unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        unsigned_parquet_table.validate(full=True)
        rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project(
            cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(
            cols).arrow()
        rel_from_duckdb.validate(full=True)

        assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)

        con = duckdb.connect()
        con.execute(
            "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq"
        )
        arrow_result = con.fetch_arrow_table()
        arrow_result.validate(full=True)
        arrow_result.combine_chunks()
        arrow_result.validate(full=True)

        round_tripping = duckdb.from_arrow(arrow_result).to_arrow_table()
        round_tripping.validate(full=True)

        assert round_tripping.equals(arrow_result, check_metadata=True)

示例#5

0

显示文件

    def test_parallel_reader_default_conn(self, duckdb_cursor):
        if not can_run:
            return

        parquet_filename = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'userdata1.parquet')

        userdata_parquet_dataset = pyarrow.dataset.dataset([
            parquet_filename,
            parquet_filename,
            parquet_filename,
        ],
                                                           format="parquet")

        batches = [r for r in userdata_parquet_dataset.to_batches()]
        reader = pyarrow.dataset.Scanner.from_batches(
            batches, userdata_parquet_dataset.schema).to_reader()

        rel = duckdb.from_arrow(reader)

        assert rel.filter("first_name=\'Jose\' and salary > 134708.82"
                          ).aggregate('count(*)').execute().fetchone()[0] == 12
        # The reader is already consumed so this should be 0
        assert rel.filter("first_name=\'Jose\' and salary > 134708.82"
                          ).aggregate('count(*)').execute().fetchone()[0] == 0

示例#6

0

显示文件

文件： test_dictionary_arrow.py 项目： triump2020/duckdb

    def test_dictionary_index_types(self, duckdb_cursor):
        if not can_run:
            return
        indices_list = [None, 1, None, 1, 2, 1, 0]
        dictionary = pa.array([10, 100, 100], type=pyarrow.uint8())
        index_types = []
        index_types.append(pa.array(indices_list * 10000,
                                    type=pyarrow.uint8()))
        index_types.append(
            pa.array(indices_list * 10000, type=pyarrow.uint16()))
        index_types.append(
            pa.array(indices_list * 10000, type=pyarrow.uint32()))
        index_types.append(
            pa.array(indices_list * 10000, type=pyarrow.uint64()))
        index_types.append(pa.array(indices_list * 10000, type=pyarrow.int8()))
        index_types.append(pa.array(indices_list * 10000,
                                    type=pyarrow.int16()))
        index_types.append(pa.array(indices_list * 10000,
                                    type=pyarrow.int32()))
        index_types.append(pa.array(indices_list * 10000,
                                    type=pyarrow.int64()))

        for index_type in index_types:
            dict_array = pa.DictionaryArray.from_arrays(index_type, dictionary)
            arrow_table = pa.Table.from_arrays([dict_array], ['a'])
            rel = duckdb.from_arrow(arrow_table)
            result = [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ),
                      (10, )] * 10000
            assert rel.execute().fetchall() == result

示例#7

0

显示文件

文件： test_timestamp_timezone.py 项目： triump2020/duckdb

 def test_timestamp_timezone_overflow(self, duckdb_cursor):
     if not can_run:
         return
     precisions = ['s', 'ms']
     current_time = 9223372036854775807
     for precision in precisions:
         with pytest.raises(Exception, match='Could not convert'):
             arrow_table = generate_table(current_time, precision, 'UTC')
             res_utc = duckdb.from_arrow(arrow_table).execute().fetchall()

示例#8

0

显示文件

文件： test_date.py 项目： triump2020/duckdb

 def test_date_null(self, duckdb_cursor):
     if not can_run:
         return
     data = (pa.array([None],
                      type=pa.date64()), pa.array([None], type=pa.date32()))
     arrow_table = pa.Table.from_arrays([data[0], data[1]], ['a', 'b'])
     rel = duckdb.from_arrow(arrow_table).arrow()
     assert (rel['a'] == arrow_table['b'])
     assert (rel['b'] == arrow_table['b'])

示例#9

0

显示文件

文件： test_interval.py 项目： triump2020/duckdb

    def test_duration_overflow(self, duckdb_cursor):
        if not can_run:
            return

        # Only seconds can overflow
        data = pa.array([9223372036854775807], pa.duration('s'))
        arrow_table = pa.Table.from_arrays([data], ['a'])

        with pytest.raises(Exception):
            arrow_from_duck = duckdb.from_arrow(arrow_table).arrow()

示例#10

0

显示文件

 def test_time_null(self, duckdb_cursor):
     if not can_run:
         return   
     data = (pa.array([None], type=pa.time32('s')),pa.array([None], type=pa.time32('ms')),pa.array([None], pa.time64('us')),pa.array([None], pa.time64('ns')))
     arrow_table = pa.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d'])
     rel = duckdb.from_arrow(arrow_table).arrow()
     assert (rel['a'] == arrow_table['c'])
     assert (rel['b'] == arrow_table['c'])
     assert (rel['c'] == arrow_table['c'])
     assert (rel['d'] == arrow_table['c'])

示例#11

0

显示文件

文件： test_large_string.py 项目： triump2020/duckdb

    def test_large_string_type(self, duckdb_cursor):
        if not can_run:
            return

        schema = pa.schema([("data", pa.large_string())])
        inputs = [pa.array(["foo", "baaaar", "b"], type=pa.large_string())]
        arrow_table = pa.Table.from_arrays(inputs, schema=schema)

        rel = duckdb.from_arrow(arrow_table)
        res = rel.execute().fetchall()
        assert res == [('foo', ), ('baaaar', ), ('b', )]

示例#12

0

显示文件

    def test_multiple_queries_same_relation(self, duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','userdata1.parquet')
        cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments'

        userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        userdata_parquet_table.validate(full=True)
        rel = duckdb.from_arrow(userdata_parquet_table)
        assert(rel.aggregate("(avg(salary))::INT").execute().fetchone()[0] == 149005)
        assert(rel.aggregate("(avg(salary))::INT").execute().fetchone()[0] == 149005)

示例#13

0

显示文件

文件： test_binary_type.py 项目： triump2020/duckdb

    def test_binary_types(self, duckdb_cursor):
        if not can_run:
            return

        # Fixed Size Binary
        arrow_table = create_binary_table(pa.binary(3))
        rel = duckdb.from_arrow(arrow_table)
        res = rel.execute().fetchall()
        assert res == [(b"foo", ), (b"bar", ), (b"baz", )]

        # Normal Binary
        arrow_table = create_binary_table(pa.binary())
        rel = duckdb.from_arrow(arrow_table)
        res = rel.execute().fetchall()
        assert res == [(b"foo", ), (b"bar", ), (b"baz", )]

        # Large Binary
        arrow_table = create_binary_table(pa.large_binary())
        rel = duckdb.from_arrow(arrow_table)
        res = rel.execute().fetchall()
        assert res == [(b"foo", ), (b"bar", ), (b"baz", )]

示例#14

0

显示文件

文件： test_date.py 项目： triump2020/duckdb

 def test_max_date(self, duckdb_cursor):
     if not can_run:
         return
     data = (pa.array([2147483647], type=pa.date32()),
             pa.array([2147483647], type=pa.date32()))
     result = pa.Table.from_arrays([data[0], data[1]], ['a', 'b'])
     data = (pa.array([2147483647 * (1000 * 60 * 60 * 24)],
                      type=pa.date64()),
             pa.array([2147483647], type=pa.date32()))
     arrow_table = pa.Table.from_arrays([data[0], data[1]], ['a', 'b'])
     rel = duckdb.from_arrow(arrow_table).arrow()
     assert (rel['a'] == result['a'])
     assert (rel['b'] == result['b'])

示例#15

0

显示文件

文件： test_dictionary_arrow.py 项目： triump2020/duckdb

    def test_dictionary_null_index(self, duckdb_cursor):
        if not can_run:
            return
        indices = pa.array([None, 1, 0, 1, 2, 1, 0, 2])
        dictionary = pa.array([10, 100, None])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        rel = duckdb.from_arrow(arrow_table)

        assert rel.execute().fetchall() == [(None, ), (100, ), (10, ), (100, ),
                                            (None, ), (100, ), (10, ),
                                            (None, )]

        indices = pa.array([None, 1, None, 1, 2, 1, 0])
        dictionary = pa.array([10, 100, 100])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        rel = duckdb.from_arrow(arrow_table)
        print(rel.execute().fetchall())
        assert rel.execute().fetchall() == [(None, ), (100, ), (None, ),
                                            (100, ), (100, ), (100, ), (10, )]

        # Test Big Vector
        indices_list = [None, 1, None, 1, 2, 1, 0]
        indices = pa.array(indices_list * 1000)
        dictionary = pa.array([10, 100, 100])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        rel = duckdb.from_arrow(arrow_table)
        result = [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ),
                  (10, )] * 1000
        assert rel.execute().fetchall() == result

        #Table with dictionary and normal array
        arrow_table = pa.Table.from_arrays([dict_array, indices], ['a', 'b'])
        rel = duckdb.from_arrow(arrow_table)
        result = [(None, None), (100, 1), (None, None), (100, 1), (100, 2),
                  (100, 1), (10, 0)] * 1000
        assert rel.execute().fetchall() == result

示例#16

0

显示文件

文件： test_interval.py 项目： triump2020/duckdb

 def test_duration_types(self, duckdb_cursor):
     if not can_run:
         return
     data = (pa.array([1000000000], type=pa.duration('ns')),
             pa.array([1000000], type=pa.duration('us')),
             pa.array([1000],
                      pa.duration('ms')), pa.array([1], pa.duration('s')))
     arrow_table = pa.Table.from_arrays(
         [data[0], data[1], data[2], data[3]], ['a', 'b', 'c', 'd'])
     rel = duckdb.from_arrow(arrow_table).arrow()
     assert (rel['a'] == arrow_table['c'])
     assert (rel['b'] == arrow_table['c'])
     assert (rel['c'] == arrow_table['c'])
     assert (rel['d'] == arrow_table['c'])

示例#17

0

显示文件

    def test_null_type(self, duckdb_cursor):
        if not can_run:
            return
        schema = pa.schema([("data", pa.null())])
        inputs = [pa.array([None, None, None], type=pa.null())]
        arrow_table = pa.Table.from_arrays(inputs, schema=schema)
        duckdb_conn = duckdb.connect()
        duckdb_conn.register("testarrow", arrow_table)
        rel = duckdb.from_arrow(arrow_table).arrow()
        # We turn it to an array of int32 nulls
        schema = pa.schema([("data", pa.int32())])
        inputs = [pa.array([None, None, None], type=pa.null())]
        arrow_table = pa.Table.from_arrays(inputs, schema=schema)

        assert rel['data'] == arrow_table['data']

示例#18

0

显示文件

文件： test_dictionary_arrow.py 项目： triump2020/duckdb

    def test_dictionary_batches(self, duckdb_cursor):
        if not can_run:
            return

        indices_list = [None, 1, None, 1, 2, 1, 0]
        indices = pa.array(indices_list * 10000)
        dictionary = pa.array([10, 100, 100])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        batch_arrow_table = pyarrow.Table.from_batches(
            arrow_table.to_batches(10))
        rel = duckdb.from_arrow(batch_arrow_table)
        result = [(None, ), (100, ), (None, ), (100, ), (100, ), (100, ),
                  (10, )] * 10000
        assert rel.execute().fetchall() == result

        #Table with dictionary and normal array
        arrow_table = pa.Table.from_arrays([dict_array, indices], ['a', 'b'])
        batch_arrow_table = pyarrow.Table.from_batches(
            arrow_table.to_batches(10))
        rel = duckdb.from_arrow(batch_arrow_table)
        result = [(None, None), (100, 1), (None, None), (100, 1), (100, 2),
                  (100, 1), (10, 0)] * 10000
        assert rel.execute().fetchall() == result

示例#19

0

显示文件

文件： test_dictionary_arrow.py 项目： triump2020/duckdb

    def test_dictionary_strings(self, duckdb_cursor):
        if not can_run:
            return

        indices_list = [None, 0, 1, 2, 3, 4, None]
        indices = pa.array(indices_list * 1000)
        dictionary = pa.array([
            'Matt Daaaaaaaaamon', 'Alec Baldwin', 'Sean Penn', 'Tim Robbins',
            'Samuel L. Jackson'
        ])
        dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
        arrow_table = pa.Table.from_arrays([dict_array], ['a'])
        rel = duckdb.from_arrow(arrow_table)
        result = [(None, ), ('Matt Daaaaaaaaamon', ), ('Alec Baldwin', ),
                  ('Sean Penn', ), ('Tim Robbins', ), ('Samuel L. Jackson', ),
                  (None, )] * 1000
        assert rel.execute().fetchall() == result

示例#20

0

显示文件

文件： test_dictionary_arrow.py 项目： triump2020/duckdb

 def test_dictionary_timestamps(self, duckdb_cursor):
     if not can_run:
         return
     indices_list = [None, 0, 1, 2, None]
     indices = pa.array(indices_list * 1000)
     dictionary = pa.array([
         Timestamp(year=2001, month=9, day=25),
         Timestamp(year=2006, month=11, day=14),
         Timestamp(year=2012, month=5, day=15),
         Timestamp(year=2018, month=11, day=2)
     ])
     dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
     arrow_table = pa.Table.from_arrays([dict_array], ['a'])
     rel = duckdb.from_arrow(arrow_table)
     print(rel.execute().fetchall())
     result = [(None, ), (datetime.datetime(2001, 9, 25, 0, 0), ),
               (datetime.datetime(2006, 11, 14, 0, 0), ),
               (datetime.datetime(2012, 5, 15, 0, 0), ), (None, )] * 1000
     assert rel.execute().fetchall() == result

示例#21

0

显示文件

文件： test_non_default_conn.py 项目： triump2020/duckdb

    def test_from_arrow(self, duckdb_cursor):
        try:
            import pyarrow as pa
        except:
            return

        conn = duckdb.connect()
        conn.execute("create table t (a integer)")
        conn.execute("insert into t values (1)")
        test_df = pd.DataFrame.from_dict({"i": [1, 2, 3, 4]})
        test_arrow = pa.Table.from_pandas(test_df)
        rel = duckdb.from_arrow(test_arrow, connection=conn)
        assert rel.query('t_2',
                         'select count(*) from t inner join t_2 on (a = i)'
                         ).fetchall()[0] == (1, )
        rel = duckdb.arrow(test_arrow, connection=conn)
        assert rel.query('t_2',
                         'select count(*) from t inner join t_2 on (a = i)'
                         ).fetchall()[0] == (1, )

示例#22

0

显示文件

文件： test_arrow_scanner.py 项目： triump2020/duckdb

    def test_parallel_scanner_default_conn(self,duckdb_cursor):
        if not can_run:
            return

        parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','userdata1.parquet')

        arrow_dataset= pyarrow.dataset.dataset([
            parquet_filename,
            parquet_filename,
            parquet_filename,
        ]
        , format="parquet")

        scanner_filter = (pc.field("first_name") == pc.scalar('Jose')) & (pc.field("salary") > pc.scalar(134708.82))

        arrow_scanner = Scanner.from_dataset(arrow_dataset, filter=scanner_filter)

        rel = duckdb.from_arrow(arrow_scanner)

        assert rel.aggregate('count(*)').execute().fetchone()[0] == 12

示例#23

0

显示文件

文件： test_integration.py 项目： hannesmuehleisen/duckdb

    def test_unsigned_roundtrip(self, duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'unsigned.parquet')
        data = (pyarrow.array([1,2,3,4,5,255], type=pyarrow.uint8()),pyarrow.array([1,2,3,4,5,65535], \
            type=pyarrow.uint16()),pyarrow.array([1,2,3,4,5,4294967295], type=pyarrow.uint32()),\
                pyarrow.array([1,2,3,4,5,18446744073709551615], type=pyarrow.uint64()))

        tbl = pyarrow.Table.from_arrays([data[0], data[1], data[2], data[3]],
                                        ['a', 'b', 'c', 'd'])
        pyarrow.parquet.write_table(tbl, parquet_filename)

        cols = 'a, b, c, d'

        unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        unsigned_parquet_table.validate(full=True)
        rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project(
            cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(
            cols).arrow()
        rel_from_duckdb.validate(full=True)

        assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)

        con = duckdb.connect()
        con.execute(
            "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq"
        )
        arrow_result = con.fetch_arrow_table()
        arrow_result.validate(full=True)
        arrow_result.combine_chunks()
        arrow_result.validate(full=True)

        round_tripping = duckdb.from_arrow(arrow_result).to_arrow_table()
        round_tripping.validate(full=True)

        assert round_tripping.equals(arrow_result, check_metadata=True)

示例#24

0

显示文件

def from_arrow():
    data = pa.array(np.random.rand(1_000_000), type=pa.float32())
    arrow_table = pa.Table.from_arrays([data],['a'])
    duckdb.from_arrow(arrow_table)