Пример #1
0
    def test_filter_pushdown_2145(self, duckdb_cursor):
        if not can_run:
            return

        date1 = pd.date_range("2018-01-01", "2018-12-31", freq="B")
        df1 = pd.DataFrame(np.random.randn(date1.shape[0], 5),
                           columns=list("ABCDE"))
        df1["date"] = date1

        date2 = pd.date_range("2019-01-01", "2019-12-31", freq="B")
        df2 = pd.DataFrame(np.random.randn(date2.shape[0], 5),
                           columns=list("ABCDE"))
        df2["date"] = date2

        pq.write_table(pa.table(df1), "data1.parquet")
        pq.write_table(pa.table(df2), "data2.parquet")

        table = pq.ParquetDataset(["data1.parquet", "data2.parquet"]).read()

        con = duckdb.connect()
        con.register_arrow("testarrow", table)

        output_df = duckdb.arrow(table).filter("date > '2019-01-01'").df()
        expected_df = duckdb.from_parquet("data*.parquet").filter(
            "date > '2019-01-01'").df()
        pd.testing.assert_frame_equal(expected_df, output_df)

        os.remove("data1.parquet")
        os.remove("data2.parquet")
Пример #2
0
    def test_parquet_roundtrip(self, duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'userdata1.parquet')
        cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments'

        # TODO timestamp

        userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        userdata_parquet_table.validate(full=True)
        rel_from_arrow = duckdb.arrow(userdata_parquet_table).project(
            cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(
            cols).arrow()
        rel_from_duckdb.validate(full=True)

        # batched version, lets use various values for batch size
        for i in [7, 51, 99, 100, 101, 500, 1000, 2000]:
            userdata_parquet_table2 = pyarrow.Table.from_batches(
                userdata_parquet_table.to_batches(i))
            assert userdata_parquet_table.equals(userdata_parquet_table2,
                                                 check_metadata=True)

            rel_from_arrow2 = duckdb.arrow(userdata_parquet_table2).project(
                cols).arrow()
            rel_from_arrow2.validate(full=True)

            assert rel_from_arrow.equals(rel_from_arrow2, check_metadata=True)
            assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)
Пример #3
0
    def test_unsigned_roundtrip(self, duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'unsigned.parquet')
        cols = 'a, b, c, d'

        unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        unsigned_parquet_table.validate(full=True)
        rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project(
            cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(
            cols).arrow()
        rel_from_duckdb.validate(full=True)

        assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)

        con = duckdb.connect()
        con.execute(
            "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq"
        )
        arrow_result = con.fetch_arrow_table()
        arrow_result.validate(full=True)
        arrow_result.combine_chunks()
        arrow_result.validate(full=True)

        round_tripping = duckdb.from_arrow(arrow_result).to_arrow_table()
        round_tripping.validate(full=True)

        assert round_tripping.equals(arrow_result, check_metadata=True)
Пример #4
0
    def test_unsigned_roundtrip(self,duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','unsigned.parquet')
        data = (pyarrow.array([1,2,3,4,5,255], type=pyarrow.uint8()),pyarrow.array([1,2,3,4,5,65535], \
            type=pyarrow.uint16()),pyarrow.array([1,2,3,4,5,4294967295], type=pyarrow.uint32()),\
                pyarrow.array([1,2,3,4,5,18446744073709551615], type=pyarrow.uint64()))

        tbl = pyarrow.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d'])
        pyarrow.parquet.write_table(tbl, parquet_filename)

        cols = 'a, b, c, d'

        unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        unsigned_parquet_table.validate(full=True)
        rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project(cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(cols).arrow()
        rel_from_duckdb.validate(full=True)

        assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)

        con = duckdb.connect()
        con.execute("select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq")
        arrow_result = con.fetch_arrow_table()
        arrow_result.validate(full=True)
        arrow_result.combine_chunks()
        arrow_result.validate(full=True)

        round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table()
        round_tripping.validate(full=True)

        assert round_tripping.equals(arrow_result, check_metadata=True)
Пример #5
0
    def test_from_parquet_binary_as_string_default_conn(self, duckdb_cursor):
        duckdb.default_connection.execute("PRAGMA binary_as_string=1")

        rel = duckdb.from_parquet(filename, True)
        assert rel.types == ['VARCHAR']

        res = rel.execute().fetchall()
        assert res[0] == ('foo', )
Пример #6
0
    def test_arrow(self, duckdb_cursor):

        if not can_run:
            return

        parquet_filename = 'userdata1.parquet'
        urllib.request.urlretrieve(
            'https://github.com/cwida/duckdb-data/releases/download/v1.0/userdata1.parquet',
            parquet_filename)

        cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments'

        # TODO timestamp

        userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        userdata_parquet_table.validate(full=True)

        rel_from_arrow = duckdb.arrow(userdata_parquet_table).project(
            cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(
            cols).arrow()
        rel_from_duckdb.validate(full=True)

        # batched version, lets use various values for batch size
        for i in [7, 51, 99, 100, 101, 500, 1000, 2000]:
            userdata_parquet_table2 = pyarrow.Table.from_batches(
                userdata_parquet_table.to_batches(i))
            assert userdata_parquet_table.equals(userdata_parquet_table2,
                                                 check_metadata=True)

            rel_from_arrow2 = duckdb.arrow(userdata_parquet_table2).project(
                cols).arrow()
            rel_from_arrow2.validate(full=True)

            assert rel_from_arrow.equals(rel_from_arrow2, check_metadata=True)
            assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)

        con = duckdb.connect()
        con.execute(
            "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq"
        )
        arrow_result = con.fetch_arrow_table()
        arrow_result.validate(full=True)
        arrow_result.combine_chunks()
        arrow_result.validate(full=True)

        round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table()
        round_tripping.validate(full=True)

        assert round_tripping.equals(arrow_result, check_metadata=True)
Пример #7
0
 def test_from_parquet(self, duckdb_cursor):
     try:
         import pyarrow as pa
     except:
         return
     temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))
     conn = duckdb.connect()
     conn.execute("create table t (a integer)")
     conn.execute("insert into t values (1)")
     test_df = pd.DataFrame.from_dict({"i":[1, 2, 3, 4]})
     test_df.to_parquet(temp_file_name, index=False)
     rel = duckdb.from_parquet(temp_file_name, connection=conn)
     assert rel.query('t_2','select count(*) from t inner join t_2 on (a = i)').fetchall()[0] ==  (1,)
Пример #8
0
    def test_arrow(self, duckdb_cursor):

        if not can_run:
            return

        parquet_filename = 'userdata1.parquet'
        urllib.request.urlretrieve(
            'https://github.com/cwida/duckdb-data/releases/download/v1.0/userdata1.parquet',
            parquet_filename)

        cols = 'id, first_name, last_name, email, gender, ip_address, cc, country, birthdate, salary, title, comments'

        # TODO timestamp

        userdata_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        userdata_parquet_table.validate(full=True)

        rel_from_arrow = duckdb.arrow(userdata_parquet_table).project(
            cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(
            cols).arrow()
        rel_from_duckdb.validate(full=True)

        # batched version, lets use various values for batch size
        for i in [7, 51, 99, 100, 101, 500, 1000, 2000]:
            userdata_parquet_table2 = pyarrow.Table.from_batches(
                userdata_parquet_table.to_batches(i))
            assert userdata_parquet_table.equals(userdata_parquet_table2,
                                                 check_metadata=True)

            rel_from_arrow2 = duckdb.arrow(userdata_parquet_table2).project(
                cols).arrow()
            rel_from_arrow2.validate(full=True)

            assert rel_from_arrow.equals(rel_from_arrow2, check_metadata=True)
            assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)
Пример #9
0
    def test_from_parquet_binary_as_string(self, duckdb_cursor):
        rel = duckdb.from_parquet(filename, True)
        assert rel.types == ['VARCHAR']

        res = rel.execute().fetchall()
        assert res[0] == ('foo', )
Пример #10
0
    def test_from_parquet_binary(self, duckdb_cursor):
        rel = duckdb.from_parquet(filename)
        assert rel.types == ['BLOB']

        res = rel.execute().fetchall()
        assert res[0] == (b'foo', )