Пример #1
0
def test_table_pickle():
    data = [
        pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()),
        pa.chunked_array([["some", "strings", None, ""]], type=pa.string()),
    ]
    schema = pa.schema([pa.field('ints', pa.uint32()),
                        pa.field('strs', pa.string())],
                       metadata={b'foo': b'bar'})
    table = pa.Table.from_arrays(data, schema=schema)

    result = pickle.loads(pickle.dumps(table))
    result._validate()
    assert result.equals(table)
Пример #2
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
Пример #3
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
Пример #4
0
Файл: jvm.py Проект: rok/arrow
def _from_jvm_int_type(jvm_type):
    """
    Convert a JVM int type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int

    Returns
    -------
    typ: pyarrow.DataType
    """
    if jvm_type.isSigned:
        if jvm_type.bitWidth == 8:
            return pa.int8()
        elif jvm_type.bitWidth == 16:
            return pa.int16()
        elif jvm_type.bitWidth == 32:
            return pa.int32()
        elif jvm_type.bitWidth == 64:
            return pa.int64()
    else:
        if jvm_type.bitWidth == 8:
            return pa.uint8()
        elif jvm_type.bitWidth == 16:
            return pa.uint16()
        elif jvm_type.bitWidth == 32:
            return pa.uint32()
        elif jvm_type.bitWidth == 64:
            return pa.uint64()
Пример #5
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
Пример #6
0
def test_from_numpy_dtype():
    cases = [(np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()),
             (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()),
             (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()),
             (np.dtype('uint16'), pa.uint16()),
             (np.dtype('uint32'), pa.uint32()),
             (np.dtype('float16'), pa.float16()),
             (np.dtype('float32'), pa.float32()),
             (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()),
             (np.dtype('S'), pa.binary()),
             (np.dtype('datetime64[s]'), pa.timestamp('s')),
             (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
             (np.dtype('datetime64[us]'), pa.timestamp('us')),
             (np.dtype('datetime64[ns]'), pa.timestamp('ns'))]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
Пример #7
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
Пример #8
0
def test_bit_width():
    for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8), (pa.uint32(), 32),
                         (pa.float16(), 16), (pa.decimal128(19, 4), 128),
                         (pa.binary(42), 42 * 8)]:
        assert ty.bit_width == expected
    for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
        with pytest.raises(ValueError, match="fixed width"):
            ty.bit_width
Пример #9
0
def test_table(n, types=None, offset=None, length=None, nullable=True):
    if types is None:
        types = [
            pyarrow.null(),
            pyarrow.bool_(),
            pyarrow.int8(),
            pyarrow.int16(),
            pyarrow.int32(),
            pyarrow.int64(),
            pyarrow.uint8(),
            pyarrow.uint16(),
            pyarrow.uint32(),
            pyarrow.uint64(),
            pyarrow.float16(),
            pyarrow.float32(),
            pyarrow.float64(),
            pyarrow.date32(),
            pyarrow.date64(),
            pyarrow.timestamp('s'),
            pyarrow.timestamp('ms'),
            pyarrow.timestamp('us'),
            pyarrow.timestamp('ns'),
            pyarrow.time32('s'),
            pyarrow.time32('ms'),
            pyarrow.time64('us'),
            pyarrow.time64('ns'),
            pyarrow.string(),
            pyarrow.binary(),
            pyarrow.binary(4),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), True),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), True),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), False),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), False),
            pyarrow.list_(pyarrow.int32()),
            pyarrow.struct([pyarrow.field('int32', pyarrow.int32())]),
            pyarrow.list_(
                pyarrow.struct([pyarrow.field('int32', pyarrow.int32())])),
            pyarrow.struct(
                [pyarrow.field('int32', pyarrow.list_(pyarrow.int32()))]),
        ]

    data = list()

    for t in types:
        name = str(t)
        array = TestArrayGenerator(n, t, False).array
        if offset is not None:
            array = array.slice(offset, length)
        data.append(pyarrow.column(name, array))

        if nullable:
            name = str(t) + ' (null)'
            array = TestArrayGenerator(n, t, True).array
            if offset is not None:
                array = array.slice(offset, length)
            data.append(pyarrow.column(name, array))

    return pyarrow.Table.from_arrays(data)
Пример #10
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
        ('duration[s]', pa.duration('s')),
        ('duration[ms]', pa.duration('ms')),
        ('duration[us]', pa.duration('us')),
        ('duration[ns]', pa.duration('ns')),
        ('month_day_nano_interval', pa.month_day_nano_interval()),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Пример #11
0
    def test_update_arrow_update_float_schema_with_uint32(self, util):
        array = [random.randint(0, 2000000) for i in range(100)]
        data = pd.DataFrame({"a": np.array(array, dtype=np.uint32)})

        schema = pa.schema({"a": pa.uint32()})

        arrow = util.make_arrow_from_pandas(data, schema)
        tbl = Table({"a": float})
        tbl.update(arrow)
        assert tbl.view().to_dict()["a"] == array
Пример #12
0
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Пример #13
0
def test_context():
    # Create
    platform = pf.Platform("echo")

    # Init
    platform.init()

    # Create a schema with some stuff
    fields = [
        pa.field("a", pa.uint64(), False),
        pa.field("b", pa.string(), False),
        pa.field("c", pa.uint64(), True),
        pa.field("d", pa.list_(pa.field("e", pa.uint32(), True)), False)
    ]

    schema = pa.schema(fields)

    a = pa.array([1, 2, 3, 4], type=pa.uint64())
    b = pa.array(["hello", "world", "fletcher", "arrow"], type=pa.string())
    c = pa.array([5, 6, 7, 8],
                 mask=np.array([True, False, True, True]),
                 type=pa.uint64())
    d = pa.array([[9, 10, 11, 12], [13, 14], [15, 16, 17], [18]],
                 type=pa.list_(pa.uint32()))
    f = pa.array([19, 20, 21, 22], type=pa.uint32())
    g = pa.array([23, 24, 25, 26], type=pa.uint32())

    rb = pa.RecordBatch.from_arrays([a, b, c, d], schema)

    context = pf.Context(platform)

    context.queue_record_batch(rb)

    context.queue_array(f)

    context.queue_array(g, field=pa.field("g", pa.uint32(), False))

    # Write buffers
    context.enable()

    # Terminate
    platform.terminate()
Пример #14
0
def normalize_arrow_dtype(dtype):  # noqa: C901
    if dtype in ['bool']:
        return pa.bool_()
    if dtype in ['int8_t', 'int8', 'byte']:
        return pa.int8()
    if dtype in ['uint8_t', 'uint8', 'char']:
        return pa.uint8()
    if dtype in ['int16_t', 'int16', 'short']:
        return pa.int16()
    if dtype in ['uint16_t', 'uint16']:
        return pa.uint16()
    if dtype in ['int32_t', 'int32', 'int']:
        return pa.int32()
    if dtype in ['uint32_t', 'uint32']:
        return pa.uint32()
    if dtype in ['int64_t', 'int64', 'long']:
        return pa.int64()
    if dtype in ['uint64_t', 'uint64']:
        return pa.uint64()
    if dtype in ['half']:
        return pa.float16()
    if dtype in ['float', 'float32']:
        return pa.float32()
    if dtype in ['double', 'float64']:
        return pa.float64()
    if dtype in ['string', 'std::string', 'std::__1::string', 'str']:
        return pa.large_string()
    if dtype in ['large_list<item: int32>']:
        return pa.large_list(pa.int32())
    if dtype in ['large_list<item: uint32>']:
        return pa.large_list(pa.uint32())
    if dtype in ['large_list<item: int64>']:
        return pa.large_list(pa.int64())
    if dtype in ['large_list<item: uint64>']:
        return pa.large_list(pa.uint64())
    if dtype in ['large_list<item: float>']:
        return pa.large_list(pa.float())
    if dtype in ['large_list<item: double>']:
        return pa.large_list(pa.double())
    if dtype in ['null', 'NULL', 'None', None]:
        return pa.null()
    raise ValueError('Unsupported data type: %s' % dtype)
Пример #15
0
def test_bit_width():
    for ty, expected in [(pa.bool_(), 1),
                         (pa.int8(), 8),
                         (pa.uint32(), 32),
                         (pa.float16(), 16),
                         (pa.decimal128(19, 4), 128),
                         (pa.binary(42), 42 * 8)]:
        assert ty.bit_width == expected
    for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
        with pytest.raises(ValueError, match="fixed width"):
            ty.bit_width
Пример #16
0
def list_array_builder(client, array, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::LargeListArray'
    meta['length_'] = len(array)
    meta['null_count_'] = array.null_count
    meta['offset_'] = array.offset

    if isinstance(array, pa.ListArray):
        buffer = array.buffers()[1]
        length = len(buffer) // (pa.uint32().bit_width // 8)
        offset_array = pa.Array.from_buffers(pa.uint32(), length, [None, buffer])
        offset_array = offset_array.cast(pa.uint64())
        offset_buffer = offset_array.buffers()[1]
    else:  # is pa.LargeListArray
        offset_buffer = array.buffers()[1]

    meta.add_member('null_bitmap_', buffer_builder(client, array.buffers()[0], builder))
    meta.add_member('buffer_offsets_', buffer_builder(client, offset_buffer, builder))
    meta.add_member('values_', builder.run(client, array.values))
    meta['nbytes'] = array.nbytes
    return client.create_metadata(meta)
Пример #17
0
    def read_product(self,
                     keep_groups=None,
                     drop_groups=None,
                     keep_modules=None,
                     drop_modules=None):
        prod_cols = [
            'upc', 'upc_ver_uc', 'upc_descr', 'product_module_code',
            'product_module_descr', 'product_group_code',
            'product_group_descr', 'brand_code_uc', 'brand_descr', 'multi',
            'size1_code_uc', 'size1_amount', 'size1_units', 'dataset_found_uc',
            'size1_change_flag_uc'
        ]

        prod_dict = {
            'upc': pa.int64(),
            'upc_ver_uc': pa.int8(),
            'product_module_code': pa.uint16(),
            'brand_code_uc': pa.uint32(),
            'multi': pa.uint16(),
            'size1_code_uc': pa.uint16()
        }

        prod_df = csv.read_csv(self.product_file,
                               read_options=csv.ReadOptions(encoding='latin'),
                               parse_options=csv.ParseOptions(delimiter='\t'),
                               convert_options=csv.ConvertOptions(
                                   column_types=prod_dict,
                                   include_columns=prod_cols)).to_pandas()
        if keep_groups:
            prod_df = prod_df[prod_df['product_group_code'].isin(keep_groups)]
        if drop_groups:
            prod_df = prod_df[~prod_df['product_group_code'].isin(drop_groups)]
        if keep_modules:
            prod_df = prod_df[prod_df['product_module_code'].isin(
                keep_modules)]
        if drop_modules:
            prod_df = prod_df[~prod_df['product_module_code'].isin(drop_modules
                                                                   )]

        # dictionary encoding to save space
        prod_df['size1_units'] = prod_df['size1_units'].astype('category')
        prod_df['product_module_descr'] = prod_df[
            'product_module_descr'].astype('category')
        prod_df['product_group_code'] = prod_df['product_group_code'].astype(
            'category')

        # clean up product info
        prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace(
            'RTE', '')
        prod_df['brand_descr'] = prod_df['brand_descr'].str.strip(
        ).str.replace('CTL BR', 'Private Label')
        self.prod_df = prod_df.copy()
        return
Пример #18
0
def test_format_uint32_array():
    assert format_number_array(
        pa.array(
            [1, 1, 2, 2, 3_000, 3_000, 4_000_000, 4_000_000, None, None, 6, 6],
            pa.uint32(),
        ),
        parse_number_format("{:,d}"),
    ).to_pylist() == [
        "1",
        "1",
        "2",
        "2",
        "3,000",
        "3,000",
        "4,000,000",
        "4,000,000",
        None,
        None,
        "6",
        "6",
    ]
Пример #19
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
def _map_arrow_type(arrow_type):
    arrow_to_dh = {
        pa.null(): '',
        pa.bool_(): '',
        pa.int8(): 'byte',
        pa.int16(): 'short',
        pa.int32(): 'int',
        pa.int64(): 'long',
        pa.uint8(): '',
        pa.uint16(): 'char',
        pa.uint32(): '',
        pa.uint64(): '',
        pa.float16(): '',
        pa.float32(): 'float',
        pa.float64(): 'double',
        pa.time32('s'): '',
        pa.time32('ms'): '',
        pa.time64('us'): '',
        pa.time64('ns'): 'io.deephaven.time.DateTime',
        pa.timestamp('us', tz=None): '',
        pa.timestamp('ns', tz=None): '',
        pa.date32(): 'java.time.LocalDate',
        pa.date64(): 'java.time.LocalDate',
        pa.binary(): '',
        pa.string(): 'java.lang.String',
        pa.utf8(): 'java.lang.String',
        pa.large_binary(): '',
        pa.large_string(): '',
        pa.large_utf8(): '',
        # decimal128(int precision, int scale=0)
        # list_(value_type, int list_size=-1)
        # large_list(value_type)
        # map_(key_type, item_type[, keys_sorted])
        # struct(fields)
        # dictionary(index_type, value_type, …)
        # field(name, type, bool nullable = True[, metadata])
        # schema(fields[, metadata])
        # from_numpy_dtype(dtype)
    }

    dh_type = arrow_to_dh.get(arrow_type)
    if not dh_type:
        # if this is a case of timestamp with tz specified
        if isinstance(arrow_type, pa.TimestampType):
            dh_type = "io.deephaven.time.DateTime"

    if not dh_type:
        raise DHError(f'unsupported arrow data type : {arrow_type}')

    return {"deephaven:type": dh_type}
Пример #21
0
def _get_numba_typ_from_pa_typ(pa_typ):
    import pyarrow as pa
    _typ_map = {
        # boolean
        pa.bool_():
        types.bool_,
        # signed int types
        pa.int8():
        types.int8,
        pa.int16():
        types.int16,
        pa.int32():
        types.int32,
        pa.int64():
        types.int64,
        # unsigned int types
        pa.uint8():
        types.uint8,
        pa.uint16():
        types.uint16,
        pa.uint32():
        types.uint32,
        pa.uint64():
        types.uint64,
        # float types (TODO: float16?)
        pa.float32():
        types.float32,
        pa.float64():
        types.float64,
        # String
        pa.string():
        string_type,
        # date
        pa.date32():
        types.NPDatetime('ns'),
        pa.date64():
        types.NPDatetime('ns'),
        # time (TODO: time32, time64, ...)
        pa.timestamp('ns'):
        types.NPDatetime('ns'),
        pa.timestamp('us'):
        types.NPDatetime('ns'),
        pa.timestamp('ms'):
        types.NPDatetime('ns'),
        pa.timestamp('s'):
        types.NPDatetime('ns'),
    }
    if pa_typ not in _typ_map:
        raise ValueError("Arrow data type {} not supported yet".format(pa_typ))
    return _typ_map[pa_typ]
Пример #22
0
def string_array_builder(client, array, builder):
    meta = ObjectMeta()
    meta['typename'] = 'vineyard::BaseBinaryArray<arrow::LargeStringArray>'
    meta['length_'] = len(array)
    meta['null_count_'] = array.null_count
    meta['offset_'] = array.offset

    null_bitmap = buffer_builder(client, array.buffers()[0], builder)
    if isinstance(array, pa.StringArray):
        buffer = array.buffers()[1]
        length = len(buffer) // (pa.uint32().bit_width // 8)
        offset_array = pa.Array.from_buffers(pa.uint32(), length, [None, buffer])
        offset_array = offset_array.cast(pa.uint64())
        offset_buffer = offset_array.buffers()[1]
    else:  # is pa.LargeStringArray
        offset_buffer = array.buffers()[1]
    buffer_offsets = buffer_builder(client, offset_buffer, builder)
    buffer_data = buffer_builder(client, array.buffers()[2], builder)

    meta.add_member('buffer_offsets_', buffer_offsets)
    meta.add_member('buffer_data_', buffer_data)
    meta.add_member('null_bitmap_', null_bitmap)
    meta['nbytes'] = array.nbytes
    return client.create_metadata(meta)
Пример #23
0
def test_filter():
    import pyarrow.gandiva as gandiva

    df = pd.DataFrame({"a": [1.0 * i for i in range(10000)]})
    table = pa.Table.from_pandas(df)

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    thousand = builder.make_literal(1000.0, pa.float64())
    cond = builder.make_function("less_than", [node_a, thousand], pa.bool_())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
Пример #24
0
def test_nested_ndarray_different_dtypes():
    data = [
        np.array([1, 2, 3], dtype='int64'), None,
        np.array([4, 5, 6], dtype='uint32')
    ]

    arr = pa.array(data)
    expected = pa.array([[1, 2, 3], None, [4, 5, 6]],
                        type=pa.list_(pa.int64()))
    assert arr.equals(expected)

    t2 = pa.list_(pa.uint32())
    arr2 = pa.array(data, type=t2)
    expected2 = expected.cast(t2)
    assert arr2.equals(expected2)
Пример #25
0
def test_nested_ndarray_different_dtypes():
    data = [
        np.array([1, 2, 3], dtype='int64'),
        None,
        np.array([4, 5, 6], dtype='uint32')
    ]

    arr = pa.array(data)
    expected = pa.array([[1, 2, 3], None, [4, 5, 6]],
                        type=pa.list_(pa.int64()))
    assert arr.equals(expected)

    t2 = pa.list_(pa.uint32())
    arr2 = pa.array(data, type=t2)
    expected2 = expected.cast(t2)
    assert arr2.equals(expected2)
Пример #26
0
    def test2DSparseTensor(self):
        tensor_representation = text_format.Parse(
            """
        sparse_tensor {
          value_column_name: "values"
          index_column_names: ["d0", "d1"]
          dense_shape {
            dim {
              size: 10
            }
            dim {
              size: 20
            }
          }
        }
        """, schema_pb2.TensorRepresentation())
        record_batch = pa.RecordBatch.from_arrays(
            [
                pa.array([[1], None, [2], [3, 4, 5], []],
                         type=pa.list_(pa.int64())),
                # Also test that the index column can be of an integral type other
                # than int64.
                pa.array([[9], None, [9], [7, 8, 9], []],
                         type=pa.list_(pa.uint32())),
                pa.array([[0], None, [0], [0, 1, 2], []],
                         type=pa.list_(pa.int64()))
            ],
            ["values", "d0", "d1"])
        adapter = tensor_adapter.TensorAdapter(
            tensor_adapter.TensorAdapterConfig(
                record_batch.schema, {"output": tensor_representation}))
        converted = adapter.ToBatchTensors(record_batch)
        self.assertLen(converted, 1)
        self.assertIn("output", converted)
        actual_output = converted["output"]
        self.assertIsInstance(
            actual_output, (tf.SparseTensor, tf.compat.v1.SparseTensorValue))
        self.assertSparseAllEqual(
            tf.compat.v1.SparseTensorValue(dense_shape=[5, 10, 20],
                                           indices=[[0, 9, 0], [2, 9, 0],
                                                    [3, 7, 0], [3, 8, 1],
                                                    [3, 9, 2]],
                                           values=tf.convert_to_tensor(
                                               [1, 2, 3, 4, 5],
                                               dtype=tf.int64)), actual_output)

        self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch)
Пример #27
0
def test_is_integer():
    signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
    unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]

    for t in signed_ints + unsigned_ints:
        assert types.is_integer(t)

    for t in signed_ints:
        assert types.is_signed_integer(t)
        assert not types.is_unsigned_integer(t)

    for t in unsigned_ints:
        assert types.is_unsigned_integer(t)
        assert not types.is_signed_integer(t)

    assert not types.is_integer(pa.float32())
    assert not types.is_signed_integer(pa.float32())
Пример #28
0
def test_is_integer():
    signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
    unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]

    for t in signed_ints + unsigned_ints:
        assert types.is_integer(t)

    for t in signed_ints:
        assert types.is_signed_integer(t)
        assert not types.is_unsigned_integer(t)

    for t in unsigned_ints:
        assert types.is_unsigned_integer(t)
        assert not types.is_signed_integer(t)

    assert not types.is_integer(pa.float32())
    assert not types.is_signed_integer(pa.float32())
Пример #29
0
def _create_parquet_schema(dtypes):
    """Create parquet schema from Pandas dtypes

    Args:
        dtypes: A dict or Series of dtypes
    Returns:
        pyarrow.Schema
    """
    import pyarrow as pa

    dtypes = dict(dtypes)
    fields = []
    for varname, vartype in dtypes.items():
        if vartype == np.float16:
            fields.append(pa.field(varname, pa.float16()))
        elif vartype == np.float32:
            fields.append(pa.field(varname, pa.float32()))
        elif vartype == np.float64:
            fields.append(pa.field(varname, pa.float64()))
        elif vartype == np.int8:
            fields.append(pa.field(varname, pa.int8()))
        elif vartype == np.int16:
            fields.append(pa.field(varname, pa.int16()))
        elif vartype == np.int32:
            fields.append(pa.field(varname, pa.int32()))
        elif vartype == np.int64:
            fields.append(pa.field(varname, pa.int64()))
        elif vartype == np.uint8:
            fields.append(pa.field(varname, pa.uint8()))
        elif vartype == np.uint16:
            fields.append(pa.field(varname, pa.uint16()))
        elif vartype == np.uint32:
            fields.append(pa.field(varname, pa.uint32()))
        elif vartype == np.uint64:
            fields.append(pa.field(varname, pa.uint64()))
        elif vartype == np.bool_:
            fields.append(pa.field(varname, pa.bool_()))
        elif (vartype == object) | (vartype.name == 'category'):
            fields.append(pa.field(varname, pa.string()))
        elif np.issubdtype(vartype, np.datetime64):
            fields.append(pa.field(varname, pa.timestamp('ns')))

    assert len(dtypes) == len(fields)
    schema = pa.schema(fields)
    return schema
def test_filter():
    import pyarrow.gandiva as gandiva

    table = pa.Table.from_arrays([pa.array([1.0 * i for i in range(10000)])],
                                 ['a'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    thousand = builder.make_literal(1000.0, pa.float64())
    cond = builder.make_function("less_than", [node_a, thousand], pa.bool_())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    # Gandiva generates compute kernel function named `@expr_X`
    assert filter.llvm_ir.find("@expr_") != -1

    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
Пример #31
0
def coerce_arrow(array: pa.Array, rechunk: bool = True) -> pa.Array:
    # note: Decimal256 could not be cast to float
    if isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    if hasattr(array, "num_chunks") and array.num_chunks > 1 and rechunk:
        # small integer keys can often not be combined, so let's already cast
        # to the uint32 used by polars
        if pa.types.is_dictionary(
                array.type) and (pa.types.is_int8(array.type.index_type)
                                 or pa.types.is_uint8(array.type.index_type)
                                 or pa.types.is_int16(array.type.index_type)
                                 or pa.types.is_uint16(array.type.index_type)
                                 or pa.types.is_int32(array.type.index_type)):
            array = pa.compute.cast(
                array, pa.dictionary(pa.uint32(),
                                     pa.large_string())).combine_chunks()
    return array
Пример #32
0
def test_convert_uint8_uint16_uint32():
    # parquet only stores int32/int64 values natively. These are upcast to
    # be encoded.
    _test_convert_via_arrow(
        pyarrow.table({
            "u8":
            pyarrow.array([1, 138, None], type=pyarrow.uint8()),
            "u16":
            pyarrow.array([1, 38383, None], type=pyarrow.uint16()),
            "u32":
            pyarrow.array([1, 4294967291, None], type=pyarrow.uint32()),
        }),
        "u8,u16,u32\r\n1,1,1\r\n138,38383,4294967291\r\n,,",
        [
            dict(u8=1, u16=1, u32=1),
            dict(u8=138, u16=38383, u32=4294967291),
            dict(u8=None, u16=None, u32=None),
        ],
    )
Пример #33
0
def write_case1_pyarrow(size=1, page_version=1):
    data, path = case1(size)

    fields = [
        pa.field('int64', pa.int64()),
        pa.field('float64', pa.float64()),
        pa.field('string', pa.utf8()),
        pa.field('bool', pa.bool_()),
        pa.field('date', pa.timestamp('ms')),
        pa.field('uint32', pa.uint32()),
    ]
    schema = pa.schema(fields)

    base_path = f"{PYARROW_PATH}/v{page_version}"

    t = pa.table(data, schema=schema)
    os.makedirs(base_path, exist_ok=True)
    pa.parquet.write_table(t,
                           f"{base_path}/{path}",
                           data_page_version=f"{page_version}.0")
Пример #34
0
def test_validate_schema_write_table(tempdir):
    # ARROW-2926
    simple_fields = [
        pa.field('POS', pa.uint32()),
        pa.field('desc', pa.string())
    ]

    simple_schema = pa.schema(simple_fields)

    # simple_table schema does not match simple_schema
    simple_from_array = [pa.array([1]), pa.array(['bla'])]
    simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc'])

    path = tempdir / 'simple_validate_schema.parquet'

    with pq.ParquetWriter(path, simple_schema,
                          version='2.0',
                          compression='snappy', flavor='spark') as w:
        with pytest.raises(ValueError):
            w.write_table(simple_table)
Пример #35
0
    def test_unsigned_roundtrip(self, duckdb_cursor):
        if not can_run:
            return
        parquet_filename = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'data',
            'unsigned.parquet')
        data = (pyarrow.array([1,2,3,4,5,255], type=pyarrow.uint8()),pyarrow.array([1,2,3,4,5,65535], \
            type=pyarrow.uint16()),pyarrow.array([1,2,3,4,5,4294967295], type=pyarrow.uint32()),\
                pyarrow.array([1,2,3,4,5,18446744073709551615], type=pyarrow.uint64()))

        tbl = pyarrow.Table.from_arrays([data[0], data[1], data[2], data[3]],
                                        ['a', 'b', 'c', 'd'])
        pyarrow.parquet.write_table(tbl, parquet_filename)

        cols = 'a, b, c, d'

        unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename)
        unsigned_parquet_table.validate(full=True)
        rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project(
            cols).arrow()
        rel_from_arrow.validate(full=True)

        rel_from_duckdb = duckdb.from_parquet(parquet_filename).project(
            cols).arrow()
        rel_from_duckdb.validate(full=True)

        assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True)

        con = duckdb.connect()
        con.execute(
            "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq"
        )
        arrow_result = con.fetch_arrow_table()
        arrow_result.validate(full=True)
        arrow_result.combine_chunks()
        arrow_result.validate(full=True)

        round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table()
        round_tripping.validate(full=True)

        assert round_tripping.equals(arrow_result, check_metadata=True)
Пример #36
0
    def test_integer_no_nulls(self):
        data = {}
        fields = []

        numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()),
                        ('i4', A.int32()), ('i8', A.int64()),
                        ('u1', A.uint8()), ('u2', A.uint16()),
                        ('u4', A.uint32()), ('u8', A.uint64())]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(info.min,
                                       min(info.max, np.iinfo('i8').max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(A.Field.from_py(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = A.Schema.from_fields(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
Пример #37
0
    def test_integer_no_nulls(self):
        data = {}
        fields = []

        numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()),
                        ('i4', A.int32()), ('i8', A.int64()),
                        ('u1', A.uint8()), ('u2', A.uint16()),
                        ('u4', A.uint32()), ('u8', A.uint64())]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(info.min,
                                       min(info.max, np.iinfo('i8').max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(A.Field.from_py(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = A.Schema.from_fields(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
Пример #38
0
def get_schemas(): 
    variant_table = pa.schema([
        ('vId', pa.int64()),
        ('chrom', pa.string()),
        ('pos', pa.int32()),
        ('ref', pa.string()),
        ('alt', pa.string())
    ])

    annotations = pa.schema([
        ('vId', pa.int64()),
        ('geneSymbol', pa.string())
    ])

    gt_table = pa.schema([
        ('vId', pa.uint64()),
        ('callsetId', pa.uint32()),
        ('genotype', pa.uint8())
    ])

    return {"variants": variant_table, "annotations": annotations, "gts": gt_table}
Пример #39
0
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False

    # invalid index type raises
    with pytest.raises(TypeError):
        pa.dictionary(pa.string(), pa.int64())
    with pytest.raises(TypeError):
        pa.dictionary(pa.uint32(), pa.string())
    def test_integer_no_nulls(self):
        data = OrderedDict()
        fields = []

        numpy_dtypes = [
            ('i1', pa.int8()), ('i2', pa.int16()),
            ('i4', pa.int32()), ('i8', pa.int64()),
            ('u1', pa.uint8()), ('u2', pa.uint16()),
            ('u4', pa.uint32()), ('u8', pa.uint64()),
            ('longlong', pa.int64()), ('ulonglong', pa.uint64())
        ]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
                                       min(info.max, np.iinfo(np.int_).max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(pa.field(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = pa.schema(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
Пример #41
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
Пример #42
0
def test_tensor_base_object():
    tensor = pa.Tensor.from_numpy(np.random.randn(10, 4))
    n = sys.getrefcount(tensor)
    array = tensor.to_numpy()
    assert sys.getrefcount(tensor) == n + 1


@pytest.mark.parametrize('dtype_str,arrow_type', [
    ('i1', pa.int8()),
    ('i2', pa.int16()),
    ('i4', pa.int32()),
    ('i8', pa.int64()),
    ('u1', pa.uint8()),
    ('u2', pa.uint16()),
    ('u4', pa.uint32()),
    ('u8', pa.uint64()),
    ('f2', pa.float16()),
    ('f4', pa.float32()),
    ('f8', pa.float64())
])
def test_tensor_numpy_roundtrip(dtype_str, arrow_type):
    dtype = np.dtype(dtype_str)
    data = (100 * np.random.randn(10, 4)).astype(dtype)

    tensor = pa.Tensor.from_numpy(data)
    assert tensor.type == arrow_type

    repr(tensor)

    result = tensor.to_numpy()
Пример #43
0
#
# The specifications were created using:
#
#   om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')()
#   field = …  # Code to instantiate the field
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize('pa_type,jvm_spec', [
    (pa.null(), '{"name":"null"}'),
    (pa.bool_(), '{"name":"bool"}'),
    (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
    (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
    (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
    (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
    (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
    (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
    (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
    (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
    (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
    (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
    (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
    (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
    (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
    (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
    (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
    (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
        '"timezone":null}'),
    (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
        '"timezone":null}'),
    (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
        '"timezone":null}'),
    (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
Пример #44
0
null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())

signed_integer_types = st.sampled_from([
    pa.int8(),
    pa.int16(),
    pa.int32(),
    pa.int64()
])
unsigned_integer_types = st.sampled_from([
    pa.uint8(),
    pa.uint16(),
    pa.uint32(),
    pa.uint64()
])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([
    pa.float16(),
    pa.float32(),
    pa.float64()
])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
)
numeric_types = st.one_of(integer_types, floating_types, decimal_type)
Пример #45
0
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)
Пример #46
0
import datetime
import decimal
import itertools
import numpy as np
import six
import pytz


int_type_pairs = [
    (np.int8, pa.int8()),
    (np.int16, pa.int16()),
    (np.int32, pa.int32()),
    (np.int64, pa.int64()),
    (np.uint8, pa.uint8()),
    (np.uint16, pa.uint16()),
    (np.uint32, pa.uint32()),
    (np.uint64, pa.uint64())]


np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()


def check_struct_type(ty, expected):
Пример #47
0
    result = pickle.loads(pickle.dumps(array))
    assert array.equals(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')