Пример #1
0
def _numpy_and_codec_from_arrow_type(field_type):
    from pyarrow import types

    if types.is_int8(field_type):
        np_type = np.int8
    elif types.is_int16(field_type):
        np_type = np.int16
    elif types.is_int32(field_type):
        np_type = np.int32
    elif types.is_int64(field_type):
        np_type = np.int64
    elif types.is_string(field_type):
        np_type = np.unicode_
    elif types.is_boolean(field_type):
        np_type = np.bool_
    elif types.is_float32(field_type):
        np_type = np.float32
    elif types.is_float64(field_type):
        np_type = np.float64
    elif types.is_decimal(field_type):
        np_type = Decimal
    elif types.is_binary(field_type):
        np_type = np.string_
    elif types.is_fixed_size_binary(field_type):
        np_type = np.string_
    elif types.is_date(field_type):
        np_type = np.datetime64
    elif types.is_timestamp(field_type):
        np_type = np.datetime64
    elif types.is_list(field_type):
        np_type = _numpy_and_codec_from_arrow_type(field_type.value_type)
    else:
        raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type))
    return np_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str:
    if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType)
            or types.is_large_string(pyarrowType)
            or types.is_large_unicode(pyarrowType)):
        return 'string'
    if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)):
        return 'bigint'
    if (types.is_binary(pyarrowType)):
        return 'binary'
    if (types.is_boolean(pyarrowType)):
        return 'boolean'
    if (types.is_date(pyarrowType) or types.is_date32(pyarrowType)
            or types.is_date64(pyarrowType)):
        return 'date'
    if (types.is_decimal(pyarrowType)):
        return 'decimal(16,2)'
    if (types.is_float64(pyarrowType)):
        'return double'
    if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)):
        return 'float'
    if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType)
            or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)):
        return 'int'
    if (types.is_map(pyarrowType)):
        return 'map'
    if (types.is_struct(pyarrowType)):
        return 'struct'
    if (types.is_timestamp(pyarrowType)):
        return 'timestamp'
    if (types.is_union(pyarrowType)):
        return 'union'
    return str(pyarrowType)
Пример #3
0
def _numpy_and_codec_from_arrow_type(field_type):
    from pyarrow import types

    if types.is_int8(field_type):
        np_type = np.int8
        codec = ScalarCodec(ByteType())
    elif types.is_int16(field_type):
        np_type = np.int16
        codec = ScalarCodec(ShortType())
    elif types.is_int32(field_type):
        np_type = np.int32
        codec = ScalarCodec(IntegerType())
    elif types.is_int64(field_type):
        np_type = np.int64
        codec = ScalarCodec(LongType())
    elif types.is_string(field_type):
        np_type = np.unicode_
        codec = ScalarCodec(StringType())
    elif types.is_boolean(field_type):
        np_type = np.bool_
        codec = ScalarCodec(BooleanType())
    elif types.is_float32(field_type):
        np_type = np.float32
        codec = ScalarCodec(FloatType())
    elif types.is_float64(field_type):
        np_type = np.float64
        codec = ScalarCodec(DoubleType())
    elif types.is_decimal(field_type):
        np_type = Decimal
        codec = ScalarCodec(DecimalType(field_type.precision,
                                        field_type.scale))
    elif types.is_binary(field_type):
        codec = ScalarCodec(StringType())
        np_type = np.string_
    elif types.is_fixed_size_binary(field_type):
        codec = ScalarCodec(StringType())
        np_type = np.string_
    elif types.is_date(field_type):
        np_type = np.datetime64
        codec = ScalarCodec(DateType())
    elif types.is_timestamp(field_type):
        np_type = np.datetime64
        codec = ScalarCodec(TimestampType())
    elif types.is_list(field_type):
        _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type)
        codec = None
    else:
        raise ValueError(
            'Cannot auto-create unischema due to unsupported column type {}'.
            format(field_type))
    return codec, np_type
Пример #4
0
def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]

    for case in date_types + time_types + timestamp_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)

    assert not types.is_temporal(pa.int32())