Exemplo n.º 1
0
def test_is_binary_string():
    assert types.is_binary(pa.binary())
    assert not types.is_binary(pa.string())

    assert types.is_string(pa.string())
    assert types.is_unicode(pa.string())
    assert not types.is_string(pa.binary())

    assert types.is_fixed_size_binary(pa.binary(5))
    assert not types.is_fixed_size_binary(pa.binary())
def test_is_binary_string():
    assert types.is_binary(pa.binary())
    assert not types.is_binary(pa.string())

    assert types.is_string(pa.string())
    assert types.is_unicode(pa.string())
    assert not types.is_string(pa.binary())

    assert types.is_fixed_size_binary(pa.binary(5))
    assert not types.is_fixed_size_binary(pa.binary())
Exemplo n.º 3
0
def _numpy_and_codec_from_arrow_type(field_type):
    from pyarrow import types

    if types.is_int8(field_type):
        np_type = np.int8
    elif types.is_int16(field_type):
        np_type = np.int16
    elif types.is_int32(field_type):
        np_type = np.int32
    elif types.is_int64(field_type):
        np_type = np.int64
    elif types.is_string(field_type):
        np_type = np.unicode_
    elif types.is_boolean(field_type):
        np_type = np.bool_
    elif types.is_float32(field_type):
        np_type = np.float32
    elif types.is_float64(field_type):
        np_type = np.float64
    elif types.is_decimal(field_type):
        np_type = Decimal
    elif types.is_binary(field_type):
        np_type = np.string_
    elif types.is_fixed_size_binary(field_type):
        np_type = np.string_
    elif types.is_date(field_type):
        np_type = np.datetime64
    elif types.is_timestamp(field_type):
        np_type = np.datetime64
    elif types.is_list(field_type):
        np_type = _numpy_and_codec_from_arrow_type(field_type.value_type)
    else:
        raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type))
    return np_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str:
    if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType)
            or types.is_large_string(pyarrowType)
            or types.is_large_unicode(pyarrowType)):
        return 'string'
    if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)):
        return 'bigint'
    if (types.is_binary(pyarrowType)):
        return 'binary'
    if (types.is_boolean(pyarrowType)):
        return 'boolean'
    if (types.is_date(pyarrowType) or types.is_date32(pyarrowType)
            or types.is_date64(pyarrowType)):
        return 'date'
    if (types.is_decimal(pyarrowType)):
        return 'decimal(16,2)'
    if (types.is_float64(pyarrowType)):
        'return double'
    if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)):
        return 'float'
    if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType)
            or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)):
        return 'int'
    if (types.is_map(pyarrowType)):
        return 'map'
    if (types.is_struct(pyarrowType)):
        return 'struct'
    if (types.is_timestamp(pyarrowType)):
        return 'timestamp'
    if (types.is_union(pyarrowType)):
        return 'union'
    return str(pyarrowType)
Exemplo n.º 5
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
             for field in at])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
Exemplo n.º 6
0
def _numpy_and_codec_from_arrow_type(field_type):
    from pyarrow import types

    if types.is_int8(field_type):
        np_type = np.int8
        codec = ScalarCodec(ByteType())
    elif types.is_int16(field_type):
        np_type = np.int16
        codec = ScalarCodec(ShortType())
    elif types.is_int32(field_type):
        np_type = np.int32
        codec = ScalarCodec(IntegerType())
    elif types.is_int64(field_type):
        np_type = np.int64
        codec = ScalarCodec(LongType())
    elif types.is_string(field_type):
        np_type = np.unicode_
        codec = ScalarCodec(StringType())
    elif types.is_boolean(field_type):
        np_type = np.bool_
        codec = ScalarCodec(BooleanType())
    elif types.is_float32(field_type):
        np_type = np.float32
        codec = ScalarCodec(FloatType())
    elif types.is_float64(field_type):
        np_type = np.float64
        codec = ScalarCodec(DoubleType())
    elif types.is_decimal(field_type):
        np_type = Decimal
        codec = ScalarCodec(DecimalType(field_type.precision,
                                        field_type.scale))
    elif types.is_binary(field_type):
        codec = ScalarCodec(StringType())
        np_type = np.string_
    elif types.is_fixed_size_binary(field_type):
        codec = ScalarCodec(StringType())
        np_type = np.string_
    elif types.is_date(field_type):
        np_type = np.datetime64
        codec = ScalarCodec(DateType())
    elif types.is_timestamp(field_type):
        np_type = np.datetime64
        codec = ScalarCodec(TimestampType())
    elif types.is_list(field_type):
        _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type)
        codec = None
    else:
        raise ValueError(
            'Cannot auto-create unischema due to unsupported column type {}'.
            format(field_type))
    return codec, np_type
Exemplo n.º 7
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type
Exemplo n.º 8
0
    def _cast(self, val: Any, dtype: pa.DataType) -> Any:
        """Fix columns with mixed/serialized dtypes"""

        if not val:
            return None

        if is_string(dtype):
            casted = str(val)
        elif is_floating(dtype):
            casted = self._cast_float(val, dtype)
        elif is_temporal(dtype):
            casted = self._cast_temporal(val, dtype)
        else:
            casted = val
        return casted
Exemplo n.º 9
0
 def _getType(value):
     if patypes.is_float_value(value):
         dtype = DoubleType()  # VectorDataType.DOUBLE
         ptype = float
     elif patypes.is_integer_value(value):
         dtype = IntegerType()  # VectorDataType.INTEGER
         ptype = int
     elif patypes.is_string(value):
         dtype = StringType()  # VectorDataType.STRING
         ptype = str
     else:
         # maybe not the best default choice, but...
         print("Unrecognized datatype {}, attempting to use Double".format(
             type(value)))
         dtype = DoubleType()  # VectorDataType.DOUBLE
         ptype = float
     return dtype, ptype
Exemplo n.º 10
0
 def _getVectorLengthAndType(self, name, row):
     dtype = VectorDataType.DOUBLE
     try:
         v0 = row[name]
         if isinstance(v0, (DenseVector, SparseVector)):
             num_elements = len(v0)
             value = v0[0]
             if patypes.is_float_value(value):
                 dtype = VectorDataType.DOUBLE  # DoubleType()
             elif patypes.is_integer_value(value):
                 dtype = VectorDataType.INTEGER  # IntegerType()
             elif patypes.is_string(value):
                 dtype = VectorDataType.STRING  # StringType()
             else:
                 # maybe not the best default choice, but...
                 print("Unrecognized datatype {}, attempting to use Double".
                       format(type(value)))
                 dtype = VectorDataType.DOUBLE  # DoubleType()
     except Exception as exc:
         print("Skipping VectorUDT `{}` due to error:\n{}".format(
             name, exc))
     return num_elements, dtype
Exemplo n.º 11
0
def to_legate_dtype(dtype):
    if type(dtype) == str:
        if dtype not in _DTYPE_MAPPING:
            raise ValueError(f"invalid dtype {dtype}")
        return _DTYPE_MAPPING[dtype]
    elif isinstance(dtype, np.dtype):
        if dtype.name not in _DTYPE_MAPPING:
            raise ValueError(f"unsupported dtype {dtype}")
        return _DTYPE_MAPPING[dtype.name]
    elif isinstance(dtype, pa.DataType):
        if pyarrow_dtype.is_string(dtype):
            return string
        else:
            return to_legate_dtype(dtype.to_pandas_dtype())
    elif pandas_dtype.is_bool_dtype(dtype):
        return bool
    elif pandas_dtype.is_string_dtype(dtype):
        return string
    else:
        try:
            return to_legate_dtype(np.dtype(dtype))
        except TypeError:
            raise TypeError("Unsupported dtype: %s " % str(dtype))
Exemplo n.º 12
0
def is_possible_feature(arrow_type: DataType) -> bool:
    """Check if data type is possibly an ML feature."""
    return is_boolean(arrow_type) or is_string(arrow_type) or is_num(
        arrow_type)  # noqa: E501
Exemplo n.º 13
0
def is_possible_cat(arrow_type: DataType, /) -> bool:
    """Check if data type is possibly categorical."""
    return (is_boolean(arrow_type) or is_string(arrow_type)
            or is_num(arrow_type))
Exemplo n.º 14
0
def is_possible_cat(arrow_type):
    return is_boolean(arrow_type) or is_string(arrow_type) or is_num(
        arrow_type)