def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def _cast_temporal(val: Union[str, int], dtype: pa.DataType) -> Union[date, datetime]: if is_date32(dtype): # and type(val) == str: casted = datetime.strptime(val, "%Y-%m-%d").date() elif is_timestamp(dtype): if type(val) == str: casted = datetime.strptime(val, "%Y-%m-%dT%H:%M:%S.%f%z") elif type(val) == int: # check if it's UTC assert len(str(val)) == 13, "Unrecognized timestamp format" tz_str = str(val)[-3:] utc_tz_str = "000" if tz_str == utc_tz_str: casted = datetime.fromtimestamp(val / 1000) else: # should convert to UTC, but hopefully we don't have to bother raise NotImplementedError( "Casting non-UTC timestamps is not yet supported.") else: raise ValueError( "A serialized date must be a string or integer") else: raise NotImplementedError( "Currently, only casting to date32 and timestamp is supported") return casted
def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 elif types.is_int16(field_type): np_type = np.int16 elif types.is_int32(field_type): np_type = np.int32 elif types.is_int64(field_type): np_type = np.int64 elif types.is_string(field_type): np_type = np.unicode_ elif types.is_boolean(field_type): np_type = np.bool_ elif types.is_float32(field_type): np_type = np.float32 elif types.is_float64(field_type): np_type = np.float64 elif types.is_decimal(field_type): np_type = Decimal elif types.is_binary(field_type): np_type = np.string_ elif types.is_fixed_size_binary(field_type): np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 elif types.is_timestamp(field_type): np_type = np.datetime64 elif types.is_list(field_type): np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) else: raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type)) return np_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str: if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType) or types.is_large_string(pyarrowType) or types.is_large_unicode(pyarrowType)): return 'string' if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)): return 'bigint' if (types.is_binary(pyarrowType)): return 'binary' if (types.is_boolean(pyarrowType)): return 'boolean' if (types.is_date(pyarrowType) or types.is_date32(pyarrowType) or types.is_date64(pyarrowType)): return 'date' if (types.is_decimal(pyarrowType)): return 'decimal(16,2)' if (types.is_float64(pyarrowType)): 'return double' if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)): return 'float' if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType) or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)): return 'int' if (types.is_map(pyarrowType)): return 'map' if (types.is_struct(pyarrowType)): return 'struct' if (types.is_timestamp(pyarrowType)): return 'timestamp' if (types.is_union(pyarrowType)): return 'union' return str(pyarrowType)
def _check_series_roundtrip(s, type_=None): arr = pa.array(s, from_pandas=True, type=type_) result = pd.Series(arr.to_pandas(), name=s.name) if patypes.is_timestamp(arr.type) and arr.type.tz is not None: result = (result.dt.tz_localize('utc').dt.tz_convert(arr.type.tz)) tm.assert_series_equal(s, result)
def _check_series_roundtrip(self, s, type_=None): arr = pa.array(s, from_pandas=True, type=type_) result = pd.Series(arr.to_pandas(), name=s.name) if patypes.is_timestamp(arr.type) and arr.type.tz is not None: result = (result.dt.tz_localize('utc') .dt.tz_convert(arr.type.tz)) tm.assert_series_equal(s, result)
def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 codec = ScalarCodec(ByteType()) elif types.is_int16(field_type): np_type = np.int16 codec = ScalarCodec(ShortType()) elif types.is_int32(field_type): np_type = np.int32 codec = ScalarCodec(IntegerType()) elif types.is_int64(field_type): np_type = np.int64 codec = ScalarCodec(LongType()) elif types.is_string(field_type): np_type = np.unicode_ codec = ScalarCodec(StringType()) elif types.is_boolean(field_type): np_type = np.bool_ codec = ScalarCodec(BooleanType()) elif types.is_float32(field_type): np_type = np.float32 codec = ScalarCodec(FloatType()) elif types.is_float64(field_type): np_type = np.float64 codec = ScalarCodec(DoubleType()) elif types.is_decimal(field_type): np_type = Decimal codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale)) elif types.is_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_fixed_size_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 codec = ScalarCodec(DateType()) elif types.is_timestamp(field_type): np_type = np.datetime64 codec = ScalarCodec(TimestampType()) elif types.is_list(field_type): _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) codec = None else: raise ValueError( 'Cannot auto-create unischema due to unsupported column type {}'. format(field_type)) return codec, np_type
def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] time_types = [pa.time32('s'), pa.time64('ns')] timestamp_types = [pa.timestamp('ms')] for case in date_types + time_types + timestamp_types: assert types.is_temporal(case) for case in date_types: assert types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) for case in time_types: assert types.is_time(case) assert not types.is_date(case) assert not types.is_timestamp(case) for case in timestamp_types: assert types.is_timestamp(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_temporal(pa.int32())
def _from_arrow_type(dt: pa.DataType) -> pt.DataType: if is_struct(dt): return pt.StructType([ pt.StructField( # field.name, _from_arrow_type(field.type), nullable=field.nullable field.name, _from_arrow_type(field.type), nullable=True, ) for field in dt ]) elif is_list(dt): if is_timestamp(dt.value_type): raise TypeError( # pragma: no cover "Spark: unsupported type in conversion from Arrow: " + str(dt)) return pt.ArrayType(_from_arrow_type(dt.value_type)) return from_arrow_type(dt)