def test_column_rename(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()) ] schema = pa.schema([ pa.field("int_col", pa.int32(), False), pa.field("bigint_col", pa.int64(), True), pa.field("string_col", pa.string(), True), pa.field("float_col", pa.float32(), True), pa.field("dbl_col", pa.float64(), True) ]) source_table = pa.table(pyarrow_array, schema=schema) target_table = reader.read() assert source_table == target_table
def test_from_bytes(self): self.assertEqual(1234, Conversions.from_byte_buffer(IntegerType.get(), b'\xd2\x04\x00\x00')) self.assertEqual(1234, Conversions.from_byte_buffer(LongType.get(), b'\xd2\x04\x00\x00\x00\x00\x00\x00')) self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer(DoubleType.get(), b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f'))
def test_to_bytes(self): self.assertEqual(b'\x00\x00', Literal.of(False).to_byte_buffer()) self.assertEqual(b'\x01\x00', Literal.of(True).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00', Literal.of(1234).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00\x00\x00\x00\x00', Literal.of(1234).to(LongType.get()).to_byte_buffer()) self.assertEqual(b'\x19\x04\x9e?', Literal.of(1.2345).to_byte_buffer()) self.assertEqual( b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f', Literal.of(1.2345).to(DoubleType.get()).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00', Literal.of(1234).to(DateType.get()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to(TimeType.get()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to( TimestampType.with_timezone()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to( TimestampType.without_timezone()).to_byte_buffer()) self.assertEqual(b'foo', Literal.of("foo").to_byte_buffer()) self.assertEqual( b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7', Literal.of(uuid.UUID( "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer()) self.assertEqual(b'foo', Literal.of(bytes(b'foo')).to_byte_buffer()) self.assertEqual(b'foo', Literal.of(bytearray(b'foo')).to_byte_buffer())
def test_from_bytes(self): self.assertEqual( False, Conversions.from_byte_buffer(BooleanType.get(), b'\x00')) self.assertEqual( True, Conversions.from_byte_buffer(BooleanType.get(), b'\x01')) self.assertEqual( 1234, Conversions.from_byte_buffer(IntegerType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 1234, Conversions.from_byte_buffer(LongType.get(), b'\xd2\x04\x00\x00\x00\x00\x00\x00')) self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer( FloatType.get(), b'\x19\x04\x9e?'), places=5) self.assertAlmostEqual( 1.2345, Conversions.from_byte_buffer(DoubleType.get(), b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f')) self.assertEqual( 1234, Conversions.from_byte_buffer(DateType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimeType.get(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.with_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.without_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( "foo", Conversions.from_byte_buffer(StringType.get(), b'foo')) self.assertEqual( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), Conversions.from_byte_buffer( UUIDType.get(), b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7')) self.assertEqual( b'foo', Conversions.from_byte_buffer(FixedType.of_length(3), b'foo')) self.assertEqual( b'foo', Conversions.from_byte_buffer(BinaryType.get(), b'foo')) self.assertEqual( Decimal(123.45).quantize(Decimal(".01")), Conversions.from_byte_buffer(DecimalType.of(5, 2), b'\x30\x39')) self.assertEqual( Decimal(123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\x00\x12\xd6\x87')) self.assertEqual( Decimal(-123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\xff\xed\x29\x79'))
def test_schema_evolution_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(16, "other_new_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(15, "new_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.not_null("new_col"), True) schema = pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("other_new_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True), pa.field("new_col", pa.string(), nullable=True) ]) pyarrow_not_null_array = [ pa.array([], type=pa.int32()), pa.array([], type=pa.int64()), pa.array([], type=pa.int32()), pa.array([], type=pa.float32()), pa.array([], type=pa.float64()), pa.array([], type=pa.string()), pa.array([], type=pa.string()) ] not_null_table = pa.table(pyarrow_not_null_array, schema=schema) pyarrow_null_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([None, None, None, None, None], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([None, None, None, None, None], type=pa.string()) ] null_table = pa.table(pyarrow_null_array, schema=schema) target_table = reader.read() assert not_null_table == target_table reader = ParquetReader(input_file, expected_schema, {}, Expressions.is_null("new_col"), True) target_table = reader.read() assert null_table == target_table
def test_primitive_types(primitive_type_test_parquet_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) compare_schema( expected_schema, convert_parquet_to_iceberg(primitive_type_test_parquet_file))
def test_to_bytes(self): self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer()) self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00', Literal.of(1234).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00\x00\x00\x00\x00', Literal.of(1234).to(LongType.get()).to_byte_buffer()) self.assertEqual(b'\x19\x04\x9e?', Literal.of(1.2345).to_byte_buffer()) self.assertEqual( b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f', Literal.of(1.2345).to(DoubleType.get()).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00', Literal.of(1234).to(DateType.get()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to(TimeType.get()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to( TimestampType.with_timezone()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to( TimestampType.without_timezone()).to_byte_buffer()) self.assertEqual(b'foo', Literal.of("foo").to_byte_buffer()) self.assertEqual( b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7', Literal.of(uuid.UUID( "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer()) self.assertEqual(b'foo', Literal.of(bytes(b'foo')).to_byte_buffer()) self.assertEqual(b'foo', Literal.of(bytearray(b'foo')).to_byte_buffer()) # Decimal on 2-bytes self.assertEqual( b'\x30\x39', Literal.of(123.45).to(DecimalType.of(5, 2)).to_byte_buffer()) # Decimal on 3-bytes to test that we use the minimum number of bytes self.assertEqual( b'\x12\xd6\x87', Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # Negative decimal to test two's complement self.assertEqual( b'\xed\x29\x79', Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer())
def supported_primitives(): return StructType.of([ NestedField.required(100, "id", LongType.get()), NestedField.optional(101, "data", StringType.get()), NestedField.required(102, "b", BooleanType.get()), NestedField.optional(103, "i", IntegerType.get()), NestedField.required(104, "l", LongType.get()), NestedField.optional(105, "f", FloatType.get()), NestedField.required(106, "d", DoubleType.get()), NestedField.optional(107, "date", DateType.get()), NestedField.required(108, "ts", TimestampType.with_timezone()), NestedField.required(110, "s", StringType.get()), NestedField.required(111, "uuid", UUIDType.get()), NestedField.required(112, "fixed", FixedType.of_length(7)), NestedField.optional(113, "bytes", BinaryType.get()), NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)), NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)), NestedField.required(114, "dec_38_10", DecimalType.of(38, 10)) ])
def test_compound_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader( input_file, expected_schema, {}, Expressions.and_(Expressions.equal("string_col", "us"), Expressions.equal("int_col", 1)), True) pyarrow_array = [ pa.array([1], type=pa.int32()), pa.array([1], type=pa.int64()), pa.array([1.0], type=pa.float32()), pa.array([1.0], type=pa.float64()), pa.array(['us'], type=pa.string()) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_decimal_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9)) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.decimal128(38, 9)) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("new_dec_col", pa.decimal128(38, 9), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) assert reader.read() == source_table
def test_long_to_double_conversion(): lit = Literal.of(34).to(LongType.get()) dbl_lit = lit.to(DoubleType.get()) assert math.isclose(lit.value, dbl_lit.value)
def test_double_to_decimal_conversion(float_type_val_tuples): lit = Literal.of(34.56).to(DoubleType.get()) assert lit.to(float_type_val_tuples[0]).value.as_tuple() == Decimal(float_type_val_tuples[1]).as_tuple()
class AvroToIceberg(object): FIELD_ID_PROP = "field-id" FIELD_TYPE_PROP = "type" FIELD_NAME_PROP = "name" FIELD_LOGICAL_TYPE_PROP = "logicalType" FIELD_FIELDS_PROP = "fields" FIELD_ITEMS_PROP = "items" FIELD_ELEMENT_ID_PROP = "element-id" AVRO_JSON_PRIMITIVE_TYPES = ("boolean", "int", "long", "float", "double", "bytes", "string") AVRO_JSON_COMPLEX_TYPES = ("record", "array", "enum", "fixed") TYPE_PROCESSING_MAP = {str: lambda x, y: AvroToIceberg.convert_str_type(x, y), dict: lambda x, y: AvroToIceberg.convert_complex_type(x, y), list: lambda x, y: AvroToIceberg.convert_union_type(x, y)} COMPLEX_TYPE_PROCESSING_MAP = {"record": lambda x, y: AvroToIceberg.convert_record_type(x, y), "array": lambda x, y: AvroToIceberg.convert_array_type(x, y), "map": lambda x, y: AvroToIceberg.convert_map_type(x, y)} PRIMITIVE_FIELD_TYPE_MAP = {"boolean": BooleanType.get(), "bytes": BinaryType.get(), "date": DateType.get(), "double": DoubleType.get(), "float": FloatType.get(), "int": IntegerType.get(), "long": LongType.get(), "string": StringType.get(), "time-millis": TimeType.get(), "timestamp-millis": TimestampType.without_timezone()} PROCESS_FUNCS = {TypeID.STRUCT: lambda avro_row, field: AvroToIceberg.get_field_from_struct(avro_row, field), TypeID.LIST: lambda avro_row, field: AvroToIceberg.get_field_from_list(avro_row, field), TypeID.MAP: lambda avro_row, field: AvroToIceberg.get_field_from_map(avro_row, field)} @staticmethod def convert_avro_schema_to_iceberg(avro_schema): if avro_schema.get(AvroToIceberg.FIELD_TYPE_PROP) != "record": raise RuntimeError("Cannot convert avro schema to iceberg %s" % avro_schema) struct = AvroToIceberg.convert_type(avro_schema, None) return Schema(struct[0].fields) @staticmethod def convert_record_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if avro_field_type != "record": raise RuntimeError("Field type muse be 'record': %s" % avro_field_type) fields = avro_field.get(AvroToIceberg.FIELD_FIELDS_PROP) iceberg_fields = [] if next_id is None: next_id = len(fields) for field in fields: iceberg_field, next_id = AvroToIceberg.convert_avro_field_to_iceberg(field, next_id=next_id) iceberg_fields.append(iceberg_field) return StructType.of(iceberg_fields), next_id @staticmethod def convert_avro_field_to_iceberg(field, next_id): field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id) if field.get(AvroToIceberg.FIELD_ID_PROP) is None: return field_type, next_id if is_optional: return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id else: return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id @staticmethod def convert_type(field, next_id=None): avro_field_type = field.get(AvroToIceberg.FIELD_TYPE_PROP) optional = AvroToIceberg.is_option_schema(avro_field_type) processing_func = AvroToIceberg.TYPE_PROCESSING_MAP.get(type(avro_field_type)) if processing_func is None: raise RuntimeError("No function found to process %s" % avro_field_type) iceberg_type, next_id = processing_func(field, next_id) return iceberg_type, optional, next_id @staticmethod def convert_str_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP) if not isinstance(avro_field_type, str): raise RuntimeError("Field type must be of type str: %s" % avro_field_type) if avro_field_type in AvroToIceberg.AVRO_JSON_PRIMITIVE_TYPES: if logical_type is not None: return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(logical_type), next_id else: return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(avro_field_type), next_id elif avro_field_type in AvroToIceberg.AVRO_JSON_COMPLEX_TYPES: if logical_type is not None: processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(logical_type) else: processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(avro_field_type) if processing_func is None: raise RuntimeError("No function found to process %s" % avro_field_type) return processing_func(avro_field, next_id) else: raise RuntimeError("Unknown type %s" % avro_field_type) @staticmethod def convert_complex_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if not isinstance(avro_field_type, dict): raise RuntimeError("Complex field type must be of type dict: %s" % avro_field_type) return AvroToIceberg.convert_avro_field_to_iceberg(avro_field_type, next_id) @staticmethod def convert_union_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if not isinstance(avro_field_type, list): raise RuntimeError("Union field type must be of type list: %s" % avro_field_type) if len(avro_field_type) > 2: raise RuntimeError("Cannot process unions larger than 2 items: %s" % avro_field_type) for item in avro_field_type: if isinstance(item, str) and item == "null": continue avro_field_type = item avro_field[AvroToIceberg.FIELD_TYPE_PROP] = avro_field_type items = AvroToIceberg.convert_type(avro_field, next_id) return items[0], items[2] @staticmethod def convert_array_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if avro_field_type != "array": raise RuntimeError("Avro type must be array: %s" % avro_field_type) element_id = avro_field.get(AvroToIceberg.FIELD_ELEMENT_ID_PROP) items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP) is_optional = AvroToIceberg.is_option_schema(items) if isinstance(items, str) and items in AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP: item_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(items) if item_type is None: raise RuntimeError("No mapping found for type %s" % items) else: raise RuntimeError("Complex list types not yet implemented") if is_optional: return ListType.of_optional(element_id, item_type), next_id else: return ListType.of_required(element_id, item_type), next_id @staticmethod def convert_map_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) avro_logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP) if avro_field_type != "array" or avro_logical_type != "map": raise RuntimeError("Avro type must be array and logical type must be map: %s" % avro_logical_type) is_optional = False items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP) for field in items.get(AvroToIceberg.FIELD_FIELDS_PROP, list()): if field.get(AvroToIceberg.FIELD_NAME_PROP) == "key": key_id = field.get(AvroToIceberg.FIELD_ID_PROP) if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str): raise RuntimeError("Support for complex map keys not yet implemented") key_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP)) elif field.get(AvroToIceberg.FIELD_NAME_PROP) == "value": value_id = field.get(AvroToIceberg.FIELD_ID_PROP) if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str): raise RuntimeError("Support for complex map values not yet imeplemented") value_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP)) if is_optional: return MapType.of_optional(key_id, value_id, key_type, value_type), next_id else: return MapType.of_required(key_id, value_id, key_type, value_type), next_id @staticmethod def is_option_schema(field_type): if isinstance(field_type, list) and len(field_type) == 2 and "null" in field_type: return True return False @staticmethod def read_avro_file(iceberg_schema, data_file): fo = data_file.new_fo() avro_reader = fastavro.reader(fo) for avro_row in avro_reader: iceberg_row = dict() for field in iceberg_schema.as_struct().fields: iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field) yield iceberg_row fo.close() @staticmethod def read_avro_row(iceberg_schema, avro_reader): try: for avro_row in avro_reader: iceberg_row = dict() for field in iceberg_schema.as_struct().fields: iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field) yield iceberg_row except StopIteration: return @staticmethod def get_field_from_avro(avro_row, field): try: return AvroToIceberg.PROCESS_FUNCS.get(field.type.type_id, AvroToIceberg.get_field_from_primitive)(avro_row, field) except KeyError: raise RuntimeError("Don't know how to get field of type: %s" % field.type.type_id) @staticmethod def get_field_from_primitive(avro_row, field): try: return avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) @staticmethod def get_field_from_struct(avro_row, field): field_obj = {} for nested_field in field.type.fields: field_obj[nested_field.name] = AvroToIceberg.get_field_from_avro(avro_row[field.name], nested_field) return field_obj @staticmethod def get_field_from_list(avro_row, field): try: return avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) @staticmethod def get_field_from_map(avro_row, field): val_map = dict() try: avro_value = avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) else: return None for val in avro_value: val_map[val['key']] = val['value'] return val_map
FixedType, FloatType, IntegerType, LongType, StringType, TimestampType, TimeType, UUIDType) PRIMITIVES = [BinaryType.get(), BooleanType.get(), DateType.get(), DecimalType.of(9, 2), DecimalType.of(11, 2), DecimalType.of(9, 3), DoubleType.get(), FixedType.of_length(3), FixedType.of_length(4), FloatType.get(), IntegerType.get(), LongType.get(), StringType.get(), TimestampType.with_timezone(), TimestampType.without_timezone(), TimeType.get(), UUIDType.get()] class TestReadabilityChecks(unittest.TestCase): def test_primitive_types(self):
LongType, MapType, NestedField, StringType, StructType, TimestampType) from iceberg.api.types import Type import pyarrow as pa from pyarrow.parquet import lib, ParquetFile _logger = logging.getLogger(__name__) arrow_type_map = {lib.Type_BOOL: lambda x=None: BooleanType.get(), lib.Type_DATE32: lambda x=None: DateType.get(), lib.Type_DECIMAL128: lambda x=None: DecimalType.of(x.precision, x.scale), lib.Type_DOUBLE: lambda x=None: DoubleType.get(), lib.Type_FIXED_SIZE_BINARY: lambda x=None: FixedType.of_length(x.byte_width), lib.Type_BINARY: lambda x=None: BinaryType.get(), lib.Type_FLOAT: lambda x=None: FloatType.get(), lib.Type_STRING: lambda x=None: StringType.get(), lib.Type_INT32: lambda x=None: IntegerType.get(), lib.Type_INT64: lambda x=None: LongType.get(), lib.Type_TIMESTAMP: lambda x=None: (TimestampType.without_timezone() if x.tz is None else TimestampType.with_timezone()) } def get_nested_field(field_id: int, field_name: str, field_type: Type, nullable: bool) -> NestedField: if nullable: return NestedField.optional(field_id, field_name, field_type)
def test_float_to_double(): lit = Literal.of(34.56) dbl_lit = lit.to(DoubleType.get()) assert math.isclose(lit.value, dbl_lit.value)
def test_double_to_float(): lit = Literal.of(34.56).to(DoubleType.get()) float_lit = lit.to(FloatType.get()) assert math.isclose(lit.value, float_lit.value)
def test_byte_buffer_conversions(self): # booleans are stored as 0x00 for 'false' and a non-zero byte for 'true' self.assertConversion(False, BooleanType.get(), b'\x00') self.assertConversion(True, BooleanType.get(), b'\x01') self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer()) self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer()) # integers are stored as 4 bytes in little-endian order # 84202 is 0...01|01001000|11101010 in binary # 11101010 -> 234 (-22), 01001000 -> 72, 00000001 -> 1, 00000000 -> 0 self.assertConversion(84202, IntegerType.get(), bytes([234, 72, 1, 0])) self.assertEqual(bytes([234, 72, 1, 0]), Literal.of(84202).to_byte_buffer()) # longs are stored as 8 bytes in little-endian order # 200L is 0...0|11001000 in binary # 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0 self.assertConversion(200, LongType.get(), bytes([200, 0, 0, 0, 0, 0, 0, 0])) self.assertEqual(bytes([200, 0, 0, 0, 0, 0, 0, 0]), Literal.of(200).to(LongType.get()).to_byte_buffer()) # floats are stored as 4 bytes in little-endian order # floating point numbers are represented as sign * 2ˆexponent * mantissa # -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary # 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 192 (-64), self.assertConversion(-4.5, FloatType.get(), bytes([0, 0, 144, 192])) self.assertEqual(bytes([0, 0, 144, 192]), Literal.of(-4.5).to_byte_buffer()) # doubles are stored as 8 bytes in little-endian order # floating point numbers are represented as sign * 2ˆexponent * mantissa # 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0 # 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64 self.assertConversion(6.0, DoubleType.get(), bytes([0, 0, 0, 0, 0, 0, 24, 64])) self.assertEqual(bytes([0, 0, 0, 0, 0, 0, 24, 64]), Literal.of(6.0).to(DoubleType.get()).to_byte_buffer()) # dates are stored as days from 1970-01-01 in a 4-byte little-endian int # 1000 is 0...0|00000011|11101000 in binary # 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0 self.assertConversion(1000, DateType.get(), bytes([232, 3, 0, 0])) self.assertEqual(bytes([232, 3, 0, 0]), Literal.of(1000).to(DateType.get()).to_byte_buffer()) # time is stored as microseconds from midnight in an 8-byte little-endian long # 10000L is 0...0|00100111|00010000 in binary # 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0 self.assertConversion(10000, TimeType.get(), bytes([16, 39, 0, 0, 0, 0, 0, 0])) self.assertEqual( bytes([16, 39, 0, 0, 0, 0, 0, 0]), Literal.of(10000).to(LongType.get()).to( TimeType.get()).to_byte_buffer()) # timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long # 400000L is 0...110|00011010|10000000 in binary # 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0 self.assertConversion(400000, TimestampType.without_timezone(), bytes([128, 26, 6, 0, 0, 0, 0, 0])) self.assertConversion(400000, TimestampType.with_timezone(), bytes([128, 26, 6, 0, 0, 0, 0, 0])) self.assertEqual( bytes([128, 26, 6, 0, 0, 0, 0, 0]), Literal.of(400000).to(LongType.get()).to( TimestampType.without_timezone()).to_byte_buffer()) self.assertEqual( bytes([128, 26, 6, 0, 0, 0, 0, 0]), Literal.of(400000).to(LongType.get()).to( TimestampType.with_timezone()).to_byte_buffer()) # strings are stored as UTF-8 bytes (without length) # 'A' -> 65, 'B' -> 66, 'C' -> 67 self.assertConversion("ABC", StringType.get(), bytes([65, 66, 67])) self.assertEqual(bytes([65, 66, 67]), Literal.of("ABC").to_byte_buffer()) # uuids are stored as 16-byte big-endian values # f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7 # 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E -> 00111110 -> 62, # 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124, # 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 10100100 -> 164 (-92), # 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52, # 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 -> 10000101 -> 133 (-123), # 0xE7 -> 11100111 -> 231 (-25) self.assertConversion( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType.get(), bytes([ 247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231 ])) self.assertEqual( bytes([ 247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231 ]), Literal.of(uuid.UUID( "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer()) # fixed values are stored directly # 'a' -> 97, 'b' -> 98 self.assertConversion(bytes("ab", "utf8"), FixedType.of_length(2), bytes([97, 98])) self.assertEqual(bytes([97, 98]), Literal.of(bytes("ab", "utf8")).to_byte_buffer()) # binary values are stored directly # 'Z' -> 90 self.assertConversion(bytearray("Z", "utf8"), BinaryType.get(), bytes([90])) self.assertEqual(bytes([90]), Literal.of(bytearray("Z", "utf8")).to_byte_buffer()) # decimals are stored as unscaled values in the form of two's-complement big-endian binary, # using the minimum number of bytes for the values # 345 is 0...1|01011001 in binary # 00000001 -> 1, 01011001 -> 89 self.assertConversion( Decimal(3.45).quantize(Decimal(".01")), DecimalType.of(3, 2), bytes([1, 89])) self.assertEqual( bytes([1, 89]), Literal.of(3.45).to(DecimalType.of(3, 2)).to_byte_buffer()) # decimal on 3-bytes to test that we use the minimum number of bytes and not a power of 2 # 1234567 is 00010010|11010110|10000111 in binary # 00010010 -> 18, 11010110 -> 214, 10000111 -> 135 self.assertConversion( Decimal(123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4), bytes([18, 214, 135])) self.assertEqual( bytes([18, 214, 135]), Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # negative decimal to test two's complement # -1234567 is 11101101|00101001|01111001 in binary # 11101101 -> 237, 00101001 -> 41, 01111001 -> 121 self.assertConversion( Decimal(-123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4), bytes([237, 41, 121])) self.assertEqual( bytes([237, 41, 121]), Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # test empty byte in decimal # 11 is 00001011 in binary # 00001011 -> 11 self.assertConversion( Decimal(0.011).quantize(Decimal(".001")), DecimalType.of(10, 3), bytes([11])) self.assertEqual( bytes([11]), Literal.of(0.011).to(DecimalType.of(10, 3)).to_byte_buffer())