def get_type(partition_type): return StructType.of([ NestedField.required(100, "file_path", StringType.get()), NestedField.required(101, "file_format", StringType.get()), NestedField.required(102, "partition", partition_type), NestedField.required(103, "record_count", LongType.get()), NestedField.required(104, "file_size_in_bytes", LongType.get()), NestedField.required(105, "block_size_in_bytes", LongType.get()), NestedField.optional(106, "file_ordinal", IntegerType.get()), NestedField.optional(107, "sort_columns", ListType.of_required(112, IntegerType.get())), NestedField.optional( 108, "column_sizes", MapType.of_required(117, 118, IntegerType.get(), LongType.get())), NestedField.optional( 109, "value_counts", MapType.of_required(119, 120, IntegerType.get(), LongType.get())), NestedField.optional( 110, "null_value_counts", MapType.of_required(121, 122, IntegerType.get(), LongType.get())), NestedField.optional( 125, "lower_bounds", MapType.of_required(126, 127, IntegerType.get(), BinaryType.get())), NestedField.optional( 128, "upper_bounds", MapType.of_required(129, 130, IntegerType.get(), BinaryType.get())) ] # NEXT ID TO ASSIGN: 131 )
def test_from_bytes(self): self.assertEqual( False, Conversions.from_byte_buffer(BooleanType.get(), b'\x00')) self.assertEqual( True, Conversions.from_byte_buffer(BooleanType.get(), b'\x01')) self.assertEqual( 1234, Conversions.from_byte_buffer(IntegerType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 1234, Conversions.from_byte_buffer(LongType.get(), b'\xd2\x04\x00\x00\x00\x00\x00\x00')) self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer( FloatType.get(), b'\x19\x04\x9e?'), places=5) self.assertAlmostEqual( 1.2345, Conversions.from_byte_buffer(DoubleType.get(), b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f')) self.assertEqual( 1234, Conversions.from_byte_buffer(DateType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimeType.get(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.with_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.without_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( "foo", Conversions.from_byte_buffer(StringType.get(), b'foo')) self.assertEqual( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), Conversions.from_byte_buffer( UUIDType.get(), b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7')) self.assertEqual( b'foo', Conversions.from_byte_buffer(FixedType.of_length(3), b'foo')) self.assertEqual( b'foo', Conversions.from_byte_buffer(BinaryType.get(), b'foo')) self.assertEqual( Decimal(123.45).quantize(Decimal(".01")), Conversions.from_byte_buffer(DecimalType.of(5, 2), b'\x30\x39')) self.assertEqual( Decimal(123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\x00\x12\xd6\x87')) self.assertEqual( Decimal(-123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\xff\xed\x29\x79'))
def to_option(field): if field.type == BinaryType.get(): type_name = "bytes" else: type_name = str(field.type) if field.is_optional: return ["null", type_name] else: return type_name
def test_partition_spec(self): schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [PartitionSpec.builder_for(schema).identity("i").build(), PartitionSpec.builder_for(schema).identity("l").build(), PartitionSpec.builder_for(schema).identity("d").build(), PartitionSpec.builder_for(schema).identity("t").build(), PartitionSpec.builder_for(schema).identity("ts").build(), PartitionSpec.builder_for(schema).identity("dec").build(), PartitionSpec.builder_for(schema).identity("s").build(), PartitionSpec.builder_for(schema).identity("u").build(), PartitionSpec.builder_for(schema).identity("f").build(), PartitionSpec.builder_for(schema).identity("b").build(), PartitionSpec.builder_for(schema).bucket("i", 128).build(), PartitionSpec.builder_for(schema).bucket("l", 128).build(), PartitionSpec.builder_for(schema).bucket("d", 128).build(), PartitionSpec.builder_for(schema).bucket("t", 128).build(), PartitionSpec.builder_for(schema).bucket("ts", 128).build(), PartitionSpec.builder_for(schema).bucket("dec", 128).build(), PartitionSpec.builder_for(schema).bucket("s", 128).build(), PartitionSpec.builder_for(schema).bucket("u", 128).build(), PartitionSpec.builder_for(schema).bucket("f", 128).build(), PartitionSpec.builder_for(schema).bucket("b", 128).build(), PartitionSpec.builder_for(schema).year("d").build(), PartitionSpec.builder_for(schema).month("d").build(), PartitionSpec.builder_for(schema).day("d").build(), PartitionSpec.builder_for(schema).year("ts").build(), PartitionSpec.builder_for(schema).month("ts").build(), PartitionSpec.builder_for(schema).day("ts").build(), PartitionSpec.builder_for(schema).hour("ts").build(), PartitionSpec.builder_for(schema).truncate("i", 10).build(), PartitionSpec.builder_for(schema).truncate("l", 10).build(), PartitionSpec.builder_for(schema).truncate("dec", 10).build(), PartitionSpec.builder_for(schema).truncate("s", 10).build(), PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(), PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(), ] for spec in specs: self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))
def test_bucket_hash(self): buckets = [ [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379], [Transforms.bucket(LongType.get(), 100), 34, 2017239379], [Transforms.bucket(DateType.get(), 100), 17486, -653330422], [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989], [Transforms.bucket(TimestampType.without_timezone(), 100), 1510871468000000, -2047944441], [Transforms.bucket(DecimalType.of(9, 2), 100), decimal.Decimal("14.20"), -500754589], [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089], [Transforms.bucket(UUIDType.get(), 100), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340], [Transforms.bucket(FixedType.of_length(3), 128), b'foo', -156908512], [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', -188683207] ] for bucket in buckets: self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))
def supported_primitives(): return StructType.of([ NestedField.required(100, "id", LongType.get()), NestedField.optional(101, "data", StringType.get()), NestedField.required(102, "b", BooleanType.get()), NestedField.optional(103, "i", IntegerType.get()), NestedField.required(104, "l", LongType.get()), NestedField.optional(105, "f", FloatType.get()), NestedField.required(106, "d", DoubleType.get()), NestedField.optional(107, "date", DateType.get()), NestedField.required(108, "ts", TimestampType.with_timezone()), NestedField.required(110, "s", StringType.get()), NestedField.required(111, "uuid", UUIDType.get()), NestedField.required(112, "fixed", FixedType.of_length(7)), NestedField.optional(113, "bytes", BinaryType.get()), NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)), NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)), NestedField.required(114, "dec_38_10", DecimalType.of(38, 10)) ])
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [ PartitionSpec.builder_for(spec_schema).identity("i").build(), PartitionSpec.builder_for(spec_schema).identity("l").build(), PartitionSpec.builder_for(spec_schema).identity("d").build(), PartitionSpec.builder_for(spec_schema).identity("t").build(), PartitionSpec.builder_for(spec_schema).identity("ts").build(), PartitionSpec.builder_for(spec_schema).identity("dec").build(), PartitionSpec.builder_for(spec_schema).identity("s").build(), PartitionSpec.builder_for(spec_schema).identity("u").build(), PartitionSpec.builder_for(spec_schema).identity("f").build(), PartitionSpec.builder_for(spec_schema).identity("b").build(), PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(), PartitionSpec.builder_for(spec_schema).year("d").build(), PartitionSpec.builder_for(spec_schema).month("d").build(), PartitionSpec.builder_for(spec_schema).day("d").build(), PartitionSpec.builder_for(spec_schema).year("ts").build(), PartitionSpec.builder_for(spec_schema).month("ts").build(), PartitionSpec.builder_for(spec_schema).day("ts").build(), PartitionSpec.builder_for(spec_schema).hour("ts").build(), PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(), PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build() ] expected_spec_strs = [ "[\n i: identity(1)\n]", "[\n l: identity(2)\n]", "[\n d: identity(3)\n]", "[\n t: identity(4)\n]", "[\n ts: identity(5)\n]", "[\n dec: identity(6)\n]", "[\n s: identity(7)\n]", "[\n u: identity(8)\n]", "[\n f: identity(9)\n]", "[\n b: identity(10)\n]", "[\n i_bucket: bucket[128](1)\n]", "[\n l_bucket: bucket[128](2)\n]", "[\n d_bucket: bucket[128](3)\n]", "[\n t_bucket: bucket[128](4)\n]", "[\n ts_bucket: bucket[128](5)\n]", "[\n dec_bucket: bucket[128](6)\n]", "[\n s_bucket: bucket[128](7)\n]", "[\n d_year: year(3)\n]", "[\n d_month: month(3)\n]", "[\n d_day: day(3)\n]", "[\n ts_year: year(5)\n]", "[\n ts_month: month(5)\n]", "[\n ts_day: day(5)\n]", "[\n ts_hour: hour(5)\n]", "[\n i_truncate: truncate[10](1)\n]", "[\n l_truncate: truncate[10](2)\n]", "[\n dec_truncate: truncate[10](6)\n]", "[\n s_truncate: truncate[10](7)\n]", "[\n dec_bucket: bucket[16](6)\n]", ] for (spec, expected_spec_str) in zip(specs, expected_spec_strs): assert str(spec) == expected_spec_str
class AvroToIceberg(object): FIELD_ID_PROP = "field-id" FIELD_TYPE_PROP = "type" FIELD_NAME_PROP = "name" FIELD_LOGICAL_TYPE_PROP = "logicalType" FIELD_FIELDS_PROP = "fields" FIELD_ITEMS_PROP = "items" FIELD_ELEMENT_ID_PROP = "element-id" AVRO_JSON_PRIMITIVE_TYPES = ("boolean", "int", "long", "float", "double", "bytes", "string") AVRO_JSON_COMPLEX_TYPES = ("record", "array", "enum", "fixed") TYPE_PROCESSING_MAP = {str: lambda x, y: AvroToIceberg.convert_str_type(x, y), dict: lambda x, y: AvroToIceberg.convert_complex_type(x, y), list: lambda x, y: AvroToIceberg.convert_union_type(x, y)} COMPLEX_TYPE_PROCESSING_MAP = {"record": lambda x, y: AvroToIceberg.convert_record_type(x, y), "array": lambda x, y: AvroToIceberg.convert_array_type(x, y), "map": lambda x, y: AvroToIceberg.convert_map_type(x, y)} PRIMITIVE_FIELD_TYPE_MAP = {"boolean": BooleanType.get(), "bytes": BinaryType.get(), "date": DateType.get(), "double": DoubleType.get(), "float": FloatType.get(), "int": IntegerType.get(), "long": LongType.get(), "string": StringType.get(), "time-millis": TimeType.get(), "timestamp-millis": TimestampType.without_timezone()} PROCESS_FUNCS = {TypeID.STRUCT: lambda avro_row, field: AvroToIceberg.get_field_from_struct(avro_row, field), TypeID.LIST: lambda avro_row, field: AvroToIceberg.get_field_from_list(avro_row, field), TypeID.MAP: lambda avro_row, field: AvroToIceberg.get_field_from_map(avro_row, field)} @staticmethod def convert_avro_schema_to_iceberg(avro_schema): if avro_schema.get(AvroToIceberg.FIELD_TYPE_PROP) != "record": raise RuntimeError("Cannot convert avro schema to iceberg %s" % avro_schema) struct = AvroToIceberg.convert_type(avro_schema, None) return Schema(struct[0].fields) @staticmethod def convert_record_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if avro_field_type != "record": raise RuntimeError("Field type muse be 'record': %s" % avro_field_type) fields = avro_field.get(AvroToIceberg.FIELD_FIELDS_PROP) iceberg_fields = [] if next_id is None: next_id = len(fields) for field in fields: iceberg_field, next_id = AvroToIceberg.convert_avro_field_to_iceberg(field, next_id=next_id) iceberg_fields.append(iceberg_field) return StructType.of(iceberg_fields), next_id @staticmethod def convert_avro_field_to_iceberg(field, next_id): field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id) if field.get(AvroToIceberg.FIELD_ID_PROP) is None: return field_type, next_id if is_optional: return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id else: return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id @staticmethod def convert_type(field, next_id=None): avro_field_type = field.get(AvroToIceberg.FIELD_TYPE_PROP) optional = AvroToIceberg.is_option_schema(avro_field_type) processing_func = AvroToIceberg.TYPE_PROCESSING_MAP.get(type(avro_field_type)) if processing_func is None: raise RuntimeError("No function found to process %s" % avro_field_type) iceberg_type, next_id = processing_func(field, next_id) return iceberg_type, optional, next_id @staticmethod def convert_str_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP) if not isinstance(avro_field_type, str): raise RuntimeError("Field type must be of type str: %s" % avro_field_type) if avro_field_type in AvroToIceberg.AVRO_JSON_PRIMITIVE_TYPES: if logical_type is not None: return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(logical_type), next_id else: return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(avro_field_type), next_id elif avro_field_type in AvroToIceberg.AVRO_JSON_COMPLEX_TYPES: if logical_type is not None: processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(logical_type) else: processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(avro_field_type) if processing_func is None: raise RuntimeError("No function found to process %s" % avro_field_type) return processing_func(avro_field, next_id) else: raise RuntimeError("Unknown type %s" % avro_field_type) @staticmethod def convert_complex_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if not isinstance(avro_field_type, dict): raise RuntimeError("Complex field type must be of type dict: %s" % avro_field_type) return AvroToIceberg.convert_avro_field_to_iceberg(avro_field_type, next_id) @staticmethod def convert_union_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if not isinstance(avro_field_type, list): raise RuntimeError("Union field type must be of type list: %s" % avro_field_type) if len(avro_field_type) > 2: raise RuntimeError("Cannot process unions larger than 2 items: %s" % avro_field_type) for item in avro_field_type: if isinstance(item, str) and item == "null": continue avro_field_type = item avro_field[AvroToIceberg.FIELD_TYPE_PROP] = avro_field_type items = AvroToIceberg.convert_type(avro_field, next_id) return items[0], items[2] @staticmethod def convert_array_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if avro_field_type != "array": raise RuntimeError("Avro type must be array: %s" % avro_field_type) element_id = avro_field.get(AvroToIceberg.FIELD_ELEMENT_ID_PROP) items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP) is_optional = AvroToIceberg.is_option_schema(items) if isinstance(items, str) and items in AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP: item_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(items) if item_type is None: raise RuntimeError("No mapping found for type %s" % items) else: raise RuntimeError("Complex list types not yet implemented") if is_optional: return ListType.of_optional(element_id, item_type), next_id else: return ListType.of_required(element_id, item_type), next_id @staticmethod def convert_map_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) avro_logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP) if avro_field_type != "array" or avro_logical_type != "map": raise RuntimeError("Avro type must be array and logical type must be map: %s" % avro_logical_type) is_optional = False items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP) for field in items.get(AvroToIceberg.FIELD_FIELDS_PROP, list()): if field.get(AvroToIceberg.FIELD_NAME_PROP) == "key": key_id = field.get(AvroToIceberg.FIELD_ID_PROP) if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str): raise RuntimeError("Support for complex map keys not yet implemented") key_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP)) elif field.get(AvroToIceberg.FIELD_NAME_PROP) == "value": value_id = field.get(AvroToIceberg.FIELD_ID_PROP) if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str): raise RuntimeError("Support for complex map values not yet imeplemented") value_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP)) if is_optional: return MapType.of_optional(key_id, value_id, key_type, value_type), next_id else: return MapType.of_required(key_id, value_id, key_type, value_type), next_id @staticmethod def is_option_schema(field_type): if isinstance(field_type, list) and len(field_type) == 2 and "null" in field_type: return True return False @staticmethod def read_avro_file(iceberg_schema, data_file): fo = data_file.new_fo() avro_reader = fastavro.reader(fo) for avro_row in avro_reader: iceberg_row = dict() for field in iceberg_schema.as_struct().fields: iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field) yield iceberg_row fo.close() @staticmethod def read_avro_row(iceberg_schema, avro_reader): try: for avro_row in avro_reader: iceberg_row = dict() for field in iceberg_schema.as_struct().fields: iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field) yield iceberg_row except StopIteration: return @staticmethod def get_field_from_avro(avro_row, field): try: return AvroToIceberg.PROCESS_FUNCS.get(field.type.type_id, AvroToIceberg.get_field_from_primitive)(avro_row, field) except KeyError: raise RuntimeError("Don't know how to get field of type: %s" % field.type.type_id) @staticmethod def get_field_from_primitive(avro_row, field): try: return avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) @staticmethod def get_field_from_struct(avro_row, field): field_obj = {} for nested_field in field.type.fields: field_obj[nested_field.name] = AvroToIceberg.get_field_from_avro(avro_row[field.name], nested_field) return field_obj @staticmethod def get_field_from_list(avro_row, field): try: return avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) @staticmethod def get_field_from_map(avro_row, field): val_map = dict() try: avro_value = avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) else: return None for val in avro_value: val_map[val['key']] = val['value'] return val_map
Literal.of(34), Literal.of(35), Literal.of(36.75), Literal.of(8.75), Literal.of("2017-11-29").to(DateType.get()), Literal.of("11:30:0").to(TimeType.get()), Literal.of("2017-11-29T11:30:07.123").to( TimestampType.without_timezone()), Literal.of("2017-11-29T11:30:07.123+01:00").to( TimestampType.with_timezone()), Literal.of("abc"), Literal.of(uuid.uuid4()), Literal.of(bytes([0x01, 0x02, 0x03])).to(FixedType.of_length(3)), Literal.of(bytes([0x03, 0x04, 0x05, 0x06])).to(BinaryType.get()), Literal.of(Decimal(122.50).quantize(Decimal(".01"))) ]) def literal(request): yield request.param @pytest.fixture(scope="session", params=[(DecimalType.of(9, 0), "34"), (DecimalType.of(9, 2), "34.00"), (DecimalType.of(9, 4), "34.0000")]) def type_val_tuples(request): yield request.param @pytest.fixture(scope="session",
def test_byte_buffer_conversions(self): # booleans are stored as 0x00 for 'false' and a non-zero byte for 'true' self.assertConversion(False, BooleanType.get(), b'\x00') self.assertConversion(True, BooleanType.get(), b'\x01') self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer()) self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer()) # integers are stored as 4 bytes in little-endian order # 84202 is 0...01|01001000|11101010 in binary # 11101010 -> 234 (-22), 01001000 -> 72, 00000001 -> 1, 00000000 -> 0 self.assertConversion(84202, IntegerType.get(), bytes([234, 72, 1, 0])) self.assertEqual(bytes([234, 72, 1, 0]), Literal.of(84202).to_byte_buffer()) # longs are stored as 8 bytes in little-endian order # 200L is 0...0|11001000 in binary # 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0 self.assertConversion(200, LongType.get(), bytes([200, 0, 0, 0, 0, 0, 0, 0])) self.assertEqual(bytes([200, 0, 0, 0, 0, 0, 0, 0]), Literal.of(200).to(LongType.get()).to_byte_buffer()) # floats are stored as 4 bytes in little-endian order # floating point numbers are represented as sign * 2ˆexponent * mantissa # -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary # 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 192 (-64), self.assertConversion(-4.5, FloatType.get(), bytes([0, 0, 144, 192])) self.assertEqual(bytes([0, 0, 144, 192]), Literal.of(-4.5).to_byte_buffer()) # doubles are stored as 8 bytes in little-endian order # floating point numbers are represented as sign * 2ˆexponent * mantissa # 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0 # 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64 self.assertConversion(6.0, DoubleType.get(), bytes([0, 0, 0, 0, 0, 0, 24, 64])) self.assertEqual(bytes([0, 0, 0, 0, 0, 0, 24, 64]), Literal.of(6.0).to(DoubleType.get()).to_byte_buffer()) # dates are stored as days from 1970-01-01 in a 4-byte little-endian int # 1000 is 0...0|00000011|11101000 in binary # 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0 self.assertConversion(1000, DateType.get(), bytes([232, 3, 0, 0])) self.assertEqual(bytes([232, 3, 0, 0]), Literal.of(1000).to(DateType.get()).to_byte_buffer()) # time is stored as microseconds from midnight in an 8-byte little-endian long # 10000L is 0...0|00100111|00010000 in binary # 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0 self.assertConversion(10000, TimeType.get(), bytes([16, 39, 0, 0, 0, 0, 0, 0])) self.assertEqual( bytes([16, 39, 0, 0, 0, 0, 0, 0]), Literal.of(10000).to(LongType.get()).to( TimeType.get()).to_byte_buffer()) # timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long # 400000L is 0...110|00011010|10000000 in binary # 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0 self.assertConversion(400000, TimestampType.without_timezone(), bytes([128, 26, 6, 0, 0, 0, 0, 0])) self.assertConversion(400000, TimestampType.with_timezone(), bytes([128, 26, 6, 0, 0, 0, 0, 0])) self.assertEqual( bytes([128, 26, 6, 0, 0, 0, 0, 0]), Literal.of(400000).to(LongType.get()).to( TimestampType.without_timezone()).to_byte_buffer()) self.assertEqual( bytes([128, 26, 6, 0, 0, 0, 0, 0]), Literal.of(400000).to(LongType.get()).to( TimestampType.with_timezone()).to_byte_buffer()) # strings are stored as UTF-8 bytes (without length) # 'A' -> 65, 'B' -> 66, 'C' -> 67 self.assertConversion("ABC", StringType.get(), bytes([65, 66, 67])) self.assertEqual(bytes([65, 66, 67]), Literal.of("ABC").to_byte_buffer()) # uuids are stored as 16-byte big-endian values # f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7 # 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E -> 00111110 -> 62, # 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124, # 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 10100100 -> 164 (-92), # 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52, # 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 -> 10000101 -> 133 (-123), # 0xE7 -> 11100111 -> 231 (-25) self.assertConversion( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType.get(), bytes([ 247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231 ])) self.assertEqual( bytes([ 247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231 ]), Literal.of(uuid.UUID( "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer()) # fixed values are stored directly # 'a' -> 97, 'b' -> 98 self.assertConversion(bytes("ab", "utf8"), FixedType.of_length(2), bytes([97, 98])) self.assertEqual(bytes([97, 98]), Literal.of(bytes("ab", "utf8")).to_byte_buffer()) # binary values are stored directly # 'Z' -> 90 self.assertConversion(bytearray("Z", "utf8"), BinaryType.get(), bytes([90])) self.assertEqual(bytes([90]), Literal.of(bytearray("Z", "utf8")).to_byte_buffer()) # decimals are stored as unscaled values in the form of two's-complement big-endian binary, # using the minimum number of bytes for the values # 345 is 0...1|01011001 in binary # 00000001 -> 1, 01011001 -> 89 self.assertConversion( Decimal(3.45).quantize(Decimal(".01")), DecimalType.of(3, 2), bytes([1, 89])) self.assertEqual( bytes([1, 89]), Literal.of(3.45).to(DecimalType.of(3, 2)).to_byte_buffer()) # decimal on 3-bytes to test that we use the minimum number of bytes and not a power of 2 # 1234567 is 00010010|11010110|10000111 in binary # 00010010 -> 18, 11010110 -> 214, 10000111 -> 135 self.assertConversion( Decimal(123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4), bytes([18, 214, 135])) self.assertEqual( bytes([18, 214, 135]), Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # negative decimal to test two's complement # -1234567 is 11101101|00101001|01111001 in binary # 11101101 -> 237, 00101001 -> 41, 01111001 -> 121 self.assertConversion( Decimal(-123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4), bytes([237, 41, 121])) self.assertEqual( bytes([237, 41, 121]), Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # test empty byte in decimal # 11 is 00001011 in binary # 00001011 -> 11 self.assertConversion( Decimal(0.011).quantize(Decimal(".001")), DecimalType.of(10, 3), bytes([11])) self.assertEqual( bytes([11]), Literal.of(0.011).to(DecimalType.of(10, 3)).to_byte_buffer())
NestedField, StringType, StructType, TimestampType) from iceberg.api.types import Type import pyarrow as pa from pyarrow.parquet import lib, ParquetFile _logger = logging.getLogger(__name__) arrow_type_map = {lib.Type_BOOL: lambda x=None: BooleanType.get(), lib.Type_DATE32: lambda x=None: DateType.get(), lib.Type_DECIMAL128: lambda x=None: DecimalType.of(x.precision, x.scale), lib.Type_DOUBLE: lambda x=None: DoubleType.get(), lib.Type_FIXED_SIZE_BINARY: lambda x=None: FixedType.of_length(x.byte_width), lib.Type_BINARY: lambda x=None: BinaryType.get(), lib.Type_FLOAT: lambda x=None: FloatType.get(), lib.Type_STRING: lambda x=None: StringType.get(), lib.Type_INT32: lambda x=None: IntegerType.get(), lib.Type_INT64: lambda x=None: LongType.get(), lib.Type_TIMESTAMP: lambda x=None: (TimestampType.without_timezone() if x.tz is None else TimestampType.with_timezone()) } def get_nested_field(field_id: int, field_name: str, field_type: Type, nullable: bool) -> NestedField: if nullable: return NestedField.optional(field_id, field_name, field_type) else: return NestedField.required(field_id, field_name, field_type)
from iceberg.api.types import (BinaryType, BooleanType, DateType, DecimalType, DoubleType, FixedType, FloatType, IntegerType, LongType, StringType, TimestampType, TimeType, UUIDType) PRIMITIVES = [BinaryType.get(), BooleanType.get(), DateType.get(), DecimalType.of(9, 2), DecimalType.of(11, 2), DecimalType.of(9, 3), DoubleType.get(), FixedType.of_length(3), FixedType.of_length(4), FloatType.get(), IntegerType.get(), LongType.get(), StringType.get(), TimestampType.with_timezone(), TimestampType.without_timezone(), TimeType.get(),