def get_list_field(col_type: pa.ListType) -> ListType: if col_type.value_field.nullable: return ListType.of_optional(int(col_type.value_field.metadata[b"PARQUET:field_id"].decode("utf-8")), arrow_type_map[col_type.value_field.type.id](col_type.value_field.type)) else: return ListType.of_required(col_type.value_field.metadata[b"PARQUET:field_id"].decode("utf-8"), arrow_type_map[col_type.value_field.type.id](col_type.value_field.type))
def list_from_dict(dict_obj): elem_id = dict_obj.get(SchemaParser.ELEMENT_ID) elem_type = SchemaParser.type_from_dict( dict_obj.get(SchemaParser.ELEMENT)) is_required = dict_obj.get(SchemaParser.REQUIRED) if is_required: return ListType.of_required(elem_id, elem_type) else: return ListType.of_optional(elem_id, elem_type)
def get_type(partition_type): return StructType.of([ NestedField.required(100, "file_path", StringType.get()), NestedField.required(101, "file_format", StringType.get()), NestedField.required(102, "partition", partition_type), NestedField.required(103, "record_count", LongType.get()), NestedField.required(104, "file_size_in_bytes", LongType.get()), NestedField.required(105, "block_size_in_bytes", LongType.get()), NestedField.optional(106, "file_ordinal", IntegerType.get()), NestedField.optional(107, "sort_columns", ListType.of_required(112, IntegerType.get())), NestedField.optional( 108, "column_sizes", MapType.of_required(117, 118, IntegerType.get(), LongType.get())), NestedField.optional( 109, "value_counts", MapType.of_required(119, 120, IntegerType.get(), LongType.get())), NestedField.optional( 110, "null_value_counts", MapType.of_required(121, 122, IntegerType.get(), LongType.get())), NestedField.optional( 125, "lower_bounds", MapType.of_required(126, 127, IntegerType.get(), BinaryType.get())), NestedField.optional( 128, "upper_bounds", MapType.of_required(129, 130, IntegerType.get(), BinaryType.get())) ] # NEXT ID TO ASSIGN: 131 )
def test_unnested_complex_types(unnested_complex_type_test_parquet_file): expected_schema = Schema([ NestedField.optional(1, "list_int_col", ListType.of_optional(3, IntegerType.get())), NestedField.optional(4, "list_str_col", ListType.of_optional(6, StringType.get())), NestedField.optional( 7, "struct_col", StructType.of([ NestedField.optional(8, "f1", IntegerType.get()), NestedField.optional(9, "f2", StringType.get()) ])) ]) converted_schema = convert_parquet_to_iceberg( unnested_complex_type_test_parquet_file) compare_schema(expected_schema, converted_schema)
def convert_array_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if avro_field_type != "array": raise RuntimeError("Avro type must be array: %s" % avro_field_type) element_id = avro_field.get(AvroToIceberg.FIELD_ELEMENT_ID_PROP) items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP) is_optional = AvroToIceberg.is_option_schema(items) if isinstance(items, str) and items in AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP: item_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(items) if item_type is None: raise RuntimeError("No mapping found for type %s" % items) else: raise RuntimeError("Complex list types not yet implemented") if is_optional: return ListType.of_optional(element_id, item_type), next_id else: return ListType.of_required(element_id, item_type), next_id