def schema(): return Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.optional(2, "no_stats", IntegerType.get()), NestedField.required(3, "required", StringType.get()), NestedField.optional(4, "all_nulls", StringType.get()), NestedField.optional(5, "some_nulls", StringType.get()), NestedField.optional(6, "no_nulls", StringType.get()))
def test_multiple_fields(assert_and_unwrap): struct = StructType.of([NestedField.required(10, 'x', IntegerType.get()), NestedField.required(11, 'y', IntegerType.get()), NestedField.required(12, 'z', IntegerType.get())]) unbound = UnboundPredicate(Operation.LT, Expressions.ref("y"), 6) expr = unbound.bind(struct) bound = assert_and_unwrap(expr) assert 11 == bound.ref.field.field_id assert Operation.LT == bound.op assert 6 == bound.lit.value
def inc_man_file(): return TestManifestFile( "manifest-list.avro", 1024, 0, int(time.time() * 1000), 5, 10, 0, (TestFieldSummary( False, Conversions.to_byte_buffer(IntegerType.get(), 30), Conversions.to_byte_buffer( IntegerType.get(), 79)), TestFieldSummary(True, None, None), TestFieldSummary( True, Conversions.to_byte_buffer(StringType.get(), 'a'), Conversions.to_byte_buffer(StringType.get(), 'z')), TestFieldSummary( False, Conversions.to_byte_buffer(StringType.get(), 'a'), Conversions.to_byte_buffer(StringType.get(), 'z'))))
def test_unnested_complex_types(unnested_complex_type_test_parquet_file): expected_schema = Schema([ NestedField.optional(1, "list_int_col", ListType.of_optional(3, IntegerType.get())), NestedField.optional(4, "list_str_col", ListType.of_optional(6, StringType.get())), NestedField.optional( 7, "struct_col", StructType.of([ NestedField.optional(8, "f1", IntegerType.get()), NestedField.optional(9, "f2", StringType.get()) ])) ]) converted_schema = convert_parquet_to_iceberg( unnested_complex_type_test_parquet_file) compare_schema(expected_schema, converted_schema)
def test_from_bytes(self): self.assertEqual(1234, Conversions.from_byte_buffer(IntegerType.get(), b'\xd2\x04\x00\x00')) self.assertEqual(1234, Conversions.from_byte_buffer(LongType.get(), b'\xd2\x04\x00\x00\x00\x00\x00\x00')) self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer(DoubleType.get(), b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f'))
def strict_file(): return TestDataFile("file.avro", TestHelpers.Row.of(), 50, { 4: 50, 5: 50, 6: 50 }, { 4: 50, 5: 10, 6: 0 }, { 1: Conversions.to_byte_buffer(IntegerType.get(), 30), 7: Conversions.to_byte_buffer(IntegerType.get(), 5) }, { 1: Conversions.to_byte_buffer(IntegerType.get(), 79), 7: Conversions.to_byte_buffer(IntegerType.get(), 5) })
def test_column_rename(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()) ] schema = pa.schema([ pa.field("int_col", pa.int32(), False), pa.field("bigint_col", pa.int64(), True), pa.field("string_col", pa.string(), True), pa.field("float_col", pa.float32(), True), pa.field("dbl_col", pa.float64(), True) ]) source_table = pa.table(pyarrow_array, schema=schema) target_table = reader.read() assert source_table == target_table
def test_comparison_predicate_binding(op, assert_and_unwrap): struct = StructType.of([NestedField.required(14, "x", IntegerType.get())]) unbound = UnboundPredicate(op, Expressions.ref("x"), 5) bound = assert_and_unwrap(unbound.bind(struct)) assert 5 == bound.lit.value assert 14 == bound.ref.field.field_id assert op == bound.op
def test_missing_field(): struct = StructType.of([NestedField.required(13, "x", IntegerType.get())]) unbound = UnboundPredicate(Operation.LT, Expressions.ref("missing"), 6) try: unbound.bind(struct) except ValidationException as e: assert e.args[0].startswith("Cannot find field 'missing' in struct")
def test_from_bytes(self): self.assertEqual( False, Conversions.from_byte_buffer(BooleanType.get(), b'\x00')) self.assertEqual( True, Conversions.from_byte_buffer(BooleanType.get(), b'\x01')) self.assertEqual( 1234, Conversions.from_byte_buffer(IntegerType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 1234, Conversions.from_byte_buffer(LongType.get(), b'\xd2\x04\x00\x00\x00\x00\x00\x00')) self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer( FloatType.get(), b'\x19\x04\x9e?'), places=5) self.assertAlmostEqual( 1.2345, Conversions.from_byte_buffer(DoubleType.get(), b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f')) self.assertEqual( 1234, Conversions.from_byte_buffer(DateType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimeType.get(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.with_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.without_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( "foo", Conversions.from_byte_buffer(StringType.get(), b'foo')) self.assertEqual( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), Conversions.from_byte_buffer( UUIDType.get(), b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7')) self.assertEqual( b'foo', Conversions.from_byte_buffer(FixedType.of_length(3), b'foo')) self.assertEqual( b'foo', Conversions.from_byte_buffer(BinaryType.get(), b'foo')) self.assertEqual( Decimal(123.45).quantize(Decimal(".01")), Conversions.from_byte_buffer(DecimalType.of(5, 2), b'\x30\x39')) self.assertEqual( Decimal(123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\x00\x12\xd6\x87')) self.assertEqual( Decimal(-123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\xff\xed\x29\x79'))
def get_type(partition_type): return StructType.of([ NestedField.required(100, "file_path", StringType.get()), NestedField.required(101, "file_format", StringType.get()), NestedField.required(102, "partition", partition_type), NestedField.required(103, "record_count", LongType.get()), NestedField.required(104, "file_size_in_bytes", LongType.get()), NestedField.required(105, "block_size_in_bytes", LongType.get()), NestedField.optional(106, "file_ordinal", IntegerType.get()), NestedField.optional(107, "sort_columns", ListType.of_required(112, IntegerType.get())), NestedField.optional( 108, "column_sizes", MapType.of_required(117, 118, IntegerType.get(), LongType.get())), NestedField.optional( 109, "value_counts", MapType.of_required(119, 120, IntegerType.get(), LongType.get())), NestedField.optional( 110, "null_value_counts", MapType.of_required(121, 122, IntegerType.get(), LongType.get())), NestedField.optional( 125, "lower_bounds", MapType.of_required(126, 127, IntegerType.get(), BinaryType.get())), NestedField.optional( 128, "upper_bounds", MapType.of_required(129, 130, IntegerType.get(), BinaryType.get())) ] # NEXT ID TO ASSIGN: 131 )
def inc_man_spec(): inc_schema = Schema( NestedField.required(1, "id", IntegerType.get()), NestedField.optional(4, "all_nulls", StringType.get()), NestedField.optional(5, "some_nulls", StringType.get()), NestedField.optional(6, "no_nulls", StringType.get())) return (PartitionSpec.builder_for(inc_schema).with_spec_id(0).identity( "id").identity("all_nulls").identity("some_nulls").identity( "no_nulls").build())
def test_table_scan_honors_select_without_case_sensitivity(ts_table): scan1 = ts_table.new_scan().case_sensitive(False).select(["ID"]) # order of refinements shouldn't matter scan2 = ts_table.new_scan().select(["ID"]).case_sensitive(False) expected_schema = Schema([NestedField.required(1, "id", IntegerType.get())]) assert scan1.schema.as_struct() == expected_schema.as_struct() assert scan2.schema.as_struct() == expected_schema.as_struct()
def test_schema_evolution_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(16, "other_new_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(15, "new_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.not_null("new_col"), True) schema = pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("other_new_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True), pa.field("new_col", pa.string(), nullable=True) ]) pyarrow_not_null_array = [ pa.array([], type=pa.int32()), pa.array([], type=pa.int64()), pa.array([], type=pa.int32()), pa.array([], type=pa.float32()), pa.array([], type=pa.float64()), pa.array([], type=pa.string()), pa.array([], type=pa.string()) ] not_null_table = pa.table(pyarrow_not_null_array, schema=schema) pyarrow_null_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([None, None, None, None, None], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([None, None, None, None, None], type=pa.string()) ] null_table = pa.table(pyarrow_null_array, schema=schema) target_table = reader.read() assert not_null_table == target_table reader = ParquetReader(input_file, expected_schema, {}, Expressions.is_null("new_col"), True) target_table = reader.read() assert null_table == target_table
def test_long_to_integer_conversion(assert_and_unwrap): struct = StructType.of([NestedField.required(17, "i", IntegerType.get())]) lt = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert lt.bind(struct) == Expressions.always_true() lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert lt_eq.bind(struct) == Expressions.always_true() gt = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert gt.bind(struct) == Expressions.always_true() gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert gt_eq.bind(struct) == Expressions.always_true() gt_max = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert gt_max.bind(struct) == Expressions.always_false() gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert gt_eq_max.bind(struct) == Expressions.always_false() lt_min = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert lt_min.bind(struct) == Expressions.always_false() lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert lt_eq_min.bind(struct) == Expressions.always_false() lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MAX_INT).bind(struct) lt_max = assert_and_unwrap(lt_expr) assert lt_max.lit.value == Literal.JAVA_MAX_INT lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT).bind(struct) lt_eq_max = assert_and_unwrap(lt_eq_expr) assert lt_eq_max.lit.value == Literal.JAVA_MAX_INT gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MIN_INT).bind(struct) gt_min = assert_and_unwrap(gt_expr) assert gt_min.lit.value == Literal.JAVA_MIN_INT gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT).bind(struct) gt_eq_min = assert_and_unwrap(gt_eq_expr) assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def file(): return TestDataFile( "file.avro", TestHelpers.Row.of(), 50, # value counts { 4: 50, 5: 50, 6: 50 }, # null value counts { 4: 50, 5: 10, 6: 0 }, # lower bounds {1: Conversions.to_byte_buffer(IntegerType.get(), 30)}, # upper bounds {1: Conversions.to_byte_buffer(IntegerType.get(), 79)})
def test_raise_exception_with_invalid_json(): spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "data", StringType.get()), NestedField.required(3, "num", DecimalType.of(9, 2))) spec_string = '{"spec-id": 0, "fields": [' \ '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \ '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \ '{"name": "data1", "transform": "bucket[16]", "source-id": 2}, ' \ '{"name": "data2", "transform": "bucket[8]", "source-id": 2}, ' \ '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3}]}' with pytest.raises(RuntimeError): PartitionSpecParser.from_json(spec_schema, spec_string)
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "data", StringType.get())) spec = PartitionSpec\ .builder_for(spec_schema) \ .identity("id")\ .bucket("data", 16)\ .build() expected = '{"spec-id": 0, "fields": [' \ '{"name": "id", "transform": "identity", "source-id": 1}, ' \ '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2}]}' assert expected == PartitionSpecParser.to_json(spec)
def test_partition_spec(self): schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [PartitionSpec.builder_for(schema).identity("i").build(), PartitionSpec.builder_for(schema).identity("l").build(), PartitionSpec.builder_for(schema).identity("d").build(), PartitionSpec.builder_for(schema).identity("t").build(), PartitionSpec.builder_for(schema).identity("ts").build(), PartitionSpec.builder_for(schema).identity("dec").build(), PartitionSpec.builder_for(schema).identity("s").build(), PartitionSpec.builder_for(schema).identity("u").build(), PartitionSpec.builder_for(schema).identity("f").build(), PartitionSpec.builder_for(schema).identity("b").build(), PartitionSpec.builder_for(schema).bucket("i", 128).build(), PartitionSpec.builder_for(schema).bucket("l", 128).build(), PartitionSpec.builder_for(schema).bucket("d", 128).build(), PartitionSpec.builder_for(schema).bucket("t", 128).build(), PartitionSpec.builder_for(schema).bucket("ts", 128).build(), PartitionSpec.builder_for(schema).bucket("dec", 128).build(), PartitionSpec.builder_for(schema).bucket("s", 128).build(), PartitionSpec.builder_for(schema).bucket("u", 128).build(), PartitionSpec.builder_for(schema).bucket("f", 128).build(), PartitionSpec.builder_for(schema).bucket("b", 128).build(), PartitionSpec.builder_for(schema).year("d").build(), PartitionSpec.builder_for(schema).month("d").build(), PartitionSpec.builder_for(schema).day("d").build(), PartitionSpec.builder_for(schema).year("ts").build(), PartitionSpec.builder_for(schema).month("ts").build(), PartitionSpec.builder_for(schema).day("ts").build(), PartitionSpec.builder_for(schema).hour("ts").build(), PartitionSpec.builder_for(schema).truncate("i", 10).build(), PartitionSpec.builder_for(schema).truncate("l", 10).build(), PartitionSpec.builder_for(schema).truncate("dec", 10).build(), PartitionSpec.builder_for(schema).truncate("s", 10).build(), PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(), PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(), ] for spec in specs: self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))
def rg_expected_schema(): return Schema([ NestedField.required(1, "string_col", StringType.get()), NestedField.required(2, "long_col", LongType.get()), NestedField.required(3, "int_col", IntegerType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "null_col", StringType.get()), NestedField.optional(6, "missing_col", StringType.get()), NestedField.optional(7, "no_stats_col", StringType.get()), NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(9, "ts_wotz_col", TimestampType.without_timezone()), NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)), NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)), NestedField.optional(12, "date_type", DateType.get()), ])
def test_bucket_hash(self): buckets = [ [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379], [Transforms.bucket(LongType.get(), 100), 34, 2017239379], [Transforms.bucket(DateType.get(), 100), 17486, -653330422], [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989], [Transforms.bucket(TimestampType.without_timezone(), 100), 1510871468000000, -2047944441], [Transforms.bucket(DecimalType.of(9, 2), 100), decimal.Decimal("14.20"), -500754589], [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089], [Transforms.bucket(UUIDType.get(), 100), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340], [Transforms.bucket(FixedType.of_length(3), 128), b'foo', -156908512], [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', -188683207] ] for bucket in buckets: self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))
def test_primitive_types(primitive_type_test_parquet_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) compare_schema( expected_schema, convert_parquet_to_iceberg(primitive_type_test_parquet_file))
def test_projection(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) num_cols = source_table.num_columns for i in range(1, num_cols - 1): source_table = source_table.remove_column(num_cols - i) assert source_table == reader.read()
def supported_primitives(): return StructType.of([ NestedField.required(100, "id", LongType.get()), NestedField.optional(101, "data", StringType.get()), NestedField.required(102, "b", BooleanType.get()), NestedField.optional(103, "i", IntegerType.get()), NestedField.required(104, "l", LongType.get()), NestedField.optional(105, "f", FloatType.get()), NestedField.required(106, "d", DoubleType.get()), NestedField.optional(107, "date", DateType.get()), NestedField.required(108, "ts", TimestampType.with_timezone()), NestedField.required(110, "s", StringType.get()), NestedField.required(111, "uuid", UUIDType.get()), NestedField.required(112, "fixed", FixedType.of_length(7)), NestedField.optional(113, "bytes", BinaryType.get()), NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)), NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)), NestedField.required(114, "dec_38_10", DecimalType.of(38, 10)) ])
def test_compound_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader( input_file, expected_schema, {}, Expressions.and_(Expressions.equal("string_col", "us"), Expressions.equal("int_col", 1)), True) pyarrow_array = [ pa.array([1], type=pa.int32()), pa.array([1], type=pa.int64()), pa.array([1.0], type=pa.float32()), pa.array([1.0], type=pa.float64()), pa.array(['us'], type=pa.string()) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_decimal_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9)) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.decimal128(38, 9)) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("new_dec_col", pa.decimal128(38, 9), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "data", StringType.get()), NestedField.required(3, "num", DecimalType.of(9, 2))) spec = PartitionSpec \ .builder_for(spec_schema) \ .identity("id") \ .bucket("data", 16) \ .add_without_field_id(2, "data1", "bucket[16]") \ .add(2, 1010, "data2", "bucket[8]") \ .bucket("num", 8) \ .build() expected = '{"spec-id": 0, "fields": [' \ '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \ '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \ '{"name": "data1", "transform": "bucket[16]", "source-id": 2, "field-id": 1002}, ' \ '{"name": "data2", "transform": "bucket[8]", "source-id": 2, "field-id": 1010}, ' \ '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3, "field-id": 1011}]}' assert expected == PartitionSpecParser.to_json(spec)
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) assert reader.read() == source_table
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [ PartitionSpec.builder_for(spec_schema).identity("i").build(), PartitionSpec.builder_for(spec_schema).identity("l").build(), PartitionSpec.builder_for(spec_schema).identity("d").build(), PartitionSpec.builder_for(spec_schema).identity("t").build(), PartitionSpec.builder_for(spec_schema).identity("ts").build(), PartitionSpec.builder_for(spec_schema).identity("dec").build(), PartitionSpec.builder_for(spec_schema).identity("s").build(), PartitionSpec.builder_for(spec_schema).identity("u").build(), PartitionSpec.builder_for(spec_schema).identity("f").build(), PartitionSpec.builder_for(spec_schema).identity("b").build(), PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(), PartitionSpec.builder_for(spec_schema).year("d").build(), PartitionSpec.builder_for(spec_schema).month("d").build(), PartitionSpec.builder_for(spec_schema).day("d").build(), PartitionSpec.builder_for(spec_schema).year("ts").build(), PartitionSpec.builder_for(spec_schema).month("ts").build(), PartitionSpec.builder_for(spec_schema).day("ts").build(), PartitionSpec.builder_for(spec_schema).hour("ts").build(), PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(), PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build() ] expected_spec_strs = [ "[\n i: identity(1)\n]", "[\n l: identity(2)\n]", "[\n d: identity(3)\n]", "[\n t: identity(4)\n]", "[\n ts: identity(5)\n]", "[\n dec: identity(6)\n]", "[\n s: identity(7)\n]", "[\n u: identity(8)\n]", "[\n f: identity(9)\n]", "[\n b: identity(10)\n]", "[\n i_bucket: bucket[128](1)\n]", "[\n l_bucket: bucket[128](2)\n]", "[\n d_bucket: bucket[128](3)\n]", "[\n t_bucket: bucket[128](4)\n]", "[\n ts_bucket: bucket[128](5)\n]", "[\n dec_bucket: bucket[128](6)\n]", "[\n s_bucket: bucket[128](7)\n]", "[\n d_year: year(3)\n]", "[\n d_month: month(3)\n]", "[\n d_day: day(3)\n]", "[\n ts_year: year(5)\n]", "[\n ts_month: month(5)\n]", "[\n ts_day: day(5)\n]", "[\n ts_hour: hour(5)\n]", "[\n i_truncate: truncate[10](1)\n]", "[\n l_truncate: truncate[10](2)\n]", "[\n dec_truncate: truncate[10](6)\n]", "[\n s_truncate: truncate[10](7)\n]", "[\n dec_bucket: bucket[16](6)\n]", ] for (spec, expected_spec_str) in zip(specs, expected_spec_strs): assert str(spec) == expected_spec_str
# software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import iceberg.api.expressions as exp from iceberg.api.types import (IntegerType, NestedField, StringType, StructType) from iceberg.exceptions import ValidationException from pytest import raises STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()), NestedField.required(14, "y", IntegerType.get()), NestedField.optional(15, "z", IntegerType.get())]) def test_less_than(row_of): evaluator = exp.evaluator.Evaluator(STRUCT, exp.expressions.Expressions.less_than("x", 7)) assert not evaluator.eval(row_of((7, 8, None))) assert evaluator.eval(row_of((6, 8, None))) def test_less_than_or_equal(row_of): evaluator = exp.evaluator.Evaluator(STRUCT, exp.expressions.Expressions.less_than_or_equal("x", 7)) assert evaluator.eval(row_of((7, 8, None)))