Пример #1
0
def test_table_scan_honors_select_without_case_sensitivity(ts_table):
    scan1 = ts_table.new_scan().case_sensitive(False).select(["ID"])
    # order of refinements shouldn't matter
    scan2 = ts_table.new_scan().select(["ID"]).case_sensitive(False)

    expected_schema = Schema([NestedField.required(1, "id", IntegerType.get())])

    assert scan1.schema.as_struct() == expected_schema.as_struct()
    assert scan2.schema.as_struct() == expected_schema.as_struct()
Пример #2
0
    def bind(self,
             struct: StructType,
             case_sensitive: bool = True) -> BoundReference:
        from iceberg.api import Schema
        schema = Schema(struct.fields)
        field = schema.find_field(
            self.name
        ) if case_sensitive else schema.case_insensitive_find_field(self.name)

        ValidationException.check(field is not None,
                                  "Cannot find field '%s' in struct: %s",
                                  (self.name, schema.as_struct()))

        return BoundReference(struct, field)
Пример #3
0
def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table
Пример #4
0
def expected_metadata_sorting():
    spec_schema = Schema(NestedField.required(1, "x", LongType.get()),
                         NestedField.required(2, "y", LongType.get()),
                         NestedField.required(3, "z", LongType.get()))

    spec = PartitionSpec \
        .builder_for(spec_schema) \
        .with_spec_id(5) \
        .build()

    random.seed(1234)
    previous_snapshot_id = int(time.time()) - random.randint(0, 3600)

    previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None,
                                     timestamp_millis=previous_snapshot_id,
                                     manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"),
                                                                    spec_id=spec.spec_id)])

    current_snapshot_id = int(time.time())
    current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id,
                                    timestamp_millis=current_snapshot_id,
                                    manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"),
                                                                   spec_id=spec.spec_id)])

    reversed_snapshot_log = list()
    metadata = TableMetadata(ops, None, "s3://bucket/test/location",
                             int(time.time()), 3, spec_schema, 5, [spec], {"property": "value"}, current_snapshot_id,
                             [previous_snapshot, current_snapshot], reversed_snapshot_log)

    reversed_snapshot_log.append(SnapshotLogEntry(current_snapshot.timestamp_millis, current_snapshot.snapshot_id))
    reversed_snapshot_log.append(SnapshotLogEntry(previous_snapshot.timestamp_millis, previous_snapshot.snapshot_id))

    return metadata
Пример #5
0
    def convert_avro_schema_to_iceberg(avro_schema):
        if avro_schema.get(AvroToIceberg.FIELD_TYPE_PROP) != "record":
            raise RuntimeError("Cannot convert avro schema to iceberg %s" % avro_schema)

        struct = AvroToIceberg.convert_type(avro_schema, None)

        return Schema(struct[0].fields)
Пример #6
0
def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table
Пример #7
0
    def from_json(json_obj):
        if isinstance(json_obj, str):
            type_var = SchemaParser.type_from_dict(json.loads(json_obj))
        else:
            type_var = SchemaParser.type_from_dict(json_obj)

        if type_var is not None and type_var.is_nested_type(
        ) and type_var.as_nested_type().is_struct_type():
            return Schema(type_var.as_nested_type().as_struct_type().fields)
        else:
            raise RuntimeError("Cannot create schema, not a struct type: %s" %
                               type_var)
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()))

    spec = PartitionSpec\
        .builder_for(spec_schema) \
        .identity("id")\
        .bucket("data", 16)\
        .build()

    expected = '{"spec-id": 0, "fields": [' \
               '{"name": "id", "transform": "identity", "source-id": 1}, ' \
               '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2}]}'
    assert expected == PartitionSpecParser.to_json(spec)
Пример #9
0
def test_raise_exception_with_invalid_json():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()),
                         NestedField.required(3, "num", DecimalType.of(9, 2)))

    spec_string = '{"spec-id": 0, "fields": [' \
                  '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \
                  '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \
                  '{"name": "data1", "transform": "bucket[16]", "source-id": 2}, ' \
                  '{"name": "data2", "transform": "bucket[8]", "source-id": 2}, ' \
                  '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3}]}'

    with pytest.raises(RuntimeError):
        PartitionSpecParser.from_json(spec_schema, spec_string)
Пример #10
0
def test_column_upcast(primitive_type_test_file):
    expected_schema = Schema(
        [NestedField.required(1, "int_col", LongType.get())])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [pa.array([1, 2, 3, 4, 5], type=pa.int32())]
    source_table = pa.table(
        pyarrow_array,
        schema=pa.schema([pa.field("int_col", pa.int64(), nullable=False)]))

    target_table = reader.read()
    assert source_table == target_table
Пример #11
0
def arrow_to_iceberg(arrow_schema: pa.Schema) -> Schema:
    """
    Use an arrow schema, which contains the field_id metadata, to create an equivalent iceberg Schema

    Parameters
    ----------
    arrow_schema : pyarrow.Schema
        An Arrow schema with the parquet field_id metadata

    Returns
    -------
    iceberg.api.Schema
        returns an equivalent iceberg Schema based on the arrow schema read from the file
    """
    return Schema([get_field(col) for col in arrow_schema])
def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.optional(1, "list_int_col",
                             ListType.of_optional(3, IntegerType.get())),
        NestedField.optional(4, "list_str_col",
                             ListType.of_optional(6, StringType.get())),
        NestedField.optional(
            7, "struct_col",
            StructType.of([
                NestedField.optional(8, "f1", IntegerType.get()),
                NestedField.optional(9, "f2", StringType.get())
            ]))
    ])
    converted_schema = convert_parquet_to_iceberg(
        unnested_complex_type_test_parquet_file)
    compare_schema(expected_schema, converted_schema)
Пример #13
0
def rg_expected_schema():
    return Schema([
        NestedField.required(1, "string_col", StringType.get()),
        NestedField.required(2, "long_col", LongType.get()),
        NestedField.required(3, "int_col", IntegerType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "null_col", StringType.get()),
        NestedField.optional(6, "missing_col", StringType.get()),
        NestedField.optional(7, "no_stats_col", StringType.get()),
        NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(9, "ts_wotz_col",
                             TimestampType.without_timezone()),
        NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)),
        NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)),
        NestedField.optional(12, "date_type", DateType.get()),
    ])
def test_primitive_types(primitive_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])
    compare_schema(
        expected_schema,
        convert_parquet_to_iceberg(primitive_type_test_parquet_file))
def prune_columns(file_schema: Schema, expected_schema: Schema) -> List[str]:
    """
    Given two Iceberg schema's returns a list of column_names for all id's in the
    file schema that are projected in the expected schema

    Parameters
    ----------
    file_schema : iceberg.api.Schema
        An Iceberg schema of the file being read
    expected_schema : iceberg.api.Schema
        An Iceberg schema of the final projection
    Returns
    -------
    list
        The column names in the file that matched ids in the expected schema
    """
    return [column.name for column in file_schema.as_struct().fields
            if column.id in get_projected_ids(expected_schema)]
Пример #16
0
def test_projection(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    num_cols = source_table.num_columns
    for i in range(1, num_cols - 1):
        source_table = source_table.remove_column(num_cols - i)

    assert source_table == reader.read()
Пример #17
0
def test_compound_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(
        input_file, expected_schema, {},
        Expressions.and_(Expressions.equal("string_col", "us"),
                         Expressions.equal("int_col", 1)), True)
    pyarrow_array = [
        pa.array([1], type=pa.int32()),
        pa.array([1], type=pa.int64()),
        pa.array([1.0], type=pa.float32()),
        pa.array([1.0], type=pa.float64()),
        pa.array(['us'], type=pa.string())
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Пример #18
0
def test_decimal_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9))
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.decimal128(38, 9))
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("new_dec_col",
                                         pa.decimal128(38, 9),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Пример #19
0
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()),
                         NestedField.required(3, "num", DecimalType.of(9, 2)))

    spec = PartitionSpec \
        .builder_for(spec_schema) \
        .identity("id") \
        .bucket("data", 16) \
        .add_without_field_id(2, "data1", "bucket[16]") \
        .add(2, 1010, "data2", "bucket[8]") \
        .bucket("num", 8) \
        .build()

    expected = '{"spec-id": 0, "fields": [' \
               '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \
               '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \
               '{"name": "data1", "transform": "bucket[16]", "source-id": 2, "field-id": 1002}, ' \
               '{"name": "data2", "transform": "bucket[8]", "source-id": 2, "field-id": 1010}, ' \
               '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3, "field-id": 1011}]}'
    assert expected == PartitionSpecParser.to_json(spec)
Пример #20
0
def missing_spec_list():
    schema = Schema(NestedField.required(1, "x", LongType.get()),
                    NestedField.required(2, "y", LongType.get()),
                    NestedField.required(3, "z", LongType.get()))

    spec = PartitionSpec.builder_for(schema).identity("x").with_spec_id(6).build()
    random.seed(1234)
    previous_snapshot_id = int(time.time()) - random.randint(0, 3600)

    previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None,
                                     timestamp_millis=previous_snapshot_id,
                                     manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"),
                                                                    spec_id=spec.spec_id)])

    current_snapshot_id = int(time.time())
    current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id,
                                    timestamp_millis=current_snapshot_id,
                                    manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"),
                                                                   spec_id=spec.spec_id)])
    return TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, schema, 6,
                         (spec,), {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot],
                         [])
Пример #21
0
    def new_table_metadata(ops: TableOperations,
                           schema: Schema,
                           spec: PartitionSpec,
                           location: str,
                           properties: dict = None) -> "TableMetadata":
        last_column_id = AtomicInteger(0)
        fresh_schema = assign_fresh_ids(schema,
                                        last_column_id.increment_and_get)

        spec_builder = PartitionSpec.builder_for(fresh_schema)
        for field in spec.fields:
            src_name = schema.find_column_name(field.source_id)
            spec_builder.add(field.source_id,
                             fresh_schema.find_field(src_name).field_id,
                             field.name, str(field.transform))

        fresh_spec = spec_builder.build()
        properties = properties if properties is not None else dict()

        return TableMetadata(ops, None, location, int(time.time() * 1000),
                             last_column_id.get(), fresh_schema,
                             TableMetadata.INITIAL_SPEC_ID, [fresh_spec],
                             properties, -1, list(), list())
Пример #22
0
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                         NestedField.required(2, "l", LongType.get()),
                         NestedField.required(3, "d", DateType.get()),
                         NestedField.required(4, "t", TimeType.get()),
                         NestedField.required(5, "ts", TimestampType.without_timezone()),
                         NestedField.required(6, "dec", DecimalType.of(9, 2)),
                         NestedField.required(7, "s", StringType.get()),
                         NestedField.required(8, "u", UUIDType.get()),
                         NestedField.required(9, "f", FixedType.of_length(3)),
                         NestedField.required(10, "b", BinaryType.get()))

    specs = [
        PartitionSpec.builder_for(spec_schema).identity("i").build(),
        PartitionSpec.builder_for(spec_schema).identity("l").build(),
        PartitionSpec.builder_for(spec_schema).identity("d").build(),
        PartitionSpec.builder_for(spec_schema).identity("t").build(),
        PartitionSpec.builder_for(spec_schema).identity("ts").build(),
        PartitionSpec.builder_for(spec_schema).identity("dec").build(),
        PartitionSpec.builder_for(spec_schema).identity("s").build(),
        PartitionSpec.builder_for(spec_schema).identity("u").build(),
        PartitionSpec.builder_for(spec_schema).identity("f").build(),
        PartitionSpec.builder_for(spec_schema).identity("b").build(),
        PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(),
        PartitionSpec.builder_for(spec_schema).year("d").build(),
        PartitionSpec.builder_for(spec_schema).month("d").build(),
        PartitionSpec.builder_for(spec_schema).day("d").build(),
        PartitionSpec.builder_for(spec_schema).year("ts").build(),
        PartitionSpec.builder_for(spec_schema).month("ts").build(),
        PartitionSpec.builder_for(spec_schema).day("ts").build(),
        PartitionSpec.builder_for(spec_schema).hour("ts").build(),
        PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(),
        PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build()
    ]

    expected_spec_strs = [
        "[\n i: identity(1)\n]",
        "[\n l: identity(2)\n]",
        "[\n d: identity(3)\n]",
        "[\n t: identity(4)\n]",
        "[\n ts: identity(5)\n]",
        "[\n dec: identity(6)\n]",
        "[\n s: identity(7)\n]",
        "[\n u: identity(8)\n]",
        "[\n f: identity(9)\n]",
        "[\n b: identity(10)\n]",
        "[\n i_bucket: bucket[128](1)\n]",
        "[\n l_bucket: bucket[128](2)\n]",
        "[\n d_bucket: bucket[128](3)\n]",
        "[\n t_bucket: bucket[128](4)\n]",
        "[\n ts_bucket: bucket[128](5)\n]",
        "[\n dec_bucket: bucket[128](6)\n]",
        "[\n s_bucket: bucket[128](7)\n]",
        "[\n d_year: year(3)\n]",
        "[\n d_month: month(3)\n]",
        "[\n d_day: day(3)\n]",
        "[\n ts_year: year(5)\n]",
        "[\n ts_month: month(5)\n]",
        "[\n ts_day: day(5)\n]",
        "[\n ts_hour: hour(5)\n]",
        "[\n i_truncate: truncate[10](1)\n]",
        "[\n l_truncate: truncate[10](2)\n]",
        "[\n dec_truncate: truncate[10](6)\n]",
        "[\n s_truncate: truncate[10](7)\n]",
        "[\n dec_bucket: bucket[16](6)\n]",
    ]

    for (spec, expected_spec_str) in zip(specs, expected_spec_strs):
        assert str(spec) == expected_spec_str
Пример #24
0
 def wrap_file_schema(file_struct):
     return Schema(NestedField.required(0, "status", IntegerType.get()),
                   NestedField.required(1, "snapshot_id", LongType.get()),
                   NestedField.required(2, "data_file", file_struct))
Пример #25
0
 def project_schema(part_type, columns):
     return ManifestEntry.wrap_file_schema(Schema(DataFile.get_type(part_type).fields)
                                           .select(columns)
                                           .as_struct())
Пример #26
0
def base_scan_schema():
    return Schema([
        NestedField.required(1, "id", IntegerType.get()),
        NestedField.required(2, "data", StringType.get())
    ])
Пример #27
0
def test_table_scan_honors_select(ts_table):
    scan = ts_table.new_scan().select(["id"])

    expected_schema = Schema([NestedField.required(1, "id", IntegerType.get())])

    assert scan.schema.as_struct() == expected_schema.as_struct()
Пример #28
0
import os
import random
import tempfile
import time

from iceberg.api import Files, PartitionSpec, Schema
from iceberg.api.types import BooleanType, IntegerType, LongType, NestedField, StringType
from iceberg.core import (BaseSnapshot, BaseTable, ConfigProperties,
                          GenericManifestFile, SnapshotLogEntry, TableMetadata,
                          TableMetadataParser, TableOperations,
                          TableProperties)
from iceberg.exceptions import AlreadyExistsException, CommitFailedException
import pytest

SCHEMA = Schema([NestedField.optional(1, "b", BooleanType.get())])
METADATA = dict()
VERSIONS = dict()


class LocalTableOperations(TableOperations):
    def current(self):
        raise RuntimeError("Not implemented for tests")

    def refresh(self):
        raise RuntimeError("Not implemented for tests")

    def commit(self, base, metadata):
        raise RuntimeError("Not implemented for tests")

    def new_input_file(self, path):
Пример #29
0
def iceberg_full_read_projection_schema():
    return Schema([
        NestedField.required(0, "id", LongType.get()),
        NestedField.optional(1, "data", StringType.get())
    ])