Пример #1
0
    def from_json(schema, json_obj):
        if isinstance(json_obj, str):
            json_obj = json.loads(json_obj)

        if not isinstance(json_obj, dict):
            raise RuntimeError(
                "Cannot parse partition spec, not an object: %s" % json_obj)

        spec_id = json_obj.get(PartitionSpecParser.SPEC_ID)

        builder = PartitionSpec.builder_for(schema).with_spec_id(spec_id)
        fields = json_obj.get(PartitionSpecParser.FIELDS)
        if not isinstance(fields, (list, tuple)):
            raise RuntimeError(
                "Cannot parse partition spec fields, not an array: %s" %
                fields)

        for element in fields:
            if not isinstance(element, dict):
                raise RuntimeError(
                    "Cannot parse partition field, not an object: %s" %
                    element)

            builder.add(element.get(PartitionSpecParser.SOURCE_ID),
                        element.get(PartitionSpecParser.NAME),
                        element.get(PartitionSpecParser.TRANSFORM))

        return builder.build()
    def from_json_fields(schema, spec_id, json_obj):
        builder = PartitionSpec.builder_for(schema).with_spec_id(spec_id)

        if isinstance(json_obj, str):
            json_obj = json.loads(json_obj)

        return PartitionSpecParser.__build_from_json_fields(builder, json_obj)
Пример #3
0
def inc_man_spec():
    inc_schema = Schema(
        NestedField.required(1, "id", IntegerType.get()),
        NestedField.optional(4, "all_nulls", StringType.get()),
        NestedField.optional(5, "some_nulls", StringType.get()),
        NestedField.optional(6, "no_nulls", StringType.get()))
    return (PartitionSpec.builder_for(inc_schema).with_spec_id(0).identity(
        "id").identity("all_nulls").identity("some_nulls").identity(
            "no_nulls").build())
    def from_json(schema, json_obj):
        if isinstance(json_obj, str):
            json_obj = json.loads(json_obj)

        if not isinstance(json_obj, dict):
            raise RuntimeError("Cannot parse partition spec, not an object: %s" % json_obj)

        spec_id = json_obj.get(PartitionSpecParser.SPEC_ID)

        builder = PartitionSpec.builder_for(schema).with_spec_id(spec_id)
        fields = json_obj.get(PartitionSpecParser.FIELDS)

        return PartitionSpecParser.__build_from_json_fields(builder, fields)
Пример #5
0
    def new_table_metadata(ops, schema, spec, location):
        last_column_id = AtomicInteger(0)
        fresh_schema = assign_fresh_ids(schema, last_column_id.increment_and_get)

        spec_builder = PartitionSpec.builder_for(fresh_schema)
        for field in spec.fields:
            src_name = schema.find_column_name(field.source_id)
            spec_builder.add(fresh_schema.find_field(src_name),
                             field,
                             str(field.fransform()))

        fresh_spec = spec_builder.build()
        return TableMetadata(ops, None, location,
                             int(time.time() * 1000),
                             last_column_id.get(), fresh_schema, TableMetadata.INITIAL_SPEC_ID, [fresh_spec],
                             dict(), -1, list(), list())
Пример #6
0
    def from_json_fields(schema, spec_id, json_obj):
        builder = PartitionSpec.builder_for(schema).with_spec_id(spec_id)

        if isinstance(json_obj, str):
            json_obj = json.loads(json_obj)

        if not isinstance(json_obj, list):
            raise RuntimeError(
                "Cannot parse partition spec fields, not an array: %s" %
                json_obj)

        for item in json_obj:
            if not isinstance(item, dict):
                raise RuntimeError(
                    "Cannot parse partition field, not an object: %s" %
                    json_obj)
            builder.add(item.get(PartitionSpecParser.SOURCE_ID),
                        item.get(PartitionSpecParser.NAME),
                        item.get(PartitionSpecParser.TRANSFORM))

        return builder.build()
Пример #7
0
def missing_spec_list():
    schema = Schema(NestedField.required(1, "x", LongType.get()),
                    NestedField.required(2, "y", LongType.get()),
                    NestedField.required(3, "z", LongType.get()))

    spec = PartitionSpec.builder_for(schema).identity("x").with_spec_id(6).build()
    random.seed(1234)
    previous_snapshot_id = int(time.time()) - random.randint(0, 3600)

    previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None,
                                     timestamp_millis=previous_snapshot_id,
                                     manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"),
                                                                    spec_id=spec.spec_id)])

    current_snapshot_id = int(time.time())
    current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id,
                                    timestamp_millis=current_snapshot_id,
                                    manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"),
                                                                   spec_id=spec.spec_id)])
    return TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, schema, 6,
                         (spec,), {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot],
                         [])
Пример #8
0
    def new_table_metadata(ops: TableOperations,
                           schema: Schema,
                           spec: PartitionSpec,
                           location: str,
                           properties: dict = None) -> "TableMetadata":
        last_column_id = AtomicInteger(0)
        fresh_schema = assign_fresh_ids(schema,
                                        last_column_id.increment_and_get)

        spec_builder = PartitionSpec.builder_for(fresh_schema)
        for field in spec.fields:
            src_name = schema.find_column_name(field.source_id)
            spec_builder.add(field.source_id,
                             fresh_schema.find_field(src_name).field_id,
                             field.name, str(field.transform))

        fresh_spec = spec_builder.build()
        properties = properties if properties is not None else dict()

        return TableMetadata(ops, None, location, int(time.time() * 1000),
                             last_column_id.get(), fresh_schema,
                             TableMetadata.INITIAL_SPEC_ID, [fresh_spec],
                             properties, -1, list(), list())
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                         NestedField.required(2, "l", LongType.get()),
                         NestedField.required(3, "d", DateType.get()),
                         NestedField.required(4, "t", TimeType.get()),
                         NestedField.required(5, "ts", TimestampType.without_timezone()),
                         NestedField.required(6, "dec", DecimalType.of(9, 2)),
                         NestedField.required(7, "s", StringType.get()),
                         NestedField.required(8, "u", UUIDType.get()),
                         NestedField.required(9, "f", FixedType.of_length(3)),
                         NestedField.required(10, "b", BinaryType.get()))

    specs = [
        PartitionSpec.builder_for(spec_schema).identity("i").build(),
        PartitionSpec.builder_for(spec_schema).identity("l").build(),
        PartitionSpec.builder_for(spec_schema).identity("d").build(),
        PartitionSpec.builder_for(spec_schema).identity("t").build(),
        PartitionSpec.builder_for(spec_schema).identity("ts").build(),
        PartitionSpec.builder_for(spec_schema).identity("dec").build(),
        PartitionSpec.builder_for(spec_schema).identity("s").build(),
        PartitionSpec.builder_for(spec_schema).identity("u").build(),
        PartitionSpec.builder_for(spec_schema).identity("f").build(),
        PartitionSpec.builder_for(spec_schema).identity("b").build(),
        PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(),
        PartitionSpec.builder_for(spec_schema).year("d").build(),
        PartitionSpec.builder_for(spec_schema).month("d").build(),
        PartitionSpec.builder_for(spec_schema).day("d").build(),
        PartitionSpec.builder_for(spec_schema).year("ts").build(),
        PartitionSpec.builder_for(spec_schema).month("ts").build(),
        PartitionSpec.builder_for(spec_schema).day("ts").build(),
        PartitionSpec.builder_for(spec_schema).hour("ts").build(),
        PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(),
        PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build()
    ]

    expected_spec_strs = [
        "[\n i: identity(1)\n]",
        "[\n l: identity(2)\n]",
        "[\n d: identity(3)\n]",
        "[\n t: identity(4)\n]",
        "[\n ts: identity(5)\n]",
        "[\n dec: identity(6)\n]",
        "[\n s: identity(7)\n]",
        "[\n u: identity(8)\n]",
        "[\n f: identity(9)\n]",
        "[\n b: identity(10)\n]",
        "[\n i_bucket: bucket[128](1)\n]",
        "[\n l_bucket: bucket[128](2)\n]",
        "[\n d_bucket: bucket[128](3)\n]",
        "[\n t_bucket: bucket[128](4)\n]",
        "[\n ts_bucket: bucket[128](5)\n]",
        "[\n dec_bucket: bucket[128](6)\n]",
        "[\n s_bucket: bucket[128](7)\n]",
        "[\n d_year: year(3)\n]",
        "[\n d_month: month(3)\n]",
        "[\n d_day: day(3)\n]",
        "[\n ts_year: year(5)\n]",
        "[\n ts_month: month(5)\n]",
        "[\n ts_day: day(5)\n]",
        "[\n ts_hour: hour(5)\n]",
        "[\n i_truncate: truncate[10](1)\n]",
        "[\n l_truncate: truncate[10](2)\n]",
        "[\n dec_truncate: truncate[10](6)\n]",
        "[\n s_truncate: truncate[10](7)\n]",
        "[\n dec_bucket: bucket[16](6)\n]",
    ]

    for (spec, expected_spec_str) in zip(specs, expected_spec_strs):
        assert str(spec) == expected_spec_str
Пример #10
0
    def test_partition_spec(self):
        schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                        NestedField.required(2, "l", LongType.get()),
                        NestedField.required(3, "d", DateType.get()),
                        NestedField.required(4, "t", TimeType.get()),
                        NestedField.required(5, "ts", TimestampType.without_timezone()),
                        NestedField.required(6, "dec", DecimalType.of(9, 2)),
                        NestedField.required(7, "s", StringType.get()),
                        NestedField.required(8, "u", UUIDType.get()),
                        NestedField.required(9, "f", FixedType.of_length(3)),
                        NestedField.required(10, "b", BinaryType.get()))
        specs = [PartitionSpec.builder_for(schema).identity("i").build(),
                 PartitionSpec.builder_for(schema).identity("l").build(),
                 PartitionSpec.builder_for(schema).identity("d").build(),
                 PartitionSpec.builder_for(schema).identity("t").build(),
                 PartitionSpec.builder_for(schema).identity("ts").build(),
                 PartitionSpec.builder_for(schema).identity("dec").build(),
                 PartitionSpec.builder_for(schema).identity("s").build(),
                 PartitionSpec.builder_for(schema).identity("u").build(),
                 PartitionSpec.builder_for(schema).identity("f").build(),
                 PartitionSpec.builder_for(schema).identity("b").build(),
                 PartitionSpec.builder_for(schema).bucket("i", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("l", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("d", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("t", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("ts", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("dec", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("s", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("u", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("f", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("b", 128).build(),
                 PartitionSpec.builder_for(schema).year("d").build(),
                 PartitionSpec.builder_for(schema).month("d").build(),
                 PartitionSpec.builder_for(schema).day("d").build(),
                 PartitionSpec.builder_for(schema).year("ts").build(),
                 PartitionSpec.builder_for(schema).month("ts").build(),
                 PartitionSpec.builder_for(schema).day("ts").build(),
                 PartitionSpec.builder_for(schema).hour("ts").build(),
                 PartitionSpec.builder_for(schema).truncate("i", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("l", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("dec", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("s", 10).build(),
                 PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(),
                 PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(),
                 ]

        for spec in specs:
            self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))