Пример #1
0
  def test_encoding_position_reorder_fields(self):
    schema1 = schema_pb2.Schema(
        id="reorder_test_schema1",
        fields=[
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
            ),
        ])
    schema2 = schema_pb2.Schema(
        id="reorder_test_schema2",
        encoding_positions_set=True,
        fields=[
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                encoding_position=1,
            ),
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
                encoding_position=0,
            ),
        ])

    RowSchema1 = named_tuple_from_schema(schema1)
    RowSchema2 = named_tuple_from_schema(schema2)
    roundtripped = RowCoder(schema2).decode(
        RowCoder(schema1).encode(RowSchema1(42, "Hello World!")))

    self.assertEqual(RowSchema2(f_int32=42, f_str="Hello World!"), roundtripped)
Пример #2
0
  def test_create_row_coder_from_schema(self):
    schema = schema_pb2.Schema(
        id="person",
        fields=[
            schema_pb2.Field(
                name="name",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)),
            schema_pb2.Field(
                name="age",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)),
            schema_pb2.Field(
                name="address",
                type=schema_pb2.FieldType(
                    atomic_type=schema_pb2.STRING, nullable=True)),
            schema_pb2.Field(
                name="aliases",
                type=schema_pb2.FieldType(
                    array_type=schema_pb2.ArrayType(
                        element_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING)))),
        ])
    coder = RowCoder(schema)

    for test_case in self.TEST_CASES:
      self.assertEqual(test_case, coder.decode(coder.encode(test_case)))
Пример #3
0
  def test_encoding_position_reorder_fields(self):
    fields = [("field1", str), ("field2", int), ("field3", int)]

    expected = typing.NamedTuple('expected', fields)
    reorder = schema_pb2.Schema(
        id="new_order",
        fields=[
            schema_pb2.Field(
                name="field3",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                encoding_position=2),
            schema_pb2.Field(
                name="field2",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
                encoding_position=1),
            schema_pb2.Field(
                name="field1",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
                encoding_position=0)
        ])

    old_coder = RowCoder.from_type_hint(expected, None)
    new_coder = RowCoder(reorder)

    encode_expected = old_coder.encode(expected("foo", 7, 12))
    encode_reorder = new_coder.encode(expected(12, 7, "foo"))
    self.assertEqual(encode_expected, encode_reorder)
Пример #4
0
def typing_to_runner_api(type_):
    if _match_is_named_tuple(type_):
        schema = None
        if hasattr(type_, _BEAM_SCHEMA_ID):
            schema = SCHEMA_REGISTRY.get_schema_by_id(
                getattr(type_, _BEAM_SCHEMA_ID))
        if schema is None:
            fields = [
                schema_pb2.Field(name=name,
                                 type=typing_to_runner_api(
                                     type_._field_types[name]))
                for name in type_._fields
            ]
            type_id = str(uuid4())
            schema = schema_pb2.Schema(fields=fields, id=type_id)
            setattr(type_, _BEAM_SCHEMA_ID, type_id)
            SCHEMA_REGISTRY.add(type_, schema)

        return schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=schema))

    # All concrete types (other than NamedTuple sub-classes) should map to
    # a supported primitive type.
    elif type_ in PRIMITIVE_TO_ATOMIC_TYPE:
        return schema_pb2.FieldType(
            atomic_type=PRIMITIVE_TO_ATOMIC_TYPE[type_])

    elif sys.version_info.major == 2 and type_ == str:
        raise ValueError(
            "type 'str' is not supported in python 2. Please use 'unicode' or "
            "'typing.ByteString' instead to unambiguously indicate if this is a "
            "UTF-8 string or a byte array.")

    elif _match_is_exactly_mapping(type_):
        key_type, value_type = map(typing_to_runner_api, _get_args(type_))
        return schema_pb2.FieldType(map_type=schema_pb2.MapType(
            key_type=key_type, value_type=value_type))

    elif _match_is_optional(type_):
        # It's possible that a user passes us Optional[Optional[T]], but in python
        # typing this is indistinguishable from Optional[T] - both resolve to
        # Union[T, None] - so there's no need to check for that case here.
        result = typing_to_runner_api(extract_optional_type(type_))
        result.nullable = True
        return result

    elif _safe_issubclass(type_, Sequence):
        element_type = typing_to_runner_api(_get_args(type_)[0])
        return schema_pb2.FieldType(array_type=schema_pb2.ArrayType(
            element_type=element_type))

    elif _safe_issubclass(type_, Mapping):
        key_type, value_type = map(typing_to_runner_api, _get_args(type_))
        return schema_pb2.FieldType(map_type=schema_pb2.MapType(
            key_type=key_type, value_type=value_type))

    raise ValueError("Unsupported type: %s" % type_)
Пример #5
0
 def test_python_callable_maps_to_logical_type(self):
     from apache_beam.utils.python_callable import PythonCallableWithSource
     self.assertEqual(
         schema_pb2.FieldType(logical_type=schema_pb2.LogicalType(
             urn=common_urns.python_callable.urn,
             representation=typing_to_runner_api(str))),
         typing_to_runner_api(PythonCallableWithSource))
     self.assertEqual(
         typing_from_runner_api(
             schema_pb2.FieldType(logical_type=schema_pb2.LogicalType(
                 urn=common_urns.python_callable.urn,
                 representation=typing_to_runner_api(str)))),
         PythonCallableWithSource)
Пример #6
0
    def test_row_coder_fail_early_bad_schema(self):
        schema_proto = schema_pb2.Schema(fields=[
            schema_pb2.Field(name="type_with_no_typeinfo",
                             type=schema_pb2.FieldType())
        ])

        # Should raise an exception referencing the problem field
        self.assertRaisesRegex(ValueError, "type_with_no_typeinfo",
                               lambda: RowCoder(schema_proto))
Пример #7
0
    def test_schema_with_bad_field_raises_helpful_error(self):
        schema_proto = schema_pb2.Schema(fields=[
            schema_pb2.Field(name="type_with_no_typeinfo",
                             type=schema_pb2.FieldType())
        ])

        # Should raise an exception referencing the problem field
        self.assertRaisesRegex(ValueError, "type_with_no_typeinfo",
                               lambda: named_tuple_from_schema(schema_proto))
Пример #8
0
def typing_to_runner_api(type_):
  if match_is_named_tuple(type_):
    schema = None
    if hasattr(type_, _BEAM_SCHEMA_ID):
      schema = SCHEMA_REGISTRY.get_schema_by_id(getattr(type_, _BEAM_SCHEMA_ID))
    if schema is None:
      fields = [
          schema_pb2.Field(
              name=name, type=typing_to_runner_api(type_._field_types[name]))
          for name in type_._fields
      ]
      type_id = str(uuid4())
      schema = schema_pb2.Schema(fields=fields, id=type_id)
      setattr(type_, _BEAM_SCHEMA_ID, type_id)
      SCHEMA_REGISTRY.add(type_, schema)

    return schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=schema))

  # All concrete types (other than NamedTuple sub-classes) should map to
  # a supported primitive type.
  elif type_ in PRIMITIVE_TO_ATOMIC_TYPE:
    return schema_pb2.FieldType(atomic_type=PRIMITIVE_TO_ATOMIC_TYPE[type_])

  elif _match_is_exactly_mapping(type_):
    key_type, value_type = map(typing_to_runner_api, _get_args(type_))
    return schema_pb2.FieldType(
        map_type=schema_pb2.MapType(key_type=key_type, value_type=value_type))

  elif _match_is_optional(type_):
    # It's possible that a user passes us Optional[Optional[T]], but in python
    # typing this is indistinguishable from Optional[T] - both resolve to
    # Union[T, None] - so there's no need to check for that case here.
    result = typing_to_runner_api(extract_optional_type(type_))
    result.nullable = True
    return result

  elif _safe_issubclass(type_, Sequence):
    element_type = typing_to_runner_api(_get_args(type_)[0])
    return schema_pb2.FieldType(
        array_type=schema_pb2.ArrayType(element_type=element_type))

  elif _safe_issubclass(type_, Mapping):
    key_type, value_type = map(typing_to_runner_api, _get_args(type_))
    return schema_pb2.FieldType(
        map_type=schema_pb2.MapType(key_type=key_type, value_type=value_type))

  try:
    logical_type = LogicalType.from_typing(type_)
  except ValueError:
    # Unknown type, just treat it like Any
    return schema_pb2.FieldType(
        logical_type=schema_pb2.LogicalType(urn=PYTHON_ANY_URN))
  else:
    # TODO(bhulette): Add support for logical types that require arguments
    return schema_pb2.FieldType(
        logical_type=schema_pb2.LogicalType(
            urn=logical_type.urn(),
            representation=typing_to_runner_api(
                logical_type.representation_type())))
Пример #9
0
def typing_from_runner_api(fieldtype_proto):
  if fieldtype_proto.nullable:
    # In order to determine the inner type, create a copy of fieldtype_proto
    # with nullable=False and pass back to typing_from_runner_api
    base_type = schema_pb2.FieldType()
    base_type.CopyFrom(fieldtype_proto)
    base_type.nullable = False
    return Optional[typing_from_runner_api(base_type)]

  type_info = fieldtype_proto.WhichOneof("type_info")
  if type_info == "atomic_type":
    try:
      return ATOMIC_TYPE_TO_PRIMITIVE[fieldtype_proto.atomic_type]
    except KeyError:
      raise ValueError(
          "Unsupported atomic type: {0}".format(fieldtype_proto.atomic_type))
  elif type_info == "array_type":
    return Sequence[typing_from_runner_api(
        fieldtype_proto.array_type.element_type)]
  elif type_info == "map_type":
    return Mapping[typing_from_runner_api(fieldtype_proto.map_type.key_type),
                   typing_from_runner_api(fieldtype_proto.map_type.value_type)]
  elif type_info == "row_type":
    schema = fieldtype_proto.row_type.schema
    user_type = SCHEMA_REGISTRY.get_typing_by_id(schema.id)
    if user_type is None:
      from apache_beam import coders

      type_name = 'BeamSchema_{}'.format(schema.id.replace('-', '_'))
      user_type = NamedTuple(
          type_name,
          [(field.name, typing_from_runner_api(field.type))
           for field in schema.fields])

      setattr(user_type, _BEAM_SCHEMA_ID, schema.id)

      # Define a reduce function, otherwise these types can't be pickled
      # (See BEAM-9574)
      def __reduce__(self):
        return (
            _hydrate_namedtuple_instance,
            (schema.SerializeToString(), tuple(self)))

      setattr(user_type, '__reduce__', __reduce__)

      SCHEMA_REGISTRY.add(user_type, schema)
      coders.registry.register_coder(user_type, coders.RowCoder)
    return user_type

  elif type_info == "logical_type":
    if fieldtype_proto.logical_type.urn == PYTHON_ANY_URN:
      return Any
    else:
      return LogicalType.from_runner_api(
          fieldtype_proto.logical_type).language_type()
Пример #10
0
  def test_encoding_position_add_fields_and_reorder(self):
    old_schema = schema_pb2.Schema(
        id="add_test_old",
        fields=[
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
            ),
        ])
    new_schema = schema_pb2.Schema(
        encoding_positions_set=True,
        id="add_test_new",
        fields=[
            schema_pb2.Field(
                name="f_new_str",
                type=schema_pb2.FieldType(
                    atomic_type=schema_pb2.STRING, nullable=True),
                encoding_position=2,
            ),
            schema_pb2.Field(
                name="f_int32",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32),
                encoding_position=0,
            ),
            schema_pb2.Field(
                name="f_str",
                type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                encoding_position=1,
            ),
        ])

    Old = named_tuple_from_schema(old_schema)
    New = named_tuple_from_schema(new_schema)
    roundtripped = RowCoder(new_schema).decode(
        RowCoder(old_schema).encode(Old(42, "Hello World!")))

    self.assertEqual(
        New(f_new_str=None, f_int32=42, f_str="Hello World!"), roundtripped)
Пример #11
0
    def test_create_row_coder_from_schema(self):
        schema = schema_pb2.Schema(
            id="person",
            fields=[
                schema_pb2.Field(
                    name="name",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)),
                schema_pb2.Field(
                    name="age",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)),
                schema_pb2.Field(name="address",
                                 type=schema_pb2.FieldType(
                                     atomic_type=schema_pb2.STRING,
                                     nullable=True)),
                schema_pb2.Field(
                    name="aliases",
                    type=schema_pb2.FieldType(array_type=schema_pb2.ArrayType(
                        element_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING)))),
                schema_pb2.Field(
                    name="knows_javascript",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.BOOLEAN)),
                schema_pb2.Field(name="payload",
                                 type=schema_pb2.FieldType(
                                     atomic_type=schema_pb2.BYTES,
                                     nullable=True)),
                schema_pb2.Field(
                    name="custom_metadata",
                    type=schema_pb2.FieldType(map_type=schema_pb2.MapType(
                        key_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING),
                        value_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.INT64),
                    ))),
            ])
        coder = RowCoder(schema)

        for test_case in self.PEOPLE:
            self.assertEqual(test_case, coder.decode(coder.encode(test_case)))
Пример #12
0
    def test_generated_class_pickle(self):
        schema = schema_pb2.Schema(
            id="some-uuid",
            fields=[
                schema_pb2.Field(
                    name='name',
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING),
                )
            ])
        user_type = named_tuple_from_schema(schema)
        instance = user_type(name="test")

        self.assertEqual(instance, pickle.loads(pickle.dumps(instance)))
Пример #13
0
    def test_schema_with_bad_field_raises_helpful_error(self):
        schema_proto = schema_pb2.Schema(
            fields=[
                schema_pb2.Field(name="type_with_no_typeinfo",
                                 type=schema_pb2.FieldType())
            ],
            id="helpful-error-uuid",
        )

        # Should raise an exception referencing the problem field
        self.assertRaisesRegex(
            ValueError,
            "type_with_no_typeinfo",
            lambda: named_tuple_from_schema(
                schema_proto,
                # bypass schema cache
                schema_registry=SchemaTypeRegistry()))
Пример #14
0
  def test_trivial_example(self):
    MyCuteClass = NamedTuple(
        'MyCuteClass',
        [
            ('name', unicode),
            ('age', Optional[int]),
            ('interests', List[unicode]),
            ('height', float),
            ('blob', ByteString),
        ])

    expected = schema_pb2.FieldType(
        row_type=schema_pb2.RowType(
            schema=schema_pb2.Schema(
                fields=[
                    schema_pb2.Field(
                        name='name',
                        type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING),
                    ),
                    schema_pb2.Field(
                        name='age',
                        type=schema_pb2.FieldType(
                            nullable=True, atomic_type=schema_pb2.INT64)),
                    schema_pb2.Field(
                        name='interests',
                        type=schema_pb2.FieldType(
                            array_type=schema_pb2.ArrayType(
                                element_type=schema_pb2.FieldType(
                                    atomic_type=schema_pb2.STRING)))),
                    schema_pb2.Field(
                        name='height',
                        type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.DOUBLE)),
                    schema_pb2.Field(
                        name='blob',
                        type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.BYTES)),
                ])))

    # Only test that the fields are equal. If we attempt to test the entire type
    # or the entire schema, the generated id will break equality.
    self.assertEqual(
        expected.row_type.schema.fields,
        typing_to_runner_api(MyCuteClass).row_type.schema.fields)
Пример #15
0
def typing_from_runner_api(fieldtype_proto):
    if fieldtype_proto.nullable:
        # In order to determine the inner type, create a copy of fieldtype_proto
        # with nullable=False and pass back to typing_from_runner_api
        base_type = schema_pb2.FieldType()
        base_type.CopyFrom(fieldtype_proto)
        base_type.nullable = False
        return Optional[typing_from_runner_api(base_type)]

    type_info = fieldtype_proto.WhichOneof("type_info")
    if type_info == "atomic_type":
        try:
            return ATOMIC_TYPE_TO_PRIMITIVE[fieldtype_proto.atomic_type]
        except KeyError:
            raise ValueError("Unsupported atomic type: {0}".format(
                fieldtype_proto.atomic_type))
    elif type_info == "array_type":
        return Sequence[typing_from_runner_api(
            fieldtype_proto.array_type.element_type)]
    elif type_info == "map_type":
        return Mapping[
            typing_from_runner_api(fieldtype_proto.map_type.key_type),
            typing_from_runner_api(fieldtype_proto.map_type.value_type)]
    elif type_info == "row_type":
        schema = fieldtype_proto.row_type.schema
        user_type = SCHEMA_REGISTRY.get_typing_by_id(schema.id)
        if user_type is None:
            from apache_beam import coders
            type_name = 'BeamSchema_{}'.format(schema.id.replace('-', '_'))
            user_type = NamedTuple(
                type_name, [(field.name, typing_from_runner_api(field.type))
                            for field in schema.fields])
            user_type.id = schema.id
            SCHEMA_REGISTRY.add(user_type, schema)
            coders.registry.register_coder(user_type, coders.RowCoder)
        return user_type

    elif type_info == "logical_type":
        pass  # TODO
Пример #16
0
def named_tuple_from_schema(schema):
    return typing_from_runner_api(
        schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=schema)))
Пример #17
0
 def test_float_maps_to_float64(self):
     self.assertEqual(schema_pb2.FieldType(atomic_type=schema_pb2.DOUBLE),
                      typing_to_runner_api(float))
Пример #18
0
 def test_int_maps_to_int64(self):
     self.assertEqual(schema_pb2.FieldType(atomic_type=schema_pb2.INT64),
                      typing_to_runner_api(int))
Пример #19
0
 def test_unknown_atomic_raise_valueerror(self):
     self.assertRaises(
         ValueError, lambda: typing_from_runner_api(
             schema_pb2.FieldType(atomic_type=schema_pb2.UNSPECIFIED)))
Пример #20
0
    def test_proto_survives_typing_roundtrip(self):
        all_nonoptional_primitives = [
            schema_pb2.FieldType(atomic_type=typ)
            for typ in schema_pb2.AtomicType.values()
            if typ is not schema_pb2.UNSPECIFIED
        ]

        # The bytes type cannot survive a roundtrip to/from proto in Python 2.
        # In order to use BYTES a user type has to use typing.ByteString (because
        # bytes == str, and we map str to STRING).
        if not IS_PYTHON_3:
            all_nonoptional_primitives.remove(
                schema_pb2.FieldType(atomic_type=schema_pb2.BYTES))

        all_optional_primitives = [
            schema_pb2.FieldType(nullable=True, atomic_type=typ)
            for typ in schema_pb2.AtomicType.values()
            if typ is not schema_pb2.UNSPECIFIED
        ]

        all_primitives = all_nonoptional_primitives + all_optional_primitives

        basic_array_types = [
            schema_pb2.FieldType(array_type=schema_pb2.ArrayType(
                element_type=typ)) for typ in all_primitives
        ]

        basic_map_types = [
            schema_pb2.FieldType(map_type=schema_pb2.MapType(
                key_type=key_type, value_type=value_type))
            for key_type, value_type in itertools.product(
                all_primitives, all_primitives)
        ]

        selected_schemas = [
            schema_pb2.FieldType(row_type=schema_pb2.RowType(
                schema=schema_pb2.Schema(
                    id='32497414-85e8-46b7-9c90-9a9cc62fe390',
                    fields=[
                        schema_pb2.Field(name='field%d' % i, type=typ)
                        for i, typ in enumerate(all_primitives)
                    ]))),
            schema_pb2.FieldType(row_type=schema_pb2.RowType(
                schema=schema_pb2.Schema(
                    id='dead1637-3204-4bcb-acf8-99675f338600',
                    fields=[
                        schema_pb2.Field(name='id',
                                         type=schema_pb2.FieldType(
                                             atomic_type=schema_pb2.INT64)),
                        schema_pb2.Field(name='name',
                                         type=schema_pb2.FieldType(
                                             atomic_type=schema_pb2.STRING)),
                        schema_pb2.Field(
                            name='optional_map',
                            type=schema_pb2.FieldType(
                                nullable=True,
                                map_type=schema_pb2.MapType(
                                    key_type=schema_pb2.FieldType(
                                        atomic_type=schema_pb2.STRING),
                                    value_type=schema_pb2.FieldType(
                                        atomic_type=schema_pb2.DOUBLE)))),
                        schema_pb2.Field(
                            name='optional_array',
                            type=schema_pb2.FieldType(
                                nullable=True,
                                array_type=schema_pb2.ArrayType(
                                    element_type=schema_pb2.FieldType(
                                        atomic_type=schema_pb2.FLOAT)))),
                        schema_pb2.Field(
                            name='array_optional',
                            type=schema_pb2.FieldType(
                                array_type=schema_pb2.ArrayType(
                                    element_type=schema_pb2.FieldType(
                                        nullable=True,
                                        atomic_type=schema_pb2.BYTES)))),
                    ]))),
        ]

        test_cases = all_primitives + \
                     basic_array_types + \
                     basic_map_types + \
                     selected_schemas

        for test_case in test_cases:
            self.assertEqual(
                test_case,
                typing_to_runner_api(typing_from_runner_api(test_case)))
Пример #21
0
 def test_unknown_primitive_maps_to_any(self):
     self.assertEqual(
         typing_to_runner_api(np.uint32),
         schema_pb2.FieldType(logical_type=schema_pb2.LogicalType(
             urn="beam:logical:pythonsdk_any:v1"),
                              nullable=True))
Пример #22
0
    def test_proto_survives_typing_roundtrip(self):
        all_nonoptional_primitives = [
            schema_pb2.FieldType(atomic_type=typ)
            for typ in schema_pb2.AtomicType.values()
            if typ is not schema_pb2.UNSPECIFIED
        ]

        all_optional_primitives = [
            schema_pb2.FieldType(nullable=True, atomic_type=typ)
            for typ in schema_pb2.AtomicType.values()
            if typ is not schema_pb2.UNSPECIFIED
        ]

        all_primitives = all_nonoptional_primitives + all_optional_primitives

        basic_array_types = [
            schema_pb2.FieldType(array_type=schema_pb2.ArrayType(
                element_type=typ)) for typ in all_primitives
        ]

        basic_map_types = [
            schema_pb2.FieldType(map_type=schema_pb2.MapType(
                key_type=key_type, value_type=value_type))
            for key_type, value_type in itertools.product(
                all_primitives, all_primitives)
        ]

        selected_schemas = [
            schema_pb2.FieldType(row_type=schema_pb2.RowType(
                schema=schema_pb2.Schema(
                    id='32497414-85e8-46b7-9c90-9a9cc62fe390',
                    fields=[
                        schema_pb2.Field(name='field%d' % i, type=typ)
                        for i, typ in enumerate(all_primitives)
                    ]))),
            schema_pb2.FieldType(row_type=schema_pb2.RowType(
                schema=schema_pb2.Schema(
                    id='dead1637-3204-4bcb-acf8-99675f338600',
                    fields=[
                        schema_pb2.Field(name='id',
                                         type=schema_pb2.FieldType(
                                             atomic_type=schema_pb2.INT64)),
                        schema_pb2.Field(name='name',
                                         type=schema_pb2.FieldType(
                                             atomic_type=schema_pb2.STRING)),
                        schema_pb2.Field(
                            name='optional_map',
                            type=schema_pb2.FieldType(
                                nullable=True,
                                map_type=schema_pb2.MapType(
                                    key_type=schema_pb2.FieldType(
                                        atomic_type=schema_pb2.STRING),
                                    value_type=schema_pb2.FieldType(
                                        atomic_type=schema_pb2.DOUBLE)))),
                        schema_pb2.Field(
                            name='optional_array',
                            type=schema_pb2.FieldType(
                                nullable=True,
                                array_type=schema_pb2.ArrayType(
                                    element_type=schema_pb2.FieldType(
                                        atomic_type=schema_pb2.FLOAT)))),
                        schema_pb2.Field(
                            name='array_optional',
                            type=schema_pb2.FieldType(
                                array_type=schema_pb2.ArrayType(
                                    element_type=schema_pb2.FieldType(
                                        nullable=True,
                                        atomic_type=schema_pb2.BYTES)))),
                    ]))),
        ]

        test_cases = all_primitives + \
                     basic_array_types + \
                     basic_map_types + \
                     selected_schemas

        for test_case in test_cases:
            self.assertEqual(
                test_case,
                typing_to_runner_api(typing_from_runner_api(
                    test_case, schema_registry=SchemaTypeRegistry()),
                                     schema_registry=SchemaTypeRegistry()))
Пример #23
0
    def typing_from_runner_api(self,
                               fieldtype_proto: schema_pb2.FieldType) -> type:
        if fieldtype_proto.nullable:
            # In order to determine the inner type, create a copy of fieldtype_proto
            # with nullable=False and pass back to typing_from_runner_api
            base_type = schema_pb2.FieldType()
            base_type.CopyFrom(fieldtype_proto)
            base_type.nullable = False
            base = self.typing_from_runner_api(base_type)
            if base == Any:
                return base
            else:
                return Optional[base]

        type_info = fieldtype_proto.WhichOneof("type_info")
        if type_info == "atomic_type":
            try:
                return ATOMIC_TYPE_TO_PRIMITIVE[fieldtype_proto.atomic_type]
            except KeyError:
                raise ValueError("Unsupported atomic type: {0}".format(
                    fieldtype_proto.atomic_type))
        elif type_info == "array_type":
            return Sequence[self.typing_from_runner_api(
                fieldtype_proto.array_type.element_type)]
        elif type_info == "map_type":
            return Mapping[
                self.typing_from_runner_api(fieldtype_proto.map_type.key_type),
                self.typing_from_runner_api(fieldtype_proto.map_type.value_type
                                            )]
        elif type_info == "row_type":
            schema = fieldtype_proto.row_type.schema
            user_type = self.schema_registry.get_typing_by_id(schema.id)
            if user_type is None:
                from apache_beam import coders

                type_name = 'BeamSchema_{}'.format(schema.id.replace('-', '_'))

                subfields = []
                for field in schema.fields:
                    try:
                        field_py_type = self.typing_from_runner_api(field.type)
                    except ValueError as e:
                        raise ValueError(
                            "Failed to decode schema due to an issue with Field proto:\n\n"
                            f"{text_format.MessageToString(field)}") from e

                    subfields.append((field.name, field_py_type))

                user_type = NamedTuple(type_name, subfields)

                setattr(user_type, _BEAM_SCHEMA_ID, schema.id)

                # Define a reduce function, otherwise these types can't be pickled
                # (See BEAM-9574)
                def __reduce__(self):
                    return (_hydrate_namedtuple_instance,
                            (schema.SerializeToString(), tuple(self)))

                setattr(user_type, '__reduce__', __reduce__)

                self.schema_registry.add(user_type, schema)
                coders.registry.register_coder(user_type, coders.RowCoder)
            return user_type

        elif type_info == "logical_type":
            if fieldtype_proto.logical_type.urn == PYTHON_ANY_URN:
                return Any
            else:
                return LogicalType.from_runner_api(
                    fieldtype_proto.logical_type).language_type()

        else:
            raise ValueError(f"Unrecognized type_info: {type_info!r}")
Пример #24
0
def named_tuple_from_schema(
        schema, schema_registry: SchemaTypeRegistry = SCHEMA_REGISTRY) -> type:
    return typing_from_runner_api(
        schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=schema)),
        schema_registry)
Пример #25
0
    def test_create_row_coder_from_schema(self):
        schema = schema_pb2.Schema(
            id="person",
            fields=[
                schema_pb2.Field(
                    name="name",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)),
                schema_pb2.Field(
                    name="age",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)),
                schema_pb2.Field(name="address",
                                 type=schema_pb2.FieldType(
                                     atomic_type=schema_pb2.STRING,
                                     nullable=True)),
                schema_pb2.Field(
                    name="aliases",
                    type=schema_pb2.FieldType(array_type=schema_pb2.ArrayType(
                        element_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING)))),
                schema_pb2.Field(
                    name="knows_javascript",
                    type=schema_pb2.FieldType(atomic_type=schema_pb2.BOOLEAN)),
                schema_pb2.Field(name="payload",
                                 type=schema_pb2.FieldType(
                                     atomic_type=schema_pb2.BYTES,
                                     nullable=True)),
                schema_pb2.Field(
                    name="custom_metadata",
                    type=schema_pb2.FieldType(map_type=schema_pb2.MapType(
                        key_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.STRING),
                        value_type=schema_pb2.FieldType(
                            atomic_type=schema_pb2.INT64),
                    ))),
                schema_pb2.Field(
                    name="favorite_time",
                    type=schema_pb2.
                    FieldType(logical_type=schema_pb2.LogicalType(
                        urn="beam:logical_type:micros_instant:v1",
                        representation=schema_pb2.FieldType(
                            row_type=schema_pb2.RowType(
                                schema=schema_pb2.Schema(
                                    id="micros_instant",
                                    fields=[
                                        schema_pb2.Field(
                                            name="seconds",
                                            type=schema_pb2.FieldType(
                                                atomic_type=schema_pb2.INT64)),
                                        schema_pb2.Field(
                                            name="micros",
                                            type=schema_pb2.FieldType(
                                                atomic_type=schema_pb2.INT64)),
                                    ])))))),
            ])
        coder = RowCoder(schema)

        for test_case in self.PEOPLE:
            self.assertEqual(test_case, coder.decode(coder.encode(test_case)))