def test_encoding_position_reorder_fields(self): schema1 = schema_pb2.Schema( id="reorder_test_schema1", fields=[ schema_pb2.Field( name="f_int32", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), ), schema_pb2.Field( name="f_str", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), ), ]) schema2 = schema_pb2.Schema( id="reorder_test_schema2", encoding_positions_set=True, fields=[ schema_pb2.Field( name="f_str", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), encoding_position=1, ), schema_pb2.Field( name="f_int32", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), encoding_position=0, ), ]) RowSchema1 = named_tuple_from_schema(schema1) RowSchema2 = named_tuple_from_schema(schema2) roundtripped = RowCoder(schema2).decode( RowCoder(schema1).encode(RowSchema1(42, "Hello World!"))) self.assertEqual(RowSchema2(f_int32=42, f_str="Hello World!"), roundtripped)
def test_encoding_position_reorder_fields(self): fields = [("field1", str), ("field2", int), ("field3", int)] expected = typing.NamedTuple('expected', fields) reorder = schema_pb2.Schema( id="new_order", fields=[ schema_pb2.Field( name="field3", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), encoding_position=2), schema_pb2.Field( name="field2", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), encoding_position=1), schema_pb2.Field( name="field1", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), encoding_position=0) ]) old_coder = RowCoder.from_type_hint(expected, None) new_coder = RowCoder(reorder) encode_expected = old_coder.encode(expected("foo", 7, 12)) encode_reorder = new_coder.encode(expected(12, 7, "foo")) self.assertEqual(encode_expected, encode_reorder)
def test_row_coder_nested_struct(self): Pair = typing.NamedTuple('Pair', [('left', Person), ('right', Person)]) value = Pair(self.PEOPLE[0], self.PEOPLE[1]) coder = RowCoder(typing_to_runner_api(Pair).row_type.schema) self.assertEqual(value, coder.decode(coder.encode(value)))
def test_create_row_coder_from_schema(self): schema = schema_pb2.Schema( id="person", fields=[ schema_pb2.Field( name="name", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)), schema_pb2.Field( name="age", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)), schema_pb2.Field( name="address", type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING, nullable=True)), schema_pb2.Field( name="aliases", type=schema_pb2.FieldType( array_type=schema_pb2.ArrayType( element_type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING)))), schema_pb2.Field( name="knows_javascript", type=schema_pb2.FieldType(atomic_type=schema_pb2.BOOLEAN)), schema_pb2.Field( name="payload", type=schema_pb2.FieldType( atomic_type=schema_pb2.BYTES, nullable=True)), schema_pb2.Field( name="custom_metadata", type=schema_pb2.FieldType( map_type=schema_pb2.MapType( key_type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING), value_type=schema_pb2.FieldType( atomic_type=schema_pb2.INT64), ))), schema_pb2.Field( name="favorite_time", type=schema_pb2.FieldType( logical_type=schema_pb2.LogicalType( urn="beam:logical_type:micros_instant:v1", representation=schema_pb2.FieldType( row_type=schema_pb2.RowType( schema=schema_pb2.Schema( id="micros_instant", fields=[ schema_pb2.Field( name="seconds", type=schema_pb2.FieldType( atomic_type=schema_pb2.INT64)), schema_pb2.Field( name="micros", type=schema_pb2.FieldType( atomic_type=schema_pb2.INT64)), ])))))), ]) coder = RowCoder(schema) for test_case in self.PEOPLE: self.assertEqual(test_case, coder.decode(coder.encode(test_case)))
def _verify_row(self, schema, row_payload, expected_values): row = RowCoder(schema).decode(row_payload) for attr_name, expected_value in expected_values.items(): self.assertTrue(hasattr(row, attr_name)) value = getattr(row, attr_name) self.assertEqual(expected_value, value)
def _get_schema_proto_and_payload(self, *args, **kwargs): named_fields = [] fields_to_values = OrderedDict() next_field_id = 0 for value in args: if value is None: raise ValueError( 'Received value None. None values are currently not supported' ) named_fields.append( ((JavaClassLookupPayloadBuilder.IGNORED_ARG_FORMAT % next_field_id), convert_to_typing_type(instance_to_type(value)))) fields_to_values[( JavaClassLookupPayloadBuilder.IGNORED_ARG_FORMAT % next_field_id)] = value next_field_id += 1 for key, value in kwargs.items(): if not key: raise ValueError('Parameter name cannot be empty') if value is None: raise ValueError( 'Received value None for key %s. None values are currently not ' 'supported' % key) named_fields.append( (key, convert_to_typing_type(instance_to_type(value)))) fields_to_values[key] = value schema_proto = named_fields_to_schema(named_fields) row = named_tuple_from_schema(schema_proto)(**fields_to_values) schema = named_tuple_to_schema(type(row)) payload = RowCoder(schema).encode(row) return (schema_proto, payload)
def test_create_row_coder_from_schema(self): schema = schema_pb2.Schema( id="person", fields=[ schema_pb2.Field( name="name", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING)), schema_pb2.Field( name="age", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32)), schema_pb2.Field(name="address", type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING, nullable=True)), schema_pb2.Field( name="aliases", type=schema_pb2.FieldType(array_type=schema_pb2.ArrayType( element_type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING)))), schema_pb2.Field( name="knows_javascript", type=schema_pb2.FieldType(atomic_type=schema_pb2.BOOLEAN)), schema_pb2.Field(name="payload", type=schema_pb2.FieldType( atomic_type=schema_pb2.BYTES, nullable=True)), ]) coder = RowCoder(schema) for test_case in self.PEOPLE: self.assertEqual(test_case, coder.decode(coder.encode(test_case)))
def __init__( self, table_name, driver_class_name, jdbc_url, username, password, statement=None, connection_properties=None, connection_init_sqls=None, expansion_service=None, classpath=None, ): """ Initializes a write operation to Jdbc. :param driver_class_name: name of the jdbc driver class :param jdbc_url: full jdbc url to the database. :param username: database username :param password: database password :param statement: sql statement to be executed :param connection_properties: properties of the jdbc connection passed as string with format [propertyName=property;]* :param connection_init_sqls: required only for MySql and MariaDB. passed as list of strings :param expansion_service: The address (host:port) of the ExpansionService. :param classpath: A list of JARs or Java packages to include in the classpath for the expansion service. This option is usually needed for `jdbc` to include extra JDBC driver packages. The packages can be in these three formats: (1) A local file, (2) A URL, (3) A gradle-style identifier of a Maven package (e.g. "org.postgresql:postgresql:42.3.1"). By default, this argument includes a Postgres SQL JDBC driver. """ classpath = classpath or DEFAULT_JDBC_CLASSPATH super().__init__( self.URN, NamedTupleBasedPayloadBuilder( JdbcConfigSchema( location=table_name, config=RowCoder( typing_to_runner_api(Config).row_type.schema).encode( Config( driver_class_name=driver_class_name, jdbc_url=jdbc_url, username=username, password=password, connection_properties=connection_properties, connection_init_sqls=connection_init_sqls, write_statement=statement, read_query=None, fetch_size=None, output_parallelization=None, ))), ), expansion_service or default_io_expansion_service(classpath), )
def test_typing_payload_builder_with_bytes(self): """ string_utf8 coder will be used even if values are not unicode in python 2.x """ result = self.get_payload_from_typing_hints(self.bytes_values) decoded = RowCoder(result.schema).decode(result.payload) for key, value in self.values.items(): self.assertEqual(getattr(decoded, key), value)
def test_row_coder_cloud_object_schema(self): schema_proto = schema_pb2.Schema() schema_proto_json = json_format.MessageToJson(schema_proto).encode('utf-8') coder = RowCoder(schema_proto) cloud_object = coder.as_cloud_object() self.assertEqual(schema_proto_json, cloud_object['schema'])
def test_row_coder_fail_early_bad_schema(self): schema_proto = schema_pb2.Schema(fields=[ schema_pb2.Field(name="type_with_no_typeinfo", type=schema_pb2.FieldType()) ]) # Should raise an exception referencing the problem field self.assertRaisesRegex(ValueError, "type_with_no_typeinfo", lambda: RowCoder(schema_proto))
def test_implicit_payload_builder(self): builder = ImplicitSchemaPayloadBuilder(PayloadBase.values) result = builder.build() decoded = RowCoder(result.schema).decode(result.payload) for key, value in PayloadBase.values.items(): # Note the default value in the getattr call. # ImplicitSchemaPayloadBuilder omits fields with valu=None since their # type cannot be inferred. self.assertEqual(getattr(decoded, key, None), value)
def test_create_row_coder_from_named_tuple(self): expected_coder = RowCoder(typing_to_runner_api(Person).row_type.schema) real_coder = coders_registry.get_coder(Person) for test_case in self.PEOPLE: self.assertEqual(expected_coder.encode(test_case), real_coder.encode(test_case)) self.assertEqual(test_case, real_coder.decode(real_coder.encode(test_case)))
def test_encoding_position_add_fields_and_reorder(self): old_schema = schema_pb2.Schema( id="add_test_old", fields=[ schema_pb2.Field( name="f_int32", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), ), schema_pb2.Field( name="f_str", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), ), ]) new_schema = schema_pb2.Schema( encoding_positions_set=True, id="add_test_new", fields=[ schema_pb2.Field( name="f_new_str", type=schema_pb2.FieldType( atomic_type=schema_pb2.STRING, nullable=True), encoding_position=2, ), schema_pb2.Field( name="f_int32", type=schema_pb2.FieldType(atomic_type=schema_pb2.INT32), encoding_position=0, ), schema_pb2.Field( name="f_str", type=schema_pb2.FieldType(atomic_type=schema_pb2.STRING), encoding_position=1, ), ]) Old = named_tuple_from_schema(old_schema) New = named_tuple_from_schema(new_schema) roundtripped = RowCoder(new_schema).decode( RowCoder(old_schema).encode(Old(42, "Hello World!"))) self.assertEqual( New(f_new_str=None, f_int32=42, f_str="Hello World!"), roundtripped)
def test_row_accepts_trailing_zeros_truncated(self): expected_coder = RowCoder( typing_to_runner_api(NullablePerson).row_type.schema) person = NullablePerson(None, np.int32(25), "Westeros", ["Mother of Dragons"], False, None, {"dragons": 3}, None, "NotNull") out = expected_coder.encode(person) # 9 fields, 1 null byte, field 0, 5, 7 are null new_payload = bytes([9, 1, 1 | 1 << 5 | 1 << 7]) + out[4:] new_value = expected_coder.decode(new_payload) self.assertEqual(person, new_value)
def __init__( self, table_name, driver_class_name, jdbc_url, username, password, query=None, output_parallelization=None, fetch_size=None, connection_properties=None, connection_init_sqls=None, expansion_service=None, ): """ Initializes a read operation from Jdbc. :param driver_class_name: name of the jdbc driver class :param jdbc_url: full jdbc url to the database. :param username: database username :param password: database password :param query: sql query to be executed :param output_parallelization: is output parallelization on :param fetch_size: how many rows to fetch :param connection_properties: properties of the jdbc connection passed as string with format [propertyName=property;]* :param connection_init_sqls: required only for MySql and MariaDB. passed as list of strings :param expansion_service: The address (host:port) of the ExpansionService. """ super().__init__( self.URN, NamedTupleBasedPayloadBuilder( JdbcConfigSchema( location=table_name, config=RowCoder( typing_to_runner_api(Config).row_type.schema).encode( Config( driver_class_name=driver_class_name, jdbc_url=jdbc_url, username=username, password=password, connection_properties=connection_properties, connection_init_sqls=connection_init_sqls, write_statement=None, read_query=query, fetch_size=fetch_size, output_parallelization=output_parallelization, ))), ), expansion_service or default_io_expansion_service(), )
def test_implicit_payload_builder_with_bytes(self): values = PayloadBase.bytes_values builder = ImplicitSchemaPayloadBuilder(values) result = builder.build() decoded = RowCoder(result.schema).decode(result.payload) for key, value in PayloadBase.values.items(): # Note the default value in the getattr call. # ImplicitSchemaPayloadBuilder omits fields with valu=None since their # type cannot be inferred. self.assertEqual(getattr(decoded, key, None), value) # Verify we have not modified a cached type (BEAM-10766) # TODO(BEAM-7372): Remove when bytes coercion code is removed. self.assertEqual(typehints.List[bytes], convert_to_beam_type(typing.List[bytes]))
def test_implicit_payload_builder_with_bytes(self): values = PayloadBase.bytes_values builder = ImplicitSchemaPayloadBuilder(values) result = builder.build() decoded = RowCoder(result.schema).decode(result.payload) if sys.version_info[0] < 3: for key, value in PayloadBase.bytes_values.items(): # Note the default value in the getattr call. # ImplicitSchemaPayloadBuilder omits fields with valu=None since their # type cannot be inferred. self.assertEqual(getattr(decoded, key, None), value) else: for key, value in PayloadBase.values.items(): # Note the default value in the getattr call. # ImplicitSchemaPayloadBuilder omits fields with valu=None since their # type cannot be inferred. self.assertEqual(getattr(decoded, key, None), value)
def parse_string_payload(input_byte): payload = external_transforms_pb2.ExternalConfigurationPayload() payload.ParseFromString(input_byte) return RowCoder(payload.schema).decode(payload.payload)._asdict()
def build(self): row = self._get_named_tuple_instance() schema = named_tuple_to_schema(type(row)) return ExternalConfigurationPayload( schema=schema, payload=RowCoder(schema).encode(row))
def test_typehints_payload_builder(self): result = self.get_payload_from_typing_hints(self.values) decoded = RowCoder(result.schema).decode(result.payload) for key, value in self.values.items(): self.assertEqual(getattr(decoded, key), value)