def test_other_attributes(self): print_name('TEST OTHER ATTRIBUTES') correct = 0 props = {} for example in OTHER_PROP_EXAMPLES: original_schema = schema.parse(example.schema_string) round_trip_schema = schema.parse(str(original_schema)) self.assertEqual(original_schema.other_props, round_trip_schema.other_props) if original_schema.type == "record": field_props = 0 for f in original_schema.fields: if f.other_props: props.update(f.other_props) field_props += 1 self.assertEqual(field_props, len(original_schema.fields)) if original_schema.other_props: props.update(original_schema.other_props) correct += 1 for k in props: v = props[k] if k == "cp_boolean": self.assertEqual(type(v), bool) elif k == "cp_int": self.assertEqual(type(v), int) elif k == "cp_object": self.assertEqual(type(v), dict) elif k == "cp_float": self.assertEqual(type(v), float) elif k == "cp_array": self.assertEqual(type(v), list) self.assertEqual(correct, len(OTHER_PROP_EXAMPLES))
def test_correct_recursive_extraction(self): s = schema.parse( '{"type": "record", "name": "X", "fields": [{"name": "y", "type": {"type": "record", "name": "Y", "fields": [{"name": "Z", "type": "X"}]}}]}' ) t = schema.parse(str(s.fields[0].type)) # If we've made it this far, the subschema was reasonably stringified; it ccould be reparsed. self.assertEqual("X", t.fields[0].type.name)
def validate_avro_schema(value): ''' Attempt to parse ``value`` into an Avro schema. Raise ``ValidationError`` on error. ''' try: parse(json.dumps(value)) except SchemaParseException as e: raise ValidationError(str(e))
def test_exception_is_not_swallowed_on_parse_error(self): print_name('TEST EXCEPTION NOT SWALLOWED ON PARSE ERROR') try: schema.parse('/not/a/real/file') caught_exception = False except schema.SchemaParseException, e: expected_message = 'Error parsing JSON: /not/a/real/file, error = ' \ 'No JSON object could be decoded' self.assertEqual(expected_message, e.args[0]) caught_exception = True
def test_valid_cast_to_string_after_parse(self): # Test that the string generated by an Avro Schema object # is, in fact, a valid Avro schema. print_name('TEST CAST TO STRING AFTER PARSE') correct = 0 for example in VALID_EXAMPLES: schema_data = schema.parse(example.schema_string) schema.parse(str(schema_data)) correct += 1 fail_msg = "Cast to string success on %d out of %d schemas" % \ (correct, len(VALID_EXAMPLES)) self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
def test_unknown_symbol(self): print_name('TEST UNKNOWN SYMBOL') writers_schema = schema.parse("""\ {"type": "enum", "name": "Test", "symbols": ["FOO", "BAR"]}""") datum_to_write = 'FOO' readers_schema = schema.parse("""\ {"type": "enum", "name": "Test", "symbols": ["BAR", "BAZ"]}""") with self.assertRaises(io.SchemaResolutionException) as context: writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) reader = StringIO(writer.getvalue()) decoder = io.BinaryDecoder(reader) datum_reader = io.DatumReader(writers_schema, readers_schema)
def check_skip_number(number_type): print_name('TEST SKIP %s' % number_type.upper()) correct = 0 for value_to_skip, hex_encoding in BINARY_ENCODINGS: VALUE_TO_READ = 6253 print('Value to Skip: %d' % value_to_skip) # write the value to skip and a known value writers_schema = schema.parse('"%s"' % number_type.lower()) writer, encoder, datum_writer = write_datum(value_to_skip, writers_schema) datum_writer.write(VALUE_TO_READ, encoder) # skip the value reader = StringIO(writer.getvalue()) decoder = io.BinaryDecoder(reader) decoder.skip_long() # read data from string buffer datum_reader = io.DatumReader(writers_schema) read_value = datum_reader.read(decoder) print('Read Value: %d' % read_value) if read_value == VALUE_TO_READ: correct += 1 print('') return correct
def test_schema_promotion(self): print_name('TEST SCHEMA PROMOTION') # note that checking writers_schema.type in read_data # allows us to handle promotion correctly promotable_schemas = ['"int"', '"long"', '"float"', '"double"'] incorrect = 0 for i, ws in enumerate(promotable_schemas): writers_schema = schema.parse(ws) datum_to_write = 219 for rs in promotable_schemas[i + 1:]: readers_schema = schema.parse(rs) writer, enc, dw = write_datum(datum_to_write, writers_schema) datum_read = read_datum(writer, writers_schema, readers_schema) print('Writer: %s Reader: %s' % (writers_schema, readers_schema)) print('Datum Read: %s' % datum_read) if datum_read != datum_to_write: incorrect += 1 self.assertEquals(incorrect, 0)
def test_type_exception(self): print_name('TEST TYPE EXCEPTION') writers_schema = schema.parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "F", "type": "int"}, {"name": "E", "type": "int"}]}""") datum_to_write = {'E': 5, 'F': 'Bad'} with self.assertRaises(io.AvroTypeException) as context: write_datum(datum_to_write, writers_schema)
def validate_entity_payload(schema_definition, payload): # Use spavro to validate payload against the linked schema try: avro_schema = parse(json.dumps(schema_definition)) valid = validate(avro_schema, payload) if not valid: raise ValidationError(MESSAGE_NOT_VALID) return True except Exception as err: raise ValidationError(str(err))
def configure(self, taskType, inSchemaText, outSchemaText): """ Parameters ------------------------------------------------------------------- taskType - What type of task (e.g map, reduce) - This is an enumeration which is specified in the input protocol inSchemaText - string containing the input schema - This is the actual schema with which the data was encoded i.e it is the writer_schema (see http://avro.apache.org/docs/current/spec.html#Schema+Resolution) This is the schema the parent process is using which might be different from the one provided by the subclass of tether_task outSchemaText - string containing the output scheme - This is the schema expected by the parent process for the output """ self.taskType = taskType try: inSchema = schema.parse(inSchemaText) outSchema = schema.parse(outSchemaText) if (taskType == TaskType.MAP): self.inReader = avio.DatumReader(writers_schema=inSchema, readers_schema=self.inschema) self.midCollector = Collector(outSchemaText, self.outputClient) elif (taskType == TaskType.REDUCE): self.midReader = avio.DatumReader( writers_schema=inSchema, readers_schema=self.midschema) # this.outCollector = new Collector<OUT>(outSchema); self.outCollector = Collector(outSchemaText, self.outputClient) # determine which fields in the input record are they keys for the reducer self._red_fkeys = [ f.name for f in self.midschema.fields if not (f.order == 'ignore') ] except Exception as e: estr = traceback.format_exc() self.fail(estr)
def test_validate(self): print_name('TEST VALIDATE') passed = 0 for example_schema, datum in SCHEMAS_TO_VALIDATE: print('Schema: %s' % example_schema) print('Datum: %s' % datum) validated = io.validate(schema.parse(example_schema), datum) print('Valid: %s' % validated) if validated: passed += 1 self.assertEqual(passed, len(SCHEMAS_TO_VALIDATE))
def validate_avro(schema, datum): result = tools.AvroValidator( schema=parse(json.dumps(schema)), datum=datum, ) errors = [] for error in result.errors: errors.append({ 'description': tools.format_validation_error(error), }) return errors
def test_parse(self): correct = 0 for example in EXAMPLES: try: schema.parse(example.schema_string) if example.valid: correct += 1 else: self.fail("Invalid schema was parsed: " + example.schema_string) except: if not example.valid: correct += 1 else: self.fail("Valid schema failed to parse: " + example.schema_string) fail_msg = "Parse behavior correct on %d out of %d schemas." % \ (correct, len(EXAMPLES)) self.assertEqual(correct, len(EXAMPLES), fail_msg)
def __init__(self, writer, datum_writer, writers_schema=None, codec='null'): """ If the schema is not present, presume we're appending. @param writer: File-like object to write into. """ self._writer = writer self._encoder = io.BinaryEncoder(writer) self._datum_writer = datum_writer self._buffer_writer = StringIO() self._buffer_encoder = io.BinaryEncoder(self._buffer_writer) self._block_count = 0 self._meta = {} self._header_written = False if writers_schema is not None: if codec not in VALID_CODECS: raise DataFileException("Unknown codec: %r" % codec) self._sync_marker = DataFileWriter.generate_sync_marker() self.set_meta('avro.codec', codec) self.set_meta('avro.schema', str(writers_schema)) self.datum_writer.writers_schema = writers_schema else: if writer.mode: if writer.mode not in ('rb+', 'ab+'): raise DataFileException( "When appending records to an Avro data file, the file object passed into DataFileWriter must be opened in read/write mode, e.g. for files: \"rb+\" or \"ab+\"" ) else: if not (writer.readable() and writer.writable()): raise DataFileException( "When appending records to an Avro data file, the file object passed into DataFileWriter must be opened in read/write mode, e.g. for files: \"rb+\" or \"ab+\"" ) # open writer for reading to collect metadata dfr = DataFileReader(writer, io.DatumReader()) # TODO(hammer): collect arbitrary metadata # collect metadata self._sync_marker = dfr.sync_marker self.set_meta('avro.codec', dfr.get_meta('avro.codec')) # get schema used to write existing file schema_from_file = dfr.get_meta('avro.schema') self.set_meta('avro.schema', schema_from_file) self.datum_writer.writers_schema = schema.parse(schema_from_file) # seek to the end of the file and prepare for writing writer.seek(0, 2) self._header_written = True
def test_equivalence_after_round_trip(self): # 1. Given a string, parse it to get Avro schema "original". # 2. Serialize "original" to a string and parse that string # to generate Avro schema "round trip". # 3. Ensure "original" and "round trip" schemas are equivalent. print_name('TEST ROUND TRIP') correct = 0 for example in VALID_EXAMPLES: original_schema = schema.parse(example.schema_string) round_trip_schema = schema.parse(str(original_schema)) if original_schema == round_trip_schema: correct += 1 debug_msg = "%s: ROUND TRIP SUCCESS" % example.name else: debug_msg = "%s: ROUND TRIP FAILURE" % example.name self.fail( "Round trip failure: %s, %s, %s" % (example.name, original_schema, str(original_schema))) fail_msg = "Round trip success on %d out of %d schemas" % \ (correct, len(VALID_EXAMPLES)) self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
def test_doc_attributes(self): print_name('TEST DOC ATTRIBUTES') correct = 0 for example in DOC_EXAMPLES: original_schema = schema.parse(example.schema_string) if original_schema.doc is not None: correct += 1 if original_schema.type == 'record': for f in original_schema.fields: if f.doc is None: self.fail("Failed to preserve 'doc' in fields: " + example.schema_string) self.assertEqual(correct, len(DOC_EXAMPLES))
def test_round_trip(self): print_name('TEST ROUND TRIP') correct = 0 for example_schema, datum in SCHEMAS_TO_VALIDATE: print 'Schema: %s' % example_schema print 'Datum: %s' % datum writers_schema = schema.parse(example_schema) writer, encoder, datum_writer = write_datum(datum, writers_schema) round_trip_datum = read_datum(writer, writers_schema) print 'Round Trip Datum: %s' % round_trip_datum if datum == round_trip_datum: correct += 1 self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
def test_no_default_value(self): print_name('TEST NO DEFAULT VALUE') writers_schema = LONG_RECORD_SCHEMA datum_to_write = LONG_RECORD_DATUM readers_schema = schema.parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "H", "type": "int"}]}""") writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) with self.assertRaises(io.SchemaResolutionException) as context: reader = StringIO(writer.getvalue()) decoder = io.BinaryDecoder(reader) datum_reader = io.DatumReader(writers_schema, readers_schema)
def test_field_order(self): print_name('TEST FIELD ORDER') writers_schema = LONG_RECORD_SCHEMA datum_to_write = LONG_RECORD_DATUM readers_schema = schema.parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "F", "type": "int"}, {"name": "E", "type": "int"}]}""") datum_to_read = {'E': 5, 'F': 6} writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) datum_read = read_datum(writer, writers_schema, readers_schema) print('Datum Read: %s' % datum_read) self.assertEquals(datum_to_read, datum_read)
def test_append(self): print('') print('TEST APPEND') print('===========') print('') correct = 0 for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: print('') print('SCHEMA NUMBER %d' % (i + 1)) print('================') print('') print('Schema: %s' % example_schema) print('Datum: %s' % datum) print('Codec: %s' % codec) # write data in binary to file once writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse(example_schema) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) dfw.append(datum) dfw.close() # open file, write, and close nine times for i in range(9): writer = open(FILENAME, 'ab+') dfw = datafile.DataFileWriter(writer, io.DatumWriter()) dfw.append(datum) dfw.close() # read data in binary from file reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) appended_data = [] for datum in dfr: appended_data.append(datum) print('Appended Data: %s' % appended_data) print('Appended Data Length: %d' % len(appended_data)) is_correct = [datum] * 10 == appended_data if is_correct: correct += 1 print('Correct Appended: %s' % is_correct) print('') os.remove(FILENAME) self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
def check_binary_encoding(number_type): print_name('TEST BINARY %s ENCODING' % number_type.upper()) correct = 0 for datum, hex_encoding in BINARY_ENCODINGS: print('Datum: %d' % datum) print('Correct Encoding: %s' % hex_encoding) writers_schema = schema.parse('"%s"' % number_type.lower()) writer, encoder, datum_writer = write_datum(datum, writers_schema) writer.seek(0) hex_val = avro_hexlify(writer) print('Read Encoding: %s' % hex_val) if hex_encoding == hex_val: correct += 1 print('') return correct
def test_round_trip(self): print_name('TEST ROUND TRIP') correct = 0 for example_schema, datum in SCHEMAS_TO_VALIDATE: print('Schema: %s' % example_schema) print('Datum: %s' % datum) writers_schema = schema.parse(example_schema) writer, encoder, datum_writer = write_datum(datum, writers_schema) round_trip_datum = read_datum(writer, writers_schema) print('Round Trip Datum: %s' % round_trip_datum) if datum == round_trip_datum: correct += 1 else: print("Mismatch: {} != {}".format(datum, round_trip_datum)) self.assertEqual(correct, len(SCHEMAS_TO_VALIDATE))
def test_default_value(self): print_name('TEST DEFAULT VALUE') writers_schema = LONG_RECORD_SCHEMA datum_to_write = LONG_RECORD_DATUM correct = 0 for field_type, default_json, default_datum in DEFAULT_VALUE_EXAMPLES: readers_schema = schema.parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "H", "type": %s, "default": %s}]} """ % (field_type, default_json)) datum_to_read = {'H': default_datum} writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) datum_read = read_datum(writer, writers_schema, readers_schema) print('Datum Read: %s' % datum_read) if datum_to_read == datum_read: correct += 1 self.assertEquals(correct, len(DEFAULT_VALUE_EXAMPLES))
def create_remote_kafka_assets(request, sample_generator, *args): # @mark annotation does not work with autouse=True. if 'integration' not in request.config.invocation_params.args: LOG.debug(f'NOT creating Kafka Assets') # return LOG.debug(f'Creating Kafka Assets') kafka_security = config.get_kafka_admin_config() kadmin = get_admin_client(kafka_security) new_topic = f'{TENANT}.{TEST_TOPIC}' create_topic(kadmin, new_topic) GENERATED_SAMPLES[new_topic] = [] producer = get_producer(kafka_security) schema = parse(json.dumps(ANNOTATED_SCHEMA)) for subset in sample_generator(max=100, chunk=10): GENERATED_SAMPLES[new_topic].extend(subset) produce(subset, schema, new_topic, producer) yield None # end of work before clean-up LOG.debug(f'deleting topic: {new_topic}') delete_topic(kadmin, new_topic)
def test_round_trip(self): print('') print('TEST ROUND TRIP') print('===============') print('') correct = 0 print(SCHEMAS_TO_VALIDATE) for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: print('') print('SCHEMA NUMBER %d' % (i + 1)) print('================') print('') print('Schema: %s' % example_schema) print('Datum: %s' % datum) print('Codec: %s' % codec) # write data in binary to file 10 times writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse(example_schema) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) for datum_counter in range(10): dfw.append(datum) dfw.close() # read data in binary from file reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) round_trip_data = [] for read_datum in dfr: round_trip_data.append(read_datum) print('Round Trip Data: %s' % round_trip_data) print('Round Trip Data Length: %d' % len(round_trip_data)) is_correct = [datum] * 10 == round_trip_data if is_correct: correct += 1 print('Correct Round Trip: %s' % is_correct) print('') os.remove(FILENAME) self.assertEquals(correct, len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
def _send_kafka(objs: List[Any], schema, _type, max_size=MAX_KAFKA_MESSAGE_SIZE, callback=None): # check size total_size = fb_utils.utf8size(schema) + fb_utils.utf8size(objs) _logger.debug( f'Sending {len(objs)} of {_type} to kafka @ size {total_size}') if total_size >= max_size: raise RuntimeError( f'Message size: {total_size} exceeds maximum: {max_size}. Chunking.' ) if not get_broker_info(KADMIN): raise ConnectionError('Could not connect to Kafka.') schema = parse(schema) TENANT = CONF.get('tenant') topic = fb_utils.sanitize_topic(f'{TENANT}.fbs.{_type}') produce(objs, schema, topic, PRODUCER, callback=callback) return
def test_metadata(self): # Test the writer with a 'with' statement. writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.set_meta('test.string', 'foo') dfw.set_meta('test.number', '1') dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: self.assertEquals('foo', dfr.get_meta('test.string')) self.assertEquals('1', dfr.get_meta('test.number')) for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def __init__(self, scheme=None, outputClient=None): """ Parameters --------------------------------------------- scheme - The scheme for the datums to output - can be a json string - or an instance of Schema outputClient - The output client used to send messages to the parent """ if not (isinstance(scheme, schema.Schema)): scheme = schema.parse(scheme) if (outputClient is None): raise ValueError("output client can't be none.") self.scheme = scheme self.buff = StringIO() self.encoder = avio.BinaryEncoder(self.buff) self.datum_writer = avio.DatumWriter(writers_schema=self.scheme) self.outputClient = outputClient
def __init__(self, reader, datum_reader): self._reader = reader self._raw_decoder = io.BinaryDecoder(reader) self._datum_decoder = None # Maybe reset at every block. self._datum_reader = datum_reader # read the header: magic, meta, sync self._read_header() # ensure codec is valid self.codec = self.get_meta(CODEC_KEY) if self.codec is None: self.codec = "null" if self.codec not in VALID_CODECS: raise DataFileException('Unknown codec: %s.' % self.codec) # get file length self._file_length = self.determine_file_length() # get ready to read self._block_count = 0 self.datum_reader.writers_schema = schema.parse(self.get_meta(SCHEMA_KEY))