def write_header(fo, metadata, sync_marker): header = { 'magic': MAGIC, 'meta': dict([(key, utob(value)) for key, value in iteritems(metadata)]), 'sync': sync_marker } write_data(fo, header, HEADER_SCHEMA)
def read_record(fo, writer_schema, reader_schema=None): """A record is encoded by encoding the values of its fields in the order that they are declared. In other words, a record is encoded as just the concatenation of the encodings of its fields. Field values are encoded per their schema. Schema Resolution: * the ordering of fields may be different: fields are matched by name. * schemas for fields with the same name in both records are resolved recursively. * if the writer's record contains a field with a name not present in the reader's record, the writer's value for that field is ignored. * if the reader's record schema has a field that contains a default value, and writer's schema does not have a field with the same name, then the reader should use the default value from its field. * if the reader's record schema has a field with no default value, and writer's schema does not have a field with the same name, then the field's value is unset. """ record = {} if reader_schema is None: for field in writer_schema['fields']: record[field['name']] = read_data(fo, field['type']) else: readers_field_dict = \ dict((f['name'], f) for f in reader_schema['fields']) for field in writer_schema['fields']: readers_field = readers_field_dict.get(field['name']) if readers_field: record[field['name']] = read_data(fo, field['type'], readers_field['type']) else: # should implement skip read_data(fo, field['type'], field['type']) # fill in default values if len(readers_field_dict) > len(record): writer_fields = [f['name'] for f in writer_schema['fields']] for field_name, field in iteritems(readers_field_dict): if field_name not in writer_fields: default = field.get('default') if default: record[field['name']] = default else: msg = 'No default value for %s' % field['name'] raise SchemaResolutionError(msg) return record
def read_record(fo, writer_schema, reader_schema=None): """A record is encoded by encoding the values of its fields in the order that they are declared. In other words, a record is encoded as just the concatenation of the encodings of its fields. Field values are encoded per their schema. Schema Resolution: * the ordering of fields may be different: fields are matched by name. * schemas for fields with the same name in both records are resolved recursively. * if the writer's record contains a field with a name not present in the reader's record, the writer's value for that field is ignored. * if the reader's record schema has a field that contains a default value, and writer's schema does not have a field with the same name, then the reader should use the default value from its field. * if the reader's record schema has a field with no default value, and writer's schema does not have a field with the same name, then the field's value is unset. """ record = {} if reader_schema is None: for field in writer_schema['fields']: record[field['name']] = read_data(fo, field['type']) else: readers_field_dict = \ dict((f['name'], f) for f in reader_schema['fields']) for field in writer_schema['fields']: readers_field = readers_field_dict.get(field['name']) if readers_field: record[field['name']] = read_data(fo, field['type'], readers_field['type']) else: # should implement skip read_data(fo, field['type'], field['type']) # fill in default values if len(readers_field_dict) > len(record): writer_fields = [f['name'] for f in writer_schema['fields']] for field_name, field in iteritems(readers_field_dict): if field_name not in writer_fields: default = field.get('default') if 'default' in field: record[field['name']] = default else: msg = 'No default value for %s' % field['name'] raise SchemaResolutionError(msg) return record
def write_map(fo, datum, schema): """Maps are encoded as a series of blocks. Each block consists of a long count value, followed by that many key/value pairs. A block with count zero indicates the end of the map. Each item is encoded per the map's value schema. If a block's count is negative, then the count is followed immediately by a long block size, indicating the number of bytes in the block. The actual count in this case is the absolute value of the count written.""" if len(datum) > 0: write_long(fo, len(datum)) vtype = schema['values'] for key, val in iteritems(datum): write_utf8(fo, key) write_data(fo, val, vtype) write_long(fo, 0)
def __init__(self, fo, reader_schema=None): """Creates a new iterator Paramaters ---------- fo: file like Input stream reader_schema: dict, optional Reader schema Example ------- >>> with open('some-file.avro', 'rb') as fo: >>> avro = iter_avro(fo) >>> schema = avro.schema >>> for record in avro: >>> process_record(record) """ self.fo = fo try: self._header = read_data(fo, HEADER_SCHEMA) except StopIteration: raise ValueError('cannot read header - is it an avro file?') # `meta` values are bytes. So, the actual decoding has to be external. self.metadata = \ dict((k, btou(v)) for k, v in iteritems(self._header['meta'])) self.schema = self.writer_schema = \ json.loads(self.metadata['avro.schema']) self.codec = self.metadata.get('avro.codec', 'null') self.reader_schema = reader_schema acquaint_schema(self.writer_schema, READERS) if reader_schema: populate_schema_defs(reader_schema, SCHEMA_DEFS) self._records = _iter_avro(fo, self._header, self.codec, self.writer_schema, reader_schema)
def __init__(self, fo, reader_schema=None): """Creates a new iterator Paramaters ---------- fo: file like Input stream reader_schema: dict, optional Reader schema Example ------- >>> with open('some-file.avro', 'rb') as fo: >>> avro = iter_avro(fo) >>> schema = avro.schema >>> for record in avro: >>> process_record(record) """ self.fo = fo try: self._header = read_data(fo, HEADER_SCHEMA) except StopIteration: raise ValueError('cannot read header - is it an avro file?') # `meta` values are bytes. So, the actual decoding has to be external. self.metadata = \ dict((k, btou(v)) for k, v in iteritems(self._header['meta'])) self.schema = self.writer_schema = \ json.loads(self.metadata['avro.schema']) self.codec = self.metadata.get('avro.codec', 'null') self.reader_schema = reader_schema acquaint_schema(self.writer_schema, READERS) if reader_schema: populate_schema_defs(reader_schema) self._records = _iter_avro(fo, self._header, self.codec, self.writer_schema, reader_schema)
def __init__(self, fo, reader_schema=None): self.fo = fo try: self._header = read_data(fo, HEADER_SCHEMA) except StopIteration: raise ValueError('cannot read header - is it an avro file?') # `meta` values are bytes. So, the actual decoding has to be external. self.metadata = \ dict([(k, btou(v)) for k, v in iteritems(self._header['meta'])]) self.schema = self.writer_schema = \ json.loads(self.metadata['avro.schema']) self.codec = self.metadata.get('avro.codec', 'null') self.reader_schema = reader_schema acquaint_schema(self.writer_schema, READERS) if reader_schema: populate_schema_defs(reader_schema, SCHEMA_DEFS) self._records = _iter_avro(fo, self._header, self.codec, self.writer_schema, reader_schema)