def test_write_in_multiple_blocks(self): writer = AvroWriter( self.client, 'weather.avro', schema=self.schema, sync_interval=1 # Flush block on every write. ) with writer: for record in self.records: writer.write(record) with AvroReader(self.client, 'weather.avro') as reader: eq_(list(reader), self.records)
def test_write_codec(self): with open(osp.join(self.dpath, 'weather.jsonl')) as reader: main( [ 'write', 'weather.avro', '--schema', dumps(self.schema), '--codec', 'deflate', ], client=self.client, stdin=reader ) # Correct content. with AvroReader(self.client, 'weather.avro') as reader: records = list(reader) eq_(records, self.records) # Different size (might not be smaller, since very small file). compressed_size = self.client.content('weather.avro')['length'] uncompressed_size = osp.getsize(osp.join(self.dpath, 'weather.avro')) ok_(compressed_size != uncompressed_size)
def test_read_with_compatible_schema(self): self.client.upload('w.avro', osp.join(self.dpath, 'weather.avro')) schema = { 'name': 'test.Weather', 'type': 'record', 'fields': [ { 'name': 'temp', 'type': 'int' }, { 'name': 'tag', 'type': 'string', 'default': '' }, ], } with AvroReader(self.client, 'w.avro', reader_schema=schema) as reader: eq_(list(reader), [{ 'temp': r['temp'], 'tag': '' } for r in self.records])
from hdfs import Config from hdfs.ext.avro import AvroReader, AvroWriter # Get the default alias' client. client = Config().get_client() # Some sample data. records = [ { 'name': 'Ann', 'age': 23 }, { 'name': 'Bob', 'age': 22 }, ] # Write an Avro File to HDFS (since our records' schema is very simple, we let # the writer infer it automatically, otherwise we would pass it as argument). with AvroWriter(client, 'names.avro', overwrite=True) as writer: for record in records: writer.write(record) # Read it back. with AvroReader(client, 'names.avro') as reader: schema = reader.schema # The inferred schema. content = reader.content # The remote file's HDFS content object. assert list(reader) == records # The records match!
def read(): """""" with AvroReader(client, '/tmp/hdfscli_avro/example.avro') as reader: for record in reader: print(record)
def test_write(self): write_dataframe(self.client, 'weather.avro', self.df) with AvroReader(self.client, 'weather.avro') as reader: eq_(list(reader), self.records)
def test_infer_schema(self): with AvroWriter(self.client, 'weather.avro') as writer: for record in self.records: writer.write(record) with AvroReader(self.client, 'weather.avro') as reader: eq_(list(reader), self.records)
def test_write_empty(self): with AvroWriter(self.client, 'empty.avro', schema=self.schema): pass with AvroReader(self.client, 'empty.avro') as reader: eq_(reader.schema, self.schema) eq_(list(reader), [])
def test_read(self): self.client.upload('weather.avro', osp.join(self.dpath, 'weather.avro')) with AvroReader(self.client, 'weather.avro') as reader: eq_(list(reader), self.records)
def test_read_with_same_schema(self): self.client.upload('w.avro', osp.join(self.dpath, 'weather.avro')) with AvroReader(self.client, 'w.avro', reader_schema=self.schema) as reader: eq_(list(reader), self.records)
def testAvroLength(self, status, destFileName, client): if status[u"length"] > 5000: return False with AvroReader(client, destFileName) as reader: return len(list(reader)) == 0