def test_write_in_multiple_blocks(self): writer = AvroWriter( self.client, 'weather.avro', schema=self.schema, sync_interval=1 # Flush block on every write. ) with writer: for record in self.records: writer.write(record) with AvroReader(self.client, 'weather.avro') as reader: eq_(list(reader), self.records)
def test_write_in_multiple_blocks(self): writer = AvroWriter( self.client, 'weather.avro', schema=self.schema, sync_interval = 1 # Flush block on every write. ) with writer: for record in self.records: writer.write(record) with AvroReader(self.client, 'weather.avro') as reader: eq_(list(reader), self.records)
def test_write(self): writer = AvroWriter( self.client, 'weather.avro', schema=self.schema, ) with writer: for record in self.records: writer.write(record) with temppath() as tpath: self.client.download('weather.avro', tpath) eq_(self._get_data_bytes(osp.join(self.dpath, 'weather.avro')), self._get_data_bytes(tpath))
def test_write(self): writer = AvroWriter( self.client, 'weather.avro', schema=self.schema, ) with writer: for record in self.records: writer.write(record) with temppath() as tpath: self.client.download('weather.avro', tpath) eq_( self._get_data_bytes(osp.join(self.dpath, 'weather.avro')), self._get_data_bytes(tpath) )
def write(): with AvroWriter(client, '/tmp/hdfscli_avro/example.avro', overwrite=True, schema=schema) as writer: for record in records: writer.write(record)
def test_read_part_file(self): data = { 'part-m-00000.avro': [{ 'name': 'jane' }, { 'name': 'bob' }], 'part-m-00001.avro': [{ 'name': 'john' }, { 'name': 'liz' }], } for fname, records in data.items(): with AvroWriter(self.client, 'data.avro/%s' % (fname, )) as writer: for record in records: writer.write(record) with temppath() as tpath: with open(tpath, 'w') as writer: main(['read', 'data.avro', '--parts', '1,'], client=self.client, stdout=writer) with open(tpath) as reader: records = [loads(line) for line in reader] eq_(records, data['part-m-00001.avro'])
def test_write_overwrite_error(self): # To check that the background `AsyncWriter` thread doesn't hang. self.client.makedirs('weather.avro') with AvroWriter(self.client, 'weather.avro', schema=self.schema) as writer: for record in self.records: writer.write(record)
def saveToHDFS(self): qs="select %s from %s where push_time >= '%s 00:00:00' and push_time < '%s 00:00:00' and id>%d order by id asc;"\ %(self.needFields,self.prefix,self.curID_PT["PT"].strftime('%Y-%m-%d'),(self.curID_PT["PT"] + timedelta(1)).strftime('%Y-%m-%d'),self.curID_PT["ID"]) curAvroFileName = self.buildFileName() try: with self.con, AvroWriter(self.clientHDFS, curAvroFileName, schema=self.schema, overwrite=True) as writer: db = self.con.cursor(mdb.cursors.DictCursor) db.execute(qs) rs = db.fetchall() for r in rs: #if r['last_checktime']: # r['last_checktime'] = time.mktime(r['last_checktime'].timetuple()) for k, v in r.items(): if type(v) == datetime.datetime: r[k] = time.mktime(v.timetuple()) writer.write(r) self.curID_PT["ID"] = r["id"] except (mdb.Error) as e: self.slogger.info(e) finally: self.slogger.info( "Now here:%s\t%d" % (self.curID_PT["PT"].isoformat(), self.curID_PT["ID"])) self.saveWhere()
def upload_to_hdfs(hdfs, records): print('saving to AVRO ...') # print(records) with AvroWriter(hdfs, f'{BASE_DIR}{FILE_NAME}-{round(time.time())}.avro', schema=parsed_schema) as writer: for record in records: try: writer.write(record) except: continue time.sleep(1)
def file_conversion_to_avro(client, fp_to_write, fp_to_read, schema): x = 0 with AvroWriter(client, fp_to_write, schema) as writer: with client.read(fp_to_read, encoding='utf-8', delimiter='\n') as reader: for i in reader: x += 1 line = i.split(',') if len(line) == 1: continue if x == 1: keys = line continue dict_to_write = MakeStringDict(keys, line) print(dict_to_write) writer.write(dict_to_write)
def write_avro(file_name, predictions): avro_schema = generate_avro_schema() with AvroWriter(hdfs_client, file_name, schema=avro_schema, overwrite=True) as writer: for prediction in predictions: data = {} data['event_id'] = 'pred_' + str(prediction['_id']) data['valid_on'] = int(prediction['WCT'].timestamp()) data['created_on'] = int(prediction['WCT'].timestamp()) data['input_events'] = str(prediction['_id']) data['patient_id'] = prediction['VISIT_NUMBER'] data['provenance'] = ['psPredsExtract', 'PredictSepsis'] data['Prediction'] = float(prediction['result']['result']['predict']) data['Score'] = float(prediction['result']['result']['score']) data['heuristic_rule'] = prediction['result']['result']['heuristic_alert'] writer.write(data)
from hdfs import Config from hdfs.ext.avro import AvroReader, AvroWriter # Get the default alias' client. client = Config().get_client() # Some sample data. records = [ { 'name': 'Ann', 'age': 23 }, { 'name': 'Bob', 'age': 22 }, ] # Write an Avro File to HDFS (since our records' schema is very simple, we let # the writer infer it automatically, otherwise we would pass it as argument). with AvroWriter(client, 'names.avro', overwrite=True) as writer: for record in records: writer.write(record) # Read it back. with AvroReader(client, 'names.avro') as reader: schema = reader.schema # The inferred schema. content = reader.content # The remote file's HDFS content object. assert list(reader) == records # The records match!
def test_infer_schema(self): with AvroWriter(self.client, 'weather.avro') as writer: for record in self.records: writer.write(record) with AvroReader(self.client, 'weather.avro') as reader: eq_(list(reader), self.records)
def test_write_empty(self): with AvroWriter(self.client, 'empty.avro', schema=self.schema): pass with AvroReader(self.client, 'empty.avro') as reader: eq_(reader.schema, self.schema) eq_(list(reader), [])
'name': 'xiaoming', 'favorite_number': 123, 'favorite_color': 'red' }, { 'name': 'xiaohong', 'favorite_number': 123, 'favorite_color': 'yellow' }, { 'name': 'xiaoliang', 'favorite_number': 123, 'favorite_color': 'black' }] # # with client.write(path, overwrite=True) as writer: # # for record in records: # # writer.write(record) # with DataFileWriter(writer, DatumWriter(), schema) as data_file_writer: # for record in records: # data_file_writer.append(record) with AvroWriter(client, path) as writer: for record in records: writer.write(record) # # with AvroReader(client, '/tmp/test_avro/data-1560762234.783534.avro') as reader: # schema = reader.schema # The remote file's Avro schema. # print(schema) # content = reader.content # Content metadata (e.g. size). # for record in reader: # print(record)