def run(self, n): # JSON Serializer # serializer = ajs.AvroJsonSerializer(self.movies_schema) # json_data = serializer.to_json(self.movies_data) total_ser = 0 total_deser = 0 bytes_len = 0 for i in range(0, n): datum_writer = DatumWriter(self.movies_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) tic = timeit.default_timer() datum_writer.write(self.movies_data, encoder) elapsed = timeit.default_timer() - tic payload = bytes_writer.getvalue() total_ser = total_ser + elapsed bytes_len = len(payload) bytes_reader = io.BytesIO(payload) decoder = BinaryDecoder(bytes_reader) reader = DatumReader(self.movies_schema) tic2 = timeit.default_timer() movies = reader.read(decoder) elapsed2 = timeit.default_timer() - tic2 total_deser = total_deser + elapsed2 self.logger.log(logging.INFO, "serialized len: %s bytes", bytes_len) avg_ser = (total_ser*(10**9))/n avg_deser = (total_deser*(10**9))/n self.logger.log(logging.INFO, "Serialization time: \n%s", avg_ser) self.logger.log(logging.INFO, "De-serialization time: \n%s", avg_deser)
def main(args): log = logging.getLogger(__name__) log.setLevel(logging.INFO) sys_log = logging.handlers.SysLogHandler("/dev/log") sys_format = logging.Formatter('%(name)s[%(process)d]: %(levelname)s %(message)s') sys_log.setFormatter(sys_format) log.addHandler(sys_log) reader = DataFileReader(open(args.avro_file, "r"), DatumReader()) schema = reader.datum_reader.writers_schema for i, row in enumerate(reader): log.debug("Consumer row:" + str(row)) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(row, encoder) raw_bytes = bytes_writer.getvalue() b64enc = base64.b64encode(raw_bytes) msg = {"messages": [{"data": b64enc}]} json_str = json.dumps(msg) log.debug("json msg:" + json_str) publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)
def toKey(self, x, avroType): x = jsonEncoder(avroType, x, False) bytes = io.BytesIO() writer = DatumWriter(avroType.schema) writer.write(x, BinaryEncoder(bytes)) bytes.flush() return base64.b64encode(bytes.getvalue())
def __call__(self, state, scope, pos, paramTypes, x): schema = avro.schema.parse(json.dumps(paramTypes[0])) x = untagUnion(x, paramTypes[0]) bytes = io.BytesIO() writer = DatumWriter(schema) writer.write(x, BinaryEncoder(bytes)) bytes.flush() return bytes.getvalue()
def compose_data(timestamp, src_vmtype, host_ip, account_id, dest_ip): writer = DatumWriter(get_schema()) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) message = '{"eventName": "Neighbour_Unreachable", "accountId":"%s", "destIp":"%s"}' \ % (account_id, dest_ip) raw_data = bytes(message) writer.write({"timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata":raw_data}, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def compose_data(timestamp, src_vmtype, host_ip, account_id, proc_name): writer = DatumWriter(get_schema()) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) message = '{"eventName": "Process_Down", "accountId":"%s", "ProcName":"%s"}' \ % (account_id, proc_name) raw_data = bytes(message) writer.write({"timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata":raw_data}, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
class AvroSerializer(object): def __init__(self, schema): self.schema = schema self.datum_writer = DatumWriter(schema) def serialize(self, record): f = StringIO() encoder = BinaryEncoder(f) self.datum_writer.write(record, encoder) return f.getvalue()
class Serializer(object): def __init__(self, schema_str): schema = avro.schema.parse(schema_str) self.writer = DatumWriter(schema) def serialize(self, record): f = StringIO() encoder = BinaryEncoder(f) self.writer.write(record, encoder) return f.getvalue()
class Serializer(object): def __init__(self, schema_str): if sys.version_info >= (3,): schema = avro.schema.Parse(schema_str) else: schema = avro.schema.parse(schema_str) self.writer = DatumWriter(schema) def serialize(self, record): f = string_io() encoder = BinaryEncoder(f) self.writer.write(record, encoder) return f.getvalue()
def __init__(self, context): super(AvroWriter, self).__init__(context) job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
def encode(self, raw_data): byte_stream = BytesIO() writer = DataFileWriter(byte_stream, DatumWriter(), self._schema) writer.append(raw_data) writer.flush() serialized_data = byte_stream.getvalue() writer.close() return serialized_data
def write_pipeline_template_to_avro(pipeline, rtasks_d, output_file): d = pipeline_template_to_dict(pipeline, rtasks_d) f = open(output_file, 'w') with DataFileWriter(f, DatumWriter(), PT_SCHEMA) as writer: writer.append(d) return d
def write_avro_file(self, rec_creator, n_samples, sync_interval): avdf.SYNC_INTERVAL = sync_interval self.assertEqual(avdf.SYNC_INTERVAL, sync_interval) fo = self._mkf('data.avro') with avdf.DataFileWriter(fo, DatumWriter(), self.schema) as writer: for i in xrange(n_samples): writer.append(rec_creator(i)) return fo.name
def gen_avro(filename): schema = avro.schema.parse(SCHEMA) fo = open(filename, "wb") writer = DataFileWriter(fo, DatumWriter(), schema) for record in looney_records(): writer.append(record) writer.close() fo.close()
def _write(self, data): "Internal write API" wmaid = self.wmaid(data) schema = self.schema fname = file_name(self.hdir, wmaid) with open_file(fname, 'w') as ostream: with DataFileWriter(ostream, DatumWriter(), schema) as writer: writer.append(data)
def __create_nested(out_path): os.makedirs(out_path) schema_path = os.path.join(os.path.dirname(__file__), 'data/nested.avsc') schema = avro.schema.parse(open(schema_path).read()) with DataFileWriter(open(os.path.join(out_path, 'part-m-00004.avro'), 'w'), DatumWriter(), schema) as writer: writer.append({'sup': 1, 'sub':{'level2':2}}) writer.append({'sup': 2, 'sub':{'level2':1}})
def prepare(producer, arr, root, level): for it in arr: buf = io.BytesIO() writer = DataFileWriter(buf, DatumWriter(), sch) item = Item(root, it, False) writer.append(item.get_dict()) writer.flush() send(buf, level, producer)
def produce(self, msg): if self.ser_type == kfkcfg.SERIALIZATIO_JSON: # s = json.dumps(msg) s = json.dumps(msg, default=json_util.default) future = self.kfkprod.produce(bytes(s, 'utf-8')) # msg = json.dumps(msg, default=json_util.default).encode('utf-8') # future = self.kfkprod.produce(bytes(msg)) elif self.ser_type == kfkcfg.SERIALIZATIO_AVRO: writer = DatumWriter(self.avro_schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(msg, encoder) raw_bytes = bytes_writer.getvalue() future = self.kfkprod.produce(raw_bytes)
def run(self): # for normalizing alcohol minimum, maximum, average = 100, 0, 0 with open('raw.csv', 'r') as fd: csv_reader = csv.reader(fd, delimiter=',') collection = {} for i, row in enumerate(csv_reader): desc = row[3].lower().replace('.', '').replace(',', '') alc = float(row[-1]) if alc < minimum: minimum = alc if alc > maximum: maximum = alc average += alc # Remove gifts or items without description if 'engin' in desc: continue if 'gjafa' in desc or 'gjafa' in row[0]: continue if 'öskju' in desc or 'öskju' in row[0]: continue if 'flöskur m/glasi' in desc or 'kútur' in row[0]: continue features = self.parse(desc.split(), row[0]) features['alcohol'] = alc collection[row[0]] = features average = average / (i + 1) with open('beers.avsc', 'r') as fd: schema = avro.schema.Parse(fd.read()) with open('beers.avro', 'wb') as fd: writer = DataFileWriter(fd, DatumWriter(), schema) denominator_alc = maximum - minimum for k, v in collection.items(): v['bitterness'] = self.BITTERNESS['class'][ v['bitterness']] / self.BITTERNESS['maximum'] v['color'] = self.COLOR['class'][ v['color']] / self.COLOR['maximum'] v['clarity'] = self.CLARITY['class'][ v['clarity']] / self.CLARITY['maximum'] v['sweetness'] = self.SWEETNESS['class'][ v['sweetness']] / self.CLARITY['maximum'] v['alcohol'] = (v['alcohol'] - minimum) / denominator_alc v['name'] = k writer.append(v) writer.close()
class AvroInference(): """Class representing a sink of Avro inference data to Apache Kafka. Args: boostrap_servers (str): List of Kafka brokers topic (str): Kafka topic data_scheme_filename (str): Filename of the AVRO scheme for training data group_id (str): Group ID of the Kafka consumer. Defaults to sink """ def __init__(self, boostrap_servers, topic, data_scheme_filename, group_id='sink'): self.boostrap_servers = boostrap_servers self.topic = topic self.data_scheme_filename = data_scheme_filename self.data_schema = open(self.data_scheme_filename, "r").read() self.avro_data_schema = avro.schema.Parse(self.data_schema) self.data_writer = DatumWriter(self.avro_data_schema) self.data_io = io.BytesIO() self.data_encoder = BinaryEncoder(self.data_io) self.__producer = KafkaProducer( bootstrap_servers=self.boostrap_servers) def send(self, data): self.data_writer.write(data, self.data_encoder) data_bytes = self.data_io.getvalue() self.__producer.send(self.topic, data_bytes) self.data_io.seek(0) self.data_io.truncate(0) """Cleans data buffer""" def close(self): self.__producer.flush() self.__producer.close()
def check_schema(self, data, schema_path): schema = avro.schema.Parse( open(schema_path, "rb").read().decode("utf-8")) writer = DataFileWriter(open('_test.avro', "wb"), DatumWriter(), schema) writer.append(data) writer.close()
def _load_datawriter(self): try: lschema = load_schema(self.schema) self.avrofile = open(self.outfile, 'w+b') self.datawrite = DataFileWriter(self.avrofile, DatumWriter(), lschema) except Exception: return False return True
def _write_items(base_name, schema_str, items): avro_schema = schema.Parse(schema_str) avro_file = base_name + '.avro' with DataFileWriter(open(avro_file, "w"), DatumWriter(), avro_schema) as writer: for i in items: writer.append(i) writer.close return (avro_file)
def _create_avro_file(schema, items, file_prefix): _, result_file_path = tempfile.mkstemp(prefix=file_prefix, suffix='.avro') parsed_schema = avro.schema.Parse(schema) with open(result_file_path, 'wb') as f: writer = DataFileWriter(f, DatumWriter(), parsed_schema) for s in items: writer.append(s) writer.close() return result_file_path
def _create_or_update_table( self, data, table_name, schema_name, schema_suffix, columns_definition, load_strategy, upload_call_count, database_name=None, primary_key=None, ): # This method doesn't actually create or update a table. It just creates # and populates a single .avro file which is used in the data upload. # The actual upload happens when the commit() method is called. if upload_call_count == 1: # Create avro writer and file in temporary folder self.avro_folder = TemporaryDirectory() self.avro_file_name = self.avro_folder.name + os.sep + table_name + ".avro" avro_schema = avro.schema.parse( json.dumps({ "type": "record", "name": table_name, "namespace": table_name, "fields": [{ "name": name, "type": [ "null", map_bq_data_type_to_avro(field["data_type"]), ], } for name, field in columns_definition.items()], })) # Create the avro_writer object to be used going forward self.avro_writer = DataFileWriter(open(self.avro_file_name, "wb"), DatumWriter(), avro_schema) # Save the relevant kwargs for later use in the commit() method self.table_creation_config = { "table_name": table_name, "schema_name": schema_name, "schema_suffix": schema_suffix, "columns_definition": columns_definition, "load_strategy": load_strategy, "database_name": database_name, "primary_key": primary_key, } self.log.info( "BigQuery Uploader writes data into Avro file for later one-off upload!" ) while data: # Write records to .avro file self.avro_writer.append(data.pop(0))
def serialize_records(records, coin, avro_output=None): if avro_output == None: avro_output = str(coin) + ".avro" transformer = transform_data() schema = transformer.parse_schema() #avro_output=str(coin) + ".avro" with open(avro_output, 'wb') as out: writer = DataFileWriter(out, DatumWriter(), schema) for record in records: writer.append(record)
def test_sanity(): """ Ensures that our "base" and "good" schemas are actually forwards- and backwards-compatible """ # fst schema / record fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read()) fst_writer = DatumWriter(writers_schema=fst_schema) fst_record = { "fieldWithoutDefaultValue": 0, "properField": 0, "enumField": "A", "unionField": None, "arrayField": ["world"], "mapField": {"hello": "world"}, "fixedField": "aaaaaaaaaaaaaaaa" } # sec schema / record sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read()) sec_writer = DatumWriter(writers_schema=sec_schema) sec_record = { "fieldWithoutDefaultValue": 0, "properField2": 0, "enumField": "B", "unionField": None, "arrayField": ["world"], "fixedField": "bbbbbbbbbbbbbbbb" } # Encode record w/ fst fst_buf = StringIO.StringIO() fst_encoder = BinaryEncoder(fst_buf) fst_writer.write(fst_record, fst_encoder) fst_data = fst_buf.getvalue() # Encode record w/ sec sec_buf = StringIO.StringIO() sec_encoder = BinaryEncoder(sec_buf) sec_writer.write(sec_record, sec_encoder) sec_data = sec_buf.getvalue() # writers == fst, readers == sec sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema) sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data)) sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good # writers == sec, readers == fst fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema) fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data)) fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
def put_frame(video_name, video_number, pic): writer = DatumWriter(SCHEMA) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write( { "rtsp": "rtsp", "createTime": time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "videoName": video_name, "videoNumber": video_number, "picContents": pic }, encoder) raw_bytes = bytes_writer.getvalue() PRODUCER.send_messages(TOPIC, raw_bytes)
def _load_file(self, file_path, schema) -> DataFileWriter: f = open(file_path, 'ab+') self.cache[file_path] = dict() self.cache[file_path]['file_io'] = f writer = DataFileWriter(f, DatumWriter(), schema) self.cache[file_path]['datum_writer'] = writer self.cache.move_to_end(file_path) if len(self.cache) > self.capacity: self._remove_item() return writer
def serialize_records(records, outpath="funding.avro"): schema = parse_schema() # with open(outpath, 'wb') as out: out = StringIO() writer = DataFileWriter(out, DatumWriter(), schema) for record in records: record = dict((f, getattr(record, f)) for f in record._fields) record['fundedDate'] = record['fundedDate'].strftime('%Y-%m-%dT%H:M:S') writer.append(record) return out
def objToBin2(): file = io.BytesIO() datum_writer = DatumWriter() fwriter = DataFileWriter(file, datum_writer, sc) for d in datum: fwriter.append(d) ab = file.getvalue() fwriter.close() return ab
def import_data(schema, src, dest, index, debug): global next_update global verbose index = int(index) verbose = int(debug) in_file = os.path.join(src, "MLHD_%03d.tar" % index) out_file = os.path.join(dest, "MLHD_%03d.avro" % index) count = 0 next_update = time() + UPDATE_INTERVAL schema = avro.schema.Parse(open(schema, "rb").read().decode('ascii')) with DataFileWriter(open(out_file, "wb"), DatumWriter(), schema, codec='deflate') as writer: tar = tarfile.open(in_file) total = 0 chunks = [] size = 0 for i, member in enumerate(tar.getnames()): count, data = handle_file(member, tar.extractfile(member).read()) chunks.append(data) total += count size += len(data) if verbose: print( "%03d: %d rows processed, %s total rows, %d bytes of output." % (index, count, total, size)) sys.stdout.flush() if size > MAX_SIZE: for chunk in chunks: try: for js in chunk: writer.append(js) except Exception as err: print("%03d: err writing file: %s" % (index, err)) sys.exit(-1) chunks = [] size = 0 tar.close() if verbose: print("%03d: finish writing output file." % index) sys.stdout.flush() for chunk in chunks: try: for js in chunk: writer.append(js) except Exception as err: print("%03d: err writing file: %s" % (index, err)) sys.exit(-1)
def main (): # Define schema of avro file. schema = avro.schema.Parse(open("logs_uuid.avsc", "rb").read()) # Create a datum writer. rwriter = DatumWriter(schema) files = ['logs_0.txt', 'logs_1.txt', 'logs_2.txt', 'logs_3.txt'] # Loop to process the files for f in files: # open file and store in a variable logfile = open(f, "r") text = logfile.readlines() logfile.close() # Set the avro file name (new) newfile = str(f).replace('.txt','uuid.avro') # Create a data file writer. dfwriter = DataFileWriter (open(newfile, "wb"), DatumWriter(), schema) # Loop to get information from each line for line in text: # Get the variables from line. sdt, surl, suser = line.strip().split('\t') # Defines a dictionary structure data = {} data['timestamp'] = sdt data['url'] = surl data['user'] = suser data['uuid'] = str(uuid.uuid1()) # Write the data in the file. dfwriter.append (data) # Close the file after the loop. dfwriter.close()
def testWrite(filename, schema): fd = open(filename, 'wb') datum = DatumWriter() writer = DataFileWriter(fd, datum, schema) writer.append(makeObject("Person A", 23)) writer.append(makeObject("Person B", 31)) writer.append(makeObject("Person C", 28)) writer.close()
def make_record_set(self, schema_path: str, items: list) -> bytes: if schema_path not in self.schemas: with open(schema_path, 'rb') as raw: self.schemas[schema_path] = avro.schema.Parse(raw.read()) out = BytesIO() writer = DataFileWriter(out, DatumWriter(), self.schemas[schema_path]) for item in items: writer.append(item) writer.flush() return out.getvalue()
def _write_records_to_avro(schema, _d_or_ds, output_file): # FIXME. There's only one record being written here, # why does this not support a single item if isinstance(_d_or_ds, dict): _d_or_ds = [_d_or_ds] with open(output_file, 'w') as outs: with DataFileWriter(outs, DatumWriter(), schema) as writer: for record in _d_or_ds: writer.append(record) log.debug("Write avro file to {p}".format(p=output_file)) return _d_or_ds
def __init__(self, context): super(AvroWriter, self).__init__(context) self.logger = LOGGER.getChild('AvroWriter') job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.logger.debug('created hdfs file %s', outfn) self.writer = DataFileWriter(wh, DatumWriter(), self.schema) self.logger.debug('opened AvroWriter')
def __init__(self, boostrap_servers, topic, data_scheme_filename, group_id='sink'): self.boostrap_servers = boostrap_servers self.topic = topic self.data_scheme_filename = data_scheme_filename self.data_schema = open(self.data_scheme_filename, "r").read() self.avro_data_schema = avro.schema.Parse(self.data_schema) self.data_writer = DatumWriter(self.avro_data_schema) self.data_io = io.BytesIO() self.data_encoder = BinaryEncoder(self.data_io) self.__producer = KafkaProducer( bootstrap_servers=self.boostrap_servers)
def write(self, data): #Parsing data to select only keys in schema store_data = {} for key in self.keys: if key in data: store_data[key] = data[key] else: store_data[key] = None #Serialize data using AVRO writer = DatumWriter(self.schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(store_data, encoder) raw_bytes = bytes_writer.getvalue() #Place into pipeline print(data) self.producer.send(self.topic, raw_bytes)
def __init__(self, schema_str): schema = avro.schema.parse(schema_str) self.writer = DatumWriter(schema)
def toKey(self, x, schema): bytes = io.BytesIO() writer = DatumWriter(schema) writer.write(x, BinaryEncoder(bytes)) bytes.flush() return base64.b64encode(bytes.getvalue())
def serialize(data): writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(data, encoder) return bytes_writer.getvalue()
def __init__(self, schema_str): if sys.version_info >= (3,): schema = avro.schema.Parse(schema_str) else: schema = avro.schema.parse(schema_str) self.writer = DatumWriter(schema)
"""Python avro official implementation encoding benchmark.""" from io import BytesIO from itertools import repeat from time import time from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter, BinaryEncoder, BinaryDecoder import sys LOOPS = 1 with open(sys.argv[1]) as reader: datum_reader = DatumReader() file_reader = DataFileReader(reader, datum_reader) SCHEMA = datum_reader.writers_schema RECORDS = list(file_reader) buf = BytesIO() datum_writer = DatumWriter(SCHEMA) start = time() n = 0 for _ in repeat(None, LOOPS): for record in RECORDS: buf.seek(0) encoder = BinaryEncoder(buf) datum_writer.write(record, encoder) n += 1 print 1000. * (time() - start) / n
def __init__(self, schema): self.schema = schema self.datum_writer = DatumWriter(schema)
def createAvroMemoryRecord(data,schema): f = StringIO() encoder = BinaryEncoder(f) writer = DatumWriter(schema) writer.write(dict(data),encoder) return f.getvalue()
# # NB: the AvroOutputReader specific portion begins here # def new_column(name, value): column = dict() column['name'] = '%s' % name column['value'] = '%s' % value column['timestamp'] = long(time.time() * 1e6) column['ttl'] = 0 return column # parse the current avro schema proto = avro.protocol.parse(open('cassandra.avpr').read()) schema = proto.types_dict['StreamingMutation'] # open an avro encoder and writer for stdout enc = BinaryEncoder(sys.stdout) writer = DatumWriter(schema) # output a series of objects matching 'StreamingMutation' in the Avro interface smutation = dict() try: for word, count in word2count.iteritems(): smutation['key'] = word smutation['mutation'] = {'column_or_supercolumn': {'column': new_column('count', count)}} writer.write(smutation, enc) finally: sys.stdout.flush()
from time import time # To send messages synchronously producer = KafkaProducer(bootstrap_servers = "localhost:9092", compression_type = "gzip") # Kafka topic topic = "tnx" # Path to user.avsc avro schema schema_path = "/home/cloudera/workspace/kafka-clients-python/transactions.avsc" schema = avro.schema.Parse(open(schema_path).read()) print("Schema", schema.to_json()) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) def get_record(): return {"id": "123" , "merchant_id": "m123" , "customer_id": "c345" , "amount": 100.1 , "category": "pos" , "timestamp": int(time())} for i in range(10): record = get_record() writer.write(record, encoder)