def _produce_test_input(self): schema = avro.schema.parse(""" { "name": "TestQueryTask_record", "type": "record", "doc": "The description", "fields": [ {"name": "col0", "type": "int", "doc": "The bold"}, {"name": "col1", "type": { "name": "inner_record", "type": "record", "doc": "This field shall be an inner", "fields": [ {"name": "inner", "type": "int", "doc": "A inner field"}, {"name": "col0", "type": "int", "doc": "Same name as outer but different doc"}, {"name": "col1", "type": ["null", "string"], "default": null, "doc": "Nullable primitive"}, {"name": "col2", "type": ["null", { "type": "map", "values": "string" }], "default": null, "doc": "Nullable map"} ] }, "doc": "This field shall be an inner"}, {"name": "col2", "type": "int", "doc": "The beautiful"}, {"name": "col3", "type": "double"} ] }""") self.addCleanup(os.remove, "tmp.avro") writer = DataFileWriter(open("tmp.avro", "wb"), DatumWriter(), schema) writer.append({'col0': 1000, 'col1': {'inner': 1234, 'col0': 3000}, 'col2': 1001, 'col3': 1.001}) writer.close() self.gcs_client.put("tmp.avro", self.gcs_dir_url + "/tmp.avro")
def serializeDataToOCFFile(schemaFile,outputFile,dataToSerialize): logging.debug("Parsing in avro schema:"+schemaFile) schema=parse_schema(schemaFile) logging.debug("Writing avro data to:"+outputFile) writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema) writer.append(dataToSerialize) writer.close()
def __create_standard(out_path): os.makedirs(out_path) schema_path = os.path.join(os.path.dirname(__file__), 'data/user.avsc') schema = avro.schema.parse(open(schema_path).read()) with DataFileWriter(open(os.path.join(out_path, 'part-m-00000.avro'), 'w'), DatumWriter(), schema) as writer: writer.append({'position': 0, 'name': 'Alyssa', 'favorite_number': 256}) writer.append({'position': 1, 'name': 'Ben', 'favorite_number': 4, 'favorite_color': 'red'}) with DataFileWriter(open(os.path.join(out_path, 'part-m-00001.avro'), 'w'), DatumWriter(), schema) as writer: writer.append({'position': 2, 'name': 'Alyssa2', 'favorite_number': 512}) writer.append({'position': 3, 'name': 'Ben2', 'favorite_number': 8, 'favorite_color': 'blue', 'secret':b'0987654321'}) writer.append({'position': 4, 'name': 'Ben3', 'favorite_number': 2, 'favorite_color': 'green', 'secret':b'12345abcd'}) with DataFileWriter(open(os.path.join(out_path, 'part-m-00002.avro'), 'w'), DatumWriter(), schema) as writer: pass with DataFileWriter(open(os.path.join(out_path, 'part-m-00003.avro'), 'w'), DatumWriter(), schema) as writer: writer.append({'position': 5, 'name': 'Alyssa3', 'favorite_number': 16}) writer.append({'position': 6, 'name': 'Mallet', 'favorite_color': 'blue', 'secret': b'asdfgf'}) writer.append({'position': 7, 'name': 'Mikel', 'favorite_color': ''})
class AvroWriter(object): """ AvroWriter """ def __init__(self, schema, outfile): self.schema = schema self.outfile = outfile self.datawrite = None self.avrofile = None self._load_datawriter() def _load_datawriter(self): try: lschema = load_schema(self.schema) self.avrofile = open(self.outfile, 'w+b') self.datawrite = DataFileWriter(self.avrofile, DatumWriter(), lschema) except Exception: return False return True def write(self, data): try: if (not self.datawrite or not self.avrofile): raise AvroWriteException('AvroFileWriter not initalized') for elem in data: self.datawrite.append(elem) self.datawrite.close() self.avrofile.close() except Exception as e: return False, e return True, None
def save_records_to_avrofile( flows_towrite, fn_output, avro_schema=cons.DEFAULT_AVRO_NFCAP_FLOWS_SCHEMA_FILEPATH): """ Write to an AVRO file a given a dictionary or a list of dicts containing flow records. :param flows_towrite: dict or list of flow records. :param fn_output: .avro output filepath and name. :param avro_schema: schema to write the records to an .avro file. :return: none """ # load schema schema = avro.schema.parse(open(avro_schema, "rb").read()) # create object writer writer = DataFileWriter(open(fn_output, "wb"), DatumWriter(), schema, codec="deflate") # write records to avro file output if type(flows_towrite) is dict: for k, v in flows_towrite.items(): writer.append(v) writer.close() if type(flows_towrite) is list: for record in flows_towrite: writer.append(record) writer.close()
def testAppend(filename): fd = open(filename, 'a+b') datum_writer = DatumWriter() fwriter = DataFileWriter(fd, datum_writer) for i in xrange(10, 20): fwriter.append(_makeTestPerson(i)) fwriter.close()
def __init__(self, simulator, stream): super(AvroRecordWriter, self).__init__(simulator, stream) self.deserializers = {} schema = None if self.simulator.avro_output_key_schema: self.deserializers['k'] = AvroDeserializer(self.simulator.avro_output_key_schema) schema = avro.schema.parse(self.simulator.avro_output_key_schema) if self.simulator.avro_output_value_schema: self.deserializers['v'] = AvroDeserializer(self.simulator.avro_output_value_schema) schema = avro.schema.parse(self.simulator.avro_output_value_schema) if self.simulator.avro_output == 'kv': schema_k_parsed = avro.schema.parse(self.simulator.avro_output_key_schema) schema_v_parsed = avro.schema.parse(self.simulator.avro_output_value_schema) schema_k = json.loads(self.simulator.avro_output_key_schema) schema_k.pop('namespace', None) schema_v = json.loads(self.simulator.avro_output_value_schema) schema_v.pop('namespace', None) schema = { 'type': 'record', 'name': 'kv', 'fields': [ {'name': 'key', 'type': schema_k}, {'name': 'value', 'type': schema_v if schema_k_parsed.fullname != schema_v_parsed.fullname else schema_k_parsed.name} ] } schema = avro.schema.parse(json.dumps(schema)) self.writer = DataFileWriter(self.stream, DatumWriter(), schema)
def write(self, format): time_start = time.time() if format == 'json' or format == 'jsch': with open('./output/output.json', 'w') as file: for base_person_obj in self._base_person_list: file.write(json.dumps(self._get_json_person(base_person_obj), separators=(',', ':'))) # file.write(json.dumps(self._data_dict, separators=(',', ':'))) elif format == 'avro': writer = DataFileWriter(open('./output/output.avro', 'wb'), DatumWriter(), self._schema_avro) for user in self._data_dict: writer.append(user) writer.close() elif format == 'protobuf': with open('./output/output.pb', 'wb') as file: for base_person_obj in self._base_person_list: protobuf_person = self._get_proto_buf_person(base_person_obj) file.write(protobuf_person.SerializeToString()) elif format == 'gzjson': with gzip.open('./output/output.jsz', 'wb') as file: file.write(json.dumps(self._data_dict, separators=(',', ':'))) time_end = time.time() return time_end - time_start
def ExportToBin(self, data, schema=None) -> tuple: ''' Exporta objeto data utilizando o schema informado em formato binário (bytes) ''' if not schema == None: pschema = self._parseschema(schema) if pschema[0]: schema = self._data['schema'] else: return pschema else: schema = self._data['schema'] if not type(schema) is avro.schema.RecordSchema: schema = None try: with tempfile.SpooledTemporaryFile(suffix='.avro') as tmp: writer = DataFileWriter(tmp, DatumWriter(), schema) if not data is list: writer.append(data) else: for d in data: writer.append(d) writer.flush() tmp.seek(0) export_bin = tmp.read() writer.close() tmp.close() self._data['data'] = export_bin return (True, export_bin, self.getSchemaInfos()) except Exception as e: return (False, str(e), self.getSchemaInfos())
def _write_lines(self,lines,fname): """ Write the lines to an avro file named fname Parameters -------------------------------------------------------- lines - list of strings to write fname - the name of the file to write to. """ import avro.io as avio from avro.datafile import DataFileReader,DataFileWriter from avro import schema #recursively make all directories dparts=fname.split(os.sep)[:-1] for i in range(len(dparts)): pdir=os.sep+os.sep.join(dparts[:i+1]) if not(os.path.exists(pdir)): os.mkdir(pdir) with file(fname,'w') as hf: inschema="""{"type":"string"}""" writer=DataFileWriter(hf,avio.DatumWriter(inschema),writers_schema=schema.parse(inschema)) #encoder = avio.BinaryEncoder(writer) #datum_writer = avio.DatumWriter() for datum in lines: writer.append(datum) writer.close()
def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour): avro_schema = '' #load data from hdfs cat = subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-cat', '/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc'], stdout=subprocess.PIPE) for line in cat.stdout: avro_schema = avro_schema + line schema = avro.schema.parse(avro_schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) #create hdfs folder structure dir = create_hdfs_dirs (year, month, day, hour) filename = str(uuid.uuid4()) + '.avro' filepath = dir + filename tmp_file = '/tmp/' + filename writer = DataFileWriter(open(tmp_file, "w"), DatumWriter(), schema) start_dt = datetime.datetime(year, month, day, hour, 0, 0) start_ts = int(time.mktime(start_dt.timetuple())) end_dt = start_dt.replace(hour=hour+1) end_ts = int(time.mktime(end_dt.timetuple())) for ts in xrange(start_ts, end_ts, 1): #generate random pnda record on per host ip basis for host_ip in host_ips: record = {} record['timestamp'] = (ts * 1000) record['src'] = 'test' record['host_ip'] = host_ip record['rawdata'] = generate_random_metrics(metric_ids) #encode avro writer.append(record) writer.close() subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-copyFromLocal', tmp_file, dir]) return filepath
def dump_data(bso_number, schema, dsn, args): offset = args.offset or 0 total_rows = 0 # things time out around 1_500_000 rows. db = conf_db(dsn) out_file = args.output.rsplit('.', 1) row_count = count_rows(db, bso_number) for chunk in range(max(1, math.trunc(math.ceil(row_count / MAX_ROWS)))): print("Dumping {} rows from bso#{} into chunk {}".format( row_count, bso_number, chunk)) out_file_name = "{}_{}_{}.{}".format(out_file[0], bso_number, hex(chunk), out_file[1]) writer = DataFileWriter(open(out_file_name, "wb"), DatumWriter(), schema) rows = dump_rows(bso_number=bso_number, chunk_offset=offset, db=db, writer=writer, args=args) writer.close() if rows == 0: break offset = offset + rows chunk += 1 return rows
def __init__(self, context): super(AvroWriter, self).__init__(context) job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
def gen_avro(filename): schema = avro.schema.parse(SCHEMA) fo = open(filename, "wb") writer = DataFileWriter(fo, DatumWriter(), schema) for record in looney_records(): writer.append(record) writer.close() fo.close()
def _load_datawriter(self): try: lschema = load_schema(self.schema) self.avrofile = open(self.outfile, 'w+b') self.datawrite = DataFileWriter(self.avrofile, DatumWriter(), lschema) except Exception: return False return True
def gen_single_day_data(date, schema): writer = DataFileWriter(open("events2-{}.avro".format(date), "w"), DatumWriter(), schema) N = 10 ** 5 for i in xrange(0, N): tags = ["t{}".format(random.randint(1, 10)) for x in range(0, 4)] (tag1, tag2, tag3, tag4) = tags cookie = 'CK.{}'.format(random.randint(1, 10 ** 5)) writer.append({"tag1":tag1, "tag2":tag2, "tag3": tag3, "tag4":tag4, "date":date, "cookie":cookie, "count": 1}) writer.close()
def testWrite(filename): schema_object = avro.schema.parse(TEST_SCHEMA) fd = open(filename, 'wb') datum_writer = DatumWriter() fwriter = DataFileWriter(fd, datum_writer, schema_object) for i in xrange(10): fwriter.append(_makeTestPerson(i)) fwriter.close()
class Avro_Merger(object): _merge_started = False _avro_extention = '.avro' _avro_stats_record = None def __init__(self, path, new_filename): try: self._avro_files = filter(lambda x: x.endswith(self._avro_extention), iter(os.listdir(path))) schema = avro.schema.parse(open(schema_file).read()) self._writter = DataFileWriter(open(output_file, 'w'), DatumWriter(), schema, 'deflate') except Exception as e: raise avro.schema.AvroException(e) sys.exit(1) def flog_metadata_handler(func): """ This is a decorator that handles avro meta data as well as very last stats record in each file during merging """ def wrapper(self, avro_records): """ Wrapper method for consuming flog avro file """ # Handle meta data if self._writter.tell() != 0: # TODO, need to fix this next(avro_records) # Handle stats line self._avro_stats_record = deque(avro_records, maxlen=1).pop() func(avro_records) return wrapper @flog_metadata_handler def consume_avro(self, avro_records): """ Write the avro data from the butter to file """ map(self._writter.append, iter(self._avro_record)) def merge(self): """ Loop through the avros and merge each file """ for file_ in self._avro_files: try: avro_records = DataFileReader(open(os.path.join(input_dir, file_), "r"), DatumReader()) except Exception as e: raise avro.schema.AvroException(e) # Consume the records! self.consume_avro(avro_records) # Write stats data to the last of the file self._writter.append(self._avro_stats_record) self._writter.close()
class AvroFileWriter(Writer): def __init__(self, schemaFile, avroFile): self.schema = avro.schema.Parse(open(schemaFile, "rb").read()) self.writer = DataFileWriter(open(avroFile, "wb"), DatumWriter(), self.schema) def write(self, obj): self.writer.append(obj); def close(self): self.writer.close()
def _create_or_update_table( self, data, table_name, schema_name, schema_suffix, columns_definition, load_strategy, upload_call_count, database_name=None, primary_key=None, ): # This method doesn't actually create or update a table. It just creates # and populates a single .avro file which is used in the data upload. # The actual upload happens when the commit() method is called. if upload_call_count == 1: # Create avro writer and file in temporary folder self.avro_folder = TemporaryDirectory() self.avro_file_name = self.avro_folder.name + os.sep + table_name + ".avro" avro_schema = avro.schema.parse( json.dumps({ "type": "record", "name": table_name, "namespace": table_name, "fields": [{ "name": name, "type": [ "null", map_bq_data_type_to_avro(field["data_type"]), ], } for name, field in columns_definition.items()], })) # Create the avro_writer object to be used going forward self.avro_writer = DataFileWriter(open(self.avro_file_name, "wb"), DatumWriter(), avro_schema) # Save the relevant kwargs for later use in the commit() method self.table_creation_config = { "table_name": table_name, "schema_name": schema_name, "schema_suffix": schema_suffix, "columns_definition": columns_definition, "load_strategy": load_strategy, "database_name": database_name, "primary_key": primary_key, } self.log.info( "BigQuery Uploader writes data into Avro file for later one-off upload!" ) while data: # Write records to .avro file self.avro_writer.append(data.pop(0))
def serialize_records(records, coin, avro_output=None): if avro_output == None: avro_output = str(coin) + ".avro" transformer = transform_data() schema = transformer.parse_schema() #avro_output=str(coin) + ".avro" with open(avro_output, 'wb') as out: writer = DataFileWriter(out, DatumWriter(), schema) for record in records: writer.append(record)
def serialize_records(records, outpath="funding.avro"): schema = parse_schema() # with open(outpath, 'wb') as out: out = StringIO() writer = DataFileWriter(out, DatumWriter(), schema) for record in records: record = dict((f, getattr(record, f)) for f in record._fields) record['fundedDate'] = record['fundedDate'].strftime('%Y-%m-%dT%H:M:S') writer.append(record) return out
def write_json_to_avro(schema_uri, output_uri, json_str): schema = avro.schema.parse(open(schema_uri).read()) writer = DataFileWriter(open(output_uri, "w"), DatumWriter(), schema) json_list = json.loads(json_str) for row in json_list: writer.append(row) writer.close()
def main(): if len(sys.argv) < 3: print "Usage:", sys.argv[0] print "add [num of events to add] filename" print "list filename" exit(1) command = sys.argv[1] if command == 'add': noEvents = sys.argv[2] filename = sys.argv[3] # load existing events existingEvents = {} try: reader = DataFileReader(open(filename, "rb"), DatumReader()) existingEvents = reader reader.close() except IOError: print filename + ": Could not open file. Creating a new one." # Write back out to disk try: schema = avro.schema.parse(open("etc/userevent.avsc").read()) f = open(filename, "w") writer = DataFileWriter(f, DatumWriter(), schema) # Append new user events for i in range(0, int(noEvents)): newEvent = createUserEvent() print newEvent writer.append(newEvent) writer.close() print "Wrote {0} user events".format(noEvents) except IOError: print filename + ": Could not save file." elif command == 'list': listAllUserEvents(sys.argv[2]) else: print "Unregistered command. Exiting" sys.exit(1)
def __init__(self, context): super(AvroWriter, self).__init__(context) self.logger = LOGGER.getChild('AvroWriter') job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.logger.debug('created hdfs file %s', outfn) self.writer = DataFileWriter(wh, DatumWriter(), self.schema) self.logger.debug('opened AvroWriter')
def writer(self, outputs, stdout, stderr=sys.stderr): """Overrides base method for hadoop.JobTask """ schema = avro.schema.parse(json.dumps(self.avro_schema())) writer = DataFileWriter(stdout, DatumWriter(), schema) for output in outputs: writer.append(output[1]) #Needn't call close, cause the luigi job will do that. writer.flush()
def main(schema_fn, csv_fn, avro_fn): with open(schema_fn) as f_in: schema = avro.schema.parse(f_in.read()) with open(csv_fn) as f_in: reader = csv.reader(f_in, delimiter=';') with open(avro_fn, 'wb') as f_out: writer = DataFileWriter(f_out, DatumWriter(), schema) for row in reader: writer.append(dict(zip(FIELDS, row))) writer.close()
def create_archive(basedir, destdir): all_files = [] all_dirs = [] # make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output for path, dirs, files in os.walk(basedir): for d in dirs: dir = os.path.join(path, d) all_dirs.append(dir) for f in files: file = os.path.join(path, f) all_files.append(file) schema = avro.schema.parse( open( os.path.join(os.path.dirname(os.path.realpath(__file__)), "avro-schemas.json")).read()) fileprefix = time.strftime("%Y%m%d-%H%M%S") avrofile = fileprefix + "-part-0001.avro" iteration = 1 fd = open(os.path.join(destdir, avrofile), 'wb') datum = avro.io.DatumWriter() writer = DataFileWriter(fd, datum, schema, codec='deflate') try: for d in all_dirs: val = makedir(os.path.basename(os.path.normpath(d)), os.path.relpath(d, basedir)) writer.append(val) for f in all_files: for sibling, numsiblings, chunk in get_file_chunks(f): if (fd.tell() + len(chunk)) > maxfilesize * 1.1: fd, writer, iteration = rotate_avro_file( fd, writer, iteration, fileprefix, destdir, datum, schema) file = makefile(os.path.basename(os.path.normpath(f)), os.path.relpath(f, basedir), numsiblings, sibling, chunk) writer.append(file) writer.flush() del file for f in all_files: os.remove(f) for d in all_dirs: os.rmdir(d) finally: writer.close() fd.close()
def create_archive(basedir, destdir): all_files = [] all_dirs = [] # make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output for path, dirs, files in os.walk(basedir): for d in dirs: dir = os.path.join(path, d) all_dirs.append(dir) for f in files: file = os.path.join(path, f) all_files.append(file) schema = avro.schema.parse( open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "avro-schemas.json")).read()) fileprefix = time.strftime("%Y%m%d-%H%M%S") avrofile = fileprefix + "-part-0001.avro" iteration = 1 fd = open(os.path.join(destdir, avrofile), 'wb') datum = avro.io.DatumWriter() writer = DataFileWriter(fd, datum, schema, codec='deflate') try: for d in all_dirs: val = makedir(os.path.basename(os.path.normpath(d)), os.path.relpath(d, basedir)) writer.append(val) for f in all_files: for sibling, numsiblings, chunk in get_file_chunks(f): if (fd.tell() + len(chunk)) > maxfilesize * 1.1: fd, writer, iteration = rotate_avro_file(fd, writer, iteration, fileprefix, destdir, datum, schema) file = makefile(os.path.basename(os.path.normpath(f)), os.path.relpath(f, basedir), numsiblings, sibling, chunk) writer.append(file) writer.flush() del file for f in all_files: os.remove(f) for d in all_dirs: os.rmdir(d) finally: writer.close() fd.close()
def produce_kafka_messages(topic, cluster, message, data_format): """Send basic messages to Kafka""" # Get Kafka producer producer = cluster.kafka.producer() basic_data_formats = ['XML', 'CSV', 'SYSLOG', 'NETFLOW', 'COLLECTD', 'BINARY', 'LOG', 'TEXT', 'JSON'] # Write records into Kafka depending on the data_format. if data_format in basic_data_formats: producer.send(topic, message) elif data_format == 'WITH_KEY': producer.send(topic, message, key=get_random_string(string.ascii_letters, 10).encode()) elif data_format == 'AVRO': writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA))) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(message, encoder) raw_bytes = bytes_writer.getvalue() producer.send(topic, raw_bytes) elif data_format == 'AVRO_WITHOUT_SCHEMA': bytes_writer = io.BytesIO() datum_writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA))) data_file_writer = DataFileWriter(writer=bytes_writer, datum_writer=datum_writer, writer_schema=avro.schema.Parse(json.dumps(SCHEMA))) data_file_writer.append(message) data_file_writer.flush() raw_bytes = bytes_writer.getvalue() data_file_writer.close() producer.send(topic, raw_bytes) producer.flush()
def dump_data(args, schema): offset = 0 # things time out around 1_500_000 rows. db = conf_spanner(args) writer = DataFileWriter(open(args.output, "wb"), DatumWriter(), schema) row_count = count_rows(db) print("Dumping {} rows".format(row_count)) while offset < row_count: old_offset = offset offset = dump_rows(offset=offset, db=db, writer=writer, args=args) if offset == old_offset: break writer.close() return row_count
def score(graphs, schema, url, port): """ graphs is expected to be a list of dictionaries, where each entry in the list represents a graph with * key idx -> index value * key nodes -> list of ints representing vertices of the graph * key edges -> list of list of ints representing edges of graph """ stream = BufferedWriter(BytesIO()) writer = DataFileWriter(stream, avro.io.DatumWriter(), schema) # writer = DataFileWriter(open("imdb-graph.avro", "wb"), DatumWriter(), schema) for graph in graphs: writer.append({ "edges": graph["edges"], "vertices": graph["vertices"], "index": graph["idx"], "label": graph.get("label") }) writer.flush() raw_bytes = stream.raw.getvalue() writer.close() url = "{}:{}/predictUnstructured/?ret_mode=binary".format( url.strip("/"), port) payload = raw_bytes headers = {'Content-Type': 'application/octet-stream'} response = requests.request("POST", url, headers=headers, data=payload) return response
def encode(self, obj: BaseRecord) -> bytes: """ Encode *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* to bytes format This function is used by kafka-python Args: obj (BaseModel): *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* Raises: MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema) AvroEncodeError: fail to encode BaseModel to bytes Returns: bytes: BaseModel in bytes """ try: schema = self._schemas[obj.event_name()] except KeyError as err: self.logger.exception('%s', err.__str__()) raise MissingEventClass try: output = BytesIO() writer = DataFileWriter(output, DatumWriter(), schema) writer.append(obj.to_dict()) writer.flush() encoded_event = output.getvalue() writer.close() except AvroTypeException as err: self.logger.exception('%s', err.__str__()) raise AvroEncodeError return encoded_event
def encode(self, raw_data): byte_stream = BytesIO() writer = DataFileWriter(byte_stream, DatumWriter(), self._schema) writer.append(raw_data) writer.flush() serialized_data = byte_stream.getvalue() writer.close() return serialized_data
def _write_avro_data(self, avro_container_uri: str, data_as_dictionary: Dict[str, Any]): if type(avro_container_uri) is not str or len(avro_container_uri) == 0: raise EmeraldMessageSerializationError( 'Unable to write avro - avro_container_uri parameter' + ' must be a string specifying container location in writable form' ) if type(data_as_dictionary) is not dict: raise EmeraldMessageSerializationError( 'Unable to write avro - data_dictionary parameter is incorrect type ("' + type(data_as_dictionary).__name__ + os.linesep + 'Should be a dictionary of k,v data pairs') if self.debug: print('Avro schema type = ' + str(type(type(self).get_avro_schema_record().avro_schema))) print('Avro schema = ' + str(type(self).get_avro_schema_record().avro_schema)) with open(avro_container_uri, "wb") as writer_fp: with DataFileWriter( writer_fp, DatumWriter(), type(self).get_avro_schema_record().avro_schema) as writer: if self.debug: print('Opened data file write') try: writer.append(data_as_dictionary) except AvroTypeException as iex: raise EmeraldMessageSerializationError( 'Unable to serialize object of type ' + type(self).__name__ + ' due to data mismatch in Avro schema' + os.linesep + 'Error info: ' + str(iex.args[0])) return
def open_avro_writer(self): #set path from ini file schema_path = 'tweet.avsc' file_path = 'tweet.avro' if 'avro_schema' in self.config['Store']: schema_path = self.config['Store']['avro_schema'][0] if 'avro_file' in self.config['Store']: file_path = self.config['Store']['avro_file'][0] if 'avro_append' in self.config['Store'] and self.config['Store']['avro_append'][0] == 'True' \ and os.path.isfile(file_path) is True: self.avro_writer = DataFileWriter(open(file_path, 'rb+'), DatumWriter(), codec="deflate") else: self.avro_schema = avro.schema.Parse(open(schema_path).read()) self.avro_writer = DataFileWriter(open(file_path, 'wb'), DatumWriter(), self.avro_schema, codec="deflate")
def write(self, filename, records): if filename.split('.')[-2] == 'snappy': compress = 'snappy' else: compress = 'null' try: with client.write(filename, overwrite=True) as writer: with DataFileWriter(writer, DatumWriter(), self.schema, codec=compress) as data_file_writer: for record in records: self.exit() _id = record['_id']['$oid'] etl(record) self.log_count() data_file_writer.append(record) self.save_count += 1 except AttributeError as e: logger.error(f'record: {_id}') logger.info(json.dumps(record, indent=4, ensure_ascii=False)) traceback.print_exc() # raise e except AvroTypeException as e: logger.info(f'Save Count: {self.save_count}') logger.error(f'record: {_id}') logger.info(json.dumps(record, indent=4, ensure_ascii=False)) raise e
def _write_data(self, directory=None, prefix=tempfile.template, codec='null', count=len(RECORDS)): with tempfile.NamedTemporaryFile( delete=False, dir=directory, prefix=prefix) as f: writer = DataFileWriter(f, DatumWriter(), self.SCHEMA, codec=codec) len_records = len(self.RECORDS) for i in range(count): writer.append(self.RECORDS[i % len_records]) writer.close() self._temp_files.append(f.name) return f.name
def __init__(self, file): threading.Thread.__init__(self) self.avro_writer = DataFileWriter(open(file, "w"), DatumWriter(), schema) self.queue = Queue.Queue() self.should_stop = False self.mutex = threading.Lock() self.start()
class AvroWriter(RecordWriter): schema = None def __init__(self, context): super(AvroWriter, self).__init__(context) job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.writer = DataFileWriter(wh, DatumWriter(), self.schema) def close(self): self.writer.close() # FIXME do we really need to explicitly close the filesystem? self.writer.writer.fs.close()
def __init__(self, path, new_filename): try: self._avro_files = filter(lambda x: x.endswith(self._avro_extention), iter(os.listdir(path))) schema = avro.schema.parse(open(schema_file).read()) self._writter = DataFileWriter(open(output_file, 'w'), DatumWriter(), schema, 'deflate') except Exception as e: raise avro.schema.AvroException(e) sys.exit(1)
def _write_items(base_name, schema_str, items): avro_schema = schema.Parse(schema_str) avro_file = base_name + '.avro' with DataFileWriter(open(avro_file, "w"), DatumWriter(), avro_schema) as writer: for i in items: writer.append(i) writer.close return (avro_file)
def __create_nested(out_path): os.makedirs(out_path) schema_path = os.path.join(os.path.dirname(__file__), 'data/nested.avsc') schema = avro.schema.parse(open(schema_path).read()) with DataFileWriter(open(os.path.join(out_path, 'part-m-00004.avro'), 'w'), DatumWriter(), schema) as writer: writer.append({'sup': 1, 'sub': {'level2': 2}}) writer.append({'sup': 2, 'sub': {'level2': 1}})
def _write(self, data): "Internal write API" wmaid = self.wmaid(data) schema = self.schema fname = file_name(self.hdir, wmaid) with open_file(fname, 'w') as ostream: with DataFileWriter(ostream, DatumWriter(), schema) as writer: writer.append(data)
def write_pipeline_template_to_avro(pipeline, rtasks_d, output_file): d = pipeline_template_to_dict(pipeline, rtasks_d) f = open(output_file, 'w') with DataFileWriter(f, DatumWriter(), PT_SCHEMA) as writer: writer.append(d) return d
class AvroAppender(threading.Thread): def __init__(self, file): threading.Thread.__init__(self) self.avro_writer = DataFileWriter(open(file, "w"), DatumWriter(), schema) self.queue = Queue.Queue() self.should_stop = False self.mutex = threading.Lock() self.start() def log_append(self, user, advertiser, **kwargs): if user is not None and advertiser is not None: record = dict(user=user, advertiser=advertiser) if kwargs["ip"]: record["ip"] = kwargs["ip"] if kwargs["agent"]: record["agent"] = kwargs["agent"] if kwargs["time"]: record["timestamp"] = float(kwargs["time"]) else: record["timestamp"] = float(time.time()) if kwargs["keywords"]: record["keywords"] = list(set([string.strip() for string in kwargs["keywords"].split(",")])) self.queue.put_nowait(record) else: print "Missing user" def close_appender(self): self.mutex.acquire() self.should_stop = True self.mutex.release() def run(self): while True: try: record = self.queue.get(False, 1000) self.avro_writer.append(record) except Queue.Empty: self.mutex.acquire() stop = self.should_stop self.mutex.release() if stop: break self.avro_writer.close()
def main(): parser = ArgumentParser(description="Simple AMS example of subscription pull/consume") parser.add_argument('--host', type=str, default='messaging-devel.argo.grnet.gr', help='FQDN of AMS Service') parser.add_argument('--token', type=str, required=True, help='Given token') parser.add_argument('--project', type=str, required=True, help='Project registered in AMS Service') parser.add_argument('--subscription', type=str, required=True, help='Subscription name') parser.add_argument('--topic', type=str, required=True, help='Given topic') parser.add_argument('--nummsgs', type=int, default=3, help='Number of messages to pull and ack') parser.add_argument('--schema', type=str, required=True, help='Avro schema') parser.add_argument('--outfile', type=str, required=True, help='Output avro file') args = parser.parse_args() # initialize service with given token and project ams = ArgoMessagingService(endpoint=args.host, token=args.token, project=args.project) # ensure that subscription is created in first run. messages can be # pulled from the subscription only when subscription already exists # for given topic prior messages being published to topic try: if not ams.has_sub(args.subscription): ams.create_sub(args.subscription, args.topic) subscription = ams.get_sub(args.subscription, retobj=True) except AmsException as e: print(e) raise SystemExit(1) # try to pull number of messages from subscription. method will # return (ackIds, AmsMessage) tuples from which ackIds and messages # payload will be extracted. avro_payloads = list() for msg in subscription.pullack(args.nummsgs, retry=5, retrysleep=15, return_immediately=True): data = msg.get_data() msgid = msg.get_msgid() print('msgid={0}'.format(msgid)) avro_payloads.append(data) try: schema = load_schema(args.schema) if os.path.exists(args.outfile): avroFile = open(args.outfile, 'a+') writer = DataFileWriter(avroFile, DatumWriter()) else: avroFile = open(args.outfile, 'w+') writer = DataFileWriter(avroFile, DatumWriter(), schema) for am in avro_payloads: msg = avro_deserialize(am, args.schema) writer.append(msg) writer.close() avroFile.close() except Exception as e: print(e) raise SystemExit(1)
def write(self): try: schema = avro.schema.parse(open(self.schema).read()) avrofile = open(self.outfile, 'w+') datawrite = DataFileWriter(avrofile, DatumWriter(), schema) for elem in self.listdata: datawrite.append(elem) datawrite.close() avrofile.close() except (avro.schema.SchemaParseException, avro.io.AvroTypeException): self.logger.error(" couldn't parse %s" % self.schema) raise SystemExit(1) except IOError as e: self.logger.error(e) raise SystemExit(1)
def rotate_avro_file(fd, writer, iteration, fileprefix, destdir, datum, schema): iteration += 1 avrofile = fileprefix + "-part-{0:04d}.avro".format(iteration) writer.close() fd.close() fd = open(os.path.join(destdir, avrofile), 'wb') writer = DataFileWriter(fd, datum, schema, codec='deflate') return fd, writer, iteration
def testWrite(filename, schema): fd = open(filename, 'wb') datum = DatumWriter() writer = DataFileWriter(fd, datum, schema) writer.append(makeObject("Person A", 23)) writer.append(makeObject("Person B", 31)) writer.append(makeObject("Person C", 28)) writer.close()
def main(): """Start of execution""" #combine the schemas known_schemas = avro.schema.Names() types_schema = LoadAvsc("parameter_types.avsc", known_schemas) param_schema = LoadAvsc("parameter.avsc", known_schemas) print json.dumps(param_schema.to_json(avro.schema.Names()), indent=2) #test the schema works param_file = open("parameters.avro", "w") writer = DataFileWriter(param_file, DatumWriter(), param_schema) param_1 = {"name": "test", "description":"An Avro test.", "type":"int"} param_2 = {"name": "test", "description":"An Avro test.", "type":"boolean"} writer.append(param_1) writer.append(param_2) writer.close() reader = DataFileReader(open("parameters.avro", "r"), DatumReader()) for parameter in reader: print parameter reader.close()
def readAndWriteAvro(): """ Unlike java, avro does not let you generate code for Tweet in python. So only way to read and write data is without using code generation""" #Read the schema schema = avro.schema.parse(open("tweet.avsc").read()) #write some data writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema) writer.append({"tweetId": 5, "user": "******", "text" : "Tweeting from python as well"}) writer.close() #read the same data tweets = DataFileReader(open("tweets.avro", "r"), DatumReader()) for tweet in tweets: print tweet tweets.close()
def main(argv): try: schema_fn = argv[1] n_users = int(argv[2]) avro_fn = argv[3] except IndexError: sys.exit('Usage: %s SCHEMA_FILE N_USERS AVRO_FILE' % argv[0]) with open(schema_fn) as f_in: schema = avro.schema.parse(f_in.read()) with open(avro_fn, 'wb') as f_out: writer = DataFileWriter(f_out, DatumWriter(), schema) for i in xrange(n_users): writer.append({ 'name': random.choice(NAME_POOL), 'office': random.choice(OFFICE_POOL), 'favorite_color': random.choice(COLOR_POOL), 'favorite_number': i, }) writer.close()
def __init__(self, xmlfile, avroschema, outputavro): """Setup, check if files exists, create schema and writer handler""" if not isfile(xmlfile): raise IOError("File {} doesn't exist.".format(xmlfile)) else: self.xmlfile = xmlfile self.schema = avro.schema.parse(open(avroschema).read()) self.writer = DataFileWriter(open(outputavro, "w"), DatumWriter(), self.schema, codec="snappy") self.schema_fields = {} # Init expected fields for the schema self.list_fields()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', nargs=1, help='new schema', required=True, metavar='avro schema') parser.add_argument('-i', nargs='+', help='avro files', required=True, metavar='avro file') parser.add_argument('-ts', action='store_true', help='convert int tag values to str', required=False) parser.add_argument('-o', nargs=1, help='output directory', required=True, metavar='output directory') args = parser.parse_args() for f in args.i: out = [] if args.o[0].startswith('/'): dest = args.o[0] else: dest = os.path.abspath('.') + '/' + args.o[0] try: os.makedirs(dest) except OSError as e: if e.args[0] != errno.EEXIST: print os.strerror(e.args[0]), e.args[1], args.o[0] raise SystemExit(1) schema = avro.schema.parse(open(args.s[0]).read()) writer = DataFileWriter(open(dest + '/' + os.path.basename(f), 'w'), DatumWriter(), schema) reader = DataFileReader(open(f, 'r'), DatumReader()) try: for i, entry in enumerate(reader): if args.ts: for t in entry['tags']: if isinstance(entry['tags'][t], int): entry['tags'][t] = str(entry['tags'][t]) writer.append(entry) writer.close() except UnicodeDecodeError as e: pprint.pprint(e) print f