def read(self, format): time_start = time.time() if format == 'json': with open('./output/output.json') as file: json.loads(file.read()) if format == 'jsch': with open('./output/output.json') as file: validate(json.loads(file.read()), self._schema_json) elif format == 'avro': reader = DataFileReader(open('./output/output.avro', 'r'), DatumReader()) for user in reader: pass reader.close() elif format == 'protobuf': with open('./output/output.pb', 'rb') as file: addressbook_pb2.AddressBook().ParseFromString(file.read()) elif format == 'gzjson': with gzip.open('./output/output.jsz', 'rb') as file: json.loads(file.read()) time_end = time.time() return time_end - time_start
def deserializeDataFromFile2Str(inputFile): logging.debug("Deserializing file:"+inputFile) reader = DataFileReader(open(inputFile, "r"), DatumReader()) data="" for item in reader: data=data+str(item) reader.close() return data
def generic_dataframe(self, df, avro_schema, assert_fns=None): """Generic test running function for arbitrary avro schemas. Writes a dataframe containing the records to avro. Reads back and compares with the original """ print(avro_schema) cyavro.write_avro_file_from_dataframe(df, self.filename, json.dumps(avro_schema), codec='null' ) if assert_fns is None: assert_fns = {} df_read = cyavro.read_avro_file_as_dataframe(self.filename) import avro.schema from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter with open(self.filename, 'rb') as fo: reader = DataFileReader(fo, DatumReader()) records = [] for user in reader: records.append(user) df_reference = pd.DataFrame(records) reader.close() success = True for col in avro_schema["fields"]: colname = col['name'] assert_fn = assert_fns.get(colname, np.testing.assert_array_equal) def print_fail_header(s): print('#' * len(s)) print("FAIL: Column {}".format(col)) print('#' * len(s)) print(s) try: assert_fn(df_read[colname], df[colname]) except AssertionError: print_fail_header("Failed for cyavro read comparison {}\n".format(col)) traceback.print_exc(file=sys.stdout) success = False try: assert_fn(df_reference[colname], df[colname]) except AssertionError: print_fail_header("Failed for cyavro write comparison {}\n".format(col)) traceback.print_exc(file=sys.stdout) success = False assert success
def testRead(filename): fd = open(filename, 'rb') datum_writer = DatumReader() freader = DataFileReader(fd, datum_writer) for datum in freader: print datum['name'], datum['company'] print datum['website'] print freader.close()
def testRead(filename): fd = open(filename, 'rb') datum = DatumReader() reader = DataFileReader(fd, datum) for record in reader: print record['name'], record['age'] reader.close()
def main(): if len(sys.argv) < 3: print "Usage:", sys.argv[0] print "add [num of events to add] filename" print "list filename" exit(1) command = sys.argv[1] if command == 'add': noEvents = sys.argv[2] filename = sys.argv[3] # load existing events existingEvents = {} try: reader = DataFileReader(open(filename, "rb"), DatumReader()) existingEvents = reader reader.close() except IOError: print filename + ": Could not open file. Creating a new one." # Write back out to disk try: schema = avro.schema.parse(open("etc/userevent.avsc").read()) f = open(filename, "w") writer = DataFileWriter(f, DatumWriter(), schema) # Append new user events for i in range(0, int(noEvents)): newEvent = createUserEvent() print newEvent writer.append(newEvent) writer.close() print "Wrote {0} user events".format(noEvents) except IOError: print filename + ": Could not save file." elif command == 'list': listAllUserEvents(sys.argv[2]) else: print "Unregistered command. Exiting" sys.exit(1)
def loadOldData(filename): oldDataDict = dict() if not os.path.isfile(filename): return oldDataDict reader = DataFileReader(open(filename, "r"), DatumReader()) for weight in reader: oldDataDict[weight["site"]] = weight["weight"] reader.close() return oldDataDict
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "hi:s:", ["help", "input-file=", "schema="]) except getopt.GetoptError as err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage(sys.argv[0]) sys.exit(2) avro_file = None avro_schema_file = None required_cl = 0 for o, a in opts: if o in ("-h", "--help"): usage(sys.argv[0]) sys.exit() elif o in ("-i", "--input-file"): required_cl += 1 avro_file = a elif o in ("-s", "--schema"): avro_schema_file = a else: assert False, "unhandled option" if (required_cl < 1): print "ERROR: Missing required argument" usage(sys.argv[0]) sys.exit(1) if not avro_schema_file: reader = DataFileReader(open(avro_file, "r"), DatumReader()) for datum in reader: print datum reader.close() else: reader_schema = open(avro_schema_file, "r") avro_schema = reader_schema.read() reader_schema.close() parsed_avro_schema = avro.schema.parse(avro_schema) with open(avro_file, "rb") as reader_data: inputio = io.BytesIO(reader_data.read()) decoder = avro.io.BinaryDecoder(inputio) reader = avro.io.DatumReader(parsed_avro_schema) while inputio.tell() < len(inputio.getvalue()): avro_datum = reader.read(decoder) print avro_datum reader_data.close()
def listAllUserEvents(filename): try: reader = DataFileReader(open(filename, "r"), DatumReader()) for event in reader: # Query uuids of events print "event id: {0}, event data extra fields: {1}".format(event["uuid"], event["eventData"]["otherEventData"]) reader.close() except IOError: print filename + ": Could not open file. Exiting" sys.exit(1)
def handle(self): data = self.request.recv(8024).strip() data = StringIO(data) reader = DataFileReader(data, DatumReader()) for fileData in reader: id = fileData['id'] data = fileData['data'] print fileData if not fileDict.has_key(id): fileDict[id] = open("./" + id, "w") f = fileDict[id] f.write(data) f.flush() reader.close()
def main(): """Start of execution""" #combine the schemas known_schemas = avro.schema.Names() types_schema = LoadAvsc("parameter_types.avsc", known_schemas) param_schema = LoadAvsc("parameter.avsc", known_schemas) print json.dumps(param_schema.to_json(avro.schema.Names()), indent=2) #test the schema works param_file = open("parameters.avro", "w") writer = DataFileWriter(param_file, DatumWriter(), param_schema) param_1 = {"name": "test", "description":"An Avro test.", "type":"int"} param_2 = {"name": "test", "description":"An Avro test.", "type":"boolean"} writer.append(param_1) writer.append(param_2) writer.close() reader = DataFileReader(open("parameters.avro", "r"), DatumReader()) for parameter in reader: print parameter reader.close()
def readAndWriteAvro(): """ Unlike java, avro does not let you generate code for Tweet in python. So only way to read and write data is without using code generation""" #Read the schema schema = avro.schema.parse(open("tweet.avsc").read()) #write some data writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema) writer.append({"tweetId": 5, "user": "******", "text" : "Tweeting from python as well"}) writer.close() #read the same data tweets = DataFileReader(open("tweets.avro", "r"), DatumReader()) for tweet in tweets: print tweet tweets.close()
def read(fin, fout=None, nrecords=0): "Read given avro file according to its schema and dump on stdout its content" reader = DataFileReader(open(fin, "r"), DatumReader()) fobj = open(fout, 'w') if fout else None count = 0 if fobj: fobj.write("[\n") for rec in reader: if fobj: if count: fobj.write(",\n") fobj.write(json.dumps(rec)) else: pprint.pprint(rec) if nrecords and count >= nrecords: break count += 1 if fobj: fobj.write("]\n") fobj.close() reader.close()
def test_data_format_avro(sdc_builder, sdc_executor, couchbase): bucket_name = get_random_string(string.ascii_letters, 10) document_key = 'id' DATA = { 'name': 'boss', 'age': 60, 'emails': ['*****@*****.**', '*****@*****.**'], 'boss': None } SCHEMA = { 'namespace': 'example.avro', 'type': 'record', 'name': 'Employee', 'fields': [{ 'name': 'name', 'type': 'string' }, { 'name': 'age', 'type': 'int' }, { 'name': 'emails', 'type': { 'type': 'array', 'items': 'string' } }, { 'name': 'boss', 'type': ['Employee', 'null'] }] } cluster = couchbase.cluster # Build the pipeline builder = sdc_builder.get_pipeline_builder() source = builder.add_stage('Dev Raw Data Source').set_attributes( data_format='JSON', raw_data=json.dumps(DATA), stop_after_first_batch=True) destination = builder.add_stage('Couchbase', type='destination') destination.set_attributes(authentication_mode='USER', bucket=bucket_name, document_key=document_key, data_format='AVRO', avro_schema=json.dumps(SCHEMA), avro_schema_location='INLINE') source >> destination pipeline = builder.build().configure_for_environment(couchbase) sdc_executor.add_pipeline(pipeline) try: logger.info('Creating %s Couchbase bucket ...', bucket_name) couchbase.bucket_manager.create_bucket( CreateBucketSettings(name=bucket_name, bucket_type='couchbase', ram_quota_mb=256)) couchbase.wait_for_healthy_bucket(bucket_name) sdc_executor.start_pipeline(pipeline).wait_for_finished() bucket = cluster.bucket(bucket_name) doc_value = bucket.get(document_key).value # decode the bytes object returned by Couchbase file = BytesIO(doc_value) reader = DataFileReader(file, DatumReader()) records = [record for record in reader] assert len( records ) == 1, 'Number of records stored should equal number of records that entered the pipeline' assert records[0] == DATA reader.close() finally: if pipeline and sdc_executor.get_pipeline_status( pipeline).response.json().get('status') == 'RUNNING': sdc_executor.stop_pipeline(pipeline) try: logger.info('Deleting %s Couchbase bucket ...', bucket_name) couchbase.bucket_manager.drop_bucket(bucket_name) except Exception as e: logger.error(f"Can't delete bucket: {e}")
def readFile(): reader = DataFileReader(open("part-00000.avro", "r"), DatumReader()) for user in reader: print user reader.close()
import avro.schema from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter with open("blog.avsc") as schema_file: schema = avro.schema.parse(schema_file.read()) with open("blog.avro", "wb") as out_file: writer = DataFileWriter(out_file, DatumWriter(), schema) writer.append({ "title": "Avro is awesome", "content": "Let's learn Avro!", "is_published": False }) writer.close() with open("blog.avro") as in_file: reader = DataFileReader(in_file, DatumReader()) for blog in reader: print blog reader.close()
def test1(self): """ Run a tethered map-reduce job. Assumptions: 1) bash is available in /bin/bash """ from word_count_task import WordCountTask from avro.tether import tether_task_runner from avro.datafile import DataFileReader from avro.io import DatumReader import avro import subprocess import StringIO import shutil import tempfile import inspect proc=None try: # TODO we use the tempfile module to generate random names # for the files base_dir = "/tmp/test_tether_word_count" if os.path.exists(base_dir): shutil.rmtree(base_dir) inpath = os.path.join(base_dir, "in") infile=os.path.join(inpath, "lines.avro") lines=["the quick brown fox jumps over the lazy dog", "the cow jumps over the moon", "the rain in spain falls mainly on the plains"] self._write_lines(lines,infile) true_counts=self._count_words(lines) if not(os.path.exists(infile)): self.fail("Missing the input file {0}".format(infile)) # The schema for the output of the mapper and reducer oschema=""" {"type":"record", "name":"Pair","namespace":"org.apache.avro.mapred","fields":[ {"name":"key","type":"string"}, {"name":"value","type":"long","order":"ignore"} ] } """ # write the schema to a temporary file osfile=tempfile.NamedTemporaryFile(mode='w',suffix=".avsc",prefix="wordcount",delete=False) outschema=osfile.name osfile.write(oschema) osfile.close() if not(os.path.exists(outschema)): self.fail("Missing the schema file") outpath = os.path.join(base_dir, "out") args=[] args.append("java") args.append("-jar") args.append(os.path.abspath("@TOPDIR@/../java/tools/target/avro-tools-@[email protected]")) args.append("tether") args.extend(["--in",inpath]) args.extend(["--out",outpath]) args.extend(["--outschema",outschema]) args.extend(["--protocol","http"]) # form the arguments for the subprocess subargs=[] srcfile=inspect.getsourcefile(tether_task_runner) # Create a shell script to act as the program we want to execute # We do this so we can set the python path appropriately script="""#!/bin/bash export PYTHONPATH={0} python -m avro.tether.tether_task_runner word_count_task.WordCountTask """ # We need to make sure avro is on the path # getsourcefile(avro) returns .../avro/__init__.py asrc=inspect.getsourcefile(avro) apath=asrc.rsplit(os.sep,2)[0] # path to where the tests lie tpath=os.path.split(__file__)[0] exhf=tempfile.NamedTemporaryFile(mode='w',prefix="exec_word_count_",delete=False) exfile=exhf.name exhf.write(script.format((os.pathsep).join([apath,tpath]),srcfile)) exhf.close() # make it world executable os.chmod(exfile,0755) args.extend(["--program",exfile]) print "Command:\n\t{0}".format(" ".join(args)) proc=subprocess.Popen(args) proc.wait() # read the output with file(os.path.join(outpath,"part-00000.avro")) as hf: reader=DataFileReader(hf, DatumReader()) for record in reader: self.assertEqual(record["value"],true_counts[record["key"]]) reader.close() except Exception as e: raise finally: # close the process if proc is not None and proc.returncode is None: proc.kill() if os.path.exists(base_dir): shutil.rmtree(base_dir) if os.path.exists(exfile): os.remove(exfile)
def main(): known_schemas = avro.schema.Names() with open("point.avsc", "rb") as fp: point = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) with open("review.avsc", "rb") as fp: place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) with open("place.avsc", "rb") as fp: place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5, 'y': 2.75}) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['x'] == 1.5 assert deserialized['y'] == 2.75 reader.close() writer.close() try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5}) assert False except AvroTypeException as e: pass try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5, 'y': "wtanaka.com"}) assert False except AvroTypeException as e: pass output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75} }) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['location']['x'] == 1.5 assert deserialized['location']['y'] == 2.75 reader.close() writer.close() output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75}, 'review': {'rating': 4, 'text': '4 stars would come again'}, }) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['location']['x'] == 1.5 assert deserialized['location']['y'] == 2.75 reader.close() writer.close() try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75}, 'review': {'x': 1.5, 'y': 2.75}, }) assert False except AvroTypeException as e: pass
def main(): parser = optparse.OptionParser(description="""Filters consumer messages based on various criteria (allowed NGIs, service flavours, metrics...)""") parser.add_option('-g', dest='gloconf', nargs=1, metavar='global.conf', help='path to global configuration file', type=str) group = optparse.OptionGroup(parser, 'Compute Engine usage') group.add_option('-d', dest='date', nargs=1, metavar='YEAR-MONTH-DAY') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Debugging usage') group.add_option('-f', dest='cfile', nargs=1, metavar='consumer_log_YEAR-MONTH-DAY.avro') parser.add_option_group(group) (options, args) = parser.parse_args() global logger logger = Logger(os.path.basename(sys.argv[0])) prefilter = {'Prefilter': ['ConsumerFilePath', 'PoemExpandedProfiles', 'PoemNameMapping', 'LookbackPoemExpandedProfiles']} schemas = {'AvroSchemas': ['Prefilter']} output = {'Output': ['Prefilter']} confpath = options.gloconf if options.gloconf else None cglob = Global(confpath, schemas, output, prefilter) global globopts globopts = cglob.parse() stats = () if options.cfile and options.date: parser.print_help() raise SystemExit(1) elif options.cfile: fname = options.cfile date = options.cfile.split('_')[-1] date = date.split('.')[0] date = date.split('-') elif options.date: date = options.date.split('-') else: parser.print_help() raise SystemExit(1) if len(date) == 0 or len(date) != 3: logger.error('Consumer file does not end with correctly formatted date') parser.print_help() raise SystemExit(1) year, month, day = date # avro files if options.cfile: inputFile = options.cfile else: inputFile = gen_fname_repdate(logger, year+'-'+month+'-'+day, globopts['PrefilterConsumerFilePath'.lower()], '') outputFile = gen_fname_repdate(logger, year+'_'+month+'_'+day, globopts['OutputPrefilter'.lower()], '') try: schema = avro.schema.parse(open(globopts['AvroSchemasPrefilter'.lower()]).read()) writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema) reader = DataFileReader(open(inputFile, "r"), DatumReader()) except IOError as e: logger.error(str(e)) raise SystemExit(1) # load poem data ngis = loadNGIs(year, month, day) profiles = loadFilteredProfiles(year, month, day) nameMapping = loadNameMapping(year, month, day) s = time.time() msgs, msgswrit, msgsfilt, falsemonhost, falseroc, falseprofile = prefilterit(reader, writer, ngis, profiles, nameMapping) e = time.time() logger.info('ExecTime:%.2fs ConsumerDate:%s Read:%d Written:%d Filtered:%d(Monitoring_Host:%d,ROC:%d,ServiceTypes_Metrics:%d)' % (round(e - s, 2), year+'-'+month+'-'+day, msgs, msgswrit, msgsfilt, falsemonhost, falseroc, falseprofile)) reader.close() writer.close()
def test1(self): """ Run a tethered map-reduce job. Assumptions: 1) bash is available in /bin/bash """ from word_count_task import WordCountTask from avro.tether import tether_task_runner from avro.datafile import DataFileReader from avro.io import DatumReader import avro import subprocess import StringIO import shutil import tempfile import inspect proc = None try: # TODO we use the tempfile module to generate random names # for the files base_dir = "/tmp/test_tether_word_count" if os.path.exists(base_dir): shutil.rmtree(base_dir) inpath = os.path.join(base_dir, "in") infile = os.path.join(inpath, "lines.avro") lines = [ "the quick brown fox jumps over the lazy dog", "the cow jumps over the moon", "the rain in spain falls mainly on the plains" ] self._write_lines(lines, infile) true_counts = self._count_words(lines) if not (os.path.exists(infile)): self.fail("Missing the input file {0}".format(infile)) # The schema for the output of the mapper and reducer oschema = """ {"type":"record", "name":"Pair","namespace":"org.apache.avro.mapred","fields":[ {"name":"key","type":"string"}, {"name":"value","type":"long","order":"ignore"} ] } """ # write the schema to a temporary file osfile = tempfile.NamedTemporaryFile(mode='w', suffix=".avsc", prefix="wordcount", delete=False) outschema = osfile.name osfile.write(oschema) osfile.close() if not (os.path.exists(outschema)): self.fail("Missing the schema file") outpath = os.path.join(base_dir, "out") args = [] args.append("java") args.append("-jar") args.append( os.path.abspath( "@TOPDIR@/../java/tools/target/avro-tools-@[email protected]" )) args.append("tether") args.extend(["--in", inpath]) args.extend(["--out", outpath]) args.extend(["--outschema", outschema]) args.extend(["--protocol", "http"]) # form the arguments for the subprocess subargs = [] srcfile = inspect.getsourcefile(tether_task_runner) # Create a shell script to act as the program we want to execute # We do this so we can set the python path appropriately script = """#!/bin/bash export PYTHONPATH={0} python -m avro.tether.tether_task_runner word_count_task.WordCountTask """ # We need to make sure avro is on the path # getsourcefile(avro) returns .../avro/__init__.py asrc = inspect.getsourcefile(avro) apath = asrc.rsplit(os.sep, 2)[0] # path to where the tests lie tpath = os.path.split(__file__)[0] exhf = tempfile.NamedTemporaryFile(mode='w', prefix="exec_word_count_", delete=False) exfile = exhf.name exhf.write( script.format((os.pathsep).join([apath, tpath]), srcfile)) exhf.close() # make it world executable os.chmod(exfile, 0755) args.extend(["--program", exfile]) print "Command:\n\t{0}".format(" ".join(args)) proc = subprocess.Popen(args) proc.wait() # read the output with file(os.path.join(outpath, "part-00000.avro")) as hf: reader = DataFileReader(hf, DatumReader()) for record in reader: self.assertEqual(record["value"], true_counts[record["key"]]) reader.close() except Exception as e: raise finally: # close the process if proc is not None and proc.returncode is None: proc.kill() if os.path.exists(base_dir): shutil.rmtree(base_dir) if os.path.exists(exfile): os.remove(exfile)
import avro.schema from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter schema = avro.schema.parse(open('./schema.avsc', 'rb').read()) # Create an avro file writer = DataFileWriter(open('user.avro', 'wb'), DatumWriter(), schema) writer.append({'name': 'Eric', 'favorite_number': 128}) writer.append({ 'name': 'Tanya', 'favorite_color': 'red', 'favorite_number': 383 }) writer.close() # Now read that file reader = DataFileReader(open('user.avro', 'rb'), DatumReader()) for user in reader: print user reader.close()