Exemplo n.º 1
0
  def read(self, format):
    time_start = time.time()

    if format == 'json':
      with open('./output/output.json') as file:
        json.loads(file.read())

    if format == 'jsch':
      with open('./output/output.json') as file:
        validate(json.loads(file.read()), self._schema_json)

    elif format == 'avro':
      reader = DataFileReader(open('./output/output.avro', 'r'), DatumReader())
      for user in reader:
        pass
      reader.close()

    elif format == 'protobuf':
      with open('./output/output.pb', 'rb') as file:
        addressbook_pb2.AddressBook().ParseFromString(file.read())

    elif format == 'gzjson':
      with gzip.open('./output/output.jsz', 'rb') as file:
        json.loads(file.read())

    time_end = time.time()

    return time_end - time_start
Exemplo n.º 2
0
def deserializeDataFromFile2Str(inputFile):
	logging.debug("Deserializing file:"+inputFile)
	reader = DataFileReader(open(inputFile, "r"), DatumReader())
	data=""
	for item in reader:
		data=data+str(item)
	reader.close()
	return data
Exemplo n.º 3
0
    def generic_dataframe(self, df, avro_schema, assert_fns=None):
        """Generic test running function for arbitrary avro schemas.

        Writes a dataframe containing the records to avro.

        Reads back and compares with the original
        """
        print(avro_schema)

        cyavro.write_avro_file_from_dataframe(df, self.filename,
                                              json.dumps(avro_schema),
                                              codec='null'
                                              )

        if assert_fns is None:
            assert_fns = {}

        df_read = cyavro.read_avro_file_as_dataframe(self.filename)

        import avro.schema
        from avro.datafile import DataFileReader, DataFileWriter
        from avro.io import DatumReader, DatumWriter

        with open(self.filename, 'rb') as fo:
            reader = DataFileReader(fo, DatumReader())
            records = []
            for user in reader:
                records.append(user)
            df_reference = pd.DataFrame(records)
            reader.close()

        success = True

        for col in avro_schema["fields"]:
            colname = col['name']
            assert_fn = assert_fns.get(colname, np.testing.assert_array_equal)

            def print_fail_header(s):
                print('#' * len(s))
                print("FAIL: Column {}".format(col))
                print('#' * len(s))
                print(s)

            try:
                assert_fn(df_read[colname], df[colname])
            except AssertionError:
                print_fail_header("Failed for cyavro read comparison  {}\n".format(col))
                traceback.print_exc(file=sys.stdout)
                success = False

            try:
                assert_fn(df_reference[colname], df[colname])
            except AssertionError:
                print_fail_header("Failed for cyavro write comparison {}\n".format(col))
                traceback.print_exc(file=sys.stdout)
                success = False

        assert success
Exemplo n.º 4
0
def testRead(filename):
    fd = open(filename, 'rb')
    datum_writer = DatumReader()
    freader = DataFileReader(fd, datum_writer)
    for datum in freader:
        print datum['name'], datum['company']
        print datum['website']
        print
    freader.close()
Exemplo n.º 5
0
def testRead(filename):
    fd = open(filename, 'rb')

    datum = DatumReader()
    reader = DataFileReader(fd, datum)

    for record in reader:
        print record['name'], record['age']

    reader.close()
Exemplo n.º 6
0
def main():

    if len(sys.argv) < 3:
        print "Usage:", sys.argv[0]
        print "add [num of events to add] filename"
        print "list filename"
        exit(1)

    command = sys.argv[1]

    if command == 'add':

        noEvents = sys.argv[2]
        filename = sys.argv[3]

        # load existing events

        existingEvents = {}

        try:
            reader = DataFileReader(open(filename, "rb"), DatumReader())
            existingEvents = reader
            reader.close()
        except IOError:
            print filename + ": Could not open file.  Creating a new one."

        # Write back out to disk

        try:

            schema = avro.schema.parse(open("etc/userevent.avsc").read())

            f = open(filename, "w")
            writer = DataFileWriter(f, DatumWriter(), schema)

            # Append new user events

            for i in range(0, int(noEvents)):
                newEvent = createUserEvent()
                print newEvent
                writer.append(newEvent)

            writer.close()

            print "Wrote {0} user events".format(noEvents)
        except IOError:
            print filename + ": Could not save file."

    elif command == 'list':

        listAllUserEvents(sys.argv[2])

    else:
        print "Unregistered command. Exiting"
        sys.exit(1)
def loadOldData(filename):
    oldDataDict = dict()

    if not os.path.isfile(filename):
        return oldDataDict

    reader = DataFileReader(open(filename, "r"), DatumReader())
    for weight in reader:
        oldDataDict[weight["site"]] = weight["weight"]
    reader.close()

    return oldDataDict
Exemplo n.º 8
0
def main():
	try:
		opts, args = getopt.getopt(sys.argv[1:], "hi:s:", ["help", "input-file=",
						"schema="])
	except getopt.GetoptError as err:
		# print help information and exit:
		print str(err) # will print something like "option -a not recognized"
		usage(sys.argv[0])
		sys.exit(2)

	avro_file = None
	avro_schema_file = None

	required_cl = 0

	for o, a in opts:
		if o in ("-h", "--help"):
			usage(sys.argv[0])
			sys.exit()
		elif o in ("-i", "--input-file"):
			required_cl += 1
            		avro_file = a
		elif o in ("-s", "--schema"):
			avro_schema_file = a
		else:
			assert False, "unhandled option"

	if (required_cl < 1): 
		print "ERROR: Missing required argument"
		usage(sys.argv[0])
		sys.exit(1)

	if not avro_schema_file:
		reader = DataFileReader(open(avro_file, "r"), DatumReader())
		for datum in reader:
			print datum
		reader.close()
	else:
		reader_schema = open(avro_schema_file, "r")
		avro_schema = reader_schema.read()
		reader_schema.close()
		parsed_avro_schema = avro.schema.parse(avro_schema)

		with open(avro_file, "rb") as reader_data:
			inputio = io.BytesIO(reader_data.read())
			decoder = avro.io.BinaryDecoder(inputio)
			reader = avro.io.DatumReader(parsed_avro_schema)
			while inputio.tell() < len(inputio.getvalue()):
				avro_datum = reader.read(decoder)
				print avro_datum
		reader_data.close()
Exemplo n.º 9
0
def listAllUserEvents(filename):

    try:

        reader = DataFileReader(open(filename, "r"), DatumReader())
        for event in reader:

            # Query uuids of events
            print "event id: {0}, event data extra fields: {1}".format(event["uuid"], event["eventData"]["otherEventData"])

        reader.close()
    except IOError:
        print filename + ": Could not open file.  Exiting"
        sys.exit(1)
Exemplo n.º 10
0
def processBlob(filename):
    reader = DataFileReader(open(filename, 'rb'), DatumReader())
    dict = {}
    for reading in reader:
        parsed_json = json.loads(reading["Body"])
        if not 'id' in parsed_json:
            return
        if not dict.has_key(parsed_json['id']):
            list = []
            dict[parsed_json['id']] = list
        else:
            list = dict[parsed_json['id']]
            list.append(parsed_json)
    reader.close()
    for device in dict.keys():
        deviceFile = open(device + '.csv', "a")
        for r in dict[device]:
            deviceFile.write(", ".join([str(r[x]) for x in r.keys()]) + '\n')
Exemplo n.º 11
0
def evaluate_file(fname: str):
    logger.info("Opening file %s", fname)
    reader = DataFileReader(open(fname, "rb"), DatumReader())
    logger.info("Counting lines...")
    i = 0
    for val in reader:
        i += 1
        if i % 1000 == 0:
            logger.debug("Read %d lines", i)
    logger.info("Found %d lines in file", i)
Exemplo n.º 12
0
    def doKmeans(self):
        numpy.seterr(divide="ignore", invalid="ignore")

        # get a dataset for the k-means generator
        dataset = []
        reader = DataFileReader(open("test/prettypfa/exoplanets.avro", "rb"),
                                DatumReader())
        for record in reader:
            mag, dist, mass, radius = record.get("mag"), record.get(
                "dist"), record.get("mass"), record.get("radius")
            if mag is not None and dist is not None and mass is not None and radius is not None:
                dataset.append([mag, dist, mass, radius])
        reader.close()

        # set up and run the k-means generator
        TestClustering.kmeansResult = KMeans(len(self.clusterNames),
                                             numpy.array(dataset))
        TestClustering.kmeansResult.optimize(
            whileall(moving(), maxIterations(1000)))
Exemplo n.º 13
0
def read_log(topic, log):
    schema = avro.schema.parse(open(os.path.abspath(os.path.dirname(__file__)) + "/avro_schema/" + topic + ".avsc").read())
    print "schema:", schema
    writer = DataFileWriter(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "w"), DatumWriter(), schema)
    for i in range(5):
        writer.append(log)
    writer.close()
    reader = DataFileReader(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "r"), DatumReader())
    for log in reader:
        print log
def readAndWriteAvro():
    """ Unlike java, avro does not let you generate
        code for Tweet in python. So only way to read and write
        data is without using code generation"""

    #Read the schema
    schema = avro.schema.parse(open("tweet.avsc").read())


    #write some data
    writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema)
    writer.append({"tweetId": 5, "user": "******", "text" : "Tweeting from python as well"})
    writer.close()

    #read the same data
    tweets = DataFileReader(open("tweets.avro", "r"), DatumReader())
    for tweet in tweets:
        print tweet
    tweets.close()
Exemplo n.º 15
0
def main(fn, out_fn, avro_mode=''):
    with open(out_fn, 'w') as fo:
        with open(fn, 'rb') as f:
            reader = DataFileReader(f, DatumReader())
            for r in reader:
                if avro_mode.upper() == 'KV':
                    r = r['key']

                fo.write('%s\t%r\n' % (r['office'], r['counts']))
    print('wrote', out_fn)
Exemplo n.º 16
0
def read_corpus(corpus_path):
    avro_files_path = [
        os.path.join(corpus_path, filename)
        for filename in os.listdir(corpus_path)
        if os.path.splitext(filename)[1] == '.avro'
    ]
    for avro_file in avro_files_path:
        small_corpus = DataFileReader(open(avro_file, 'rb'), DatumReader())
        for article in small_corpus:
            yield article
    def handle(self):
        data = self.request.recv(8024).strip()
        data = StringIO(data)

        reader = DataFileReader(data, DatumReader())
        for fileData in reader:
            id = fileData['id']
            data = fileData['data']

            print fileData

            if not fileDict.has_key(id):
                fileDict[id] = open("./" + id, "w")

            f = fileDict[id]

            f.write(data)
            f.flush()
        reader.close()
Exemplo n.º 18
0
def processBlob(filename):
    reader = DataFileReader(open(filename, 'rb'), DatumReader())
    dict = {}
    readingNb = 0
    for reading in reader:
        readingNb += 1
        try:
            parsed_json = json.loads(reading["Body"])
            #print parsed_json
            print "-----------------------------"
            print "id:"
            print parsed_json[0]["id"]
            #print parsed_json[0]
            if not 'id' in parsed_json[0]:
                print "no id found..."
                return
            if not dict.has_key(parsed_json[0]['id']):
                list = [parsed_json[0]]
                dict[parsed_json[0]['id']] = list
            else:
                list = dict[parsed_json[0]['id']]
                list.append(parsed_json[0])
            print "id:"
            print dict[parsed_json[0]['id']][0]["id"]
            print "eventTime:"
            print dict[parsed_json[0]['id']][0]["eventTime"]
            print "eventType:"
            print dict[parsed_json[0]['id']][0]["eventType"]
            print "resourceUri:"
            print dict[parsed_json[0]['id']][0]["data"]["resourceUri"]
            print "operationName:"
            print dict[parsed_json[0]['id']][0]["data"]["operationName"]
            print "resourceProvider:"
            print dict[parsed_json[0]['id']][0]["data"]["resourceProvider"]
            print "status:"
            print dict[parsed_json[0]['id']][0]["data"]["status"]
            print "subject:"
            print dict[parsed_json[0]['id']][0]["subject"]
        except:
            print "exception in converting blob to json"
    reader.close()
    print readingNb
    '''
Exemplo n.º 19
0
def main():
  """Start of execution"""
  #combine the schemas 
  known_schemas = avro.schema.Names()
  types_schema = LoadAvsc("parameter_types.avsc", known_schemas)
  param_schema = LoadAvsc("parameter.avsc", known_schemas)
  print json.dumps(param_schema.to_json(avro.schema.Names()), indent=2) 
  #test the schema works 
  param_file = open("parameters.avro", "w")
  writer = DataFileWriter(param_file, DatumWriter(), param_schema)
  param_1 = {"name": "test", "description":"An Avro test.", "type":"int"}
  param_2 = {"name": "test", "description":"An Avro test.", "type":"boolean"}
  writer.append(param_1)
  writer.append(param_2)
  writer.close()
  reader = DataFileReader(open("parameters.avro", "r"), DatumReader())
  for parameter in reader:
      print parameter
  reader.close()  
Exemplo n.º 20
0
def read_avro(iostream, runs=1):
    times = []
    for _ in range(runs):
        iostream.seek(0)
        start = time.time()
        records = list(DataFileReader(iostream, DatumReader()))
        end = time.time()
        times.append(end - start)
    print(f'... {runs} runs averaged {sum(times) / runs} seconds')
    return records
Exemplo n.º 21
0
def deserialize(value):
    """Deserialize AVRO encoded binary string and yield records.
    Args:
        value (str): binary string value.
    Yields:
        dict: deserialized record.
    """
    with DataFileReader(io.BytesIO(value), DatumReader()) as reader:
        for record in reader:
            yield record
Exemplo n.º 22
0
def read_data():
    # read avro file into an array of dicts
    reader = DataFileReader(open(DATA_FILE_PATH, "rb"), DatumReader())

    try:
        data = []
        for row in reader:
            data.append(row)

        # pandas can only read json or csv
        # convert data to json object
        json_data = json.dumps(data)

        # read the json into a pandas dataframe
        dataset = pd.read_json(json_data)

        # separate features and labels
        features = dataset.copy().drop('rating', 1)
        labels = dataset.copy().pop('rating')

        # normalize features
        features = normalize_features(features)

        # split into train and test data
        train_features = features.sample(frac=0.8, random_state=0)
        test_features = features.drop(train_features.index)

        train_labels = labels[labels.index.isin(train_features.index)]
        test_labels = labels.drop(train_features.index)

        # convert features to numpy arrays
        train_features = train_features.to_numpy()
        test_features = test_features.to_numpy()

        # left shift labels to convert them from the range [1,10]
        # to the range [0, 9]
        train_labels = left_shift_labels(train_labels)
        test_labels = left_shift_labels(test_labels)

        return train_features, train_labels, test_features, test_labels
    finally:
        reader.close()
Exemplo n.º 23
0
    def get_manifest_hdfs_path_list(self, tmp_path_prefix,
                                    manifest_list_hdfs_path):
        local_path = '%s_%s.manifest_list.avro' % (tmp_path_prefix,
                                                   random.randint(0, 10000))
        check_call([
            'hadoop', 'fs', '-copyToLocal', manifest_list_hdfs_path, local_path
        ])

        manifest_hdfs_path_list = []
        reader = None
        try:
            with open(local_path, 'rb') as fp:
                reader = DataFileReader(fp, DatumReader())
                for manifest in reader:
                    manifest_hdfs_path_list.append(manifest['manifest_path'])
        finally:
            if reader:
                reader.close()
            os.remove(local_path)
        return manifest_hdfs_path_list
Exemplo n.º 24
0
def read_avro_file(file):
    reader = DataFileReader(open(file, "rb"), DatumReader())
    data = []
    fields = json.loads(reader.meta['avro.schema'])['fields']

    for i in range(min(100, reader.file_length)):
        rec = reader.next()
        data.append(rec)
    reader.close()

    json_fields = json.dumps(fields)
    json_data = json.dumps(data)

    return render_template('tables.html',
                           columns=len(fields),
                           rows=reader.file_length,
                           shown_rows=min(100,
                                          reader.file_length)) + \
           json2html.convert(json=json_fields) + \
           json2html.convert(json=json_data)
Exemplo n.º 25
0
def read_avro_with_schema(avro_filepath, schema_filepath):
    print("\nfile:{}\nschema:{}".format(avro_filepath, schema_filepath))

    with open(schema_filepath) as f:
        schema = avro.schema.Parse(f.read())

    datum_reader = DatumReader(reader_schema=schema)
    with open(avro_filepath, 'rb') as f:
        with DataFileReader(f, datum_reader) as dfr:
            for record in dfr:
                print(record)
Exemplo n.º 26
0
def read(fin, fout=None, nrecords=0):
    "Read given avro file according to its schema and dump on stdout its content"
    reader = DataFileReader(open(fin, "r"), DatumReader())
    fobj = open(fout, 'w') if fout else None
    count = 0
    if  fobj:
        fobj.write("[\n")
    for rec in reader:
        if  fobj:
            if  count:
                fobj.write(",\n")
            fobj.write(json.dumps(rec))
        else:
            pprint.pprint(rec)
        if  nrecords and count >= nrecords:
            break
        count += 1
    if  fobj:
        fobj.write("]\n")
        fobj.close()
    reader.close()
Exemplo n.º 27
0
 def json_avro_schema(self):
     if self._json_avro_schema is None:
         # dependency on the avro python reference implementation since getting full json
         # avro schema from the c-api is elusive
         from avro.datafile import DataFileReader
         from avro.io import DatumReader
         import json
         with open(self.filename) as fo:
             with DataFileReader(fo, DatumReader()) as avf:
                 self._json_avro_schema = json.loads(
                     avf.meta['avro.schema'])
     return self._json_avro_schema
Exemplo n.º 28
0
def read(fin, fout=None, nrecords=0):
    "Read given avro file according to its schema and dump on stdout its content"
    reader = DataFileReader(open(fin, "r"), DatumReader())
    fobj = open(fout, 'w') if fout else None
    count = 0
    if fobj:
        fobj.write("[\n")
    for rec in reader:
        if fobj:
            if count:
                fobj.write(",\n")
            fobj.write(json.dumps(rec))
        else:
            pprint.pprint(rec)
        if nrecords and count >= nrecords:
            break
        count += 1
    if fobj:
        fobj.write("]\n")
        fobj.close()
    reader.close()
Exemplo n.º 29
0
def main(args):
    global in_file_name, out_file_name
    processParams(args)

    print(' * Processing ' + in_file_name)
    ifh = openFile(in_file_name, "r")
    reader = DataFileReader(ifh, DatumReader())

    if out_file_name is None:
        print(' * Sending Output to STDOUT')
        ofh = sys.stdout
        print_progress_status = False
    else:
        print(' * Sending Output to ' + out_file_name)
        ofh = openFile(out_file_name, "w")
        print_progress_status = True

    rec_count = 0
    start_time = time.time()
    prev_time = start_time
    for rec in reader:
        rec_count += 1
        if is_pretty_print:
            rec_str = json.dumps(rec, indent=4, sort_keys=True)
            ofh.write("[" if (rec_count == 1) else ",\n")
        else:
            rec_str = json.dumps(rec)
            ofh.write("[" if (rec_count == 1) else ",")
        ofh.write(rec_str)
        cur_time = time.time()
        if (print_progress_status == True) and (
                int(cur_time - prev_time) >= STATUS_IN_TERMINAL_AFTER_SECONDS):
            print(" .... Processed record # " + str(rec_count))
            prev_time = cur_time

    ofh.write("]")
    reader.close()
    cur_time = time.time()
    print('\n * Processed ' + str(rec_count) + ' records in ' +
          str(int(round(cur_time - start_time))) + ' seconds.')
Exemplo n.º 30
0
    def runEngine(self, engine):
        last = [None]

        if engine.config.method == "emit":

            def emit(x):
                last[0] = x

            engine.emit = emit

            for record in DataFileReader(
                    open("test/prettypfa/exoplanets.avro", "r"),
                    DatumReader()):
                engine.action(record)

        else:
            for record in DataFileReader(
                    open("test/prettypfa/exoplanets.avro", "r"),
                    DatumReader()):
                last[0] = engine.action(record)

        return last[0]
Exemplo n.º 31
0
 def check_avro(self, filehandle):
     try:
         DataFileReader(filehandle, DatumReader())
         print(self.valid_avro_msg)
     except avro.datafile.DataFileException as _:
         if 'snappy' in str(_):
             die("%s => ERROR: %s - Is the python-snappy module installed? ('pip install python-snappy')" \
                 % (filehandle.name, _))
         die("%s => ERROR: %s" % (filehandle.name, _))
     except TypeError as _:
         if self.verbose > 2:
             print(_)
         die(self.invalid_avro_msg)
Exemplo n.º 32
0
 def file_read(self, fname):
     "Read documents from given file name"
     try:
         schema = self.schema
         out = []
         with DataFileReader(open_file(fname), DatumReader()) as reader:
             for rec in reader:
                 out.append(rec)
         return out
     except Exception as exc:
         err = traceback.format_exc(limit=1).splitlines()[-1]
         msg = 'Failure in %s storage, error=%s' % (self.stype, err)
         raise ReadError(msg)
Exemplo n.º 33
0
def readAndWriteAvro():
    """ Unlike java, avro does not let you generate
        code for Tweet in python. So only way to read and write
        data is without using code generation"""

    #Read the schema
    schema = avro.schema.parse(open("tweet.avsc").read())

    #write some data
    writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema)
    writer.append({
        "tweetId": 5,
        "user": "******",
        "text": "Tweeting from python as well"
    })
    writer.close()

    #read the same data
    tweets = DataFileReader(open("tweets.avro", "r"), DatumReader())
    for tweet in tweets:
        print tweet
    tweets.close()
Exemplo n.º 34
0
def read_orders(in_filename):
    sample = None
    counter = 0
    t0 = time()
    reader = DataFileReader(open(in_filename, 'rb'), DatumReader())
    for pedido in reader:
        if counter == 0:
            print("Primeira iteracao em {:0.8f}s".format(time() - t0))
            sample = pedido
        counter += 1
    delta = time() - t0
    print("{} registros lidos em {:0.3f}s".format(counter, delta))
    print("Exemplo de registro:")
    pprint(sample)
Exemplo n.º 35
0
    def _get_jc_for_avro_input(self, file_in, job_conf):

        jc = dict(job_conf)
        if self.avro_input:
            jc[AVRO_INPUT] = self.avro_input
            reader = DataFileReader(file_in, DatumReader())
            schema = reader.get_meta('avro.schema')
            file_in.seek(0)
            if self.avro_input == 'v':
                jc[AVRO_VALUE_INPUT_SCHEMA] = schema
            elif self.avro_input == 'k':
                jc[AVRO_KEY_INPUT_SCHEMA] = schema
            else:
                schema_obj = json.loads(schema)
                for field in schema_obj['fields']:
                    if field['name'] == 'key':
                        key_schema = field['type']
                    else:
                        value_schema = field['type']
                jc[AVRO_KEY_INPUT_SCHEMA] = json.dumps(key_schema)
                jc[AVRO_VALUE_INPUT_SCHEMA] = json.dumps(value_schema)

        return jc
Exemplo n.º 36
0
    def test_HKMA_Bondtrades(self):
        avroFile = "testbondtrade.avro"
        numOfInvestors = 10
        numOfTradesEach = 100
        generateHKMATrades(numOfInvestors, numOfTradesEach,
                           "HKMA/SelectedSecurity.json", avroFile)

        reader = DataFileReader(open(avroFile, "rb"), DatumReader())
        cnt = 0
        for bondtrade in reader:
            self.assertIsNotNone(bondtrade["cust"])
            self.assertIsNotNone(bondtrade["tradeDate"])
            self.assertIsNotNone(bondtrade["asset"]["securityId"])
            self.assertGreater(bondtrade["asset"]["notional"], 1000000)
            dt = datetime.datetime.fromtimestamp(bondtrade["timestamp"] / 1000)
            nowdt = datetime.datetime.now()
            self.assertEqual(dt.year, nowdt.year)
            self.assertEqual(dt.month, nowdt.month)
            self.assertEqual(dt.day, nowdt.day)
            cnt += 1
        self.assertEqual(numOfInvestors * numOfTradesEach, cnt)

        reader.close()
Exemplo n.º 37
0
    def binary_roundtrip(self, model_class, data):
        model = model_class(data)
        schema_dumper = self.mk_schema_dumper()
        schema = avro.schema.parse(schema_dumper.dump_schema(model_class))

        fp, file_name = self.get_tempfile(text=False)
        with DataFileWriter(fp, DatumWriter(), schema) as writer:
            writer.append(dict(model))

        with DataFileReader(
                open(file_name, 'rb'),
                DatumReader(readers_schema=schema)) as reader:
            [row] = reader
            return row
Exemplo n.º 38
0
def avro2dataframe(path, verbose=False):
    ''' Transforms DNA snapshot data in a pandas DataFrame object.
    '''
    read_schema = avro.schema.Parse(json.dumps(djdna_avro_schema))
    file_content = list()
    files = sorted(os.listdir(path))
    for avro_file in files:
        if (os.path.isfile(os.path.join(path, avro_file))
                and avro_file.split('.')[-1] == 'avro'):
            if verbose:
                print('Reading file {} \r'.format(avro_file), end='')
            file_path = os.path.join(path, avro_file)
            reader = DataFileReader(open(file_path, 'rb'),
                                    DatumReader(read_schema))
            # new_schema = reader.GetMeta('avro.schema')
            users = []
            for user in reader:
                users.append(user)
            file_content.append(users)
            reader.close()
    data = [pd.DataFrame(content) for content in file_content]
    data = pd.concat(data, ignore_index=True)
    return data
Exemplo n.º 39
0
def new_schema_create_new_table(filename, table_name, database_name = "braze"):
    reader = DataFileReader(open(filename, "rb"), DatumReader())
    schema = json.loads(reader.meta['avro.schema'])
    create_table = "CREATE TABLE IF NOT EXISTS " + table_name
    all_field_string = ''
    for field in  schema['fields']:
        comma = ', '
        if(all_field_string == ""):
            comma = ' '
        all_field_string = all_field_string + comma + convert_schema_to_Presto(field)
    create_table = create_table + ' ( ' + all_field_string +  ' ); '
    td = tdclient.Client(os.environ['td_apikey'])
    job = td.query(database_name, create_table, type = "presto")
    job.wait()
Exemplo n.º 40
0
    def decode(
        self, encoded_obj: Any
    ) -> Dict[str, Union[BaseRecord, StoreRecord, BaseHandler,
                         BaseStoreRecordHandler]]:
        """ Decode bytes format to BaseModel and return dict which contains decoded *BaseModel / BaseStoreRecord*

        This function is used by kafka-python / internal call

        Args:
            encoded_obj (Any): Bytes encode BaseModel / BaseStoreRecord

        Raises:
            AvroDecodeError: fail to decode bytes in BaseModel
            MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema)
            MissingHandlerClass: can’t find BaseHandlerModel in own registered BaseHandlerModel list (self._handler)

        Returns:
            Dict[str, Union[BaseModel, BaseStoreRecord, BaseHandler, BaseStoreRecordHandler]]:
                                                                    example: {'event_class': ..., 'handler_class': ...}
        """
        try:
            reader = DataFileReader(BytesIO(encoded_obj), DatumReader())
            schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
            schema_name = schema['namespace'] + '.' + schema['name']
            dict_data = next(reader)
        except AvroTypeException as err:
            self.logger.exception('%s', err.__str__())
            raise AvroDecodeError

        # Finds a matching event name
        for e_name, event in self._events.items():
            if e_name.match(schema_name):  # type: ignore
                record_class = event
                break
        else:
            raise MissingEventClass

        # Finds a matching handler name
        for e_name, handler in self._handlers.items():
            if e_name.match(schema_name):  # type: ignore
                handler_class = handler
                break
        else:
            raise MissingHandlerClass
        return {
            'record_class': record_class.from_dict(dict_data=dict_data),
            'handler_class': handler_class
        }
Exemplo n.º 41
0
 def _read(self, spec, fields=None):
     "Internal read API"
     if PAT_UID.match(str(spec)):  # requested to read concrete file
         out = []
         fname = file_name(self.hdir, spec)
         with open_file(fname) as istream:
             reader = DataFileReader(istream, DatumReader())
             for data in reader:
                 if isinstance(data, list):
                     for rec in data:
                         self.check(rec)
                     return data
                 self.check(data)
                 out.append(data)
         return out
     return self.empty_data
Exemplo n.º 42
0
    def process(self, elem):
        # TODO: figure out how to cache the client locally
        gcs = google.cloud.storage.Client()

        event = json.loads(elem)

        bucket = event["bucket"]
        name = event["name"]
        blob = gcs.get_bucket(bucket).blob(name)

        contents = blob.download_as_string()
        print("fetched {}/{}: {} bytes".format(bucket, name, len(contents)))

        rd = DataFileReader(StringIO.StringIO(contents), DatumReader())
        for record in rd:
            print(record)
        return []
Exemplo n.º 43
0
  def test1(self):
    """
    Run a tethered map-reduce job.

    Assumptions: 1) bash is available in /bin/bash
    """
    from word_count_task import WordCountTask
    from avro.tether import tether_task_runner
    from avro.datafile import DataFileReader
    from avro.io import DatumReader
    import avro

    import subprocess
    import StringIO
    import shutil
    import tempfile
    import inspect

    proc=None

    try:


      # TODO we use the tempfile module to generate random names
      # for the files
      base_dir = "/tmp/test_tether_word_count"
      if os.path.exists(base_dir):
        shutil.rmtree(base_dir)

      inpath = os.path.join(base_dir, "in")
      infile=os.path.join(inpath, "lines.avro")
      lines=["the quick brown fox jumps over the lazy dog",
             "the cow jumps over the moon",
             "the rain in spain falls mainly on the plains"]

      self._write_lines(lines,infile)

      true_counts=self._count_words(lines)

      if not(os.path.exists(infile)):
        self.fail("Missing the input file {0}".format(infile))


      # The schema for the output of the mapper and reducer
      oschema="""
{"type":"record",
 "name":"Pair","namespace":"org.apache.avro.mapred","fields":[
     {"name":"key","type":"string"},
     {"name":"value","type":"long","order":"ignore"}
 ]
}
"""

      # write the schema to a temporary file
      osfile=tempfile.NamedTemporaryFile(mode='w',suffix=".avsc",prefix="wordcount",delete=False)
      outschema=osfile.name
      osfile.write(oschema)
      osfile.close()

      if not(os.path.exists(outschema)):
        self.fail("Missing the schema file")

      outpath = os.path.join(base_dir, "out")

      args=[]

      args.append("java")
      args.append("-jar")
      args.append(os.path.abspath("@TOPDIR@/../java/tools/target/avro-tools-@[email protected]"))


      args.append("tether")
      args.extend(["--in",inpath])
      args.extend(["--out",outpath])
      args.extend(["--outschema",outschema])
      args.extend(["--protocol","http"])

      # form the arguments for the subprocess
      subargs=[]

      srcfile=inspect.getsourcefile(tether_task_runner)

      # Create a shell script to act as the program we want to execute
      # We do this so we can set the python path appropriately
      script="""#!/bin/bash
export PYTHONPATH={0}
python -m avro.tether.tether_task_runner word_count_task.WordCountTask
"""
      # We need to make sure avro is on the path
      # getsourcefile(avro) returns .../avro/__init__.py
      asrc=inspect.getsourcefile(avro)
      apath=asrc.rsplit(os.sep,2)[0]

      # path to where the tests lie
      tpath=os.path.split(__file__)[0]

      exhf=tempfile.NamedTemporaryFile(mode='w',prefix="exec_word_count_",delete=False)
      exfile=exhf.name
      exhf.write(script.format((os.pathsep).join([apath,tpath]),srcfile))
      exhf.close()

      # make it world executable
      os.chmod(exfile,0755)

      args.extend(["--program",exfile])

      print "Command:\n\t{0}".format(" ".join(args))
      proc=subprocess.Popen(args)


      proc.wait()

      # read the output
      with file(os.path.join(outpath,"part-00000.avro")) as hf:
        reader=DataFileReader(hf, DatumReader())
        for record in reader:
          self.assertEqual(record["value"],true_counts[record["key"]])

        reader.close()

    except Exception as e:
      raise
    finally:
      # close the process
      if proc is not None and proc.returncode is None:
        proc.kill()
      if os.path.exists(base_dir):
        shutil.rmtree(base_dir)
      if os.path.exists(exfile):
        os.remove(exfile)
Exemplo n.º 44
0
def main():
    parser = optparse.OptionParser(description="""Filters consumer messages based on various criteria
                                                    (allowed NGIs, service flavours, metrics...)""")
    parser.add_option('-g', dest='gloconf', nargs=1, metavar='global.conf', help='path to global configuration file', type=str)
    group = optparse.OptionGroup(parser, 'Compute Engine usage')
    group.add_option('-d', dest='date', nargs=1, metavar='YEAR-MONTH-DAY')
    parser.add_option_group(group)
    group = optparse.OptionGroup(parser, 'Debugging usage')
    group.add_option('-f', dest='cfile', nargs=1, metavar='consumer_log_YEAR-MONTH-DAY.avro')
    parser.add_option_group(group)
    (options, args) = parser.parse_args()

    global logger
    logger = Logger(os.path.basename(sys.argv[0]))

    prefilter = {'Prefilter': ['ConsumerFilePath', 'PoemExpandedProfiles', 'PoemNameMapping', 'LookbackPoemExpandedProfiles']}
    schemas = {'AvroSchemas': ['Prefilter']}
    output = {'Output': ['Prefilter']}
    confpath = options.gloconf if options.gloconf else None
    cglob = Global(confpath, schemas, output, prefilter)
    global globopts
    globopts = cglob.parse()

    stats = ()

    if options.cfile and options.date:
        parser.print_help()
        raise SystemExit(1)
    elif options.cfile:
        fname = options.cfile
        date = options.cfile.split('_')[-1]
        date = date.split('.')[0]
        date = date.split('-')
    elif options.date:
        date = options.date.split('-')
    else:
        parser.print_help()
        raise SystemExit(1)

    if len(date) == 0 or len(date) != 3:
        logger.error('Consumer file does not end with correctly formatted date')
        parser.print_help()
        raise SystemExit(1)

    year, month, day = date

    # avro files
    if options.cfile:
        inputFile = options.cfile
    else:
        inputFile = gen_fname_repdate(logger, year+'-'+month+'-'+day, globopts['PrefilterConsumerFilePath'.lower()], '')
    outputFile = gen_fname_repdate(logger, year+'_'+month+'_'+day, globopts['OutputPrefilter'.lower()], '')

    try:
        schema = avro.schema.parse(open(globopts['AvroSchemasPrefilter'.lower()]).read())
        writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema)
        reader = DataFileReader(open(inputFile, "r"), DatumReader())
    except IOError as e:
        logger.error(str(e))
        raise SystemExit(1)

    # load poem data
    ngis = loadNGIs(year, month, day)
    profiles = loadFilteredProfiles(year, month, day)
    nameMapping = loadNameMapping(year, month, day)

    s = time.time()
    msgs, msgswrit, msgsfilt, falsemonhost, falseroc, falseprofile = prefilterit(reader, writer, ngis, profiles, nameMapping)
    e = time.time()

    logger.info('ExecTime:%.2fs ConsumerDate:%s Read:%d Written:%d Filtered:%d(Monitoring_Host:%d,ROC:%d,ServiceTypes_Metrics:%d)' % (round(e - s, 2), year+'-'+month+'-'+day,
                                                                                    msgs, msgswrit, msgsfilt, falsemonhost, falseroc,
                                                                                    falseprofile))

    reader.close()
    writer.close()
Exemplo n.º 45
0
def readFile():
    reader = DataFileReader(open("part-00000.avro", "r"), DatumReader())
    for user in reader:
        print user
    reader.close()
Exemplo n.º 46
0
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

with open("blog.avsc") as schema_file:
    schema = avro.schema.parse(schema_file.read())

with open("blog.avro", "wb") as out_file:
    writer = DataFileWriter(out_file, DatumWriter(), schema)
    writer.append({
        "title": "Avro is awesome",
        "content": "Let's learn Avro!",
        "is_published": False })
    writer.close()

with open("blog.avro") as in_file:
    reader = DataFileReader(in_file, DatumReader())
    for blog in reader:
        print blog
    reader.close()
Exemplo n.º 47
0
 def getit(avroType):
     reader = DataFileReader(urllib.urlopen(url), DatumReader())
     return reader.read()
Exemplo n.º 48
0
def main():
   known_schemas = avro.schema.Names()

   with open("point.avsc", "rb") as fp:
      point = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   with open("review.avsc", "rb") as fp:
      place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   with open("place.avsc", "rb") as fp:
      place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), point)
   writer.append({'x': 1.5, 'y': 2.75})
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['x'] == 1.5
   assert deserialized['y'] == 2.75
   reader.close()
   writer.close()

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), point)
      writer.append({'x': 1.5})
      assert False
   except AvroTypeException as e:
      pass

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), point)
      writer.append({'x': 1.5, 'y': "wtanaka.com"})
      assert False
   except AvroTypeException as e:
      pass

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), place)
   writer.append({
         'name': 'wtanaka.com',
         'location': {'x': 1.5, 'y': 2.75}
         })
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['location']['x'] == 1.5
   assert deserialized['location']['y'] == 2.75
   reader.close()
   writer.close()

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), place)
   writer.append({
         'name': 'wtanaka.com',
         'location': {'x': 1.5, 'y': 2.75},
         'review': {'rating': 4, 'text': '4 stars would come again'},
         })
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['location']['x'] == 1.5
   assert deserialized['location']['y'] == 2.75
   reader.close()
   writer.close()

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), place)
      writer.append({
            'name': 'wtanaka.com',
            'location': {'x': 1.5, 'y': 2.75},
            'review': {'x': 1.5, 'y': 2.75},
            })
      assert False
   except AvroTypeException as e:
      pass