Пример #1
0
def main(args=sys.argv):
    if len(args) == 1:
        print "Usage: %s [dump|rpcreceive|rpcsend]" % args[0]
        return 1

    if args[1] == "dump":
        if len(args) != 3:
            print "Usage: %s dump input_file" % args[0]
            return 1
        for d in datafile.DataFileReader(file_or_stdin(args[2]),
                                         io.DatumReader()):
            print repr(d)
    elif args[1] == "rpcreceive":
        usage_str = "Usage: %s rpcreceive uri protocol_file " % args[0]
        usage_str += "message_name (-data d | -file f)"
        if len(args) not in [5, 7]:
            print usage_str
            return 1
        uri, proto, msg = args[2:5]
        datum = None
        if len(args) > 5:
            if args[5] == "-file":
                reader = open(args[6], 'rb')
                datum_reader = io.DatumReader()
                dfr = datafile.DataFileReader(reader, datum_reader)
                datum = dfr.next()
            elif args[5] == "-data":
                print "JSON Decoder not yet implemented."
                return 1
            else:
                print usage_str
                return 1
        run_server(uri, proto, msg, datum)
    elif args[1] == "rpcsend":
        usage_str = "Usage: %s rpcsend uri protocol_file " % args[0]
        usage_str += "message_name (-data d | -file f)"
        if len(args) not in [5, 7]:
            print usage_str
            return 1
        uri, proto, msg = args[2:5]
        datum = None
        if len(args) > 5:
            if args[5] == "-file":
                reader = open(args[6], 'rb')
                datum_reader = io.DatumReader()
                dfr = datafile.DataFileReader(reader, datum_reader)
                datum = dfr.next()
            elif args[5] == "-data":
                print "JSON Decoder not yet implemented."
                return 1
            else:
                print usage_str
                return 1
        send_message(uri, proto, msg, datum)
    return 0
Пример #2
0
def _read_avro(fs, path, offset, length):
    contents = ''
    try:
        fhandle = fs.open(path)
        try:
            fhandle.seek(offset)
            data_file_reader = datafile.DataFileReader(fhandle,
                                                       io.DatumReader())
            contents_list = []
            read_start = fhandle.tell()
            # Iterate over the entire sought file.
            for datum in data_file_reader:
                read_length = fhandle.tell() - read_start
                if read_length > length and len(contents_list) > 0:
                    break
                else:
                    datum_str = str(datum) + "\n"
                    contents_list.append(datum_str)
            data_file_reader.close()
            contents = "".join(contents_list)
        except:
            logging.warn("Could not read avro file at %s" % path,
                         exc_info=True)
            raise PopupException(_("Failed to read Avro file."))
    finally:
        fhandle.close()
    return contents
Пример #3
0
    def test_container(self):
        writer = open('data.avro', 'wb')
        datum_writer = io.DatumWriter()
        schema_object = schema.parse("""\
{ "type": "record",
  "name": "StringPair",
  "doc": "A pair of strings.",
  "fields": [
    {"name": "left", "type": "string"},
    {"name": "right", "type": "string"}
  ]
}
    """)
        dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
        datum = {'left': 'L', 'right': 'R'}
        dfw.append(datum)
        dfw.close()

        reader = open('data.avro', 'rb')
        datum_reader = io.DatumReader()
        dfr = datafile.DataFileReader(reader, datum_reader)
        data = []
        for datum in dfr:
            data.append(datum)

        self.assertEquals(1, len(data))
        self.assertEquals(datum, data[0])
Пример #4
0
    def test_interop(self):
        ran = False
        print()
        print('TEST INTEROP')
        print('============')
        print()
        for f in os.listdir(_INTEROP_DATA_DIR):
            ran = True

            base_ext = os.path.splitext(os.path.basename(f))[0].split('_', 1)
            if len(base_ext) < 2 or base_ext[1] in datafile.VALID_CODECS:
                print('READING %s' % f)
                print('')

                # read data in binary from file
                reader = open(os.path.join(_INTEROP_DATA_DIR, f), 'rb')
                datum_reader = io.DatumReader()
                dfr = datafile.DataFileReader(reader, datum_reader)
                i = 0
                for i, datum in enumerate(dfr, 1):
                    assert datum is not None
                assert i > 0
            else:
                print('SKIPPING %s due to an unsupported codec' % f)
                print('')
        self.assertTrue(ran, "Didn't find any interop data files to test")
Пример #5
0
    def test_context_manager(self):
        # Context manager was introduced as a first class
        # member only in Python 2.6 and above.
        import sys
        if sys.version_info < (2, 6):
            print 'Skipping context manager tests on this Python version.'
            return
        # Test the writer with a 'with' statement.
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
        schema_object = schema.parse(sample_schema)
        with datafile.DataFileWriter(writer, datum_writer,
                                     schema_object) as dfw:
            dfw.append(sample_datum)
        self.assertTrue(writer.closed)

        # Test the reader with a 'with' statement.
        datums = []
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        with datafile.DataFileReader(reader, datum_reader) as dfr:
            for datum in dfr:
                datums.append(datum)
        self.assertTrue(reader.closed)
Пример #6
0
    def testMetadata(self):
        file_path = self.NewTempFile()

        # Test the writer with a 'with' statement.
        with open(file_path, 'wb') as writer:
            datum_writer = io.DatumWriter()
            sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
            schema_object = schema.parse(sample_schema)
            with datafile.DataFileWriter(writer, datum_writer,
                                         schema_object) as dfw:
                dfw.SetMeta('test.string', 'foo')
                dfw.SetMeta('test.number', '1')
                dfw.append(sample_datum)
            self.assertTrue(writer.closed)

        # Test the reader with a 'with' statement.
        datums = []
        with open(file_path, 'rb') as reader:
            datum_reader = io.DatumReader()
            with datafile.DataFileReader(reader, datum_reader) as dfr:
                self.assertEqual(b'foo', dfr.GetMeta('test.string'))
                self.assertEqual(b'1', dfr.GetMeta('test.number'))
                for datum in dfr:
                    datums.append(datum)
            self.assertTrue(reader.closed)
def data_access_dir_binary_avro():
    dir_location = request.args.get('datadir_avro')
    print dir_location
    dir_url = base_url + dir_location + '?user.name=hdfs&op=OPEN'
    r = requests.get(dir_url, stream=True)
    print r.status_code

    with open('p.avro', 'wb') as fo:
        for chunk in r:
            fo.write(chunk)

    fo.close()
    print "created"

    OUTFILE_NAME = 'p.avro'

    rec_reader = io.DatumReader()
    df_reader = datafile.DataFileReader(open(OUTFILE_NAME, 'rb'), rec_reader)
    # Read all records stored inside
    mydata = []
    for record in df_reader:
        mydata.append(record)

    de = pd.DataFrame(mydata)
    #r=requests.get(dir_url)

    return de.to_html()
    def testInterop(self):
        with tempfile.NamedTemporaryFile() as temp_path:
            WriteDataFile(temp_path.name, INTEROP_DATUM, INTEROP_SCHEMA)

            # read data in binary from file
            datum_reader = io.DatumReader()
            with open(temp_path.name, 'rb') as reader:
                dfr = datafile.DataFileReader(reader, datum_reader)
                for datum in dfr:
                    self.assertEqual(INTEROP_DATUM, datum)
Пример #9
0
    def testInterop(self):
        with tempfile.NamedTemporaryFile() as temp_path:
            write_data_file(temp_path.name, INTEROP_DATUM,
                            get_interop_schema())

            # read data in binary from file
            datum_reader = io.DatumReader()
            with open(temp_path.name, "rb") as reader:
                dfr = datafile.DataFileReader(reader, datum_reader)
                for datum in dfr:
                    self.assertEqual(INTEROP_DATUM, datum)
Пример #10
0
  def testAppend(self):
    correct = 0
    codecs_to_validate = get_codecs_to_validate()
    for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in codecs_to_validate:
        file_path = self.NewTempFile()

        logging.debug(
            'Performing append with codec %r in file %s for example #%d\n'
            'Writing datum: %r using writer schema:\n%s',
            codec, file_path, iexample,
            datum, writer_schema)

        logging.debug('Creating data file %r', file_path)
        with open(file_path, 'wb') as writer:
          datum_writer = io.DatumWriter()
          schema_object = schema.parse(writer_schema)
          with datafile.DataFileWriter(
              writer=writer,
              datum_writer=datum_writer,
              writer_schema=schema_object,
              codec=codec,
          ) as dfw:
            dfw.append(datum)

        logging.debug('Appending data to %r', file_path)
        for i in range(9):
          with open(file_path, 'ab+') as writer:
            with datafile.DataFileWriter(writer, io.DatumWriter()) as dfw:
              dfw.append(datum)

        logging.debug('Reading appended data from %r', file_path)
        with open(file_path, 'rb') as reader:
          datum_reader = io.DatumReader()
          with datafile.DataFileReader(reader, datum_reader) as dfr:
            appended_data = list(dfr)

        logging.debug(
            'Appended data has %d items: %r',
            len(appended_data), appended_data)

        if ([datum] * 10) == appended_data:
          correct += 1
        else:
          logging.error(
              'Appended data does not match:\n'
              'Expect: %r\n'
              'Actual: %r',
              [datum] * 10,
              appended_data)

    self.assertEqual(
        correct,
        len(codecs_to_validate) * len(SCHEMAS_TO_VALIDATE))
Пример #11
0
  def test_append(self):
    print ''
    print 'TEST APPEND'
    print '==========='
    print ''
    correct = 0
    for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in CODECS_TO_VALIDATE:
        if (codec == 'snappy'):
          try:
            import snappy
          except:
            print 'Snappy not present. Skipping.'
            correct += 1
            continue
        print ''
        print 'SCHEMA NUMBER %d' % (i + 1)
        print '================'
        print ''
        print 'Schema: %s' % example_schema
        print 'Datum: %s' % datum
        print 'Codec: %s' % codec

        # write data in binary to file once
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        schema_object = schema.parse(example_schema)
        dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec)
        dfw.append(datum)
        dfw.close()

        # open file, write, and close nine times
        for i in range(9):
          writer = open(FILENAME, 'ab+')
          dfw = datafile.DataFileWriter(writer, io.DatumWriter())
          dfw.append(datum)
          dfw.close()

        # read data in binary from file
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        dfr = datafile.DataFileReader(reader, datum_reader)
        appended_data = []
        for datum in dfr:
          appended_data.append(datum)

        print 'Appended Data: %s' % appended_data
        print 'Appended Data Length: %d' % len(appended_data)
        is_correct = [datum] * 10 == appended_data
        if is_correct: correct += 1
        print 'Correct Appended: %s' % is_correct
        print ''
    os.remove(FILENAME)
    self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
Пример #12
0
def read_avro_file(insource='results.avro'):
    rec_reader = io.DatumReader()
    if insource == sys.stdin:          
        input = sys.stdin.read()
        temp_file = StringIO(input)

        df_reader = datafile.DataFileReader(temp_file, rec_reader)
    else:
        df_reader = datafile.DataFileReader(open(insource), rec_reader)
    del stored[:]
    """
    for record in df_reader:
        size = record['size']
        for i in range(size):
            i = i+1
            arg = record["arg%s"%(i)]
            #print arg
            stored.append(arg)
    """
    return df_reader
Пример #13
0
 def testInterop(self):
     datum_reader = io.DatumReader()
     for avro_file in glob.glob('../../build/interop/data/*.avro'):
         base_ext = os.path.splitext(os.path.basename(avro_file))[0].split(
             '_', 1)
         if len(base_ext) < 2 or base_ext[1] in datafile.VALID_CODECS:
             with open(avro_file, 'rb') as reader, \
               datafile.DataFileReader(reader, datum_reader) as dfr:
                 i = 0
                 for i, datum in enumerate(dfr, 1):
                     self.assertIsNotNone(datum)
                 self.assertGreater(i, 0)
Пример #14
0
def main():

    # Create a datum writer.
    rec_reader = io.DatumReader()

    # Define files to convert into parquet files
    files = ['logs_0.avro', 'logs_1.avro', 'logs_2.avro', 'logs_3.avro']
    pqfiles = []
    #files = ['logs_small.avro']

    # Loop to process the files
    for f in files:

        # Print message
        print("Converting", f, "to parquet format..")

        # Define reader to avro format.
        df_reader = datafile.DataFileReader(open(f, "rb"), rec_reader)

        # Convert the records from avro into pandas dataframe.
        df = pd.DataFrame.from_records(df_reader)

        # Convert pandas dataframe into parquet table
        table = pa.Table.from_pandas(df)

        # Set the avro file name (new) and append on the list.
        newfile = str(f).replace('.avro', '.parquet')
        pqfiles.append(newfile)

        # Write the data into a parquet file format.
        pq.write_table(table, newfile)

        # Close the dataframe reader
        df_reader.close()

    # S3 setup
    s3 = boto3.resource('s3',
                        aws_access_key_id=ACCESS_KEY_ID,
                        aws_secret_access_key=ACCESS_SECRET_KEY,
                        config=Config(signature_version='s3v4'))

    # Loop to save the files on S3.
    for f in pqfiles:

        # File to upload on S3
        upfile = open(f, 'rb')

        # Upload the file on S3.
        print("Uploading", f, "on S3 AWS..")
        s3.Bucket(BUCKET_NAME).put_object(Key=f, Body=upfile)

        # Close the file.
        upfile.close()
Пример #15
0
  def testRoundTrip(self):
    correct = 0
    codecs_to_validate = get_codecs_to_validate()
    for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in codecs_to_validate:
        file_path = self.NewTempFile()

        # Write the datum this many times in the data file:
        nitems = 10

        logging.debug(
            'Performing round-trip with codec %r in file %s for example #%d\n'
            'Writing datum: %r using writer schema:\n%s',
            codec, file_path, iexample,
            datum, writer_schema)

        logging.debug('Creating data file %r', file_path)
        with open(file_path, 'wb') as writer:
          datum_writer = io.DatumWriter()
          schema_object = schema.parse(writer_schema)
          with datafile.DataFileWriter(
              writer=writer,
              datum_writer=datum_writer,
              writer_schema=schema_object,
              codec=codec,
          ) as dfw:
            for _ in range(nitems):
              dfw.append(datum)

        logging.debug('Reading data from %r', file_path)
        with open(file_path, 'rb') as reader:
          datum_reader = io.DatumReader()
          with datafile.DataFileReader(reader, datum_reader) as dfr:
            round_trip_data = list(dfr)

        logging.debug(
            'Round-trip data has %d items: %r',
            len(round_trip_data), round_trip_data)

        if ([datum] * nitems) == round_trip_data:
          correct += 1
        else:
          logging.error(
              'Round-trip data does not match:\n'
              'Expect: %r\n'
              'Actual: %r',
              [datum] * nitems,
              round_trip_data)

    self.assertEqual(
        correct,
        len(codecs_to_validate) * len(SCHEMAS_TO_VALIDATE))
Пример #16
0
  def test_empty_datafile(self):
    """A reader should not fail to read a file consisting of a single empty block."""
    sample_schema = schema.parse(SCHEMAS_TO_VALIDATE[1][0])
    with datafile.DataFileWriter(open(FILENAME, 'wb'), io.DatumWriter(),
        sample_schema) as dfw:
      dfw.flush()
      # Write an empty block
      dfw.encoder.write_long(0)
      dfw.encoder.write_long(0)
      dfw.writer.write(dfw.sync_marker)

    with datafile.DataFileReader(open(FILENAME, 'rb'), io.DatumReader()) as dfr:
      self.assertEqual([], list(dfr))
Пример #17
0
def read_avro_file(name):
    # Create a 'record' (datum) reader
    # You can pass an 'expected=SCHEMA' kwarg
    # if you want it to expect a particular
    # schema (Strict)
    rec_reader = io.DatumReader()

    # Create a 'data file' (avro file) reader
    df_reader = datafile.DataFileReader(open(name), rec_reader)

    # Read all records stored inside
    for record in df_reader:
        with open(record['filename'], 'wb') as f:
            f.write(record['content'])
def read_avro_file():
    # Create a 'record' (datum) reader
    # You can pass an 'expected=SCHEMA' kwarg
    # if you want it to expect a particular
    # schema (Strict)
    rec_reader = io.DatumReader()

    # Create a 'data file' (avro file) reader
    df_reader = datafile.DataFileReader(open(OUTFILE_NAME), rec_reader)

    # Read all records stored inside
    for record in df_reader:
        print record['name'], record['age']
        print record['address'], record['value']
Пример #19
0
  def test_interop(self):
    print ''
    print 'TEST INTEROP'
    print '============'
    print ''
    for f in os.listdir('/home/blue/avro/lang/py/../../build/interop/data'):
      print 'READING %s' % f
      print ''

      # read data in binary from file
      reader = open(os.path.join('/home/blue/avro/lang/py/../../build/interop/data', f), 'rb')
      datum_reader = io.DatumReader()
      dfr = datafile.DataFileReader(reader, datum_reader)
      for datum in dfr:
        assert datum is not None
Пример #20
0
    def test_interop(self):
        print('')
        print('TEST INTEROP')
        print('============')
        print('')
        for f in os.listdir('@INTEROP_DATA_DIR@'):
            print('READING %s' % f)
            print('')

            # read data in binary from file
            reader = open(os.path.join('@INTEROP_DATA_DIR@', f), 'rb')
            datum_reader = io.DatumReader()
            dfr = datafile.DataFileReader(reader, datum_reader)
            for datum in dfr:
                assert datum is not None
Пример #21
0
    def test_interop(self):
        print ''
        print 'TEST INTEROP'
        print '============'
        print ''
        for f in os.listdir(INTEROP_DATA_DIR):
            print 'READING %s' % f
            print ''

            # read data in binary from file
            reader = open(os.path.join(INTEROP_DATA_DIR, f), 'rb')
            datum_reader = io.DatumReader()
            dfr = datafile.DataFileReader(reader, datum_reader)
            for datum in dfr:
                assert datum is not None
Пример #22
0
  def test_round_trip(self):
    print ''
    print 'TEST ROUND TRIP'
    print '==============='
    print ''
    correct = 0
    for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in CODECS_TO_VALIDATE:
        if (codec == 'snappy'):
          try:
            import snappy
          except:
            print 'Snappy not present. Skipping.'
            correct += 1
            continue
        print ''
        print 'SCHEMA NUMBER %d' % (i + 1)
        print '================'
        print ''
        print 'Schema: %s' % example_schema
        print 'Datum: %s' % datum
        print 'Codec: %s' % codec

        # write data in binary to file 10 times
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        schema_object = schema.parse(example_schema)
        dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec)
        for i in range(10):
          dfw.append(datum)
        dfw.close()

        # read data in binary from file
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        dfr = datafile.DataFileReader(reader, datum_reader)
        round_trip_data = []
        for datum in dfr:
          round_trip_data.append(datum)

        print 'Round Trip Data: %s' % round_trip_data
        print 'Round Trip Data Length: %d' % len(round_trip_data)
        is_correct = [datum] * 10 == round_trip_data
        if is_correct: correct += 1
        print 'Correct Round Trip: %s' % is_correct
        print ''
    os.remove(FILENAME)
    self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
Пример #23
0
def process_file(afile, output_path):
    rec_reader = io.DatumReader()
    df_reader = datafile.DataFileReader(open(afile),rec_reader)
    for record in df_reader:
        basename = os.path.basename(record['file_path'])
        dirname = os.path.dirname(record['file_path'])
        x = output_path + dirname
        x = x.replace('//','/')
        c = ['mkdir', '-p', x]
        subprocess.call(c)
        fpath = x + '/' + basename
        print fpath
        with open(fpath, 'w') as f:
            b64 = base64.b64decode(record['content'])
            f.write(b64)
            f.close()
Пример #24
0
  def test_interop(self):
    print ''
    print 'TEST INTEROP'
    print '============'
    print ''
    for f in os.listdir('@INTEROP_DATA_DIR@'):
      print 'READING %s' % f
      print ''

      # read data in binary from file
      reader = open(os.path.join('@INTEROP_DATA_DIR@', f), 'rb')
      datum_reader = io.DatumReader()
      dfr = datafile.DataFileReader(reader, datum_reader)
      i = 0
      for i, datum in enumerate(dfr, 1):
        assert datum is not None
      assert i > 0
Пример #25
0
def cat(opts, args):
    if not args:
        raise AvroError('No files to show')

    for filename in args:
        try:
            fo = open(filename, 'rb')
        except (OSError, IOError) as e:
            raise AvroError('Cannot open %s - %s' % (filename, e))

        avro = datafile.DataFileReader(fo, avro_io.DatumReader())

        if opts.print_schema:
            print_schema(avro)
            continue

        print_avro(avro, opts)
    def test_round_trip(self):
        print('')
        print('TEST ROUND TRIP')
        print('===============')
        print('')
        correct = 0
        for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
            for codec in CODECS_TO_VALIDATE:
                print('')
                print('SCHEMA NUMBER %d' % (i + 1))
                print('================')
                print('')
                print('Schema: %s' % example_schema)
                print('Datum: %s' % datum)
                print('Codec: %s' % codec)

                # write data in binary to file 10 times
                writer = open(FILENAME, 'wb')
                datum_writer = io.DatumWriter()
                schema_object = schema.parse(example_schema)
                dfw = datafile.DataFileWriter(writer,
                                              datum_writer,
                                              schema_object,
                                              codec=codec)
                for i in range(10):
                    dfw.append(datum)
                dfw.close()

                # read data in binary from file
                reader = open(FILENAME, 'rb')
                datum_reader = io.DatumReader()
                dfr = datafile.DataFileReader(reader, datum_reader)
                round_trip_data = []
                for datum in dfr:
                    round_trip_data.append(datum)

                print('Round Trip Data: %s' % round_trip_data)
                print('Round Trip Data Length: %d' % len(round_trip_data))
                is_correct = [datum] * 10 == round_trip_data
                if is_correct:
                    correct += 1
                print('Correct Round Trip: %s' % is_correct)
                print('')
        os.remove(FILENAME)
        self.assertEquals(correct,
                          len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
Пример #27
0
  def test_context_manager(self):
    """Test the writer with a 'with' statement."""
    writer = open(FILENAME, 'wb')
    datum_writer = io.DatumWriter()
    sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
    schema_object = schema.parse(sample_schema)
    with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
      dfw.append(sample_datum)
    self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    reader = open(FILENAME, 'rb')
    datum_reader = io.DatumReader()
    with datafile.DataFileReader(reader, datum_reader) as dfr:
      for datum in dfr:
        datums.append(datum)
    self.assertTrue(reader.closed)
Пример #28
0
    def clean(data):
        try:
            json.dumps(data)
            return data
        except:
            LOG.exception('Failed to dump data as JSON')
            cleaned = {}
            lim = [0]
            if isinstance(
                    data, str
            ):  # Not JSON dumpable, meaning some sort of bytestring or byte data
                #detect if avro file
                if (data[:3] == '\x4F\x62\x6A'):
                    #write data to file in memory
                    output = StringIO.StringIO()
                    output.write(data)

                    #read and parse avro
                    rec_reader = io.DatumReader()
                    df_reader = datafile.DataFileReader(output, rec_reader)
                    return json.dumps(clean([record for record in df_reader]))
                return base64.b64encode(data)

            if hasattr(data, "__iter__"):
                if type(data) is dict:
                    for i in data:
                        cleaned[i] = clean(data[i])
                elif type(data) is list:
                    cleaned = []
                    for i, item in enumerate(data):
                        cleaned += [clean(item)]
                else:
                    for i, item in enumerate(data):
                        cleaned[i] = clean(item)
            else:
                for key in dir(data):
                    value = getattr(data, key)
                    if value is not None and not hasattr(
                            value, '__call__') and sum([
                                int(bool(re.search(ignore, key)))
                                for ignore in ignored_fields
                            ]) == 0:
                        cleaned[key] = clean(value)
            return cleaned
Пример #29
0
  def testContextManager(self):
    file_path = self.NewTempFile()

    # Test the writer with a 'with' statement.
    with open(file_path, 'wb') as writer:
      datum_writer = io.DatumWriter()
      sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
      schema_object = schema.parse(sample_schema)
      with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
        dfw.append(sample_datum)
      self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    with open(file_path, 'rb') as reader:
      datum_reader = io.DatumReader()
      with datafile.DataFileReader(reader, datum_reader) as dfr:
        for datum in dfr:
          datums.append(datum)
      self.assertTrue(reader.closed)
Пример #30
0
def read_avro_file():
    # Create a 'record' (datum) reader
    # You can pass an 'expected=SCHEMA' kwarg
    # if you want it to expect a particular
    # schema (Strict)
    rec_reader = io.DatumReader()

    # Create a 'data file' (avro file) reader
    df_reader = datafile.DataFileReader(open(INFILE_NAME), rec_reader)

    # Read all records stored inside
    n = 1
    for record in df_reader:
        # print record
        n = n + 1
        # print record['name'], record['age']
        # print record['address'], record['value']
        # Do whatever read-processing you wanna do
        # for each record here ...
    print "No. of Records in file :- {0}".format(str(n))