示例#1
0
def test_not_avro():
    try:
        with open(__file__, 'rb') as fo:
            fastavro.reader(fo)
        assert False, 'opened non avro file'
    except ValueError:
        pass
示例#2
0
def check(filename):
    with open(filename, 'rb') as fo:
        reader = fastavro.reader(fo)
        assert hasattr(reader, 'schema'), 'no schema on file'

        if basename(filename) in NO_DATA:
            return

        records = list(reader)
        assert len(records) > 0, 'no records found'

    new_file = MemoryIO()
    fastavro.writer(new_file, reader.schema, records, reader.codec)
    new_file_bytes = new_file.getvalue()

    new_file = NoSeekMemoryIO(new_file_bytes)
    new_reader = fastavro.reader(new_file)
    assert hasattr(new_reader, 'schema'), "schema wasn't written"
    assert new_reader.schema == reader.schema
    assert new_reader.codec == reader.codec
    new_records = list(new_reader)

    assert new_records == records

    # Test schema migration with the same schema
    new_file = NoSeekMemoryIO(new_file_bytes)
    schema_migration_reader = fastavro.reader(new_file, reader.schema)
    assert schema_migration_reader.reader_schema == reader.schema
    new_records = list(schema_migration_reader)

    assert new_records == records
示例#3
0
def test_repo_caching_issue():
    schema = {
        "type": "record",
        "name": "B",
        "fields": [{
            "name": "b",
            "type": {
                "type": "record",
                "name": "C",
                "fields": [{
                    "name": "c",
                    "type": "string"
                }]
            }
        }]
    }

    new_file = MemoryIO()
    records = [{"b": {"c": "test"}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == records

    other_schema = {
        "name": "A",
        "type": "record",
        "fields": [{
            "name": "a",
            "type": {
                "type": "record",
                "name": "B",
                "fields": [{
                    "name": "b",
                    "type": {
                        "type": "record",
                        "name": "C",
                        "fields": [{
                            "name": "c",
                            "type": "int"
                        }]
                    }
                }]
            }
        }, {
            "name": "aa",
            "type": "B"
        }]
    }

    new_file = MemoryIO()
    records = [{"a": {"b": {"c": 1}}, "aa": {"b": {"c": 2}}}]
    fastavro.writer(new_file, other_schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == records
示例#4
0
def test_schema_migration_array_with_union_promotion():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["boolean", "long"]
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "float"]
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": [1, 2, 3]}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == records
示例#5
0
def test_schema_migration_array_failure():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "int"]
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "boolean"]
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": [1, 2, 3]}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    list(new_reader)
示例#6
0
def roundtrip(record, writer_schema, reader_schema):
    new_file = MemoryIO()
    fastavro.writer(new_file, writer_schema, [record])
    new_file.seek(0)

    new_records = list(fastavro.reader(new_file, reader_schema))
    return new_records[0]
示例#7
0
def test_schema_migration_maps_with_union_promotion():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": ["string", "int"]
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": ["string", "long"]
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": {"foo": 1}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == records
示例#8
0
def test_schema_migration_maps_failure():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": "string"
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": "long"
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": {"foo": "a"}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    try:
        list(new_reader)
    except fastavro._reader.SchemaResolutionError:
        pass
    else:
        assert False
示例#9
0
def extract_file(filename):
    print >> sys.stderr, "Processing " + filename
    result = []
    with open(filename, 'rb') as avro_file:
        reader = avro.reader(avro_file)
        schema = reader.schema
        fields = global_fields
        add_header = args.add_header
        for index, record in enumerate(reader):
            if not fields:
                fields = tuple(get_fields(record))
            if args.list_fields:
                print 'Fields in %s:' % (filename,)
                for f in fields:
                    print '  Field: ' + '/'.join(f)
                break
            if add_header:
                print '\t'.join( '/'.join( p.encode('utf-8') for p in f ) for f in fields )
                add_header = False
            if index and not (index % 1000):
                if samples and all( s.is_full for s in samples.itervalues() ):
                    break
                sys.stderr.write("Read %d lines of input\r" % (index,))
            extracted_values = extract(record, fields)
            if samples is None:
                result.append('\t'.join(extracted_values))
        if samples:
            print 'Samples values from %s:' % (filename,)
            for f in fields:
                print '  ' + '/'.join(f) + ':'
                for v in sorted(samples[f]):
                    print '    ' + v
        print >> sys.stderr, "Read %d lines of input\r" % (index,)
    return result
示例#10
0
def test_str_py3():
    letters = ascii_uppercase + digits
    id_size = 100

    seed('str_py3')  # Repeatable results

    def gen_id():
        return ''.join(choice(letters) for _ in range(id_size))

    keys = ['first', 'second', 'third', 'fourth']

    testdata = [{key: gen_id() for key in keys} for _ in range(50)]

    schema = {
        "fields": [{'name': key, 'type': 'string'} for key in keys],
        "namespace": "namespace",
        "name": "zerobyte",
        "type": "record"
    }

    buf = BytesIO()
    fastavro.writer(buf, schema, testdata)

    buf.seek(0, SEEK_SET)
    for i, rec in enumerate(fastavro.reader(buf), 1):
        pass

    size = len(testdata)

    assert i == size, 'bad number of records'
    assert rec == testdata[-1], 'bad last record'
示例#11
0
def _read_avro(fn, executor=None, hdfs=None, lazy=False, **kwargs):
    """ See distributed.hdfs.read_avro for docstring """
    from hdfs3 import HDFileSystem
    from dask import do
    import fastavro
    hdfs = hdfs or HDFileSystem()
    executor = default_executor(executor)

    filenames = hdfs.glob(fn)
    blockss = []

    for fn in filenames:
        with hdfs.open(fn, 'r') as f:
            av = fastavro.reader(f)
            header = av._header
        schema = json.loads(header['meta']['avro.schema'])

        blockss.extend([read_bytes(fn, executor, hdfs, lazy=True,
                                   delimiter=header['sync'], not_zero=True)
                        for fn in filenames])

    lazy_values = [do(avro_body)(b, header) for blocks in blockss
                                            for b in blocks]

    if lazy:
        raise gen.Return(lazy_values)
    else:
        futures = executor.compute(*lazy_values)
        raise gen.Return(futures)
示例#12
0
def test_schema_migration_maps_failure():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": "string"
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "map",
                "values": "long"
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": {"foo": "a"}}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    list(new_reader)
示例#13
0
def test_schema_migration_array_failure():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "int"]
            },
        }]
    }

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": {
                "type": "array",
                "items": ["string", "boolean"]
            },
        }]
    }

    new_file = MemoryIO()
    records = [{"test": [1, 2, 3]}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    try:
        list(new_reader)
    except fastavro._reader.SchemaResolutionError:
        pass
    else:
        assert False
示例#14
0
def test_schema_migration_schema_mismatch():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": "string",
        }]
    }

    new_schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["FOO", "BAR"],
    }

    new_file = MemoryIO()
    records = [{"test": "test"}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    try:
        list(new_reader)
    except fastavro._reader.SchemaResolutionError:
        pass
    else:
        assert False
示例#15
0
def roundtrip(schema, records, new_schema):
    new_file = MemoryIO()
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)

    reader = fastavro.reader(new_file, new_schema)
    new_records = list(reader)
    return new_records
示例#16
0
def test_default_values():
    schema = {"type": "record", "fields": [{"name": "default_field", "type": "string", "default": "default_value"}]}
    new_file = MemoryIO()
    records = [{}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)
    assert new_records == [{"default_field": "default_value"}]
示例#17
0
def test_metadata():
    schema = {"type": "record", "fields": []}

    new_file = MemoryIO()
    records = [{}]
    metadata = {"key": "value"}
    fastavro.writer(new_file, schema, records, metadata=metadata)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    assert new_reader.metadata["key"] == metadata["key"]
示例#18
0
文件: genpy.py 项目: bwengals/hadrian
 def avroInputIterator(self, inputStream, interpreter="avro"):
     if interpreter == "avro":
         return DataFileReader(inputStream, DatumReader())
     elif interpreter == "fastavro":
         import fastavro
         return fastavro.reader(inputStream)
     elif interpreter == "correct-fastavro":
         return FastAvroCorrector(inputStream, self.config.input)
     else:
         raise ValueError("interpreter must be one of \"avro\", \"fastavro\", and \"correct-fastavro\" (which corrects fastavro's handling of Unicode strings)")
示例#19
0
def read(iostream, runs=1):
    times = []
    for _ in range(runs):
        iostream.seek(0)
        start = time.time()
        records = list(reader(iostream))
        end = time.time()
        times.append(end - start)
    print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs)))
    return records
示例#20
0
文件: __init__.py 项目: mtth/hdfs
 def _reader():
   """Record generator over all part-files."""
   for path in self._paths:
     with self._client.read(path) as bytes_reader:
       reader = fastavro.reader(_SeekableReader(bytes_reader))
       if not self._schema:
         schema = reader.writer_schema
         _logger.debug('Read schema from %r.', path)
         yield (schema, reader.metadata)
       for record in reader:
         yield record
示例#21
0
def test_fastavro():
    print("fastavro: reading file...")

    with open(filename, "rb") as fp:
        av = fastavro.reader(fp)

        t0 = datetime.datetime.now()
        res = list(av)
        t1 = datetime.datetime.now()

    return (t1 - t0, len(res))
示例#22
0
def gen_cat(sources, cat_type=None):
    for s in sources:
        if cat_type is None:
            for item in s:
                yield item
        elif cat_type == "avro":
            reader = fastavro.reader(s)
            for item in reader:
                yield json.dumps(item)
        else:
            print "unknown cat type: " + str(cat_type)
示例#23
0
def test_schema_migration_add_default_field():
    schema = {"type": "record", "fields": []}

    new_schema = {"type": "record", "fields": [{"name": "test", "type": "string", "default": "default"}]}

    new_file = MemoryIO()
    records = [{}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == [{"test": "default"}]
示例#24
0
def test_schema_migration_reader_union():
    schema = {"type": "record", "fields": [{"name": "test", "type": "int"}]}

    new_schema = {"type": "record", "fields": [{"name": "test", "type": ["string", "int"]}]}

    new_file = MemoryIO()
    records = [{"test": 1}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == records
示例#25
0
def check(filename):
    with open(filename, 'rb') as fo:
        reader = fastavro.reader(fo)
        assert hasattr(reader, 'schema'), 'no schema on file'

        if basename(filename) in NO_DATA:
            return

        records = list(reader)
        assert len(records) > 0, 'no records found'

    new_file = MemoryIO()
    fastavro.writer(new_file, reader.schema, records, reader.codec)

    new_file.seek(0)
    new_reader = fastavro.reader(new_file)
    assert hasattr(new_reader, 'schema'), "schema wasn't written"
    assert new_reader.schema == reader.schema
    assert new_reader.codec == reader.codec
    new_records = list(new_reader)

    assert new_records == records
示例#26
0
    def extract(self, infile, job):
        minx, miny, maxx, maxy = (None, None, None, None)
        poly_wkt = None
        job.set_field(VgDexField.NAME, os.path.basename(infile))
        job.set_field(VgDexField.PATH, infile)
        with open(infile, 'rb') as avro_file:
            reader = fastavro.reader(avro_file)
            for record in reader:
                for k, v in record.iteritems():
                    if k.lower() == 'footprint_geometry':
                        poly_wkt = v
                        job.set_field(VgDexField.GEO_WKT, poly_wkt)
                        if job.get(VgDexField.GEO):
                            job.geo['wkt'] = poly_wkt
                        job.set_field(VgDexField.GEO, job.get_field(VgDexField.GEO_WKT))
                    else:
                        if k == 'MBR_EAST' and v:
                            minx = float(v)
                        elif k == 'MBR_WEST' and v:
                            maxx = float(v)
                        elif k == 'MBR_NORTH' and v:
                            maxy = float(v)
                        elif k == 'MBR_SOUTH' and v:
                            miny = float(v)

                    # Map values to correct data type.
                    if isinstance(v, str):
                        job.set_field("fs_{0}".format(k), v)
                    elif isinstance(v, unicode):
                        job.set_field("fs_{0}".format(k), v)
                    elif isinstance(v, bool):
                        job.set_field("fs_{0}".format(k), v)
                    elif isinstance(v, int):
                        job.set_field("fl_{0}".format(k), v)
                    elif isinstance(v, float):
                        job.set_field("fu_{0}".format(k), v)
                    elif isinstance(v, datetime.datetime):
                        job.set_field("fd_{0}".format(k), self.format_date(v))
                    elif (v and "Date" in k) or (isinstance(v, unicode) and len(v.strip()) == 14):
                        job.set_field("fd_{0}".format(k), self.format_date(v))
                    elif isinstance(v, list):
                        job.set_field("fs_{0}".format(k), v)
                    else:
                        job.set_field("meta_{0}".format(k), v)

            if minx and not poly_wkt:
                poly_wkt = "POLYGON (({0} {1}, {0} {3}, {2} {3}, {2} {1}, {0} {1}))".format(minx, miny, maxx, maxy)
                job.set_field(VgDexField.GEO_WKT, poly_wkt)
                if job.get(VgDexField.GEO):
                    job.geo['wkt'] = poly_wkt
                job.set_field(VgDexField.GEO, job.get_field(VgDexField.GEO_WKT))
示例#27
0
def main(argv=None):
    import sys
    from argparse import ArgumentParser

    argv = argv or sys.argv

    parser = ArgumentParser(
        description='iter over avro file, emit records as JSON')
    parser.add_argument('file', help='file(s) to parse', nargs='*')
    parser.add_argument('--schema', help='dump schema instead of records',
                        action='store_true', default=False)
    parser.add_argument('--codecs', help='print supported codecs',
                        action='store_true', default=False)
    parser.add_argument('--version', action='version',
                        version='fastavro %s' % avro.__version__)
    parser.add_argument('-p', '--pretty', help='pretty print json',
                        action='store_true', default=False)
    args = parser.parse_args(argv[1:])

    if args.codecs:
        import fastavro
        print('\n'.join(sorted(fastavro._reader.BLOCK_READERS)))
        raise SystemExit

    files = args.file or ['-']
    for filename in files:
        if filename == '-':
            fo = sys.stdin
        else:
            try:
                fo = open(filename, 'rb')
            except IOError as e:
                raise SystemExit('error: cannot open %s - %s' % (filename, e))

        try:
            reader = avro.reader(fo)
        except ValueError as e:
            raise SystemExit('error: %s' % e)

        if args.schema:
            json_dump(reader.schema, True)
            sys.stdout.write('\n')
            continue

        indent = 4 if args.pretty else None
        try:
            for record in reader:
                json_dump(record, indent)
                sys.stdout.write('\n')
        except (IOError, KeyboardInterrupt):
            pass
示例#28
0
def main(args):
    if 3 != len(args):
        print("usage %s input output" % args[0])
        return 1
    input_file_path = args[1]
    output_file_path = args[2]
    with open(input_file_path, 'rb') as infile:
        reader = avro.reader(infile)
        schema = reader.schema

        with open(output_file_path, 'w') as of:
            for record in reader:
                as_json = json.dumps(record)
                of.write(as_json)
                of.write("\n")
def read_fastavro_original_from_buffer(bufferbytes):
    """
    Reads the AVRO binary data contained in the specified bytes buffer and returns it as a python data structure.
    The avro buffer to read must contain schema, headers, and the binary representation of the data. This is basically
    what is written by default by DataFileWriter

    :param bufferbytes: the buffer of bytes containing original binary AVRO representation
    :return: recs  : python list containing all avro recs
             schema: the avro schema object
    """

    bytes_reader = BytesIO(bufferbytes)
    freader = reader(bytes_reader)
    schema = freader.schema
    recs = []
    for datum in freader:
        recs.append(datum)
    return recs, schema
示例#30
0
def test_serialize(ephemeral, schema):
    user = {
        'name': 'Foo Bar Matic',
        'favorite_number': 24,
        'favorite_color': 'Nonyabusiness',
    }

    avro_blob = serialize(schema, [user], ephemeral_storage=ephemeral)
    buf = BytesIO()
    buf.write(avro_blob)
    buf.seek(0)

    read = reader(buf)
    meta = read.metadata
    value = meta.get('postmates.storage.ephemeral', None)
    assert value == ('1' if ephemeral else None)
    records = [r for r in read]
    assert records == [user]
示例#31
0
def test_schema_migration_remove_field():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": "string",
        }]
    }

    new_schema = {"type": "record", "fields": []}

    new_file = MemoryIO()
    records = [{'test': 'test'}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == [{}]
示例#32
0
def test_py37_runtime_error():
    """On Python 3.7 this test would cause the StopIteration to get raised as
    a RuntimeError.

    See https://www.python.org/dev/peps/pep-0479/
    """
    weather_file = join(data_dir, 'weather.avro')

    zip_io = MemoryIO()
    with zipfile.ZipFile(zip_io, mode='w') as zio:
        zio.write(weather_file, arcname='weather')

    with zipfile.ZipFile(zip_io) as zio:
        with zio.open('weather') as fo:
            # Need to read fo into a bytes buffer for python versions less
            # than 3.7
            reader = fastavro.reader(MemoryIO(fo.read()))
            list(reader)
示例#33
0
def test_schema_migration_reader_union():
    schema = {"type": "record", "fields": [{"name": "test", "type": "int"}]}

    new_schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": ["string", "int"]
        }]
    }

    new_file = MemoryIO()
    records = [{"test": 1}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    new_records = list(new_reader)
    assert new_records == records
示例#34
0
def deserialize(avro_bytes, decode_schema=False, reader_schema=None):
    """
        Deserialize encoded avro bytes.

        Args:
            avro_bytes: IOBase | bytes - Avro blob to decode.

        Kwargs:
            decode_schema: Bool - Load metadata['avro.schema'] as Python dictionary?.  Default = False.

            reader_schema: Dict - Schema to use when deserializing. If None, use writer_schema.  Default = None.

        Returns:
            (metadata, values) where:
                metadata: dict - Avro metadata as raw bytes.  When decode_schema is True,
                    the key 'avro.schema' value will be loaded as a Python dictionary instead of a string of JSON.

                values: generator - Generator for values corresponding to the schema contained
                    in metadata.

    """
    def _avro_generator(datafile_reader):
        for value in datafile_reader:
            yield value

    if isinstance(avro_bytes, IOBase):
        buffer = avro_bytes
    elif isinstance(avro_bytes, bytes):
        buffer = BytesIO(avro_bytes)
    else:
        raise ValueError(
            "avro_bytes must be a bytes object or file-like io object")

    read = reader(buffer, reader_schema=reader_schema)
    values = _avro_generator(read)
    metadata = read.metadata

    if decode_schema:
        schema = metadata['avro.schema']
        if sys.version_info < (3, 0):
            schema = schema.decode('utf-8')
        metadata['avro.schema'] = json.loads(schema)

    return metadata, values
示例#35
0
    def create_book(self, data):
        Session = sessionmaker(bind=engine)
        session = Session()
        with BytesIO() as f:
            f.write(bytes.fromhex(data))
            f.seek(0)
            avro_reader = reader(f, book_schema)
            for data in avro_reader:
                category = None
                if data["category"]:
                    category = session.query(BookCategory).filter_by(id=data["category"]).first()
                book = Book(name=data["name"], amount=data["amount"])
                if category:
                    book.category = category
                session.add(book)
                session.commit()

        session.close()
        return "success"
def avro_csvconcat():
    head = True
    count = 0
    #f = csv.writer(open("test.csv", "w+"))
    with open(avro_concat_06, 'rb') as fo, open('OI_06_concat.csv',
                                                mode='a') as csv_out:
        f = csv.writer(csv_out)
        avro_reader = reader(fo)
        for emp in avro_reader:
            print(count)
            #print(emp)
            if head == True:
                header = emp.keys()
                f.writerow(header)
                head = False
            count += 1
            f.writerow(emp.values())
            #print(emp.values())
    print(count)
def test_reading_after_writing_with_load_schema():
    schema_path = join(data_dir, 'Parent.avsc')
    schema = fastavro.schema.load_schema(schema_path)

    records = [{'child': {}}]

    new_file = MemoryIO()
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)

    # Clean the Child and Parent entries so we are forced to get them from the
    # schema
    for repo in (SCHEMA_DEFS, fastavro.write.WRITERS, fastavro.read.READERS):
        del repo['Child']
        del repo['Parent']

    reader = fastavro.reader(new_file)
    new_records = list(reader)
    assert new_records == records
def read_file(filepath, only_stats=True, merge_body=True) -> pd.DataFrame:
    """Reads a single Dow Jones snapshot datafile
    Parameters
    ----------
    filepath : str
        Relative or absolute file path
    only_stats : bool, optional
        Specifies if only file metadata is loaded (True), or if the full article content is loaded (False). On average,
        only_stats loads about 1/10 and is recommended for quick metadata-based analysis. (Default is True)
    merge_body : bool, optional
        Specifies if the body field should be merged with the snippet and this last column being dropped.
        (default is True)
    Returns
    -------
    pandas.DataFrame
        A single Pandas Dataframe with the file content
    """
    with open(filepath, "rb") as fp:
        reader = fastavro.reader(fp)
        records = [r for r in reader]
        r_df = pd.DataFrame.from_records(records)

    if only_stats is True:
        r_df = r_df[ARTICLES_STAT_FIELDS]

    if (only_stats is False) & (merge_body is True):
        r_df['body'] = r_df['snippet'] + "\n\n" + r_df['body']
        r_df.drop('snippet', axis=1, inplace=True)

    for d_field in ARTICLE_DELETE_FIELDS:
        if d_field in r_df.columns:
            r_df.drop(d_field, axis=1, inplace=True)
    r_df['publication_date'] = r_df['publication_date'].astype(
        'datetime64[ms]')
    r_df['publication_datetime'] = r_df['publication_datetime'].astype(
        'datetime64[ms]')
    r_df['modification_date'] = r_df['modification_date'].astype(
        'datetime64[ms]')
    r_df['modification_datetime'] = r_df['modification_datetime'].astype(
        'datetime64[ms]')
    r_df['ingestion_datetime'] = r_df['ingestion_datetime'].astype(
        'datetime64[ms]')
    return r_df
示例#39
0
 def __enter__(self):
     rv = super(PFBReader, self).__enter__()
     self._reader = reader(self._file_obj)
     schema = []
     self.set_encoded_schema(self._reader.writer_schema)
     for f in self._reader.writer_schema["fields"]:
         if f["name"] == "object":
             it = iter(f["type"])
             # skip metadata
             next(it)
             for node in it:
                 node = deepcopy(node)
                 schema.append(node)
                 for field in node["fields"]:
                     handle_schema_field_unicode(field, encode=False)
     self.set_schema(
         json.loads(json.dumps(schema), object_pairs_hook=str_hook))
     self.set_metadata(next(self._reader)["object"])
     return rv
def test_write_union_tuple_primitive():
    '''
    Test that when we can use tuple style of writing unions
    (see function `write_union` in `_write`) with primitives
     not only with records.
    '''

    schema = {
        'name': 'test_name',
        'namespace': 'test',
        'type': 'record',
        'fields': [{
            'name': 'val',
            'type': ['string', 'int']
        }]
    }

    data = [
        {
            "val": ("int", 1)
        },
        {
            "val": ("string", "string")
        },
    ]

    expected_data = [
        {
            "val": 1
        },
        {
            "val": "string"
        },
    ]

    new_file = MemoryIO()
    fastavro.writer(new_file, schema, data)
    new_file.seek(0)

    new_reader = fastavro.reader(new_file)
    new_records = list(new_reader)

    assert new_records == expected_data
示例#41
0
def test_export(tmpwd):
    ref_path = tmpwd.old / 'xun/tests/test_data/data.bin'

    set_xun_sima_root_args = cli.parser.parse_args(
        ['sima-export', 'i24fi',
         str(ref_path), '-o', 'out.avro'])
    set_xun_sima_root_args.func(set_xun_sima_root_args)

    test_data = {}
    with open(str(ref_path), 'rb') as db:
        td = struct.unpack('i24fi', db.read())
        test_data = dict(('col_{}'.format(i), v) for (i, v) in enumerate(td))

    avro_data = {}
    with open('out.avro', 'rb') as oa:
        for record in fastavro.reader(oa):
            avro_data = record

    assert avro_data == pytest.approx(test_data)
示例#42
0
def test_schema_migration_enum_failure():
    schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["FOO", "BAR"],
    }

    new_schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["BAZ", "BAR"],
    }

    new_file = MemoryIO()
    records = ["FOO"]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    list(new_reader)
示例#43
0
 def alerts(self, limit: None | int = None) -> Iterator[io.BytesIO]:
     """
     Generate alerts until timeout is reached
     :returns: dict instance of the alert content
     :raises StopIteration: when next(fastavro.reader) has dried out
     """
     topic_stats: defaultdict[str, list[float]] = defaultdict(
         lambda: [float("inf"), -float("inf"), 0])
     for message in itertools.islice(self._consumer, limit):
         reader = fastavro.reader(io.BytesIO(message.value()))
         alert = next(reader)  # raise StopIteration
         stats = topic_stats[message.topic()]
         if alert["candidate"]["jd"] < stats[0]:
             stats[0] = alert["candidate"]["jd"]
         if alert["candidate"]["jd"] > stats[1]:
             stats[1] = alert["candidate"]["jd"]
         stats[2] += 1
         yield io.BytesIO(message.value())
     log.info("Got messages from topics: {}".format(dict(topic_stats)))
示例#44
0
def main():
    parser = argparse.ArgumentParser(
        description='get the clone counts in the given Avro files',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('lineage_label',
                        metavar='label',
                        help='the clone label to use')
    parser.add_argument('filenames',
                        metavar='file',
                        nargs='+',
                        help='the Avro files to read')
    args = parser.parse_args()

    clones_counts = defaultdict(int)

    for filename in args.filenames:
        with open_compressed(filename, 'rb') as read_handle:
            reader = fastavro.reader(read_handle)

            for record in reader:
                if args.lineage_label in record['lineages']:
                    subject = record['subject']
                    source = record['source']
                    type_ = record['sequence']['annotations']['target1']
                    lineage = record['lineages'][args.lineage_label]

                    clones_counts[(subject, source, type_, lineage)] += 1

    writer = csv.DictWriter(
        sys.stdout,
        fieldnames=['subject', 'source', 'type', 'lineage', 'read_count'])
    writer.writeheader()

    for (subject, source, type_, lineage), read_count in clones_counts.items():
        row = {
            'subject': subject,
            'source': source,
            'type': type_,
            'lineage': lineage,
            'read_count': read_count
        }
        writer.writerow(row)
示例#45
0
def test_zephyre(tmpwd):
    argv = ['tag', '2030-01-01', '2030-01-02']
    args = cli.parser.parse_args(['zephyre', *argv])

    with patch('camille.source.zephyre.Zephyre._get_token',
               return_value='token'), patch(
                   'camille.source.zephyre.requests.get',
                   side_effect=requests_get_mock,
               ):
        args.func(args)

    expected_filename = filename_from_args(args,
                                           prefix='zephyre.',
                                           postfix='.avro')

    with open(expected_filename, 'rb') as f:
        avro_reader = fastavro.reader(f, reader_schema=schema)
        result = list(avro_reader)

    assert result == reference
示例#46
0
文件: test_hdfs.py 项目: rongou/cudf
def test_read_avro(datadir, hdfs, test_url):
    fname = datadir / "avro" / "example.avro"
    # Read from local file system as buffer
    with open(fname, mode="rb") as f:
        buffer = BytesIO(f.read())
    # Write to hdfs
    hdfs.upload(basedir + "/file.avro", buffer)

    if test_url:
        hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro"
    else:
        hd_fpath = f"hdfs://{basedir}/file.avro"

    got = cudf.read_avro(hd_fpath)
    with open(fname, mode="rb") as f:
        expect = pd.DataFrame.from_records(fa.reader(f))

    for col in expect.columns:
        expect[col] = expect[col].astype(got[col].dtype)
    assert_eq(expect, got)
示例#47
0
async def test_translate_record():
    schema = make_pfb_schema([person_entity_def])
    file = make_avro_file(schema, [
        {'name': 'person', 'id': '123', 'object': {
            'first_name': 'Test', 'last_name': 'Dummy', 'eye_color': 'gray'
        }}
    ])

    result = await translate(fastavro.reader(file))
    assert result == [
        {
            'name': '123',
            'entityType': 'person',
            'operations': [
                add_update_attribute('first_name', 'Test'),
                add_update_attribute('last_name', 'Dummy'),
                add_update_attribute('eye_color', 'gray')
            ]
        }
    ]
示例#48
0
	def load_ztf_alert(arg):
		"""	
		Convenience method.
		Do not use for production!
		"""
		import fastavro
		with open(arg, "rb") as fo:
			al = next(fastavro.reader(fo), None)

		if al.get('prv_candidates') is None:
			return AmpelAlert(
				al['objectId'], tuple([MappingProxyType(al['candidate'])]), None
			)
		else:
			pps = [MappingProxyType(d) for d in al['prv_candidates'] if d.get('candid') is not None]
			pps.insert(0,  MappingProxyType(al['candidate']))
			return AmpelAlert(
				al['objectId'], tuple(pps), 
				tuple(MappingProxyType(d) for d in al['prv_candidates'] if d.get('candid') is None)
			)
示例#49
0
 def deserialise(self, buffer):
     output = reader(io.BytesIO(buffer), schema)
     new_message = None
     for message in output:
         if MessageType(message['type']) is MessageType.TEXT:
             new_message = TextMessage(
                 message['author'], 'last_author',
                 datetime.fromtimestamp(message['timestamp']),
                 datetime.fromtimestamp(0), message['topic'],
                 message['raw_text'])
         elif MessageType(message['type']) is MessageType.PYTHON:
             new_message = PythonMessage(message['author'],
                                         'last_author',
                                         datetime.fromtimestamp(
                                             message['timestamp']),
                                         datetime.fromtimestamp(0),
                                         message['raw_text'],
                                         message['topic'],
                                         html=message['html'])
         elif MessageType(message['type']) is MessageType.R:
             new_message = RMessage(message['author'],
                                    'last_author',
                                    datetime.fromtimestamp(
                                        message['timestamp']),
                                    datetime.fromtimestamp(0),
                                    message['raw_text'],
                                    message['topic'],
                                    html=message['html'])
         elif MessageType(message['type']) is MessageType.IMAGE:
             new_message = ImageMessage(message['author'],
                                        'last_author',
                                        datetime.fromtimestamp(
                                            message['timestamp']),
                                        datetime.fromtimestamp(0),
                                        message['binary'],
                                        message['topic'],
                                        html=message['html'])
         else:
             raise ValueError(
                 'Unrecognised message type in AvroSerialise.deserialise')
     return new_message
示例#50
0
    def _validate_avro_for_batch_retrieval(self, source: str,
                                           feature_sets_request):
        """
        Validate whether the entity rows in an Avro source file contains the
        correct information for batch retrieval.

        Only gs:// and local files (file://) uri schemes are allowed.

        Avro file must have a column named "event_timestamp".

        No checks will be done if a GCS path is provided.

        Args:
            source (str):
                File path to Avro.

            feature_sets_request:
                Feature sets that will be requested.
        """
        p = urlparse(source)

        if p.scheme == "gs":
            # GCS path provided (Risk is delegated to user)
            # No validation if GCS path is provided
            return
        elif p.scheme == "file" or not p.scheme:
            # Local file (file://) provided
            file_path = os.path.abspath(os.path.join(p.netloc, p.path))
        else:
            raise Exception(
                f"Unsupported uri scheme provided {p.scheme}, only "
                f"local files (file://), and gs:// schemes are "
                f"allowed")

        with open(file_path, "rb") as f:
            reader = fastavro.reader(f)
            schema = json.loads(reader.metadata["avro.schema"])
            columns = [x["name"] for x in schema["fields"]]
            self._validate_columns(columns=columns,
                                   feature_sets_request=feature_sets_request,
                                   datetime_field="event_timestamp")
示例#51
0
def _read_avro(urlpath, **kwargs):
    """Read avro file in given path and returns a list of delayed objects."""
    values = []
    for fn in open_files(urlpath):
        with fn as fp:
            av = fastavro.reader(fp)
            header = av._header

        # TODO: If the avro block size in the file is larger than the blocksize
        # passed here then some returned blocks may be empty because they don't
        # contain the delimiter.
        _, blockss = read_bytes(fn.path, delimiter=header['sync'], not_zero=True,
                                sample=False, **kwargs)
        values.extend(
            delayed(_avro_body)(block, header) for blocks in blockss for block in blocks
        )

    if not values:
        raise ValueError("urlpath is empty: %s" % urlpath)

    return values
示例#52
0
def test_schema_migration_schema_mismatch():
    schema = {
        "type": "record",
        "fields": [{
            "name": "test",
            "type": "string",
        }]
    }

    new_schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["FOO", "BAR"],
    }

    new_file = MemoryIO()
    records = [{"test": "test"}]
    fastavro.writer(new_file, schema, records)
    new_file.seek(0)
    new_reader = fastavro.reader(new_file, new_schema)
    list(new_reader)
示例#53
0
文件: job.py 项目: vjrkr/feast
    def result(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])):
        """
        Wait until job is done to get an iterable rows of result. The row can
        only represent an Avro row in Feast 0.3.

        Args:
            timeout_sec (int):
                Max no of seconds to wait until job is done. If "timeout_sec"
                is exceeded, an exception will be raised.

        Returns:
            Iterable of Avro rows.
        """
        uris = self.get_avro_files(timeout_sec)
        for file_uri in uris:
            file_obj = get_staging_client(file_uri.scheme).download_file(file_uri)
            file_obj.seek(0)
            avro_reader = fastavro.reader(file_obj)

            for record in avro_reader:
                yield record
def test_enum_evolution_no_default_failure():
    original_schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["FOO", "BAR"],
    }

    new_schema = {
        "type": "enum",
        "name": "test",
        "symbols": ["BAZ", "BAR"],
    }

    original_records = ["FOO"]

    bio = BytesIO()
    fastavro.writer(bio, original_schema, original_records)
    bio.seek(0)

    with pytest.raises(fastavro.read.SchemaResolutionError):
        list(fastavro.reader(bio, new_schema))
示例#55
0
    def _avro_to_df(self, avro_buffer, data_types):
        """Read an avro structure into a dataframe and minimially parse it

        returns: (schema, pandas.Dataframe)
        """
        def parse_row(row):
            return {
                col["name"]: pandas.to_datetime(row[col["name"]])
                if col["data_type"] == "date" else row[col["name"]]
                for col in data_types
            }

        reader = fastavro.reader(six.BytesIO(avro_buffer))
        metadata = reader.writer_schema.get("structure", ())

        if not metadata:
            raise DataMonsterError(
                "DataMonster does not currently support this request")

        records = [parse_row(r) for r in reader]
        return metadata, pandas.DataFrame.from_records(records)
示例#56
0
def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame:
    from fastavro import reader

    kw = ParamDict(kwargs)
    process_record = None
    if "process_record" in kw:
        process_record = kw["process_record"]
        del kw["process_record"]

    with FileSystem().openbin(path) as fp:
        # Configure Avro reader
        avro_reader = reader(fp)
        # Load records in memory
        if process_record:
            records = [process_record(r) for r in avro_reader]

        else:
            records = list(avro_reader)

        # Populate pandas.DataFrame with records
        return pd.DataFrame.from_records(records)
示例#57
0
def test_avro_reader_basic(datadir, inputfile, columns, engine):
    path = datadir / inputfile
    try:
        reader = fa.reader(open(path, "rb"))
    except Exception as excpr:
        if type(excpr).__name__ == "FileNotFoundError":
            pytest.skip(".avro file is not found")
        else:
            print(type(excpr).__name__)

    expect = pd.DataFrame.from_records(reader)
    got = cudf.read_avro(path, engine=engine, columns=columns)

    # PANDAS uses NaN to represent invalid data, which forces float dtype
    # For comparison, we can replace NaN with 0 and cast to the cuDF dtype
    # FASTAVRO produces int64 columns from avro int32 dtype, so convert
    # it back to int32 here
    for col in expect.columns:
        expect[col] = expect[col].astype(got[col].dtype)

    assert_eq(expect, got, check_categorical=False)
def createZonemaps(filename, output, startID):
    listPT = [0.0 for x in range(0, Constants.IMPRINTS_NUM_PT)]
    with open(filename, 'rb') as fo:
        reader = avro.reader(fo)
        schema = reader.schema
        counter = 0
        print "Create zonemaps..."
        print "Input file: " + str(filename)
        for record in reader:
            tempMuon = len(record['Muon'])
            if (tempMuon > 0):
                for j in range(0, tempMuon):
                    temppt = Decimal(record['Muon'][j]['pt'])
                    listPT[counter] = temppt
                    counter += 1
                    if (counter == Constants.IMPRINTS_NUM_PT):
                        writeZonemaps(listPT, output, startID)
                        counter = 0
                        startID += 1
    print "Finish!"
    return startID