def test_not_avro(): try: with open(__file__, 'rb') as fo: fastavro.reader(fo) assert False, 'opened non avro file' except ValueError: pass
def check(filename): with open(filename, 'rb') as fo: reader = fastavro.reader(fo) assert hasattr(reader, 'schema'), 'no schema on file' if basename(filename) in NO_DATA: return records = list(reader) assert len(records) > 0, 'no records found' new_file = MemoryIO() fastavro.writer(new_file, reader.schema, records, reader.codec) new_file_bytes = new_file.getvalue() new_file = NoSeekMemoryIO(new_file_bytes) new_reader = fastavro.reader(new_file) assert hasattr(new_reader, 'schema'), "schema wasn't written" assert new_reader.schema == reader.schema assert new_reader.codec == reader.codec new_records = list(new_reader) assert new_records == records # Test schema migration with the same schema new_file = NoSeekMemoryIO(new_file_bytes) schema_migration_reader = fastavro.reader(new_file, reader.schema) assert schema_migration_reader.reader_schema == reader.schema new_records = list(schema_migration_reader) assert new_records == records
def test_repo_caching_issue(): schema = { "type": "record", "name": "B", "fields": [{ "name": "b", "type": { "type": "record", "name": "C", "fields": [{ "name": "c", "type": "string" }] } }] } new_file = MemoryIO() records = [{"b": {"c": "test"}}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file) new_records = list(new_reader) assert new_records == records other_schema = { "name": "A", "type": "record", "fields": [{ "name": "a", "type": { "type": "record", "name": "B", "fields": [{ "name": "b", "type": { "type": "record", "name": "C", "fields": [{ "name": "c", "type": "int" }] } }] } }, { "name": "aa", "type": "B" }] } new_file = MemoryIO() records = [{"a": {"b": {"c": 1}}, "aa": {"b": {"c": 2}}}] fastavro.writer(new_file, other_schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file) new_records = list(new_reader) assert new_records == records
def test_schema_migration_array_with_union_promotion(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["boolean", "long"] }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "float"] }, }] } new_file = MemoryIO() records = [{"test": [1, 2, 3]}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == records
def test_schema_migration_array_failure(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "int"] }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "boolean"] }, }] } new_file = MemoryIO() records = [{"test": [1, 2, 3]}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) list(new_reader)
def roundtrip(record, writer_schema, reader_schema): new_file = MemoryIO() fastavro.writer(new_file, writer_schema, [record]) new_file.seek(0) new_records = list(fastavro.reader(new_file, reader_schema)) return new_records[0]
def test_schema_migration_maps_with_union_promotion(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": ["string", "int"] }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": ["string", "long"] }, }] } new_file = MemoryIO() records = [{"test": {"foo": 1}}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == records
def test_schema_migration_maps_failure(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": "string" }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": "long" }, }] } new_file = MemoryIO() records = [{"test": {"foo": "a"}}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) try: list(new_reader) except fastavro._reader.SchemaResolutionError: pass else: assert False
def extract_file(filename): print >> sys.stderr, "Processing " + filename result = [] with open(filename, 'rb') as avro_file: reader = avro.reader(avro_file) schema = reader.schema fields = global_fields add_header = args.add_header for index, record in enumerate(reader): if not fields: fields = tuple(get_fields(record)) if args.list_fields: print 'Fields in %s:' % (filename,) for f in fields: print ' Field: ' + '/'.join(f) break if add_header: print '\t'.join( '/'.join( p.encode('utf-8') for p in f ) for f in fields ) add_header = False if index and not (index % 1000): if samples and all( s.is_full for s in samples.itervalues() ): break sys.stderr.write("Read %d lines of input\r" % (index,)) extracted_values = extract(record, fields) if samples is None: result.append('\t'.join(extracted_values)) if samples: print 'Samples values from %s:' % (filename,) for f in fields: print ' ' + '/'.join(f) + ':' for v in sorted(samples[f]): print ' ' + v print >> sys.stderr, "Read %d lines of input\r" % (index,) return result
def test_str_py3(): letters = ascii_uppercase + digits id_size = 100 seed('str_py3') # Repeatable results def gen_id(): return ''.join(choice(letters) for _ in range(id_size)) keys = ['first', 'second', 'third', 'fourth'] testdata = [{key: gen_id() for key in keys} for _ in range(50)] schema = { "fields": [{'name': key, 'type': 'string'} for key in keys], "namespace": "namespace", "name": "zerobyte", "type": "record" } buf = BytesIO() fastavro.writer(buf, schema, testdata) buf.seek(0, SEEK_SET) for i, rec in enumerate(fastavro.reader(buf), 1): pass size = len(testdata) assert i == size, 'bad number of records' assert rec == testdata[-1], 'bad last record'
def _read_avro(fn, executor=None, hdfs=None, lazy=False, **kwargs): """ See distributed.hdfs.read_avro for docstring """ from hdfs3 import HDFileSystem from dask import do import fastavro hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = hdfs.glob(fn) blockss = [] for fn in filenames: with hdfs.open(fn, 'r') as f: av = fastavro.reader(f) header = av._header schema = json.loads(header['meta']['avro.schema']) blockss.extend([read_bytes(fn, executor, hdfs, lazy=True, delimiter=header['sync'], not_zero=True) for fn in filenames]) lazy_values = [do(avro_body)(b, header) for blocks in blockss for b in blocks] if lazy: raise gen.Return(lazy_values) else: futures = executor.compute(*lazy_values) raise gen.Return(futures)
def test_schema_migration_maps_failure(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": "string" }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": "long" }, }] } new_file = MemoryIO() records = [{"test": {"foo": "a"}}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) list(new_reader)
def test_schema_migration_array_failure(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "int"] }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "boolean"] }, }] } new_file = MemoryIO() records = [{"test": [1, 2, 3]}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) try: list(new_reader) except fastavro._reader.SchemaResolutionError: pass else: assert False
def test_schema_migration_schema_mismatch(): schema = { "type": "record", "fields": [{ "name": "test", "type": "string", }] } new_schema = { "type": "enum", "name": "test", "symbols": ["FOO", "BAR"], } new_file = MemoryIO() records = [{"test": "test"}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) try: list(new_reader) except fastavro._reader.SchemaResolutionError: pass else: assert False
def roundtrip(schema, records, new_schema): new_file = MemoryIO() fastavro.writer(new_file, schema, records) new_file.seek(0) reader = fastavro.reader(new_file, new_schema) new_records = list(reader) return new_records
def test_default_values(): schema = {"type": "record", "fields": [{"name": "default_field", "type": "string", "default": "default_value"}]} new_file = MemoryIO() records = [{}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file) new_records = list(new_reader) assert new_records == [{"default_field": "default_value"}]
def test_metadata(): schema = {"type": "record", "fields": []} new_file = MemoryIO() records = [{}] metadata = {"key": "value"} fastavro.writer(new_file, schema, records, metadata=metadata) new_file.seek(0) new_reader = fastavro.reader(new_file) assert new_reader.metadata["key"] == metadata["key"]
def avroInputIterator(self, inputStream, interpreter="avro"): if interpreter == "avro": return DataFileReader(inputStream, DatumReader()) elif interpreter == "fastavro": import fastavro return fastavro.reader(inputStream) elif interpreter == "correct-fastavro": return FastAvroCorrector(inputStream, self.config.input) else: raise ValueError("interpreter must be one of \"avro\", \"fastavro\", and \"correct-fastavro\" (which corrects fastavro's handling of Unicode strings)")
def read(iostream, runs=1): times = [] for _ in range(runs): iostream.seek(0) start = time.time() records = list(reader(iostream)) end = time.time() times.append(end - start) print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs))) return records
def _reader(): """Record generator over all part-files.""" for path in self._paths: with self._client.read(path) as bytes_reader: reader = fastavro.reader(_SeekableReader(bytes_reader)) if not self._schema: schema = reader.writer_schema _logger.debug('Read schema from %r.', path) yield (schema, reader.metadata) for record in reader: yield record
def test_fastavro(): print("fastavro: reading file...") with open(filename, "rb") as fp: av = fastavro.reader(fp) t0 = datetime.datetime.now() res = list(av) t1 = datetime.datetime.now() return (t1 - t0, len(res))
def gen_cat(sources, cat_type=None): for s in sources: if cat_type is None: for item in s: yield item elif cat_type == "avro": reader = fastavro.reader(s) for item in reader: yield json.dumps(item) else: print "unknown cat type: " + str(cat_type)
def test_schema_migration_add_default_field(): schema = {"type": "record", "fields": []} new_schema = {"type": "record", "fields": [{"name": "test", "type": "string", "default": "default"}]} new_file = MemoryIO() records = [{}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == [{"test": "default"}]
def test_schema_migration_reader_union(): schema = {"type": "record", "fields": [{"name": "test", "type": "int"}]} new_schema = {"type": "record", "fields": [{"name": "test", "type": ["string", "int"]}]} new_file = MemoryIO() records = [{"test": 1}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == records
def check(filename): with open(filename, 'rb') as fo: reader = fastavro.reader(fo) assert hasattr(reader, 'schema'), 'no schema on file' if basename(filename) in NO_DATA: return records = list(reader) assert len(records) > 0, 'no records found' new_file = MemoryIO() fastavro.writer(new_file, reader.schema, records, reader.codec) new_file.seek(0) new_reader = fastavro.reader(new_file) assert hasattr(new_reader, 'schema'), "schema wasn't written" assert new_reader.schema == reader.schema assert new_reader.codec == reader.codec new_records = list(new_reader) assert new_records == records
def extract(self, infile, job): minx, miny, maxx, maxy = (None, None, None, None) poly_wkt = None job.set_field(VgDexField.NAME, os.path.basename(infile)) job.set_field(VgDexField.PATH, infile) with open(infile, 'rb') as avro_file: reader = fastavro.reader(avro_file) for record in reader: for k, v in record.iteritems(): if k.lower() == 'footprint_geometry': poly_wkt = v job.set_field(VgDexField.GEO_WKT, poly_wkt) if job.get(VgDexField.GEO): job.geo['wkt'] = poly_wkt job.set_field(VgDexField.GEO, job.get_field(VgDexField.GEO_WKT)) else: if k == 'MBR_EAST' and v: minx = float(v) elif k == 'MBR_WEST' and v: maxx = float(v) elif k == 'MBR_NORTH' and v: maxy = float(v) elif k == 'MBR_SOUTH' and v: miny = float(v) # Map values to correct data type. if isinstance(v, str): job.set_field("fs_{0}".format(k), v) elif isinstance(v, unicode): job.set_field("fs_{0}".format(k), v) elif isinstance(v, bool): job.set_field("fs_{0}".format(k), v) elif isinstance(v, int): job.set_field("fl_{0}".format(k), v) elif isinstance(v, float): job.set_field("fu_{0}".format(k), v) elif isinstance(v, datetime.datetime): job.set_field("fd_{0}".format(k), self.format_date(v)) elif (v and "Date" in k) or (isinstance(v, unicode) and len(v.strip()) == 14): job.set_field("fd_{0}".format(k), self.format_date(v)) elif isinstance(v, list): job.set_field("fs_{0}".format(k), v) else: job.set_field("meta_{0}".format(k), v) if minx and not poly_wkt: poly_wkt = "POLYGON (({0} {1}, {0} {3}, {2} {3}, {2} {1}, {0} {1}))".format(minx, miny, maxx, maxy) job.set_field(VgDexField.GEO_WKT, poly_wkt) if job.get(VgDexField.GEO): job.geo['wkt'] = poly_wkt job.set_field(VgDexField.GEO, job.get_field(VgDexField.GEO_WKT))
def main(argv=None): import sys from argparse import ArgumentParser argv = argv or sys.argv parser = ArgumentParser( description='iter over avro file, emit records as JSON') parser.add_argument('file', help='file(s) to parse', nargs='*') parser.add_argument('--schema', help='dump schema instead of records', action='store_true', default=False) parser.add_argument('--codecs', help='print supported codecs', action='store_true', default=False) parser.add_argument('--version', action='version', version='fastavro %s' % avro.__version__) parser.add_argument('-p', '--pretty', help='pretty print json', action='store_true', default=False) args = parser.parse_args(argv[1:]) if args.codecs: import fastavro print('\n'.join(sorted(fastavro._reader.BLOCK_READERS))) raise SystemExit files = args.file or ['-'] for filename in files: if filename == '-': fo = sys.stdin else: try: fo = open(filename, 'rb') except IOError as e: raise SystemExit('error: cannot open %s - %s' % (filename, e)) try: reader = avro.reader(fo) except ValueError as e: raise SystemExit('error: %s' % e) if args.schema: json_dump(reader.schema, True) sys.stdout.write('\n') continue indent = 4 if args.pretty else None try: for record in reader: json_dump(record, indent) sys.stdout.write('\n') except (IOError, KeyboardInterrupt): pass
def main(args): if 3 != len(args): print("usage %s input output" % args[0]) return 1 input_file_path = args[1] output_file_path = args[2] with open(input_file_path, 'rb') as infile: reader = avro.reader(infile) schema = reader.schema with open(output_file_path, 'w') as of: for record in reader: as_json = json.dumps(record) of.write(as_json) of.write("\n")
def read_fastavro_original_from_buffer(bufferbytes): """ Reads the AVRO binary data contained in the specified bytes buffer and returns it as a python data structure. The avro buffer to read must contain schema, headers, and the binary representation of the data. This is basically what is written by default by DataFileWriter :param bufferbytes: the buffer of bytes containing original binary AVRO representation :return: recs : python list containing all avro recs schema: the avro schema object """ bytes_reader = BytesIO(bufferbytes) freader = reader(bytes_reader) schema = freader.schema recs = [] for datum in freader: recs.append(datum) return recs, schema
def test_serialize(ephemeral, schema): user = { 'name': 'Foo Bar Matic', 'favorite_number': 24, 'favorite_color': 'Nonyabusiness', } avro_blob = serialize(schema, [user], ephemeral_storage=ephemeral) buf = BytesIO() buf.write(avro_blob) buf.seek(0) read = reader(buf) meta = read.metadata value = meta.get('postmates.storage.ephemeral', None) assert value == ('1' if ephemeral else None) records = [r for r in read] assert records == [user]
def test_schema_migration_remove_field(): schema = { "type": "record", "fields": [{ "name": "test", "type": "string", }] } new_schema = {"type": "record", "fields": []} new_file = MemoryIO() records = [{'test': 'test'}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == [{}]
def test_py37_runtime_error(): """On Python 3.7 this test would cause the StopIteration to get raised as a RuntimeError. See https://www.python.org/dev/peps/pep-0479/ """ weather_file = join(data_dir, 'weather.avro') zip_io = MemoryIO() with zipfile.ZipFile(zip_io, mode='w') as zio: zio.write(weather_file, arcname='weather') with zipfile.ZipFile(zip_io) as zio: with zio.open('weather') as fo: # Need to read fo into a bytes buffer for python versions less # than 3.7 reader = fastavro.reader(MemoryIO(fo.read())) list(reader)
def test_schema_migration_reader_union(): schema = {"type": "record", "fields": [{"name": "test", "type": "int"}]} new_schema = { "type": "record", "fields": [{ "name": "test", "type": ["string", "int"] }] } new_file = MemoryIO() records = [{"test": 1}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == records
def deserialize(avro_bytes, decode_schema=False, reader_schema=None): """ Deserialize encoded avro bytes. Args: avro_bytes: IOBase | bytes - Avro blob to decode. Kwargs: decode_schema: Bool - Load metadata['avro.schema'] as Python dictionary?. Default = False. reader_schema: Dict - Schema to use when deserializing. If None, use writer_schema. Default = None. Returns: (metadata, values) where: metadata: dict - Avro metadata as raw bytes. When decode_schema is True, the key 'avro.schema' value will be loaded as a Python dictionary instead of a string of JSON. values: generator - Generator for values corresponding to the schema contained in metadata. """ def _avro_generator(datafile_reader): for value in datafile_reader: yield value if isinstance(avro_bytes, IOBase): buffer = avro_bytes elif isinstance(avro_bytes, bytes): buffer = BytesIO(avro_bytes) else: raise ValueError( "avro_bytes must be a bytes object or file-like io object") read = reader(buffer, reader_schema=reader_schema) values = _avro_generator(read) metadata = read.metadata if decode_schema: schema = metadata['avro.schema'] if sys.version_info < (3, 0): schema = schema.decode('utf-8') metadata['avro.schema'] = json.loads(schema) return metadata, values
def create_book(self, data): Session = sessionmaker(bind=engine) session = Session() with BytesIO() as f: f.write(bytes.fromhex(data)) f.seek(0) avro_reader = reader(f, book_schema) for data in avro_reader: category = None if data["category"]: category = session.query(BookCategory).filter_by(id=data["category"]).first() book = Book(name=data["name"], amount=data["amount"]) if category: book.category = category session.add(book) session.commit() session.close() return "success"
def avro_csvconcat(): head = True count = 0 #f = csv.writer(open("test.csv", "w+")) with open(avro_concat_06, 'rb') as fo, open('OI_06_concat.csv', mode='a') as csv_out: f = csv.writer(csv_out) avro_reader = reader(fo) for emp in avro_reader: print(count) #print(emp) if head == True: header = emp.keys() f.writerow(header) head = False count += 1 f.writerow(emp.values()) #print(emp.values()) print(count)
def test_reading_after_writing_with_load_schema(): schema_path = join(data_dir, 'Parent.avsc') schema = fastavro.schema.load_schema(schema_path) records = [{'child': {}}] new_file = MemoryIO() fastavro.writer(new_file, schema, records) new_file.seek(0) # Clean the Child and Parent entries so we are forced to get them from the # schema for repo in (SCHEMA_DEFS, fastavro.write.WRITERS, fastavro.read.READERS): del repo['Child'] del repo['Parent'] reader = fastavro.reader(new_file) new_records = list(reader) assert new_records == records
def read_file(filepath, only_stats=True, merge_body=True) -> pd.DataFrame: """Reads a single Dow Jones snapshot datafile Parameters ---------- filepath : str Relative or absolute file path only_stats : bool, optional Specifies if only file metadata is loaded (True), or if the full article content is loaded (False). On average, only_stats loads about 1/10 and is recommended for quick metadata-based analysis. (Default is True) merge_body : bool, optional Specifies if the body field should be merged with the snippet and this last column being dropped. (default is True) Returns ------- pandas.DataFrame A single Pandas Dataframe with the file content """ with open(filepath, "rb") as fp: reader = fastavro.reader(fp) records = [r for r in reader] r_df = pd.DataFrame.from_records(records) if only_stats is True: r_df = r_df[ARTICLES_STAT_FIELDS] if (only_stats is False) & (merge_body is True): r_df['body'] = r_df['snippet'] + "\n\n" + r_df['body'] r_df.drop('snippet', axis=1, inplace=True) for d_field in ARTICLE_DELETE_FIELDS: if d_field in r_df.columns: r_df.drop(d_field, axis=1, inplace=True) r_df['publication_date'] = r_df['publication_date'].astype( 'datetime64[ms]') r_df['publication_datetime'] = r_df['publication_datetime'].astype( 'datetime64[ms]') r_df['modification_date'] = r_df['modification_date'].astype( 'datetime64[ms]') r_df['modification_datetime'] = r_df['modification_datetime'].astype( 'datetime64[ms]') r_df['ingestion_datetime'] = r_df['ingestion_datetime'].astype( 'datetime64[ms]') return r_df
def __enter__(self): rv = super(PFBReader, self).__enter__() self._reader = reader(self._file_obj) schema = [] self.set_encoded_schema(self._reader.writer_schema) for f in self._reader.writer_schema["fields"]: if f["name"] == "object": it = iter(f["type"]) # skip metadata next(it) for node in it: node = deepcopy(node) schema.append(node) for field in node["fields"]: handle_schema_field_unicode(field, encode=False) self.set_schema( json.loads(json.dumps(schema), object_pairs_hook=str_hook)) self.set_metadata(next(self._reader)["object"]) return rv
def test_write_union_tuple_primitive(): ''' Test that when we can use tuple style of writing unions (see function `write_union` in `_write`) with primitives not only with records. ''' schema = { 'name': 'test_name', 'namespace': 'test', 'type': 'record', 'fields': [{ 'name': 'val', 'type': ['string', 'int'] }] } data = [ { "val": ("int", 1) }, { "val": ("string", "string") }, ] expected_data = [ { "val": 1 }, { "val": "string" }, ] new_file = MemoryIO() fastavro.writer(new_file, schema, data) new_file.seek(0) new_reader = fastavro.reader(new_file) new_records = list(new_reader) assert new_records == expected_data
def test_export(tmpwd): ref_path = tmpwd.old / 'xun/tests/test_data/data.bin' set_xun_sima_root_args = cli.parser.parse_args( ['sima-export', 'i24fi', str(ref_path), '-o', 'out.avro']) set_xun_sima_root_args.func(set_xun_sima_root_args) test_data = {} with open(str(ref_path), 'rb') as db: td = struct.unpack('i24fi', db.read()) test_data = dict(('col_{}'.format(i), v) for (i, v) in enumerate(td)) avro_data = {} with open('out.avro', 'rb') as oa: for record in fastavro.reader(oa): avro_data = record assert avro_data == pytest.approx(test_data)
def test_schema_migration_enum_failure(): schema = { "type": "enum", "name": "test", "symbols": ["FOO", "BAR"], } new_schema = { "type": "enum", "name": "test", "symbols": ["BAZ", "BAR"], } new_file = MemoryIO() records = ["FOO"] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) list(new_reader)
def alerts(self, limit: None | int = None) -> Iterator[io.BytesIO]: """ Generate alerts until timeout is reached :returns: dict instance of the alert content :raises StopIteration: when next(fastavro.reader) has dried out """ topic_stats: defaultdict[str, list[float]] = defaultdict( lambda: [float("inf"), -float("inf"), 0]) for message in itertools.islice(self._consumer, limit): reader = fastavro.reader(io.BytesIO(message.value())) alert = next(reader) # raise StopIteration stats = topic_stats[message.topic()] if alert["candidate"]["jd"] < stats[0]: stats[0] = alert["candidate"]["jd"] if alert["candidate"]["jd"] > stats[1]: stats[1] = alert["candidate"]["jd"] stats[2] += 1 yield io.BytesIO(message.value()) log.info("Got messages from topics: {}".format(dict(topic_stats)))
def main(): parser = argparse.ArgumentParser( description='get the clone counts in the given Avro files', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('lineage_label', metavar='label', help='the clone label to use') parser.add_argument('filenames', metavar='file', nargs='+', help='the Avro files to read') args = parser.parse_args() clones_counts = defaultdict(int) for filename in args.filenames: with open_compressed(filename, 'rb') as read_handle: reader = fastavro.reader(read_handle) for record in reader: if args.lineage_label in record['lineages']: subject = record['subject'] source = record['source'] type_ = record['sequence']['annotations']['target1'] lineage = record['lineages'][args.lineage_label] clones_counts[(subject, source, type_, lineage)] += 1 writer = csv.DictWriter( sys.stdout, fieldnames=['subject', 'source', 'type', 'lineage', 'read_count']) writer.writeheader() for (subject, source, type_, lineage), read_count in clones_counts.items(): row = { 'subject': subject, 'source': source, 'type': type_, 'lineage': lineage, 'read_count': read_count } writer.writerow(row)
def test_zephyre(tmpwd): argv = ['tag', '2030-01-01', '2030-01-02'] args = cli.parser.parse_args(['zephyre', *argv]) with patch('camille.source.zephyre.Zephyre._get_token', return_value='token'), patch( 'camille.source.zephyre.requests.get', side_effect=requests_get_mock, ): args.func(args) expected_filename = filename_from_args(args, prefix='zephyre.', postfix='.avro') with open(expected_filename, 'rb') as f: avro_reader = fastavro.reader(f, reader_schema=schema) result = list(avro_reader) assert result == reference
def test_read_avro(datadir, hdfs, test_url): fname = datadir / "avro" / "example.avro" # Read from local file system as buffer with open(fname, mode="rb") as f: buffer = BytesIO(f.read()) # Write to hdfs hdfs.upload(basedir + "/file.avro", buffer) if test_url: hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro" else: hd_fpath = f"hdfs://{basedir}/file.avro" got = cudf.read_avro(hd_fpath) with open(fname, mode="rb") as f: expect = pd.DataFrame.from_records(fa.reader(f)) for col in expect.columns: expect[col] = expect[col].astype(got[col].dtype) assert_eq(expect, got)
async def test_translate_record(): schema = make_pfb_schema([person_entity_def]) file = make_avro_file(schema, [ {'name': 'person', 'id': '123', 'object': { 'first_name': 'Test', 'last_name': 'Dummy', 'eye_color': 'gray' }} ]) result = await translate(fastavro.reader(file)) assert result == [ { 'name': '123', 'entityType': 'person', 'operations': [ add_update_attribute('first_name', 'Test'), add_update_attribute('last_name', 'Dummy'), add_update_attribute('eye_color', 'gray') ] } ]
def load_ztf_alert(arg): """ Convenience method. Do not use for production! """ import fastavro with open(arg, "rb") as fo: al = next(fastavro.reader(fo), None) if al.get('prv_candidates') is None: return AmpelAlert( al['objectId'], tuple([MappingProxyType(al['candidate'])]), None ) else: pps = [MappingProxyType(d) for d in al['prv_candidates'] if d.get('candid') is not None] pps.insert(0, MappingProxyType(al['candidate'])) return AmpelAlert( al['objectId'], tuple(pps), tuple(MappingProxyType(d) for d in al['prv_candidates'] if d.get('candid') is None) )
def deserialise(self, buffer): output = reader(io.BytesIO(buffer), schema) new_message = None for message in output: if MessageType(message['type']) is MessageType.TEXT: new_message = TextMessage( message['author'], 'last_author', datetime.fromtimestamp(message['timestamp']), datetime.fromtimestamp(0), message['topic'], message['raw_text']) elif MessageType(message['type']) is MessageType.PYTHON: new_message = PythonMessage(message['author'], 'last_author', datetime.fromtimestamp( message['timestamp']), datetime.fromtimestamp(0), message['raw_text'], message['topic'], html=message['html']) elif MessageType(message['type']) is MessageType.R: new_message = RMessage(message['author'], 'last_author', datetime.fromtimestamp( message['timestamp']), datetime.fromtimestamp(0), message['raw_text'], message['topic'], html=message['html']) elif MessageType(message['type']) is MessageType.IMAGE: new_message = ImageMessage(message['author'], 'last_author', datetime.fromtimestamp( message['timestamp']), datetime.fromtimestamp(0), message['binary'], message['topic'], html=message['html']) else: raise ValueError( 'Unrecognised message type in AvroSerialise.deserialise') return new_message
def _validate_avro_for_batch_retrieval(self, source: str, feature_sets_request): """ Validate whether the entity rows in an Avro source file contains the correct information for batch retrieval. Only gs:// and local files (file://) uri schemes are allowed. Avro file must have a column named "event_timestamp". No checks will be done if a GCS path is provided. Args: source (str): File path to Avro. feature_sets_request: Feature sets that will be requested. """ p = urlparse(source) if p.scheme == "gs": # GCS path provided (Risk is delegated to user) # No validation if GCS path is provided return elif p.scheme == "file" or not p.scheme: # Local file (file://) provided file_path = os.path.abspath(os.path.join(p.netloc, p.path)) else: raise Exception( f"Unsupported uri scheme provided {p.scheme}, only " f"local files (file://), and gs:// schemes are " f"allowed") with open(file_path, "rb") as f: reader = fastavro.reader(f) schema = json.loads(reader.metadata["avro.schema"]) columns = [x["name"] for x in schema["fields"]] self._validate_columns(columns=columns, feature_sets_request=feature_sets_request, datetime_field="event_timestamp")
def _read_avro(urlpath, **kwargs): """Read avro file in given path and returns a list of delayed objects.""" values = [] for fn in open_files(urlpath): with fn as fp: av = fastavro.reader(fp) header = av._header # TODO: If the avro block size in the file is larger than the blocksize # passed here then some returned blocks may be empty because they don't # contain the delimiter. _, blockss = read_bytes(fn.path, delimiter=header['sync'], not_zero=True, sample=False, **kwargs) values.extend( delayed(_avro_body)(block, header) for blocks in blockss for block in blocks ) if not values: raise ValueError("urlpath is empty: %s" % urlpath) return values
def test_schema_migration_schema_mismatch(): schema = { "type": "record", "fields": [{ "name": "test", "type": "string", }] } new_schema = { "type": "enum", "name": "test", "symbols": ["FOO", "BAR"], } new_file = MemoryIO() records = [{"test": "test"}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) list(new_reader)
def result(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])): """ Wait until job is done to get an iterable rows of result. The row can only represent an Avro row in Feast 0.3. Args: timeout_sec (int): Max no of seconds to wait until job is done. If "timeout_sec" is exceeded, an exception will be raised. Returns: Iterable of Avro rows. """ uris = self.get_avro_files(timeout_sec) for file_uri in uris: file_obj = get_staging_client(file_uri.scheme).download_file(file_uri) file_obj.seek(0) avro_reader = fastavro.reader(file_obj) for record in avro_reader: yield record
def test_enum_evolution_no_default_failure(): original_schema = { "type": "enum", "name": "test", "symbols": ["FOO", "BAR"], } new_schema = { "type": "enum", "name": "test", "symbols": ["BAZ", "BAR"], } original_records = ["FOO"] bio = BytesIO() fastavro.writer(bio, original_schema, original_records) bio.seek(0) with pytest.raises(fastavro.read.SchemaResolutionError): list(fastavro.reader(bio, new_schema))
def _avro_to_df(self, avro_buffer, data_types): """Read an avro structure into a dataframe and minimially parse it returns: (schema, pandas.Dataframe) """ def parse_row(row): return { col["name"]: pandas.to_datetime(row[col["name"]]) if col["data_type"] == "date" else row[col["name"]] for col in data_types } reader = fastavro.reader(six.BytesIO(avro_buffer)) metadata = reader.writer_schema.get("structure", ()) if not metadata: raise DataMonsterError( "DataMonster does not currently support this request") records = [parse_row(r) for r in reader] return metadata, pandas.DataFrame.from_records(records)
def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame: from fastavro import reader kw = ParamDict(kwargs) process_record = None if "process_record" in kw: process_record = kw["process_record"] del kw["process_record"] with FileSystem().openbin(path) as fp: # Configure Avro reader avro_reader = reader(fp) # Load records in memory if process_record: records = [process_record(r) for r in avro_reader] else: records = list(avro_reader) # Populate pandas.DataFrame with records return pd.DataFrame.from_records(records)
def test_avro_reader_basic(datadir, inputfile, columns, engine): path = datadir / inputfile try: reader = fa.reader(open(path, "rb")) except Exception as excpr: if type(excpr).__name__ == "FileNotFoundError": pytest.skip(".avro file is not found") else: print(type(excpr).__name__) expect = pd.DataFrame.from_records(reader) got = cudf.read_avro(path, engine=engine, columns=columns) # PANDAS uses NaN to represent invalid data, which forces float dtype # For comparison, we can replace NaN with 0 and cast to the cuDF dtype # FASTAVRO produces int64 columns from avro int32 dtype, so convert # it back to int32 here for col in expect.columns: expect[col] = expect[col].astype(got[col].dtype) assert_eq(expect, got, check_categorical=False)
def createZonemaps(filename, output, startID): listPT = [0.0 for x in range(0, Constants.IMPRINTS_NUM_PT)] with open(filename, 'rb') as fo: reader = avro.reader(fo) schema = reader.schema counter = 0 print "Create zonemaps..." print "Input file: " + str(filename) for record in reader: tempMuon = len(record['Muon']) if (tempMuon > 0): for j in range(0, tempMuon): temppt = Decimal(record['Muon'][j]['pt']) listPT[counter] = temppt counter += 1 if (counter == Constants.IMPRINTS_NUM_PT): writeZonemaps(listPT, output, startID) counter = 0 startID += 1 print "Finish!" return startID