def concat_avro_files(input_paths: list, output_path: str, avro_tools_path: str = None) -> None: """Concatenate Avro files using avro-tools jar utility.""" avro_tools_cli = _get_avro_tools_cli(avro_tools_path) first_input_schema = None default_subprocess_kwargs = dict(shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) for file_path in input_paths: if not fastavro.is_avro(file_path): raise ValueError(f'Input {file_path!r} is not an Avro file') print(f'Reading schema from {file_path!r}...') getmeta = subprocess.run( f'{avro_tools_cli} getmeta {file_path} ' f'--key avro.schema', **default_subprocess_kwargs) getmeta.check_returncode() if getmeta.stderr: print(f'getmeta.stderr:\n{getmeta.stderr.decode()}') avro_schema = json.loads(getmeta.stdout.decode()) if not first_input_schema: first_input_schema = avro_schema else: assert avro_schema == first_input_schema print(f'Concatenating Avro files into {output_path!r}...') concat = subprocess.run( f'{avro_tools_cli} concat {" ".join(input_paths)} ' f'{output_path}', **default_subprocess_kwargs) concat.check_returncode() if not os.path.isfile(output_path): raise AssertionError( f'{output_path!r} was not created ' f'(avro-tools stdout: {concat.stdout.decode()!r})') print(f'Checking schema in output file {output_path!r}...') getmeta = subprocess.run( f'{avro_tools_cli} getmeta {output_path} ' f'--key avro.schema', **default_subprocess_kwargs) getmeta.check_returncode() output_schema = json.loads(getmeta.stdout.decode()) diffs = list(dictdiffer.diff(output_schema, first_input_schema)) if diffs: print('Differences in output vs input schema:') for diff in diffs: print(diff) assert list( dictdiffer.diff(output_schema['fields'], first_input_schema['fields'])) == [] else: print('Input/output schema identical')
def is_match(cls, file_path, options=None): """ Test the given file to check if the file has valid AVRO format or not. :param file_path: path to the file to be examined :type file_path: str :param options: avro read options :type options: dict :return: is file a avro file or not :rtype: bool """ if options is None: options = dict() # get current position of stream if data_utils.is_stream_buffer(file_path): starting_location = file_path.tell() is_valid_avro = fastavro.is_avro(file_path) # return to original position in stream if data_utils.is_stream_buffer(file_path): file_path.seek(starting_location, 0) return is_valid_avro
def _load_avro_files(self, ext_path: str = None): """ Load Avro alert data Parameters ---------- ext_path: str, optional If not None, load explicitly data under `ext_path`. Default is None (self.path is used). """ if ext_path is not None: path = ext_path else: path = self.path if isinstance(path, list): self.filenames = path elif os.path.isdir(path): self.filenames = glob.glob(os.path.join(path, '*.avro')) elif path == '': print('WARNING: path to avro files is empty') self.filenames = [] elif fastavro.is_avro(path): self.filenames = [path] else: msg = """ Data path not understood: {} You must give an avro file with its extension (.avro), or a folder with avro files. """.format(path) raise IOError(msg)
def read_avro_alerts(data_path: str) -> Iterable[dict]: """ Read avro alert files and return an interable with dicts of alert data Parameters ---------- data_path: str a directory path where to look for avro alert files Returns ---------- record: Iterable a generator that yields records(dict) after reading avro files in the given directory """ avro_files = glob.glob(data_path + '/*.avro') for avro_file in avro_files: # check for valid avro file if not fastavro.is_avro(avro_file): continue with open(avro_file, 'rb') as f: reader = fastavro.reader(f) record = next(reader) yield record
def get_dataframe_from_avro(input_path: str) -> pandas.DataFrame: """Create a DataFrame from Avro file (in-memory, mind your sizes).""" if not fastavro.is_avro(input_path): raise ValueError(f'Input {input_path!r} is not an Avro file') with open(input_path, 'rb') as avro_file: avro_reader = fastavro.reader(avro_file) df = pandas.DataFrame.from_records(list(avro_reader)) return df
def is_match(cls, file_path, options=None): """ Test the given file to check if the file has valid AVRO format or not. :param file_path: path to the file to be examined :type file_path: str :param options: avro read options :type options: dict :return: is file a avro file or not :rtype: bool """ is_valid_avro = fastavro.is_avro(file_path) return is_valid_avro
def sample_avro_file(input_paths: list, output_path: str, limit: int, sample_rate: float = 0.5, avro_tools_path: str = None) -> None: """Sample records from an Avro file using avro-tools jar utility.""" avro_tools_cli = _get_avro_tools_cli(avro_tools_path) default_subprocess_kwargs = dict(shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) for input_path in input_paths: if not fastavro.is_avro(input_path): raise ValueError(f'Input {input_path!r} is not an Avro file') sample_cmd = (f'{avro_tools_cli} cat --limit {limit} ' f'--samplerate {sample_rate} {" ".join(input_paths)} ' f'{output_path}') print(f'Sampling: {sample_cmd!r}...') cat = subprocess.run(sample_cmd, **default_subprocess_kwargs) cat.check_returncode() print(f'Result: {cat.stdout.decode()!r}')
def test_is_avro_fo(): for path in iglob('%s/*.avro' % data_dir): with open(path, 'rb') as fp: assert fastavro.is_avro(fp) with open(__file__, 'rb') as fp: assert not fastavro.is_avro(fp)
def test_is_avro_str(): for path in iglob('%s/*.avro' % data_dir): assert fastavro.is_avro(path) assert not fastavro.is_avro(__file__)
def convert(in_path, out_path, dedup_threshold): """ Convert .avro files in in_path into .parquet files in out_path. :param in_path: The input path for the .avro files. :type in_path: str :param out_path: The output path to write .parquet files. :type out_path: str :param dedup_threshold: The duplication percentage for dictionary compression. :type dedup_threshold: float :return: """ for avro_file_path in file_crawler.crawl(in_path): log.info(f"Opening Avro file {avro_file_path}") if not is_avro(str(avro_file_path)): log.error(f"error: {avro_file_path} is not an Avro file") sys.exit(1) with open(avro_file_path, "rb") as open_file: avro_data = reader(open_file) # Get the ordered list of field names from the avro schema avro_file_schema = avro_data.metadata['avro.schema'] log.debug(f"avro_file_schema: {avro_file_schema}") avro_schema = avro_data.writer_schema log.debug(f"avro_schema: {avro_schema}") # Read Avro file into Pandas dataframe data_frame = pd.DataFrame( data=avro_data, # Preserve column ordering columns=[x['name'] for x in avro_schema['fields']]) log.debug(f"Data Frame info: {data_frame}") # Get a list of columns with hashable types log.debug(f"All Columns: {[x for x in data_frame.columns]}") hashable_cols = [ x for x in data_frame.columns if isinstance(data_frame[x][0], Hashable) ] log.debug(f"Hashable columns from the data_frame: {hashable_cols}") # Find columns with high duplication (> 30%) for use with dictionary encoding dupcols = [ x.encode('UTF-8') for x in hashable_cols if (data_frame[x].duplicated().sum() / (int(data_frame[x].size) - 1)) > dedup_threshold ] log.debug(f"Columns to dedup: {dupcols}") table = pa.Table.from_pandas(data_frame).replace_schema_metadata({ 'parquet.avro.schema': avro_file_schema, 'writer.model.name': 'avro' }) parts = avro_file_path.parts parquet_file_path = pathlib.Path(os.path.join(out_path, *parts[3:])) parquet_file_path.parent.mkdir(parents=True, exist_ok=True) parquet_file_path = os.path.splitext(parquet_file_path)[0] + '.parquet' log.info(f"Writing parquet file: {parquet_file_path}") pq.write_table(table, parquet_file_path, compression='gzip', use_dictionary=dupcols, compression_level=5, coerce_timestamps='ms', allow_truncated_timestamps=False)
from fastavro import reader, is_avro file_path = "/Users/vpeche/tmp/avro/000000000000.AVRO" print("File is AVRO: {}".format(is_avro(file_path))) with open(file_path, 'rb') as fo: avro_reader = reader(fo) for record in avro_reader: print(record)
def __init__(self, data, schema=None): """ :param data: dict, list of dicts, JSON str, file, bytes :param schema: dict """ self._last_error = None # Last error captured self._object_data = None self._json_data = None self._avro_data = None self._origin = None self._schema = None self._schema_origin = None self._ok = False if schema is None: self._schema = None elif isinstance(schema, str): try: success, schema, origin = AvroTools.fetch_json(schema) if success: schema = json.loads(schema) self._schema_origin = origin else: schema = None except Exception as e: self._last_error = str(e) schema = None if schema is not None: try: self._schema = parse_schema(schema) if self._schema_origin is None: self._schema_origin = type(schema).__name__ except Exception as e: self._last_error = str(e) schema = None if isinstance(data, bytes): b_avro = False try: bdata = io.BytesIO(data) if is_avro(bdata): self._origin = 'binary_avro' bdata.seek(0) b_avro = True avro_reader = reader(bdata) self._schema = avro_reader.schema obj_data = [] for record in avro_reader: obj_data.append(record) self._object_data = None if len( obj_data) == 0 else obj_data[0] if len( obj_data) == 1 else obj_data self._ok = True else: self._origin = 'binary_string' data = data.decode('utf-8') except Exception as e: self._last_error = ('Avro binary' if b_avro else 'String decoding') + f' error: {e}' if isinstance(data, str): success, json_data, origin = AvroTools.fetch_json(data) if not self._origin: self._origin = origin if not success: self._last_error = json_data return try: self._object_data = json.loads(json_data) self._json_data = json_data if self._schema is None: self._ok = True except Exception as e: self._last_error = f'JSON parsing error: {e}' elif isinstance(data, dict) or isinstance(data, list): self._origin = type(data).__name__ self._object_data = data if self._schema is None: self._ok = True if self._object_data is not None and not self._ok and self._schema is not None: try: validate(self._object_data, self._schema) self._ok = True except Exception as e: self._last_error = f'Schema error: {e}'