def _load_token_database(db, domain: Pattern[str]) -> tokens.Database: """Loads a Database from a database object, ELF, CSV, or binary database.""" if db is None: return tokens.Database() if isinstance(db, tokens.Database): return db if isinstance(db, elf_reader.Elf): return _database_from_elf(db, domain) # If it's a str, it might be a path. Check if it's an ELF or CSV. if isinstance(db, (str, Path)): if not os.path.exists(db): raise FileNotFoundError( f'"{db}" is not a path to a token database') # Read the path as an ELF file. with open(db, 'rb') as fd: if elf_reader.compatible_file(fd): return _database_from_elf(fd, domain) # Read the path as a packed binary or CSV file. return tokens.DatabaseFile(db) # Assume that it's a file object and check if it's an ELF. if elf_reader.compatible_file(db): return _database_from_elf(db, domain) # Read the database as CSV or packed binary from a file object's path. if hasattr(db, 'name') and os.path.exists(db.name): return tokens.DatabaseFile(db.name) # Read CSV directly from the file object. return tokens.Database(tokens.parse_csv(db))
def _read_elf_with_domain(elf: str, domain: Pattern[str]) -> Iterable[tokens.Database]: for path in expand_paths_or_globs(elf): with path.open('rb') as file: if not elf_reader.compatible_file(file): raise ValueError(f'{elf} is not an ELF file, ' f'but the "{domain}" domain was specified') yield _database_from_elf(file, domain)
def expand_paths_or_globs(*paths_or_globs: str) -> Iterable[Path]: """Expands any globs in a list of paths; raises FileNotFoundError.""" for path_or_glob in paths_or_globs: if os.path.exists(path_or_glob): # This is a valid path; yield it without evaluating it as a glob. yield Path(path_or_glob) else: paths = glob.glob(path_or_glob, recursive=True) # If no paths were found and the path is not a glob, raise an Error. if not paths and not any(c in path_or_glob for c in '*?[]!'): raise FileNotFoundError(f'{path_or_glob} is not a valid path') for path in paths: # Resolve globs to CSV or compatible binary files. if elf_reader.compatible_file(path) or path.endswith('.csv'): yield Path(path)
def generate_reports(paths: Iterable[Path]) -> _DatabaseReport: """Returns a dictionary with information about the provided databases.""" reports: _DatabaseReport = {} for path in paths: with path.open('rb') as file: if elf_reader.compatible_file(file): domains = list(tokenization_domains(file)) else: domains = [''] domain_reports = {} for domain in domains: domain_reports[domain] = database_summary( load_token_database(path, domain=domain)) reports[str(path)] = domain_reports return reports
def _handle_report(token_database_or_elf, output): for path in token_database_or_elf: with path.open('rb') as file: if elf_reader.compatible_file(file): domains = list(tokenization_domains(file)) else: domains = [path.name] for domain in domains: output.write( '[{name}]\n' ' Domain: {domain}\n' ' Entries present: {present_entries}\n' ' Size of strings: {present_size_bytes} B\n' ' Total entries: {total_entries}\n' ' Total size of strings: {total_size_bytes} B\n' ' Collisions: {collisions} tokens\n'.format( name=path, domain=domain, **generate_report(load_token_database(path, domain=domain))))
def test_compatible_file_for_invalid_archive(self): self.assertFalse(elf_reader.compatible_file(io.BytesIO(b'!<arch>')))
def test_compatible_file_for_archive(self): self.assertTrue(elf_reader.compatible_file(io.BytesIO(b'!<arch>\n'))) self.assertTrue(elf_reader.compatible_file(self._archive))
def test_compatible_file_for_invalid_elf(self): self.assertFalse(elf_reader.compatible_file(io.BytesIO(b'\x7fELVESF')))
def test_compatible_file_for_elf_start_at_offset(self): self._elf_file.seek(13) # Seek ahead to get out of sync self.assertTrue(elf_reader.compatible_file(self._elf_file)) self.assertEqual(13, self._elf_file.tell())
def test_compatible_file_for_elf(self): self.assertTrue(elf_reader.compatible_file(self._elf_file)) self.assertTrue(elf_reader.compatible_file(io.BytesIO(b'\x7fELF')))