Пример #1
0
def _load_token_database(db, domain: Pattern[str]) -> tokens.Database:
    """Loads a Database from a database object, ELF, CSV, or binary database."""
    if db is None:
        return tokens.Database()

    if isinstance(db, tokens.Database):
        return db

    if isinstance(db, elf_reader.Elf):
        return _database_from_elf(db, domain)

    # If it's a str, it might be a path. Check if it's an ELF or CSV.
    if isinstance(db, (str, Path)):
        if not os.path.exists(db):
            raise FileNotFoundError(
                f'"{db}" is not a path to a token database')

        # Read the path as an ELF file.
        with open(db, 'rb') as fd:
            if elf_reader.compatible_file(fd):
                return _database_from_elf(fd, domain)

        # Read the path as a packed binary or CSV file.
        return tokens.DatabaseFile(db)

    # Assume that it's a file object and check if it's an ELF.
    if elf_reader.compatible_file(db):
        return _database_from_elf(db, domain)

    # Read the database as CSV or packed binary from a file object's path.
    if hasattr(db, 'name') and os.path.exists(db.name):
        return tokens.DatabaseFile(db.name)

    # Read CSV directly from the file object.
    return tokens.Database(tokens.parse_csv(db))
Пример #2
0
def _read_elf_with_domain(elf: str,
                          domain: Pattern[str]) -> Iterable[tokens.Database]:
    for path in expand_paths_or_globs(elf):
        with path.open('rb') as file:
            if not elf_reader.compatible_file(file):
                raise ValueError(f'{elf} is not an ELF file, '
                                 f'but the "{domain}" domain was specified')

            yield _database_from_elf(file, domain)
Пример #3
0
def expand_paths_or_globs(*paths_or_globs: str) -> Iterable[Path]:
    """Expands any globs in a list of paths; raises FileNotFoundError."""
    for path_or_glob in paths_or_globs:
        if os.path.exists(path_or_glob):
            # This is a valid path; yield it without evaluating it as a glob.
            yield Path(path_or_glob)
        else:
            paths = glob.glob(path_or_glob, recursive=True)

            # If no paths were found and the path is not a glob, raise an Error.
            if not paths and not any(c in path_or_glob for c in '*?[]!'):
                raise FileNotFoundError(f'{path_or_glob} is not a valid path')

            for path in paths:
                # Resolve globs to CSV or compatible binary files.
                if elf_reader.compatible_file(path) or path.endswith('.csv'):
                    yield Path(path)
Пример #4
0
def generate_reports(paths: Iterable[Path]) -> _DatabaseReport:
    """Returns a dictionary with information about the provided databases."""
    reports: _DatabaseReport = {}

    for path in paths:
        with path.open('rb') as file:
            if elf_reader.compatible_file(file):
                domains = list(tokenization_domains(file))
            else:
                domains = ['']

        domain_reports = {}

        for domain in domains:
            domain_reports[domain] = database_summary(
                load_token_database(path, domain=domain))

        reports[str(path)] = domain_reports

    return reports
Пример #5
0
def _handle_report(token_database_or_elf, output):
    for path in token_database_or_elf:
        with path.open('rb') as file:
            if elf_reader.compatible_file(file):
                domains = list(tokenization_domains(file))
            else:
                domains = [path.name]

        for domain in domains:
            output.write(
                '[{name}]\n'
                '                 Domain: {domain}\n'
                '        Entries present: {present_entries}\n'
                '        Size of strings: {present_size_bytes} B\n'
                '          Total entries: {total_entries}\n'
                '  Total size of strings: {total_size_bytes} B\n'
                '             Collisions: {collisions} tokens\n'.format(
                    name=path,
                    domain=domain,
                    **generate_report(load_token_database(path,
                                                          domain=domain))))
Пример #6
0
 def test_compatible_file_for_invalid_archive(self):
     self.assertFalse(elf_reader.compatible_file(io.BytesIO(b'!<arch>')))
Пример #7
0
 def test_compatible_file_for_archive(self):
     self.assertTrue(elf_reader.compatible_file(io.BytesIO(b'!<arch>\n')))
     self.assertTrue(elf_reader.compatible_file(self._archive))
Пример #8
0
 def test_compatible_file_for_invalid_elf(self):
     self.assertFalse(elf_reader.compatible_file(io.BytesIO(b'\x7fELVESF')))
Пример #9
0
 def test_compatible_file_for_elf_start_at_offset(self):
     self._elf_file.seek(13)  # Seek ahead to get out of sync
     self.assertTrue(elf_reader.compatible_file(self._elf_file))
     self.assertEqual(13, self._elf_file.tell())
Пример #10
0
 def test_compatible_file_for_elf(self):
     self.assertTrue(elf_reader.compatible_file(self._elf_file))
     self.assertTrue(elf_reader.compatible_file(io.BytesIO(b'\x7fELF')))