def _load_token_database(db, domain: Pattern[str]) -> tokens.Database: """Loads a Database from a database object, ELF, CSV, or binary database.""" if db is None: return tokens.Database() if isinstance(db, tokens.Database): return db if isinstance(db, elf_reader.Elf): return _database_from_elf(db, domain) # If it's a str, it might be a path. Check if it's an ELF or CSV. if isinstance(db, (str, Path)): if not os.path.exists(db): raise FileNotFoundError( f'"{db}" is not a path to a token database') # Read the path as an ELF file. with open(db, 'rb') as fd: if elf_reader.compatible_file(fd): return _database_from_elf(fd, domain) # Read the path as a packed binary or CSV file. return tokens.DatabaseFile(db) # Assume that it's a file object and check if it's an ELF. if elf_reader.compatible_file(db): return _database_from_elf(db, domain) # Read the database as CSV or packed binary from a file object's path. if hasattr(db, 'name') and os.path.exists(db.name): return tokens.DatabaseFile(db.name) # Read CSV directly from the file object. return tokens.Database(tokens.parse_csv(db))
def test_simple(self): detok = detokenize.Detokenizer( tokens.Database([ tokens.TokenizedStringEntry(0xcdab, '%02d %s %c%%', dt.datetime.now()) ])) self.assertEqual(str(detok.detokenize(b'\xab\xcd\0\0\x02\x03Two\x66')), '01 Two 3%')
def _database_from_elf(elf, domain: Pattern[str]) -> tokens.Database: """Reads the tokenized strings from an elf_reader.Elf or ELF file object.""" _LOG.debug('Reading tokenized strings in domain "%s" from %s', domain, elf) reader = _elf_reader(elf) # Read tokenized string entries. section_data = reader.dump_section_contents(_TOKENIZED_ENTRY_SECTIONS) if section_data is not None: return tokens.Database(_read_tokenized_entries(section_data, domain)) # Read legacy null-terminated string entries. sections = reader.dump_sections(_LEGACY_STRING_SECTIONS) if sections: return tokens.Database.merged( *_read_tokenized_strings(sections, domain)) return tokens.Database([])
def setUp(self): super().setUp() self.detok = detokenize.Detokenizer( tokens.Database([ tokens.TokenizedStringEntry(0, '$AAAAAA=='), # token for 0 tokens.TokenizedStringEntry(1, '$AgAAAA=='), # token for 2 tokens.TokenizedStringEntry(2, '$AwAAAA=='), # token for 3 tokens.TokenizedStringEntry(3, '$AgAAAA=='), # token for 2 ]))
def setUp(self): self.db = tokens.Database([ tokens.TokenizedStringEntry(1, 'Luke'), tokens.TokenizedStringEntry(2, 'Leia'), tokens.TokenizedStringEntry(2, 'Darth Vader'), tokens.TokenizedStringEntry(2, 'Emperor Palpatine'), tokens.TokenizedStringEntry(3, 'Han'), tokens.TokenizedStringEntry(4, 'Chewbacca'), tokens.TokenizedStringEntry(5, 'Darth Maul'), tokens.TokenizedStringEntry(6, 'Han Solo'), ])
def test_unparsed_data(self): detok = detokenize.Detokenizer( tokens.Database([ tokens.TokenizedStringEntry(1, 'no args', dt.datetime(100, 1, 1)), ])) result = detok.detokenize(b'\x01\0\0\0o_o') self.assertFalse(result.ok()) self.assertEqual('no args', str(result)) self.assertIn('o_o', repr(result)) self.assertIn('decoding failed', result.error_message())
def test_detokenize_missing_data_with_errors_is_unsuccessful(self): detok = detokenize.Detokenizer(tokens.Database( [tokens.TokenizedStringEntry(2, '%s', dt.datetime(1, 1, 1))]), show_errors=True) result = detok.detokenize(b'\x02\0\0\0') string, args, remaining = result.failures[0] self.assertIn('%s MISSING', string) self.assertEqual(len(args), 1) self.assertEqual(b'', remaining) self.assertEqual(len(result.failures), 1) self.assertIn('%s MISSING', str(result))
def test_merge_multiple(self): db = tokens.Database.merged( tokens.Database( [tokens.TokenizedStringEntry(1, 'one', datetime.datetime.max)]), tokens.Database( [tokens.TokenizedStringEntry(2, 'two', datetime.datetime.min)]), tokens.Database( [tokens.TokenizedStringEntry(1, 'one', datetime.datetime.min)])) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'}) db.merge( tokens.Database([ tokens.TokenizedStringEntry(4, 'four', datetime.datetime.max) ]), tokens.Database( [tokens.TokenizedStringEntry(2, 'two', datetime.datetime.max)]), tokens.Database([ tokens.TokenizedStringEntry(3, 'three', datetime.datetime.min) ])) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two', 'three', 'four'})
def test_detokenize_extra_data_is_unsuccessful(self): detok = detokenize.Detokenizer( tokens.Database([ tokens.TokenizedStringEntry(1, 'no args', dt.datetime(1, 1, 1)) ])) result = detok.detokenize(b'\x01\0\0\0\x04args') self.assertEqual(len(result.failures), 1) string, args, remaining = result.failures[0] self.assertEqual('no args', string) self.assertFalse(args) self.assertEqual(b'\x04args', remaining) self.assertEqual('no args', string) self.assertEqual('no args', str(result))
def test_detokenize_missing_data_is_unsuccessful(self): detok = detokenize.Detokenizer( tokens.Database([ tokens.TokenizedStringEntry(2, '%s', date_removed=dt.datetime(1, 1, 1)) ])) result = detok.detokenize(b'\x02\0\0\0') string, args, remaining = result.failures[0] self.assertEqual('%s', string) self.assertEqual(len(args), 1) self.assertEqual(b'', remaining) self.assertEqual(len(result.failures), 1) self.assertEqual('%s', str(result))
def test_add(self): db = tokens.Database() db.add(_entries('MILK', 'apples')) self.assertEqual({e.string for e in db.entries()}, {'MILK', 'apples'}) db.add(_entries('oranges', 'CHEESE', 'pears')) self.assertEqual(len(db.entries()), 5) db.add(_entries('MILK', 'apples', 'only this one is new')) self.assertEqual(len(db.entries()), 6) db.add(_entries('MILK')) self.assertEqual({e.string for e in db.entries()}, { 'MILK', 'apples', 'oranges', 'CHEESE', 'pears', 'only this one is new' })
def setUp(self): super().setUp() token = 0xbaad # Database with several conflicting tokens. self.detok = detokenize.Detokenizer(tokens.Database([ tokens.TokenizedStringEntry(token, 'REMOVED', dt.datetime(9, 1, 1)), tokens.TokenizedStringEntry(token, 'newer'), tokens.TokenizedStringEntry(token, 'A: %d', dt.datetime(30, 5, 9)), tokens.TokenizedStringEntry(token, 'B: %c', dt.datetime(30, 5, 10)), tokens.TokenizedStringEntry(token, 'C: %s'), tokens.TokenizedStringEntry(token, '%d%u'), tokens.TokenizedStringEntry(token, '%s%u %d'), tokens.TokenizedStringEntry(1, '%s'), tokens.TokenizedStringEntry(1, '%d'), tokens.TokenizedStringEntry(2, 'Three %s %s %s'), tokens.TokenizedStringEntry(2, 'Five %d %d %d %d %s'), ])) # yapf: disable
def test_merge_multiple_datbases_in_one_call(self): """Tests the merge and merged methods with multiple databases.""" db = tokens.Database.merged( tokens.Database([ tokens.TokenizedStringEntry(1, 'one', date_removed=datetime.datetime.max) ]), tokens.Database([ tokens.TokenizedStringEntry(2, 'two', date_removed=datetime.datetime.min) ]), tokens.Database([ tokens.TokenizedStringEntry(1, 'one', date_removed=datetime.datetime.min) ])) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'}) db.merge( tokens.Database([ tokens.TokenizedStringEntry(4, 'four', date_removed=datetime.datetime.max) ]), tokens.Database([ tokens.TokenizedStringEntry(2, 'two', date_removed=datetime.datetime.max) ]), tokens.Database([ tokens.TokenizedStringEntry(3, 'three', date_removed=datetime.datetime.min) ])) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two', 'three', 'four'})
def read_db_from_csv(csv_str: str) -> tokens.Database: with io.StringIO(csv_str) as csv_db: return tokens.Database(tokens.parse_csv(csv_db))
def test_binary_format_parse(self): with io.BytesIO(BINARY_DATABASE) as binary_db: db = tokens.Database(tokens.parse_binary(binary_db)) self.assertEqual(str(db), CSV_DATABASE)
def test_merge(self): """Tests the tokens.Database merge method.""" db = tokens.Database() # Test basic merging into an empty database. db.merge( tokens.Database([ tokens.TokenizedStringEntry( 1, 'one', date_removed=datetime.datetime.min), tokens.TokenizedStringEntry( 2, 'two', date_removed=datetime.datetime.min), ])) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two'}) self.assertEqual(db.token_to_entries[1][0].date_removed, datetime.datetime.min) self.assertEqual(db.token_to_entries[2][0].date_removed, datetime.datetime.min) # Test merging in an entry with a removal date. db.merge( tokens.Database([ tokens.TokenizedStringEntry(3, 'three'), tokens.TokenizedStringEntry( 4, 'four', date_removed=datetime.datetime.min), ])) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two', 'three', 'four'}) self.assertIsNone(db.token_to_entries[3][0].date_removed) self.assertEqual(db.token_to_entries[4][0].date_removed, datetime.datetime.min) # Test merging in one entry. db.merge(tokens.Database([ tokens.TokenizedStringEntry(5, 'five'), ])) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two', 'three', 'four', 'five'}) self.assertEqual(db.token_to_entries[4][0].date_removed, datetime.datetime.min) self.assertIsNone(db.token_to_entries[5][0].date_removed) # Merge in repeated entries different removal dates. db.merge( tokens.Database([ tokens.TokenizedStringEntry( 4, 'four', date_removed=datetime.datetime.max), tokens.TokenizedStringEntry( 5, 'five', date_removed=datetime.datetime.max), ])) self.assertEqual(len(db.entries()), 5) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two', 'three', 'four', 'five'}) self.assertEqual(db.token_to_entries[4][0].date_removed, datetime.datetime.max) self.assertIsNone(db.token_to_entries[5][0].date_removed) # Merge in the same repeated entries now without removal dates. db.merge( tokens.Database([ tokens.TokenizedStringEntry(4, 'four'), tokens.TokenizedStringEntry(5, 'five') ])) self.assertEqual(len(db.entries()), 5) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two', 'three', 'four', 'five'}) self.assertIsNone(db.token_to_entries[4][0].date_removed) self.assertIsNone(db.token_to_entries[5][0].date_removed) # Merge in an empty databsse. db.merge(tokens.Database([])) self.assertEqual({str(e) for e in db.entries()}, {'one', 'two', 'three', 'four', 'five'})