def test_getitem(self, tessfile_list): for f in tessfile_list: lines = [] with open(f, 'r') as tess: for line in tess.readlines(): lines.append(line) indices = [i for i in range(len(lines))] # Test __getitem__ with buffering in order t = TessFile(f) for i in indices: assert t[i] == lines[i] # Test __getitem__ with buffering in order t = TessFile(f, buffer=False) for i in indices: assert t[i] == lines[i] random.shuffle(indices) # Test __getitem__ with buffering in order t = TessFile(f) for i in indices: assert t[i] == lines[i] # Test __getitem__ with buffering in order t = TessFile(f, buffer=False) for i in indices: assert t[i] == lines[i]
def populate_database(search_connection, test_data): """Set up the database to conduct searches on the test texts. Fixtures -------- search_connection TessMongoConnection for search unit tests. test_data Example data for unit testing. """ for text in test_data['texts']: tessfile = TessFile(text['path'], metadata=Text(**text)) search_connection.insert(tessfile.metadata) if text['language'] == 'latin': tok = LatinTokenizer(search_connection) unitizer = Unitizer() tokens, tags, features = tok.tokenize(tessfile.read(), text=tessfile.metadata) search_connection.update(features) lines, phrases = unitizer.unitize(tokens, tags, tessfile.metadata) search_connection.insert(lines + phrases) search_connection.insert(tokens) yield search_connection.connection['texts'].delete_many({}) search_connection.connection['tokens'].delete_many({}) search_connection.connection['features'].delete_many({}) search_connection.connection['units'].delete_many({}) search_connection.connection['matches'].delete_many({}) search_connection.connection['searches'].delete_many({})
def test_normalize(self, greek_files, greek_tokens): grc = self.__test_class__() for i in range(len(greek_files)): fname = greek_files[i] ref_tokens = [t for t in greek_tokens[i] if t['FORM'] != ''] t = TessFile(fname) token_idx = 0 for i, line in enumerate(t.readlines(include_tag=False)): tokens = [t for t in grc.normalize(line)] tokens = [ t for t in tokens if re.search( '[' + grc.word_characters + ']+', t, flags=re.UNICODE) ] offset = token_idx + len(tokens) correct = map(lambda x: x[0] == x[1]['FORM'], zip(tokens, ref_tokens[token_idx:offset])) if not all(correct): print(fname, i, line) print(ref_tokens[token_idx:offset]) for j in range(len(tokens)): if tokens[j] != ref_tokens[token_idx + j]['FORM']: print('{}->{}'.format( tokens[j], ref_tokens[token_idx + j]['FORM'])) assert all(correct) token_idx = offset
def test_unitize_notag_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.notag.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def test_unitize_elision_file(unit_connection, tessfiles_greek_path): tokenizer = GreekTokenizer(unit_connection) t = Text(path=str(tessfiles_greek_path.joinpath('test.elision.tess')), language='greek') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1
def read_files_count(filepath): tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) stop = 0 while stop != 1: try: rawtoken = next(tokengenerator) cleantoken_list = token_cleanup(rawtoken) token = cleantoken_list[0] countgram(token) except StopIteration: stop = 1
def test_unitize_linebreak_file(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str(tessfiles_latin_path.joinpath('test.linebreak.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 1 first_tag = phrases[0].tags[0] for phrase in phrases[1:]: assert phrase.tags[0] == first_tag
def unitizer_inputs(unit_tessfiles, unit_connection): inputs = [] tokenizer_selector = { 'latin': LatinTokenizer(unit_connection), 'greek': GreekTokenizer(unit_connection) } for t in unit_tessfiles: tessfile = TessFile(t.path, metadata=t) tokens, tags, features = tokenizer_selector[t.language].tokenize( tessfile.read(), text=t) features.sort(key=lambda x: x.index) inputs.append((tokens, tags, features)) yield inputs
def test_readlines(self, tessfile_list): for f in tessfile_list: lines = [] with open(f, 'r') as tess: for line in tess.readlines(): lines.append(line) # Ensure that readlines works with a buffer t = TessFile(f) for i, line in enumerate(t.readlines()): assert line == lines[i] # Ensure that the buffer resets on second call reset = False for i, line in enumerate(t.readlines()): assert line == lines[i] reset = True assert reset # Ensure that readlines works with initial read t = TessFile(f, buffer=False) for i, line in enumerate(t.readlines()): assert line == lines[i] # Ensure that the iterator resets on second call reset = False for i, line in enumerate(t.readlines()): assert line == lines[i] reset = True assert reset
def test_normalize(self, latin_files, latin_tokens): la = self.__test_class__() for i in range(len(latin_files)): fname = latin_files[i] ref_tokens = [t for t in latin_tokens[i] if 'FORM' in t] t = TessFile(fname) tokens = la.normalize(t.read()) correct = map( lambda x: ('FORM' in x[1] and x[0] == x[1]['FORM']) or x[0] == '', zip(tokens, ref_tokens))
def main(): """Ingest a text into Tesserae. Takes a .tess files and computes tokens, features, frequencies, and units. All computed components are inserted into the database. """ args = parse_args() if args.password: password = getpass(prompt='Tesserae MongoDB Password: ') else: password = None connection = TessMongoConnection(args.host, args.port, args.user, password, db=args.database) text_hash = hashlib.md5() text_hash.update(TessFile(args.input).read().encode()) text_hash = text_hash.hexdigest() text = Text(language=args.language, title=args.title, author=args.author, year=args.year, path=args.input, hash=text_hash, is_prose=args.prose) ingest_text(connection, text, enable_multitext=args.enable_multitext)
def test_unitize_linebreak_end(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.linebreak_end.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) print('# lines') for line in lines: print(line.snippet) print('# phrases') for phrase in phrases: print(phrase.snippet) assert len(lines) == 2
def test_unitize_diacrit_in_latin(unit_connection, tessfiles_latin_path): tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.diacrit_in_latin.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) forms = {f.index: f.token for f in features if f.feature == 'form'} lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) for phrase in phrases: for t in phrase.tokens: cur_form = t['features']['form'][0] if cur_form != -1: normalized = tokenizer.normalize(t['display'])[0][0] assert normalized == forms[cur_form], phrase.snippet
def read_files(filepath): '''Moves through a .tess file and calls the 'next' and 'count_lemma' functions as needed. Updates the SKIP_LIBRARY global object. Parameters ---------- filepath: a file in .tess format ''' tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) stop = 0 while stop != 1: try: rawtoken = next(tokengenerator) cleantoken_list = token_cleanup(rawtoken) count_lemma(cleantoken_list[0]) except StopIteration: stop = 1
def test_unitize_nopunctuation_file(unit_connection, tessfiles_latin_path): # when there is no ending punctuation despite coming to the end of a poem # and another poem starts after a blank line tokenizer = LatinTokenizer(unit_connection) t = Text(path=str( tessfiles_latin_path.joinpath('test.nopunctuation.tess')), language='latin') tessfile = TessFile(t.path, metadata=t) unitizer = Unitizer() tokens, tags, features = tokenizer.tokenize(tessfile.read(), text=t) lines, phrases = unitizer.unitize(tokens, tags, tokens[0].text) assert len(lines) == 68 for prev_phrase, cur_phrase in zip(phrases[:-1], phrases[1:]): if '2.13' in prev_phrase.tags[0] and '2.14' in cur_phrase.tags[0]: assert prev_phrase.snippet == 'quin et Prometheus et Pelopis parens / dulci laborem decipitur sono / nec curat Orion leones / aut timidos agitare lyncas / Eheu fugaces, Postume, Postume, / labuntur anni nec pietas moram / rugis et instanti senectae / adferet indomitaeque morti, / non, si trecenis quotquot eunt dies, / amice, places inlacrimabilem / Plutona tauris, qui ter amplum / Geryonen Tityonque tristi / conpescit unda, scilicet omnibus / quicumque terrae munere vescimur / enaviganda, sive reges / sive inopes erimus coloni. / ' assert cur_phrase.snippet == 'frustra cruento Marte carebimus / fractisque rauci fluctibus Hadriae, / frustra per autumnos nocentem / corporibus metuemus Austrum: / ' break
def test_tokenize(self, greek_files, greek_tokens, greek_word_frequencies): grc = self.__test_class__() for k in range(len(greek_files)): fname = greek_files[k] ref_tokens = [t for t in greek_tokens[k] if 'FORM' in t] ref_freqs = greek_word_frequencies[k] t = TessFile(fname) tokens, frequencies = grc.tokenize(t.read()) tokens = [ t for t in tokens if re.search('[\w]', t.display, flags=re.UNICODE) ] correct = map(lambda x: x[0].display == x[1]['DISPLAY'], zip(tokens, ref_tokens)) if not all(correct): print(fname) for j in range(len(tokens)): if tokens[j].display != ref_tokens[j]['DISPLAY']: print(ref_tokens[j]) print('{}->{}'.format(tokens[j].display, ref_tokens[j]['DISPLAY'])) print('{}->{}'.format(tokens[j].form, ref_tokens[j]['FORM'])) assert all(correct) correct = map(lambda x: x[0].form == x[1]['FORM'], zip(tokens, ref_tokens)) if not all(correct): print(fname) for j in range(len(tokens)): if tokens[j].form != ref_tokens[j]['FORM']: print(ref_tokens[j]) print('{}->{}'.format(tokens[j].form, ref_tokens[j]['FORM'])) assert all(correct)
def test_hash(self, tessfile_list): for f in tessfile_list: hashitizer = hashlib.md5() with open(f, 'r') as tess: hashitizer.update(tess.read().encode('utf-8')) h = hashitizer.hexdigest() # Test that the hash is computed correctly t = TessFile(f) assert t._TessFile__hash is None assert t.hash == h assert t._TessFile__hash == h
def test_init(self, tessfile_list): for f in tessfile_list: # Test initializing as buffer t = TessFile(f) assert t.path == f assert t.mode == 'r' assert t.buffer assert t._TessFile__hash is None assert t._TessFile__len is None assert isinstance(t.file, io.TextIOWrapper) assert t.file.name == f assert t.file.mode == 'r' # Test initializing with read result = [] with open(f, 'r') as tess: for line in tess.readlines(): result.append(line) t = TessFile(f, buffer=False) assert t.path == f assert t.mode == 'r' assert not t.buffer assert t._TessFile__hash is None assert t._TessFile__len is None assert t.file == result # # Test initializing as buffer with validation # t = TessFile(f, validate=True) # assert t.mode == 'r' # assert t.buffer # assert t._TessFile__hash is None # assert t._TessFile__len is None # assert isinstance(t.file, io.TextIOWrapper) # assert t.file.name == f # assert t.file.mode == 'r' # # # Test initializing with read and validation # t = TessFile(f, buffer=False, validate=True) # assert t.mode == 'r' # assert not t.buffer # assert t._TessFile__hash is None # assert t._TessFile__len is None # assert t.file == result # Test instantiating with a non-existent file with pytest.raises(FileNotFoundError): t = TessFile('/foo/bar.tess') with pytest.raises(FileNotFoundError): t = TessFile('/foo/bar.tess', buffer=False) # Test instantiating with a directory as path with pytest.raises(IsADirectoryError): t = TessFile(os.path.dirname(os.path.abspath(__file__))) with pytest.raises(IsADirectoryError): t = TessFile(os.path.dirname(os.path.abspath(__file__)), buffer=False)
def greek_tessfiles(test_data, token_connection): # Get the test data and filter for Greek texts only. tessfiles = [t for t in test_data['texts'] if t['language'] == 'greek'] tessfiles = [Text(**text) for text in tessfiles] # Prep the database with the text metadata token_connection.insert(tessfiles) # Create file readers for each text tessfiles = [TessFile(text.path, metadata=text) for text in tessfiles] yield sorted(tessfiles, key=lambda x: x.metadata.path) token_connection.delete([t.metadata for t in tessfiles])
def load_text(client, cts_urn, mode='r', buffer=True): """Open a .tess file for reading. Parameters ---------- cts_urn : str Unique collection-level identifier. mode : str File open mode ('r', 'w', 'a', etc.) buffer : bool If True, load file contents into memory on-the-fly. Otherwise, load in contents on initialization. Returns ------- text : `tesserae.utils.TessFile` or None A non-/buffered reader for the file at ``path``. If the file does not exit in the database, returns None. Raises ------ NoTextError Raised when the requested text does not exist in the database. """ # Retrieve text data from the database by CTS URN text_objs = retrieve_text_list(client, cts_urn=cts_urn) # If more than one text was retrieved, database integrity has been # compromised. Raise an exception. if len(text_objs) > 1: raise DuplicateTextError(cts_urn) # Attempt to load the first text in the list of text objects. If the list # is empty, raise an excpetion. try: text = TessFile(text_objs[0].path, mode=mode, buffer=buffer, metadata=text_objs[0]) except IndexError: raise NoTextError(cts_urn) return text
def test_unitize(self, units): for unit in units: u = Unitizer() metadata = unit['metadata'] tess = TessFile(metadata.path, metadata=metadata) tokens = unit['tokens'] lines = unit['lines'] phrases = unit['phrases'] if metadata.language == 'greek': tokenizer = GreekTokenizer() elif metadata.language == 'latin': tokenizer = LatinTokenizer() tokenizer.clear() for i, line in enumerate(tess.readlines(include_tag=False)): stop = (i == len(tess) - 1) u.unitize(line, metadata, tokenizer=tokenizer, stop=stop) print(metadata.path) assert len(u.lines) == len(lines) for i in range(len(lines)): line_tokens = \ [tokenizer.tokens[j].form for j in u.lines[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in lines[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if line_tokens != correct_tokens: print('Line {}'.format(i)) print(line_tokens) print(correct_tokens) assert line_tokens == correct_tokens print(u.phrases[-1].tokens) assert len(u.phrases) == len(phrases) for i in range(len(u.phrases)): phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i].tokens if re.search(r'[\w\d]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i]['TOKEN_ID'] if 'FORM' in tokens[j] and tokens[j]['FORM']] if phrase_tokens != correct_tokens: print('Phrase {}'.format(i)) phrase_tokens = \ [tokenizer.tokens[j].form for j in u.phrases[i - 1].tokens if re.search(r'[\w]', tokenizer.tokens[j].display, flags=re.UNICODE) and tokenizer.tokens[j].form] correct_tokens = \ [tokens[j]['FORM'] for j in phrases[i - 1]['TOKEN_ID'] if 'FORM' in tokens[j]] print(phrase_tokens) print(correct_tokens) assert phrase_tokens == correct_tokens assert len(u.phrases) == len(phrases) u.clear() tokenizer.clear()
def read_files_skipgram(filepath, context_window): '''Moves through a .tess file and calls the 'next' and 'skipgram' functions as needed. Updates the SKIP_LIBRARY global object. Parameters ---------- filepath: a file in .tess format context_window: how many words on either side of the target to look at. ''' tessobj = TessFile(filepath) tokengenerator = iter(tessobj.read_tokens()) tokens = new_file(tokengenerator, context_window) stop = 0 clearflag = 0 target_position = context_window while stop != 1: #the target should be five away from the end of the file, until the end # can't just pop the target token; we want to keep it for the next round. targettoken = tokens[target_position] #grab all the other tokens but the target contexttokens = [x for i, x in enumerate(tokens) if i != target_position] #add this context to the skipgram map skipgram(targettoken, contexttokens) #prep the next token in the file try: rawtoken = next(tokengenerator) cleantoken_list = token_cleanup(rawtoken) if len(cleantoken_list) > 1 and cleantoken_list[-1] in punctuation_list: #this should indicate a sentence has ended. #when this happens, it's necessary to clear the list *after* this iteration. clearflag = 1 tokens.append(cleantoken_list[0]) # if we've seen end-of-sentence punctuation, we need to start counting down. if clearflag == 1: # when this begins, the token list just received the final word. tokens.pop(0) while len(tokens) > context_window: # perform the usual dictionary operation, but don't add a new token. targettoken = tokens[target_position] contexttokens = [x for i, x in enumerate(tokens) if i != target_position] skipgram(targettoken, contexttokens) tokens.pop(0) #initialize the next sentence tokens = [] tokens = new_file(tokengenerator, context_window) clearflag = 0 else: tokens.pop(0) except StopIteration: #we have reached EOF. Loop through until the last token is done then quit #when this happens, the token list should have 11 indices, and the 'target_position' #index will be the sixth (i.e. :tokens[5]). Pop the first index off, leaving 10 #indices and making the sixth index (previously the seventh) the new target. # this entire loop is obsolete now that punctuation is accounted for. try: tokens.pop(0) except IndexError: pass while len(tokens) > (context_window): # This loop makes the target_position move to the end. E.g. if the context_window is 6, then # as long as there are six or more indexes, make the target_position the sixth index. targettoken = tokens[target_position] #grab all the other tokens but the target contexttokens = [x for i, x in enumerate(tokens) if i != target_position] #add this context to the skipgram map skipgram(targettoken, contexttokens) tokens.pop(0) stop = 1
if len(lemmas) > 1: all_lemmas_total = sum([COUNT_LIBRARY[l] for l in lemmas]) try: lemmalist = [(l, (COUNT_LIBRARY[l] / all_lemmas_total)) for l in lemmas] except ZeroDivisionError: print([(COUNT_LIBRARY[l], l) for l in lemmas]) return lemmalist else: lemmalist = [] lemmaobj = (lemmas[0], 1) lemmalist.append(lemmaobj) return lemmalist tessobj = TessFile(onlyfiles[258]) tokengenerator = iter(tessobj.read_tokens()) tokens = new_file(tokengenerator, 2) target = tokens.pop(0) compare_context(target, tokens) rel_path = os.path.join( '~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff') path = os.path.expanduser(rel_path) file = 'latin_pos_lemmatized_sents.pickle' latin_pos_lemmatized_sents_path = os.path.join(path, file) if os.path.isfile(latin_pos_lemmatized_sents_path): latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path) else: print('The file %s is not available in cltk_data' % file)
def test_read_tokens(self, tessfile_list): for f in tessfile_list: lines = [] with open(f, 'r') as tess: for line in tess.readlines(): lines.append(line) t_b = TessFile(f) t_r = TessFile(f, buffer=False) # Ensure that tokens omit the tag when requested # Grab all tokens from the text tokens = [] for line in lines: start = line.find('>') if start >= 0: tokens.extend(line[start + 1:].strip( string.whitespace).split()) # Test with buffer for i, token in enumerate(t_b.read_tokens()): # print(token, tokens[i]) assert token == tokens[i] # Ensure that the iterator resets reset = False for i, token in enumerate(t_b.read_tokens()): assert token == tokens[i] reset = True assert reset # Test with initial read for i, token in enumerate(t_r.read_tokens()): assert token == tokens[i] # Ensure that the iterator resets reset = False for i, token in enumerate(t_r.read_tokens()): assert token == tokens[i] reset = True assert reset # Ensure that tokens include the tag when requested # Lines now start before the tag tokens = [] for line in lines: tokens.extend(line.strip().split()) # Test with buffer for i, token in enumerate(t_b.read_tokens(include_tag=True)): print(token, tokens[i]) assert token == tokens[i] # Ensure that the iterator resets reset = False for i, token in enumerate(t_b.read_tokens(include_tag=True)): assert token == tokens[i] reset = True assert reset # Test with initial read for i, token in enumerate(t_r.read_tokens(include_tag=True)): assert token == tokens[i] # Ensure that the iterator resets reset = False for i, token in enumerate(t_r.read_tokens(include_tag=True)): assert token == tokens[i] reset = True assert reset
def insert_text(connection, cts_urn, language, author, title, year, unit_types, path): """Insert a new text into the database. Attempt to insert a new text in the database, sanitized to match the fields and data types of existing texts. Parameters ---------- cts_urn : str Unique collection-level identifier. language : str Language the text is written in. author : str Full name of the text author. title : str Title of the text. year : int Year of text authorship. unit_types : str or list of str Valid unit-level delimiters for this text. path : str Path to the raw text file. May be a remote URL. Returns ------- result : `pymongo.InsertOneResult` The Raises ------ TextExistsError Raised when attempting to insert a text that already exists in the database. Notes ----- This function should not be made available to everyone. To properly secure the database, ensure that only MongoDB users NOT connected to a public- facing client application are able to write to the database. See the <MongoDB documentation on role-based access control>_ for more information. .. _MongoDB documentation on role-based access control: https://docs.mongodb.com/manual/core/authorization/ """ # Attempt to load the file and any database entry with the same CTS URN text_file = TessFile(path) db_texts = retrieve_text_list(connection, cts_urn=cts_urn, hash=text_file.hash) # If no entries with the same CTS URN were found in the database, insert. # Otherwise, raise an exception. if len(db_texts) == 0: text = Text(cts_urn=cts_urn, language=language, author=author, title=title, year=year, unit_types=unit_types, path=path, hash=text_file.hash) result = connection.texts.insert_one(text.json_encode(exclude=['_id'])) return result else: raise TextExistsError(cts_urn, hash)
def test_tokenize(self, latin_files, latin_tokens, latin_word_frequencies): la = self.__test_class__() for k in range(len(latin_files)): fname = latin_files[k] ref_tokens = [t for t in latin_tokens[k] if 'FORM' in t] ref_freqs = latin_word_frequencies[k] t = TessFile(fname) tokens, frequencies = la.tokenize(t.read(), text=t.metadata) tokens = [ t for t in tokens if re.search(r'^[a-zA-Z]+$', t.display, flags=re.UNICODE) ] correct = map(lambda x: x[0].display == x[1]['DISPLAY'], zip(tokens, ref_tokens)) if not all(correct): print(fname) for j in range(len(tokens)): if tokens[j].display != ref_tokens[j]['DISPLAY']: print('{}->{}'.format(tokens[j].display, ref_tokens[j]['DISPLAY'])) assert all(correct) correct = map( lambda x: ('FORM' in x[1] and x[0].form == x[1]['FORM']) or not x[0].form, zip(tokens, ref_tokens)) if not all(correct): print(fname) # for j in range(len(tokens)): # if tokens[j].form != ref_tokens[j]['FORM']: # print('{}->{}'.format(tokens[j].form, ref_tokens[j]['FORM'])) assert all(correct) for key in ref_freqs: assert key in la.frequencies assert la.frequencies[key] == ref_freqs[key] diff = [] for word in frequencies: if word.form not in ref_freqs and re.search( r'[a-zA-Z]', word.form, flags=re.UNICODE): diff.append(word.form) print(diff) assert len(diff) == 0 keys = sorted(list(ref_freqs.keys())) frequencies.sort(key=lambda x: x.form) correct = map( lambda x: x[0].form == x[1] and x[0].frequency == ref_freqs[x[ 1]], zip(frequencies, keys)) assert all(correct) la.clear()