def _find_ellipsis_words(self, input_string: str) -> Words: """Determine whether input string has lemma ellipsis pattern and return the preceding and following word as elements in Words object. If there is no ellipsis pattern, return an empty Words list. """ settings_pat = '|'.join([pat for pat in settings['ellipsis_patterns']]) ellipsis_pat = regex.compile('(' + settings_pat + ')') ellipsis_search = regex.search(ellipsis_pat, input_string) if ellipsis_search: spos = ellipsis_search.span()[0] epos = ellipsis_search.span()[1] return (Tokenizer(input_string[:spos]).wordlist + Tokenizer(input_string[spos:epos]).wordlist + Tokenizer(input_string[epos:]).wordlist) return Words()
def test_unicode_blocks(self): for block in blocks: # Test the last point in each block point = chr(int(blocks[block][1], 16)) if not regex.match( r'[{}]+'.format(''.join(settings['punctuation'])), point): assert Tokenizer(point).wordlist == [point]
def test_edtext_with_nested_brackets(self): text = '\edtext{entry \emph{nested \emph{b}}}{\Bfootnote{fnote}} nice' expect = ['entry', 'nested', 'b', 'nice'] registry = [{'lvl': 0, 'data': [0, 2]}] tokenization = Tokenizer(text) assert tokenization.wordlist == expect assert tokenization.wordlist.write() == text assert tokenization.registry == registry
def test_edtext_by_another_edtext_are_separated(self): text = r"\edtext{a}{\Bfootnote{b}},\edtext{c}{\Bfootnote{d}}" wordlist = ['a', 'c'] registry = [{'lvl': 0, 'data': [0, 0]}, {'lvl': 0, 'data': [1, 1]}] tokenization = Tokenizer(text) assert tokenization.wordlist == wordlist assert tokenization.registry == registry assert tokenization.wordlist.write() == text
def test_registry(self): text = 'text \edtext{emphasis}{\Bfootnote{fnote}} is nice' expect = ['text', 'emphasis', 'is', 'nice'] registry = [{'lvl': 0, 'data': [1, 1]}] tokenization = Tokenizer(text) assert tokenization.wordlist == expect assert tokenization.wordlist.write() == text assert tokenization.registry == registry
def test_tokenizer_two_levels(self): text = r""" \edtext{lvl1 \edtext{lvl2 }{\Bfootnote{l2-note}}}{\Bfootnote{l1-note}} """ expect = ['', 'lvl1', 'lvl2', ''] registry = [{'lvl': 0, 'data': [1, 3]}, {'lvl': 1, 'data': [2, 3]}] tokenization = Tokenizer(text) assert tokenization.wordlist == expect assert tokenization.wordlist.write() == text assert tokenization.registry == registry
def run_annotation(input_text: str, method: str = 'annotate') -> str: tokenization = Tokenizer(input_text) matcher = Matcher(tokenization.wordlist, tokenization.registry) if method == 'annotate': words = matcher.annotate() elif method == 'update': words = matcher.update() else: words = matcher.cleanup() return words.write()
def cleanup(self, wordlist: Words = None) -> Words: """Given a Words list, remove all sameword annotations.""" if not wordlist: wordlist = self.words for val, word in enumerate(wordlist): if word.has_sameword: # Remove the macro itself. for i in [v for v, m in enumerate(word.macros) if m.name == '\\sameword']: close_idx = val + word.macros[i].to_closing word.pop_macro(i) wordlist[close_idx].pop_suffix() # Clean the app note's `\lemma{}` if there is any. for i in [v for v, a in enumerate(word.clean_apps) if regex.search(r'\\lemma', a.cont)]: # get the relevant app Element app_note = word.clean_apps[i] # split up the apparatus note into before, lem, and after s, e = self._find_lemma_pos(app_note) if s == e: # empty lemma break el_words = self._find_ellipsis_words(app_note.cont[s:e]) if el_words: # Tokenize the lemma words and ellipsis lem_words = el_words else: lem_words = Tokenizer(app_note.cont[s:e]).wordlist lem_words = self.cleanup(lem_words) # patch app note up again with new lemma content bef = app_note.cont[:s] after = app_note.cont[e:] new = bef + lem_words.write() + after # update the app note Element with the new content word.update_element(app_note, new) return wordlist
def _define_search_words(self, edtext: Words) -> Tuple[List, bool]: """ From the Words that make up the edtext element, determine the search words based on either (1) the content of the lemma element in the apparatus note or (2) the content of the critical note. When the apparatus notes are analyzed (the .clean_apps attribute) they are moved into the .ann_apps attribute. This means that an app element can never occur in both attributes (as that would result in duplicate entries of the app in printing) :return: """ # The apparatus note is the first item in app_entries of last Word app_note = edtext[-1].clean_apps.pop() start, end = self._find_lemma_pos(app_note) if start is not -1: # Content excluding the brackets lemma_content = app_note.cont[start:end] else: lemma_content = '' if lemma_content: tokens = self._find_ellipsis_words(lemma_content) if tokens: ellipsis = True else: tokens = Tokenizer(lemma_content).wordlist ellipsis = False lem_wl = Words([w for w in tokens if w.content]) if ellipsis: # Covers ellipsis lemma. content = ([lem_wl[0].get_text()] + [lem_wl[-1].get_text()]) elif len(lem_wl) == 1: # Covers single word lemma content = [lem_wl[0].get_text()] elif len(lem_wl) > 1: # Covers multiword lemma content = lem_wl.clean() else: content = [] ellipsis = False else: content = edtext.clean() ellipsis = False if not settings['sensitive_context_match']: content = [w.lower() for w in content] edtext[-1].ann_apps.append(app_note) return content, ellipsis
def test_registry_with_three_close_levels(self): text = (r"so \edtext{\edtext{\edtext{so}{\lemma{so}\Bfootnote{lev " r"3}}}{\lemma{so}\Bfootnote{lev 2}}}{\lemma{so}\Bfootnote{lev" r" 1}}") expect = ['so', 'so'] registry = [{ 'lvl': 0, 'data': [1, 1] }, { 'lvl': 1, 'data': [1, 1] }, { 'lvl': 2, 'data': [1, 1] }] tokenization = Tokenizer(text) assert tokenization.wordlist == expect assert tokenization.registry == registry assert tokenization.wordlist.write() == text
def test_registry_with_nesting_and_sequential_nested_entries(self): text = r""" \edtext{lvl1 \edtext{lvl2 \edtext{lvl3-1}{\Bfootnote{n3}} inter \edtext{lvl3-2}{\Bfootnote{n4}}}{\Bfootnote{n2}}}{\Bfootnote{n1}} """ expect = ['', 'lvl1', 'lvl2', 'lvl3-1', 'inter', 'lvl3-2'] registry = [{ 'lvl': 0, 'data': [1, 5] }, { 'lvl': 1, 'data': [2, 5] }, { 'lvl': 2, 'data': [3, 3] }, { 'lvl': 2, 'data': [5, 5] }] tokenization = Tokenizer(text) assert tokenization.wordlist == expect assert tokenization.wordlist.write() == text assert tokenization.registry == registry
def write_tokenization(self, input_text): return Tokenizer(input_text).wordlist.write()
def test_whitespace(self): text = 'short text\t with some\n space and stuff' expect = ['short', 'text', 'with', 'some', 'space', 'and', 'stuff'] assert Tokenizer(text).wordlist == expect assert Tokenizer(text).wordlist.write() == text
def test_punctuation(self): text = 'text, with. punctuation.-!"#$&()*+,-./:;<=>?@[]^`| enough?!' expect = ['text', 'with', 'punctuation', 'enough'] tokens = Tokenizer(text) assert tokens.wordlist == expect assert tokens.wordlist.write() == text
def test_space_macros(self): thinspace1 = r'A\,B' thinspace2 = r'A\thinspace B' enskip = r'A\enskip B' quad = r'A\quad B' qquad = r'A\qquad B' hskip = r'A\hskip{10pt}B' enspace = r'A\enspace B' negthinspace = r'A\negthinspace B' kern = r'A\kern{.5em}B' singleword_result = ['AB'] non_spaced_result = ['A', 'B'] spaced_result = ['A', '', 'B'] assert Tokenizer(thinspace1).wordlist == singleword_result assert Tokenizer(thinspace2).wordlist == non_spaced_result assert Tokenizer(enskip).wordlist == spaced_result assert Tokenizer(quad).wordlist == spaced_result assert Tokenizer(qquad).wordlist == spaced_result assert Tokenizer(hskip).wordlist == ['A', 'B'] assert Tokenizer(enspace).wordlist == spaced_result assert Tokenizer(negthinspace).wordlist == spaced_result assert Tokenizer(kern).wordlist == ['A', 'B'] assert Tokenizer(thinspace1).wordlist.write() == thinspace1 assert Tokenizer(thinspace2).wordlist.write() == thinspace2 assert Tokenizer(enskip).wordlist.write() == enskip assert Tokenizer(quad).wordlist.write() == quad assert Tokenizer(qquad).wordlist.write() == qquad assert Tokenizer(hskip).wordlist.write() == hskip assert Tokenizer(enspace).wordlist.write() == enspace assert Tokenizer(negthinspace).wordlist.write() == negthinspace assert Tokenizer(kern).wordlist.write() == kern
def test_edtext_after_word_content(self): text = (r"word\edtext{content}{\Bfootnote{note}}") assert Tokenizer(text).wordlist == ['word', 'content']
def annotate(self, registry: Registry = None) -> Words: """ Given a registry, determine whether there is a context match of the edtext lemma content for each entry and annotate accordingly. """ if not registry: registry = self.registry for entry in registry: # Get data points for phrase and its start and end edtext_start = entry['data'][0] edtext_end = entry['data'][1] + 1 edtext_lvl = entry['lvl'] + 1 # Reledmac 1-indexes the levels. edtext = self.words[edtext_start:edtext_end] # Identify search words and ellipsis search_ws, ellipsis = self._define_search_words(edtext) if ellipsis: # If we have a lemma note with ellipsis, we need to establish # context for both ellipsis elements (which may be nested # inside the edtext). ell_sidx = edtext.index(search_ws[0], default=0) + edtext_start ell_eidx = edtext.rindex(search_ws[1], default=0) + edtext_start el1_ctxt = self._get_contexts(self.words, ell_sidx) el2_ctxt = self._get_contexts(self.words, ell_eidx) contexts = el1_ctxt + el2_ctxt else: # Establish the context ctxt_before = self._get_context_before(self.words, edtext_start) ctxt_after = self._get_context_after(self.words, edtext_end) contexts = [w.get_text() for w in ctxt_before] + \ [w.get_text() for w in ctxt_after] # Is there a match in either context? if search_ws and self._in_context(contexts, search_ws, ellipsis): # Annotate the edtext # ------------------- if ellipsis: sidx = edtext.index(search_ws[0], default=0) eidx = edtext.rindex(search_ws[1], default=0) if self._in_context(el1_ctxt, search_ws[0:1], ellipsis): self._add_sameword(edtext[sidx:sidx+1], edtext_lvl) if self._in_context(el2_ctxt, search_ws[-1:], ellipsis): self._add_sameword(edtext[eidx:eidx+1], edtext_lvl) else: try: with temp_settings({'sensitive_context_match': False}): sidx, eidx = self._find_index(edtext, search_ws) except TypeError: raise ValueError("Looks like edtext and lemma content " "don't match in " "'{}'".format(edtext.write())) self._process_annotation(edtext, sidx, eidx, edtext_lvl) # Annotate the lemma if relevant # ------------------ if r'\lemma' in edtext[-1].ann_apps[-1].cont: # get the relevant app Element app_note = edtext[-1].ann_apps[-1] # split up the apparatus note into before, lem, after s, e = self._find_lemma_pos(app_note) if ellipsis: # Tokenize the lemma words and ellipsis # Annotate the lemma word where the context matches # We want to annotate words even though they may not # be first or last index in tokenized text. So we get # the indexes of those (list comp `idxs`) and then # use those to index into the tokenized list in # replacing. lemma = self._find_ellipsis_words(app_note.cont[s:e]) idxs = [i for i, w in enumerate(lemma) if w.content] if self._in_context(el1_ctxt, search_ws[0:1], ellipsis): lemma[idxs[0]] = self._add_sameword( lemma[idxs[0]:idxs[0]+1], level=0)[0] if self._in_context(el2_ctxt, search_ws[-1:], ellipsis): lemma[idxs[-1]] = self._add_sameword( lemma[idxs[-1]:idxs[-1]+1], level=0)[0] else: lemma = Tokenizer(app_note.cont[s:e]).wordlist lemma = self._process_annotation(lemma, 0, len(lemma), 0) # patch app note up again with new lemma content bef = app_note.cont[:s] after = app_note.cont[e:] new = bef + lemma.write() + after # update the app note Element with the new content edtext[-1].update_element(app_note, new) # Then annotate the contexts # ------------------------------ if ellipsis: for pos, word in zip([ell_sidx, ell_eidx], search_ws): ctxt = self._get_context_before(self.words, pos) +\ self._get_context_after(self.words, pos + 1) if self._in_context(ctxt, [word], ellipsis): self._annotate_context(ctxt, [word]) else: for ctxt in [ctxt_before, ctxt_after]: self._annotate_context(ctxt, search_ws) return self.words
def test_nested_macro(self): text = r'text \emph{with \textbf{nesting} emphasis}' expect = ['text', 'with', 'nesting', 'emphasis'] tokens = Tokenizer(text) assert tokens.wordlist == expect assert tokens.wordlist.write() == text
def test_single_macro(self): text = 'text \emph{emphasis} is nice' expect = ['text', 'emphasis', 'is', 'nice'] tokens = Tokenizer(text) assert tokens.wordlist == expect assert tokens.wordlist.write() == text
def test_latex_non_breaking_space(self): text = '2~dollars' expect = ['2', 'dollars'] assert Tokenizer(text).wordlist == expect assert Tokenizer(text).wordlist.write() == text