Python Tokenizer示例，samewords.tokenize.Tokenizer Python示例

示例#1

0

显示文件

文件： matcher.py 项目： Kevin-Mattheus-Moerman/samewords

 def _find_ellipsis_words(self, input_string: str) -> Words:
     """Determine whether input string has lemma ellipsis pattern and
     return the preceding and following word as elements in Words object.
     If there is no ellipsis pattern, return an empty Words list. """
     settings_pat = '|'.join([pat for pat in
                              settings['ellipsis_patterns']])
     ellipsis_pat = regex.compile('(' + settings_pat + ')')
     ellipsis_search = regex.search(ellipsis_pat, input_string)
     if ellipsis_search:
         spos = ellipsis_search.span()[0]
         epos = ellipsis_search.span()[1]
         return (Tokenizer(input_string[:spos]).wordlist +
                 Tokenizer(input_string[spos:epos]).wordlist +
                 Tokenizer(input_string[epos:]).wordlist)
     return Words()

示例#2

0

显示文件

 def test_unicode_blocks(self):
     for block in blocks:
         # Test the last point in each block
         point = chr(int(blocks[block][1], 16))
         if not regex.match(
                 r'[{}]+'.format(''.join(settings['punctuation'])), point):
             assert Tokenizer(point).wordlist == [point]

示例#3

0

显示文件

 def test_edtext_with_nested_brackets(self):
     text = '\edtext{entry \emph{nested \emph{b}}}{\Bfootnote{fnote}} nice'
     expect = ['entry', 'nested', 'b', 'nice']
     registry = [{'lvl': 0, 'data': [0, 2]}]
     tokenization = Tokenizer(text)
     assert tokenization.wordlist == expect
     assert tokenization.wordlist.write() == text
     assert tokenization.registry == registry

示例#4

0

显示文件

 def test_edtext_by_another_edtext_are_separated(self):
     text = r"\edtext{a}{\Bfootnote{b}},\edtext{c}{\Bfootnote{d}}"
     wordlist = ['a', 'c']
     registry = [{'lvl': 0, 'data': [0, 0]}, {'lvl': 0, 'data': [1, 1]}]
     tokenization = Tokenizer(text)
     assert tokenization.wordlist == wordlist
     assert tokenization.registry == registry
     assert tokenization.wordlist.write() == text

示例#5

0

显示文件

 def test_registry(self):
     text = 'text \edtext{emphasis}{\Bfootnote{fnote}} is nice'
     expect = ['text', 'emphasis', 'is', 'nice']
     registry = [{'lvl': 0, 'data': [1, 1]}]
     tokenization = Tokenizer(text)
     assert tokenization.wordlist == expect
     assert tokenization.wordlist.write() == text
     assert tokenization.registry == registry

示例#6

0

显示文件

 def test_tokenizer_two_levels(self):
     text = r"""
     \edtext{lvl1 \edtext{lvl2 }{\Bfootnote{l2-note}}}{\Bfootnote{l1-note}}
     """
     expect = ['', 'lvl1', 'lvl2', '']
     registry = [{'lvl': 0, 'data': [1, 3]}, {'lvl': 1, 'data': [2, 3]}]
     tokenization = Tokenizer(text)
     assert tokenization.wordlist == expect
     assert tokenization.wordlist.write() == text
     assert tokenization.registry == registry

示例#7

0

显示文件

文件： core.py 项目： Kevin-Mattheus-Moerman/samewords

def run_annotation(input_text: str, method: str = 'annotate') -> str:
    tokenization = Tokenizer(input_text)
    matcher = Matcher(tokenization.wordlist, tokenization.registry)
    if method == 'annotate':
        words = matcher.annotate()
    elif method == 'update':
        words = matcher.update()
    else:
        words = matcher.cleanup()
    return words.write()

示例#8

0

显示文件

文件： matcher.py 项目： Kevin-Mattheus-Moerman/samewords

    def cleanup(self, wordlist: Words = None) -> Words:
        """Given a Words list, remove all sameword annotations."""
        if not wordlist:
            wordlist = self.words

        for val, word in enumerate(wordlist):
            if word.has_sameword:
                # Remove the macro itself.
                for i in [v for v, m in enumerate(word.macros)
                          if m.name == '\\sameword']:
                    close_idx = val + word.macros[i].to_closing
                    word.pop_macro(i)
                    wordlist[close_idx].pop_suffix()

                # Clean the app note's `\lemma{}` if there is any.
                for i in [v for v, a in enumerate(word.clean_apps) if
                          regex.search(r'\\lemma', a.cont)]:
                    # get the relevant app Element
                    app_note = word.clean_apps[i]
                    # split up the apparatus note into before, lem, and after
                    s, e = self._find_lemma_pos(app_note)
                    if s == e:
                        # empty lemma
                        break
                    el_words = self._find_ellipsis_words(app_note.cont[s:e])
                    if el_words:
                        # Tokenize the lemma words and ellipsis
                        lem_words = el_words
                    else:
                        lem_words = Tokenizer(app_note.cont[s:e]).wordlist
                    lem_words = self.cleanup(lem_words)
                    # patch app note up again with new lemma content
                    bef = app_note.cont[:s]
                    after = app_note.cont[e:]
                    new = bef + lem_words.write() + after
                    # update the app note Element with the new content
                    word.update_element(app_note, new)
        return wordlist

示例#9

0

显示文件

文件： matcher.py 项目： Kevin-Mattheus-Moerman/samewords

    def _define_search_words(self, edtext: Words) -> Tuple[List, bool]:
        """
        From the Words that make up the edtext element, determine the search
        words based on either (1) the content of the lemma element in the
        apparatus note or (2) the content of the critical note.

        When the apparatus notes are analyzed (the .clean_apps attribute)
        they are moved into the .ann_apps attribute. This means that an app
        element can never occur in both attributes (as that would result in
        duplicate entries of the app in printing)

        :return:
        """
        # The apparatus note is the first item in app_entries of last Word
        app_note = edtext[-1].clean_apps.pop()
        start, end = self._find_lemma_pos(app_note)
        if start is not -1:
            # Content excluding the brackets
            lemma_content = app_note.cont[start:end]
        else:
            lemma_content = ''

        if lemma_content:
            tokens = self._find_ellipsis_words(lemma_content)
            if tokens:
                ellipsis = True
            else:
                tokens = Tokenizer(lemma_content).wordlist
                ellipsis = False
            lem_wl = Words([w for w in tokens if w.content])
            if ellipsis:
                # Covers ellipsis lemma.
                content = ([lem_wl[0].get_text()] + [lem_wl[-1].get_text()])
            elif len(lem_wl) == 1:
                # Covers single word lemma
                content = [lem_wl[0].get_text()]
            elif len(lem_wl) > 1:
                # Covers multiword lemma
                content = lem_wl.clean()
            else:
                content = []
                ellipsis = False
        else:
            content = edtext.clean()
            ellipsis = False
        if not settings['sensitive_context_match']:
            content = [w.lower() for w in content]
        edtext[-1].ann_apps.append(app_note)
        return content, ellipsis

示例#10

0

显示文件

 def test_registry_with_three_close_levels(self):
     text = (r"so \edtext{\edtext{\edtext{so}{\lemma{so}\Bfootnote{lev "
             r"3}}}{\lemma{so}\Bfootnote{lev 2}}}{\lemma{so}\Bfootnote{lev"
             r" 1}}")
     expect = ['so', 'so']
     registry = [{
         'lvl': 0,
         'data': [1, 1]
     }, {
         'lvl': 1,
         'data': [1, 1]
     }, {
         'lvl': 2,
         'data': [1, 1]
     }]
     tokenization = Tokenizer(text)
     assert tokenization.wordlist == expect
     assert tokenization.registry == registry
     assert tokenization.wordlist.write() == text

示例#11

0

显示文件

 def test_registry_with_nesting_and_sequential_nested_entries(self):
     text = r"""
     \edtext{lvl1 \edtext{lvl2 \edtext{lvl3-1}{\Bfootnote{n3}} inter
     \edtext{lvl3-2}{\Bfootnote{n4}}}{\Bfootnote{n2}}}{\Bfootnote{n1}}
     """
     expect = ['', 'lvl1', 'lvl2', 'lvl3-1', 'inter', 'lvl3-2']
     registry = [{
         'lvl': 0,
         'data': [1, 5]
     }, {
         'lvl': 1,
         'data': [2, 5]
     }, {
         'lvl': 2,
         'data': [3, 3]
     }, {
         'lvl': 2,
         'data': [5, 5]
     }]
     tokenization = Tokenizer(text)
     assert tokenization.wordlist == expect
     assert tokenization.wordlist.write() == text
     assert tokenization.registry == registry

示例#12

0

显示文件

 def write_tokenization(self, input_text):
     return Tokenizer(input_text).wordlist.write()

示例#13

0

显示文件

 def test_whitespace(self):
     text = 'short text\t with    some\n space and stuff'
     expect = ['short', 'text', 'with', 'some', 'space', 'and', 'stuff']
     assert Tokenizer(text).wordlist == expect
     assert Tokenizer(text).wordlist.write() == text

示例#14

0

显示文件

 def test_punctuation(self):
     text = 'text, with. punctuation.-!"#$&()*+,-./:;<=>?@[]^`| enough?!'
     expect = ['text', 'with', 'punctuation', 'enough']
     tokens = Tokenizer(text)
     assert tokens.wordlist == expect
     assert tokens.wordlist.write() == text

示例#15

0

显示文件

    def test_space_macros(self):
        thinspace1 = r'A\,B'
        thinspace2 = r'A\thinspace B'
        enskip = r'A\enskip B'
        quad = r'A\quad B'
        qquad = r'A\qquad B'
        hskip = r'A\hskip{10pt}B'
        enspace = r'A\enspace B'
        negthinspace = r'A\negthinspace B'
        kern = r'A\kern{.5em}B'
        singleword_result = ['AB']
        non_spaced_result = ['A', 'B']
        spaced_result = ['A', '', 'B']
        assert Tokenizer(thinspace1).wordlist == singleword_result
        assert Tokenizer(thinspace2).wordlist == non_spaced_result
        assert Tokenizer(enskip).wordlist == spaced_result
        assert Tokenizer(quad).wordlist == spaced_result
        assert Tokenizer(qquad).wordlist == spaced_result
        assert Tokenizer(hskip).wordlist == ['A', 'B']
        assert Tokenizer(enspace).wordlist == spaced_result
        assert Tokenizer(negthinspace).wordlist == spaced_result
        assert Tokenizer(kern).wordlist == ['A', 'B']

        assert Tokenizer(thinspace1).wordlist.write() == thinspace1
        assert Tokenizer(thinspace2).wordlist.write() == thinspace2
        assert Tokenizer(enskip).wordlist.write() == enskip
        assert Tokenizer(quad).wordlist.write() == quad
        assert Tokenizer(qquad).wordlist.write() == qquad
        assert Tokenizer(hskip).wordlist.write() == hskip
        assert Tokenizer(enspace).wordlist.write() == enspace
        assert Tokenizer(negthinspace).wordlist.write() == negthinspace
        assert Tokenizer(kern).wordlist.write() == kern

示例#16

0

显示文件

 def test_edtext_after_word_content(self):
     text = (r"word\edtext{content}{\Bfootnote{note}}")
     assert Tokenizer(text).wordlist == ['word', 'content']

示例#17

0

显示文件

文件： matcher.py 项目： Kevin-Mattheus-Moerman/samewords

    def annotate(self, registry: Registry = None) -> Words:
        """
        Given a registry, determine whether there is a context match of
        the edtext lemma content for each entry and annotate accordingly.
        """
        if not registry:
            registry = self.registry

        for entry in registry:
            # Get data points for phrase and its start and end
            edtext_start = entry['data'][0]
            edtext_end = entry['data'][1] + 1
            edtext_lvl = entry['lvl'] + 1   # Reledmac 1-indexes the levels.
            edtext = self.words[edtext_start:edtext_end]

            # Identify search words and ellipsis
            search_ws, ellipsis = self._define_search_words(edtext)

            if ellipsis:
                # If we have a lemma note with ellipsis, we need to establish
                # context for both ellipsis elements (which may be nested
                # inside the edtext).
                ell_sidx = edtext.index(search_ws[0], default=0) + edtext_start
                ell_eidx = edtext.rindex(search_ws[1], default=0) + edtext_start

                el1_ctxt = self._get_contexts(self.words, ell_sidx)
                el2_ctxt = self._get_contexts(self.words, ell_eidx)
                contexts = el1_ctxt + el2_ctxt
            else:
                # Establish the context
                ctxt_before = self._get_context_before(self.words, edtext_start)
                ctxt_after = self._get_context_after(self.words, edtext_end)
                contexts = [w.get_text() for w in ctxt_before] + \
                           [w.get_text() for w in ctxt_after]

            # Is there a match in either context?
            if search_ws and self._in_context(contexts, search_ws, ellipsis):

                # Annotate the edtext
                # -------------------
                if ellipsis:
                    sidx = edtext.index(search_ws[0], default=0)
                    eidx = edtext.rindex(search_ws[1], default=0)
                    if self._in_context(el1_ctxt, search_ws[0:1], ellipsis):
                        self._add_sameword(edtext[sidx:sidx+1], edtext_lvl)
                    if self._in_context(el2_ctxt, search_ws[-1:], ellipsis):
                        self._add_sameword(edtext[eidx:eidx+1], edtext_lvl)
                else:
                    try:
                        with temp_settings({'sensitive_context_match': False}):
                            sidx, eidx = self._find_index(edtext, search_ws)
                    except TypeError:
                        raise ValueError("Looks like edtext and lemma content "
                                         "don't match in "
                                         "'{}'".format(edtext.write()))

                    self._process_annotation(edtext, sidx, eidx, edtext_lvl)

                # Annotate the lemma if relevant
                # ------------------
                if r'\lemma' in edtext[-1].ann_apps[-1].cont:
                    # get the relevant app Element
                    app_note = edtext[-1].ann_apps[-1]
                    # split up the apparatus note into before, lem, after
                    s, e = self._find_lemma_pos(app_note)
                    if ellipsis:
                        # Tokenize the lemma words and ellipsis
                        # Annotate the lemma word where the context matches
                        # We want to annotate words even though they may not
                        # be first or last index in tokenized text. So we get
                        #  the indexes of those (list comp `idxs`) and then
                        # use those to index into the tokenized list in
                        # replacing.
                        lemma = self._find_ellipsis_words(app_note.cont[s:e])
                        idxs = [i for i, w in enumerate(lemma) if w.content]
                        if self._in_context(el1_ctxt, search_ws[0:1], ellipsis):
                            lemma[idxs[0]] = self._add_sameword(
                                lemma[idxs[0]:idxs[0]+1], level=0)[0]
                        if self._in_context(el2_ctxt, search_ws[-1:], ellipsis):
                            lemma[idxs[-1]] = self._add_sameword(
                                lemma[idxs[-1]:idxs[-1]+1], level=0)[0]

                    else:
                        lemma = Tokenizer(app_note.cont[s:e]).wordlist
                        lemma = self._process_annotation(lemma, 0, len(lemma), 0)

                    # patch app note up again with new lemma content
                    bef = app_note.cont[:s]
                    after = app_note.cont[e:]
                    new = bef + lemma.write() + after
                    # update the app note Element with the new content
                    edtext[-1].update_element(app_note, new)

                # Then annotate the contexts
                # ------------------------------
                if ellipsis:
                    for pos, word in zip([ell_sidx, ell_eidx], search_ws):
                        ctxt = self._get_context_before(self.words, pos) +\
                               self._get_context_after(self.words, pos + 1)
                        if self._in_context(ctxt, [word], ellipsis):
                            self._annotate_context(ctxt, [word])
                else:
                    for ctxt in [ctxt_before, ctxt_after]:
                        self._annotate_context(ctxt, search_ws)

        return self.words

示例#18

0

显示文件

 def test_nested_macro(self):
     text = r'text \emph{with \textbf{nesting} emphasis}'
     expect = ['text', 'with', 'nesting', 'emphasis']
     tokens = Tokenizer(text)
     assert tokens.wordlist == expect
     assert tokens.wordlist.write() == text

示例#19

0

显示文件

 def test_single_macro(self):
     text = 'text \emph{emphasis} is nice'
     expect = ['text', 'emphasis', 'is', 'nice']
     tokens = Tokenizer(text)
     assert tokens.wordlist == expect
     assert tokens.wordlist.write() == text

示例#20

0

显示文件

 def test_latex_non_breaking_space(self):
     text = '2~dollars'
     expect = ['2', 'dollars']
     assert Tokenizer(text).wordlist == expect
     assert Tokenizer(text).wordlist.write() == text