def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)
    inf = [x for x in inf if
           '-|–|—|--|---|——|~' not in x]  # remove the hyphen-between-letters pattern from infix patterns
    infix_re = compile_infix_regex(tuple(inf))

    infixes = (
            LIST_ELLIPSES
            + LIST_ICONS
            + [
                r'(?<=[0-9])[+\\-\\*^](?=[0-9-])',
                r'(?<=[{al}{q}])\\.(?=[{au}{q}])'.format(
                    al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
                ),
                # REMOVE: commented out regex that splits on hyphens between letters:
                # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
                # EDIT: remove split on slash between letters, and add comma
                # r'(?<=[{a}0-9])[:<>=/](?=[{a}])'.format(a=ALPHA),
                r'(?<=[{a}0-9])[:<>=,](?=[{a}])'.format(a=ALPHA),
                # ADD: ampersand as an infix character except for dual upper FOO&FOO variant
                r'(?<=[{a}0-9])[&](?=[{al}0-9])'.format(a=ALPHA, al=ALPHA_LOWER),
                r'(?<=[{al}0-9])[&](?=[{a}0-9])'.format(a=ALPHA, al=ALPHA_LOWER),
            ]
    )

    infix_re = spacy.util.compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                     suffix_search=nlp.tokenizer.suffix_search,
                     infix_finditer=infix_re.finditer,
                     token_match=nlp.tokenizer.token_match,
                     rules=nlp.Defaults.tokenizer_exceptions)
Exemplo n.º 2
0
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 use_regex: bool = True) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self.nlp = spacy.load('en_core_web_sm')

        if use_regex:
            infix_re = compile_infix_regex(self.nlp.Defaults.infixes +
                                           tuple(r'-') +
                                           tuple(r'[/+=\(\)\[\]]'))
            prefix_re = compile_prefix_regex(self.nlp.Defaults.prefixes +
                                             tuple(r'[\'\(\[]'))
            suffix_re = compile_suffix_regex(self.nlp.Defaults.suffixes +
                                             tuple(r'[\.\+\)\]]'))

            self.nlp.tokenizer = Tokenizer(
                self.nlp.vocab,
                prefix_search=prefix_re.search,
                suffix_search=suffix_re.search,
                infix_finditer=infix_re.finditer,
                token_match=self.nlp.tokenizer.token_match)
def clean_text(txt):
    nlp = French()
    listcode = [x + 45 for x in range(99)]
    postalcod = lambda dd, liscode: str(int(dd) * 1000
                                        ) if dd in liscode else dd
    customize_remove_PUNCT = ['%']
    for w in customize_remove_PUNCT:
        nlp.vocab[w].is_punct = False
    customize_add_PUNCT = [
        '>', '=', '$', '™', 'eee', 'ee', 'e', "EE", "EEE", "E", ":"
    ]
    for w in customize_add_PUNCT:
        nlp.vocab[w].is_punct = True
    reg = '(?<=[0-9])[+\\-\\*^](?=[0-9-])'
    list_infixes_defaults = list(nlp.Defaults.infixes)
    if reg in list_infixes_defaults:
        list_infixes_defaults.remove(reg)
    # modify process_text infix patterns(dd-dd-dd)
    infixes = (list_infixes_defaults + [r"(?<=[0-9])[\+\*^](?=[0-9-])"])
    infix_re = compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_re.finditer
    doc = nlp(txt)
    tokens = [
        postalcod(w.text.lower(), listcode) for w in doc
        if w.text != 'n' and not w.is_punct and not w.is_space
        and not (w.like_num and len(w.text) > 5) and not len(w.text) > 11
        and not w.is_quote
    ]
    listToStr = ' '.join(map(str, tokens))

    return listToStr
Exemplo n.º 4
0
    def custom_tokenizer(self):
        """ Function that defines a tokenizer in order to be used
        
        Parameters
        -----------
        nlp:  spacy loaded object
        return: prepared tokenizer
        """

        infixes = (
            LIST_ELLIPSES + LIST_ICONS + [
                r"(?<=[0-9])[+\-\*^](?=[0-9-])",
                r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                    al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
                r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
                #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
                r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
            ])

        infix_re = compile_infix_regex(infixes)

        return Tokenizer(self.nlp.vocab,
                         prefix_search=self.nlp.tokenizer.prefix_search,
                         suffix_search=self.nlp.tokenizer.suffix_search,
                         infix_finditer=infix_re.finditer,
                         token_match=self.nlp.tokenizer.token_match,
                         rules=self.nlp.Defaults.tokenizer_exceptions)
Exemplo n.º 5
0
def custom_tokenizer(nlp):
    # add '\.|-|~' and remove '#' (default prefixes list)
    hashtag_index = nlp.Defaults.prefixes.index('#')
    _prefixes = list(nlp.Defaults.prefixes) + [r'^\.|^~|^-(?=\S)']
    del _prefixes[hashtag_index]
    # add '\.' and remove '#' (default suffixes list)
    # add _api_calls regex
    hashtag_index = nlp.Defaults.suffixes.index('#')
    _suffixes = list(nlp.Defaults.suffixes) + _api_invoc + _var + [r'\.$']
    del _suffixes[hashtag_index]
    # add '\(|\[' to split nested api calls, arrays etc (default infixes list)
    # add _hashtags regex
    _infixes = list(nlp.Defaults.infixes) + _hashtags + \
        [r'\(|\)|\[|\]|\{|\}|<|>|,|=|\+|-|:|;|\'|\"|\/|&|\?']
    # setup each regex using native spaCy util functions
    prefix_re = util.compile_prefix_regex(_prefixes)
    suffix_re = util.compile_suffix_regex(_suffixes)
    infix_re = util.compile_infix_regex(_infixes)
    _tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions
    return Tokenizer(nlp.vocab,
                     _tokenizer_exceptions,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=_protect.match)
Exemplo n.º 6
0
def setup_tokenizer():
    '''
    Function to set up a tokenizer with specific rules to not split words or
    numbers that include hyphens.
    '''

    nlp = spacy.load('en_core_web_sm')

    # Default infixes
    inf = list(nlp.Defaults.infixes)

    # Remove the generic op between numbers or between a number and a hyphen
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")

    # Convert inf to tuple
    inf = tuple(inf)

    # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])

    # Remove hyphen between letters rule
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x]
    infix_re = compile_infix_regex(infixes)

    nlp.tokenizer = Tokenizer(nlp.vocab,
                               prefix_search = nlp.tokenizer.prefix_search,
                               suffix_search = nlp.tokenizer.suffix_search,
                               infix_finditer = infix_re.finditer,
                               token_match = nlp.tokenizer.token_match,
                               rules = nlp.Defaults.tokenizer_exceptions)

    return nlp
Exemplo n.º 7
0
def create_custom_tokenizer(nlp):
    from spacy import util
    from spacy.tokenizer import Tokenizer
    from spacy.lang.tokenizer_exceptions import TOKEN_MATCH
    prefixes = nlp.Defaults.prefixes + ('^<i>', )
    suffixes = nlp.Defaults.suffixes + ('</i>$', )
    # remove the tag symbols from prefixes and suffixes
    prefixes = list(prefixes)
    prefixes.remove('<')
    prefixes = tuple(prefixes)
    suffixes = list(suffixes)
    suffixes.remove('>')
    suffixes = tuple(suffixes)
    infixes = nlp.Defaults.infixes
    rules = nlp.Defaults.tokenizer_exceptions
    token_match = TOKEN_MATCH
    prefix_search = (util.compile_prefix_regex(prefixes).search)
    suffix_search = (util.compile_suffix_regex(suffixes).search)
    infix_finditer = (util.compile_infix_regex(infixes).finditer)
    return Tokenizer(nlp.vocab,
                     rules=rules,
                     prefix_search=prefix_search,
                     suffix_search=suffix_search,
                     infix_finditer=infix_finditer,
                     token_match=token_match)
Exemplo n.º 8
0
def custom_tokenizer(nlp):
    prefix_re = compile_prefix_regex(Language.Defaults.prefixes + (';', '\*'))
    suffix_re = compile_suffix_regex(Language.Defaults.suffixes + (';', '\*'))
    infix_re = compile_infix_regex(Language.Defaults.infixes +
                                   ('(', ')', "/", "-", ";", "\*"))
    return Tokenizer(nlp.vocab,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None)
    def _get_infix_regex(self):
        """
        Custom infix tokenization rules
        :return:
        """
        custom_infixes = [r'\[\]', r'(?<=[0-9])-(?=[0-9])', r'[!&:,()\*/-><]']
        infix_re = compile_infix_regex(
            tuple(list(self.nlp.Defaults.infixes) + custom_infixes))

        return infix_re
Exemplo n.º 10
0
    def _get_infix_regex(self):
        """
        Custom infix tokenization rules
        :return:
        """
        custom_infixes = ['.']
        infix_re = compile_infix_regex(
            tuple(list(self.nlp.Defaults.infixes) + custom_infixes))

        return infix_re
 def _get_infix_regex(self):
     # Customize Spacy tokenization to NOT split words with hyphens
     # Source: https://spacy.io/usage/linguistic-features#native-tokenizers
     return compile_infix_regex(LIST_ELLIPSES + LIST_ICONS + [
         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
             al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         # EDIT: commented out regex that splits on hyphens between letters:
         # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
     ]).finditer
Exemplo n.º 12
0
def customize_infixes():
    infixes = (
        LIST_ELLIPSES + LIST_ICONS + [
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>/](?=[{a}])".format(a=ALPHA),
        ])
    infix_re = compile_infix_regex(infixes)
    return infix_re
Exemplo n.º 13
0
def create_custom_tokenizer(nlp):
    prefixes = compile_prefix_regex(nlp.Defaults.prefixes)
    infixes = compile_infix_regex(nlp.Defaults.infixes)
    suffixes = compile_suffix_regex(
        tuple(list(nlp.Defaults.suffixes) + custom_suffixes))

    return Tokenizer(nlp.vocab,
                     rules=nlp.Defaults.tokenizer_exceptions,
                     prefix_search=prefixes.search,
                     infix_finditer=infixes.finditer,
                     suffix_search=suffixes.search,
                     token_match=None)
Exemplo n.º 14
0
def my_nlp(model):
    nlp = spacy.load(model)
    list_infixes_defaults = list(nlp.Defaults.infixes)
    if reg in list_infixes_defaults:
        list_infixes_defaults.remove(reg)
    # modify tokenizer infix patterns(dd-dd-dd)
    infixes = (list_infixes_defaults + [r"(?<=[0-9])[\+\*^](?=[0-9-])"])
    infix_re = compile_infix_regex(infixes)

    nlp.tokenizer.infix_finditer = infix_re.finditer
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, before='ner')
    return nlp
Exemplo n.º 15
0
 def __init__(self):
     self.nlp = spacy.load("en_core_web_lg")
     infixes = (
         LIST_ELLIPSES + LIST_ICONS + [
             r"(?<=[0-9])[+\-\*^](?=[0-9-])",
             r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                 al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
             r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
             # EDIT: commented out regex that splits on hyphens between letters:
             # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
             r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
         ])
     infix_re = compile_infix_regex(infixes)
     self.nlp.tokenizer.infix_finditer = infix_re.finditer
Exemplo n.º 16
0
def extend_tokenizer(nlp, pref, inf, suf):
    pref = tuple(
        pref + list(nlp.Defaults.prefixes)) if pref else nlp.Defaults.prefixes
    suf = tuple(suf +
                list(nlp.Defaults.suffixes)) if suf else nlp.Defaults.suffixes
    inf = tuple(inf +
                list(nlp.Defaults.infixes)) if inf else nlp.Defaults.infixes
    tok = "^(?:" + "|".join([RE[r]["str"] for r in RE['tok_patterns']]) + ")$"
    return Tokenizer(
        nlp.vocab,
        rules=nlp.Defaults.tokenizer_exceptions,
        prefix_search=spacyUtil.compile_prefix_regex(pref).search,
        suffix_search=spacyUtil.compile_suffix_regex(suf).search,
        infix_finditer=spacyUtil.compile_infix_regex(inf).finditer,
        token_match=re.compile(tok).match)
Exemplo n.º 17
0
def test_tokenizer_infix_prefix(en_vocab):
    # the prefix and suffix matches overlap in the suffix lookbehind
    infixes = ["±"]
    suffixes = ["%"]
    infix_re = compile_infix_regex(infixes)
    suffix_re = compile_suffix_regex(suffixes)
    tokenizer = Tokenizer(
        en_vocab,
        infix_finditer=infix_re.finditer,
        suffix_search=suffix_re.search,
    )
    tokens = [t.text for t in tokenizer("±10%")]
    assert tokens == ["±10", "%"]
    explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
    assert tokens == explain_tokens
Exemplo n.º 18
0
 def custom_tokenizer(nlp):
     infixes = (LIST_ELLIPSES + LIST_ICONS + [
         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
         r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
             al=ALPHA_LOWER, au=ALPHA_UPPER,
             q=CONCAT_QUOTES), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA)
     ])
     infix_re = compile_infix_regex(infixes)
     return Tokenizer(nlp.vocab,
                      prefix_search=nlp.tokenizer.prefix_search,
                      suffix_search=nlp.tokenizer.suffix_search,
                      infix_finditer=infix_re.finditer,
                      token_match=nlp.tokenizer.token_match,
                      rules=nlp.Defaults.tokenizer_exceptions)
Exemplo n.º 19
0
def spacy_nlp(nlp):
    customize_add_PUNCT = ['/', '=', '$', '|', '\\', "-"]
    for w in customize_add_PUNCT:
        nlp.vocab[w].is_punct = True

    # modify tokenizer infix patterns
    prefixes = (list(nlp.Defaults.prefixes) + ['/'])
    prefixes_regex = compile_prefix_regex(prefixes)
    nlp.tokenizer.prefix_search = prefixes_regex.search

    infixes = (list(nlp.Defaults.infixes) +
               ['(?<=[0-9])[|\/+\\-\\*^](?=[0-9-])'])
    infix_re = compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_re.finditer

    return nlp
Exemplo n.º 20
0
 def spacy_tokenizer(text_spacy):
     # modify tokenizer infix patterns
     infixes = (
         LIST_ELLIPSES + LIST_ICONS + [
             r"(?<=[0-9])[+\-\*^](?=[0-9-])",
             r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                 al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
             r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
             # EDIT: commented out regex that splits on hyphens between letters:
             #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
             r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
         ])
     infix_re = compile_infix_regex(infixes)
     self.nlp.tokenizer.infix_finditer = infix_re.finditer
     doc = self.nlp(text_spacy)
     return [(t.text, t.pos_) for t in doc]
Exemplo n.º 21
0
def create_medspacy_tokenizer(nlp):
    """Generates a custom tokenizer to augment the default spacy tokenizer 
        for situations commonly seen in clinical text.
        This includes:
            * Punctuation infixes.  
                For example, this allows the following examples to be more aggresively tokenized as :
                    "Patient complains of c/o" -> [..., 'c', '/', 'o']
                    "chf+cp" -> ['chf', '+', 'cp']
       @param nlp: Spacy language model
    """

    # augment the defaults
    # this is not quite correct.  We do not want to break on uppercase and we do not
    # want to break on all punctuation (periods)
    # infixes = nlp.Defaults.infixes + (r'''[^a-z0-9]''',)
    # escape all the punctuation we want to allow to allow to break up tokens

    # get all python punctuation
    punctuation_chars = string.punctuation
    # remove periods so that we do not break up '1.5 mg' into '1 . 5 mg'
    punctuation_chars = punctuation_chars.replace('.', '')

    infixes = nlp.Defaults.infixes + (r'''[{}]'''.format(
        re.escape(punctuation_chars)), )
    prefixes = nlp.Defaults.prefixes
    suffixes = nlp.Defaults.suffixes

    # compile
    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    # Default exceptions could be extended later
    tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy()

    # now create this
    tokenizer = Tokenizer(
        nlp.vocab,
        tokenizer_exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )

    return tokenizer
def custom_en_tokenizer(en_vocab):
    prefix_re = compile_prefix_regex(English.Defaults.prefixes)
    suffix_re = compile_suffix_regex(English.Defaults.suffixes)
    custom_infixes = [
        r"\.\.\.+",
        r"(?<=[0-9])-(?=[0-9])",
        r"[0-9]+(,[0-9]+)+",
        r"[\[\]!&:,()\*—–\/-]",
    ]
    infix_re = compile_infix_regex(custom_infixes)
    return Tokenizer(
        en_vocab,
        English.Defaults.tokenizer_exceptions,
        prefix_re.search,
        suffix_re.search,
        infix_re.finditer,
        token_match=None,
    )
Exemplo n.º 23
0
def setup_spacy_parser():
    SPACY_PARSER = spacy.load('en', disable=['spacy_parser', 'ner'])

    #prefix_re = re.compile(r'''^[]''')
    #suffix_re = re.compile(r'''[]$''')
    #infix_re = re.compile(r'''''')

    # modify tokenizer infix patterns
    infixes = (LIST_ELLIPSES + LIST_ICONS)

    infix_re = compile_infix_regex(infixes)

    SPACY_PARSER.tokenizer.infix_finditer = infix_re.finditer
    SPACY_PARSER.tokenizer.add_special_case("``", [{"ORTH": "``"}])
    SPACY_PARSER.tokenizer.add_special_case("´´", [{"ORTH": "´´"}])
    #SPACY_PARSER.tokenizer.prefix_search = prefix_re.search
    #SPACY_PARSER.tokenizer.suffix_search = suffix_re.search
    return SPACY_PARSER
Exemplo n.º 24
0
def _custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)  # Default infixes
    inf.remove(
        r"(?<=[0-9])[+\-\*^](?=[0-9-])"
    )  # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)  # Convert inf to tuple
    infixes = inf + tuple([
        r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"
    ])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x
               ]  # Remove - between letters rule
    infix_re = compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab,
                     prefix_search=nlp.tokenizer.prefix_search,
                     suffix_search=nlp.tokenizer.suffix_search,
                     infix_finditer=infix_re.finditer,
                     token_match=nlp.tokenizer.token_match,
                     rules=nlp.Defaults.tokenizer_exceptions)
def custom_tokenizer(nlp):
    infixes = list(nlp.Defaults.infixes)

    # add custom tokenize cases:
    # for case: <Wort>-<Wort> --> für deutsch eher weglassen?
    #infixes.append(r'(?<=[{a}"])[-](?=[{a}])'.format(a=ALPHA))
    # for case: <Zahl>-<Wort>
    infixes.append(r'(?<=[0-9])[-](?=[{a}])'.format(a=ALPHA))

    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab,
                     rules=nlp.Defaults.tokenizer_exceptions,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=None)
Exemplo n.º 26
0
def custom_tokenizer(nlp, never_split):
    cls = nlp.Defaults
    rules = cls.tokenizer_exceptions
    token_match = cls.token_match
    prefix_search = (util.compile_prefix_regex(cls.prefixes).search
                     if cls.prefixes else None)
    suffix_search = (util.compile_suffix_regex(cls.suffixes).search
                     if cls.suffixes else None)
    infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
                      if cls.infixes else None)
    vocab = nlp.vocab
    return Tokenizer(
        vocab,
        rules=rules,
        prefix_search=prefix_search,
        suffix_search=suffix_search,
        infix_finditer=infix_finditer,
        token_match=lambda x: token_match(x) or x in never_split,
    )
Exemplo n.º 27
0
def add_special_tokenizer_cases(nlp: Language) -> Language:
    infix_re = compile_infix_regex(
        tuple(TOKENIZER_INFIXES + [
            r"(?<=[{a}0-9])([()#\.]+|(-)+([->])+)(?=[{a}0-9])".format(a=ALPHA)
        ]))
    prefix_re = compile_prefix_regex(tuple(TOKENIZER_PREFIXES + [r'^[.-]+']))
    suffix_re = compile_suffix_regex(tuple(TOKENIZER_SUFFIXES + [r'[.-]+$']))
    nlp.tokenizer = Tokenizer(nlp.vocab,
                              prefix_search=prefix_re.search,
                              suffix_search=suffix_re.search,
                              infix_finditer=infix_re.finditer,
                              token_match=None)

    for tok in [
            '==', '+=', '-=', '*=', '/=', '%=', '!=', '<>', '->', '-->', '--',
            '---', TOK_VERSION
    ]:
        nlp.tokenizer.add_special_case(tok, [{ORTH: tok, NORM: tok, POS: X}])
    return nlp
def customize_tokenizer(nlp):
    prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
    infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
    # Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
    exceptions = {
        k: v
        for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
        if not (len(k) == 2 and k[1] == ".")
    }
    new_tokenizer = Tokenizer(
        nlp.vocab,
        exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )
    nlp.tokenizer = new_tokenizer
Exemplo n.º 29
0
def custom_tokenizer(nlp):
    infixes = (LIST_ELLIPSES + LIST_ICONS + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9\.])[:<>=()+—](?=[{a}0-9\.])".format(a=ALPHA),
        r"(?<=[A-Za-z]{2})/(?=[A-Za-z]{2})",
        r"(?:[{a}]\.)+ [{a}0-9]".format(a=ALPHA),
    ])

    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes + ('-', ))
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes + ('-', ))

    return Tokenizer(nlp.vocab,
                     prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=nlp.tokenizer.token_match,
                     rules=nlp.Defaults.tokenizer_exceptions)
    def custom_tokenizer(self):
        '''Set up custom tokenizer'''

        default_infix = self.nlp.Defaults.infixes
        default_prefix = self.nlp.Defaults.prefixes
        default_suffix = self.nlp.Defaults.suffixes

        prefix_list = ['mr', 'dr', 'mrs', 'prof', 'ms', 'mx']
        prefix_re_list = self._make_prefix_cases(prefix_list)

        all_infix_re = compile_infix_regex(default_infix)
        all_prefix_re = spacy.util.compile_prefix_regex(
            tuple(list(default_prefix) + prefix_re_list))

        all_suffix_re = compile_suffix_regex(default_suffix)

        return Tokenizer(self.nlp.vocab,
                         prefix_search=all_prefix_re.search,
                         suffix_search=all_suffix_re.search,
                         infix_finditer=all_infix_re.finditer,
                         token_match=None)