Exemplo n.º 1
0
        def combined_rule_prefixes() -> List[str]:
            """Helper function that returns the prefix pattern for the tokenizer.
               It is a helper function to accommodate spacy tests that only test
               prefixes.
            """
            prefix_punct = char_classes.PUNCT.replace("|", " ")

            prefixes = (["§", "%", "=", r"\+"] +
                        char_classes.split_chars(prefix_punct) +
                        char_classes.LIST_ELLIPSES + char_classes.LIST_QUOTES +
                        char_classes.LIST_CURRENCY + char_classes.LIST_ICONS)
            return prefixes
Exemplo n.º 2
0
def combined_rule_prefixes() -> List[str]:
    """Helper function that returns the prefix pattern for the tokenizer.
       It is a helper function to accomodate spacy tests that only test
       prefixes.
    """
    # add lookahead assertions for brackets (may not work properly for unbalanced brackets)
    prefix_punct = char_classes.PUNCT.replace("|", " ")
    prefix_punct = prefix_punct.replace(r"\(", r"\((?![^\(\s]+\)\S+)")
    prefix_punct = prefix_punct.replace(r"\[", r"\[(?![^\[\s]+\]\S+)")
    prefix_punct = prefix_punct.replace(r"\{", r"\{(?![^\{\s]+\}\S+)")

    prefixes = (["§", "%", "=", r"\+"] +
                char_classes.split_chars(prefix_punct) +
                char_classes.LIST_ELLIPSES + char_classes.LIST_QUOTES +
                char_classes.LIST_CURRENCY + char_classes.LIST_ICONS)
    return prefixes
Exemplo n.º 3
0
def combined_rule_tokenizer(nlp: Language) -> Tokenizer:
    """Creates a custom tokenizer on top of spaCy's default tokenizer. The
       intended use of this function is to replace the tokenizer in a spaCy
       pipeline like so:

            nlp = spacy.load("some_spacy_model")
            nlp.tokenizer = combined_rule_tokenizer(nlp)

       @param nlp: a loaded spaCy model
    """
    # remove the first hyphen to prevent tokenization of the normal hyphen
    hyphens = char_classes.HYPHENS.replace("-|", "", 1)

    infixes = (
        char_classes.LIST_ELLIPSES + char_classes.LIST_ICONS + [
            r"×",  # added this special x character to tokenize it separately
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}])\.(?=[{au}])".format(al=char_classes.ALPHA_LOWER,
                                              au=char_classes.ALPHA_UPPER),
            r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA),
            r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(
                a=char_classes.ALPHA, h=hyphens),
            # removed / to prevent tokenization of /
            r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=char_classes.ALPHA),
        ])

    prefixes = combined_rule_prefixes()

    # add the last apostrophe
    quotes = char_classes.LIST_QUOTES.copy() + ["’"]

    # add lookbehind assertions for brackets (may not work properly for unbalanced brackets)
    suffix_punct = char_classes.PUNCT.replace("|", " ")
    # These lookbehinds are commented out because they are variable width lookbehinds, and as of spacy 2.1,
    # spacy uses the re package instead of the regex package. The re package does not support variable width
    # lookbehinds. Hacking spacy internals to allow us to use the regex package is doable, but would require
    # creating our own instance of the language class, with our own Tokenizer class, with the from_bytes method
    # using the regex package instead of the re package
    # suffix_punct = suffix_punct.replace(r"\)", r"(?<!\S+\([^\)\s]+)\)")
    # suffix_punct = suffix_punct.replace(r"\]", r"(?<!\S+\[[^\]\s]+)\]")
    # suffix_punct = suffix_punct.replace(r"\}", r"(?<!\S+\{[^\}\s]+)\}")

    suffixes = (
        char_classes.split_chars(suffix_punct) + char_classes.LIST_ELLIPSES +
        quotes + char_classes.LIST_ICONS +
        ["'s", "'S", "’s", "’S", "’s", "’S"] + [
            r"(?<=[0-9])\+",
            r"(?<=°[FfCcKk])\.",
            r"(?<=[0-9])(?:{})".format(char_classes.CURRENCY),
            # this is another place where we used a variable width lookbehind
            # so now things like 'H3g' will be tokenized as ['H3', 'g']
            # previously the lookbehind was (^[0-9]+)
            r"(?<=[0-9])(?:{u})".format(u=char_classes.UNITS),
            r"(?<=[0-9{}{}(?:{})])\.".format(char_classes.ALPHA_LOWER,
                                             r"%²\-\)\]\+", "|".join(quotes)),
            # add |\d to split off the period of a sentence that ends with 1D.
            r"(?<=[{a}|\d][{a}])\.".format(a=char_classes.ALPHA_UPPER),
        ])

    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    # Update exclusions to include these abbreviations so the period is not split off
    exclusions = {
        abbreviation: [{
            ORTH: abbreviation
        }]
        for abbreviation in ABBREVIATIONS
    }
    tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy()
    tokenizer_exceptions.update(exclusions)

    tokenizer = Tokenizer(
        nlp.vocab,
        tokenizer_exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )
    return tokenizer
Exemplo n.º 4
0
def combined_rule_tokenizer(nlp: Language) -> Tokenizer:
    """Creates a custom tokenizer on top of spaCy's default tokenizer. The
       intended use of this function is to replace the tokenizer in a spaCy
       pipeline like so:

            nlp = spacy.load("some_spacy_model")
            nlp.tokenizer = combined_rule_tokenizer(nlp)

       @param nlp: a loaded spaCy model
    """
    # remove the first hyphen to prevent tokenization of the normal hyphen
    hyphens = char_classes.HYPHENS.replace("-|", "", 1)

    infixes = (
        char_classes.LIST_ELLIPSES + char_classes.LIST_ICONS + [
            r'×',  # added this special x character to tokenize it separately
            r'(?<=[0-9])[+\-\*^](?=[0-9-])',
            r'(?<=[{}])\.(?=[{}])'.format(char_classes.ALPHA_LOWER,
                                          char_classes.ALPHA_UPPER),
            r'(?<=[{a}]),(?=[{a}])'.format(a=char_classes.ALPHA),
            r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(
                a=char_classes.ALPHA, h=hyphens),
            # removed / to prevent tokenization of /
            r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=char_classes.ALPHA)
        ])

    prefixes = combined_rule_prefixes()

    # add the last apostrophe
    quotes = char_classes.QUOTES.replace('|', ' ') + " ’"

    # add lookbehind assertions for brackets (may not work properly for unbalanced brackets)
    suffix_punct = char_classes.PUNCT.replace('|', ' ')
    suffix_punct = suffix_punct.replace(r"\)", r"(?<!\S+\([^\)\s]+)\)")
    suffix_punct = suffix_punct.replace(r"\]", r"(?<!\S+\[[^\]\s]+)\]")
    suffix_punct = suffix_punct.replace(r"\}", r"(?<!\S+\{[^\}\s]+)\}")

    suffixes = (
        char_classes.split_chars(suffix_punct) + char_classes.LIST_ELLIPSES +
        char_classes.split_chars(quotes) + char_classes.LIST_ICONS +
        ["'s", "'S", "’s", "’S", "’s", "’S"] + [
            r'(?<=[0-9])\+',
            r'(?<=°[FfCcKk])\.',
            r'(?<=[0-9])(?:{})'.format(char_classes.CURRENCY),
            # add to look behind to exclude things like h3g from splitting off the g as a unit
            r'(?<=(^[0-9]+|\s{p}*[0-9]+))(?:{u})'.format(u=char_classes.UNITS,
                                                         p=prefixes),
            r'(?<=[0-9{}{}(?:{})])\.'.format(char_classes.ALPHA_LOWER,
                                             r'%²\-\)\]\+',
                                             char_classes.merge_chars(quotes)),
            # add |\d to split off the period of a sentence that ends with 1D.
            r'(?<=[{a}|\d][{a}])\.'.format(a=char_classes.ALPHA_UPPER)
        ])
    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    # Update exclusions to include these abbreviations so the period is not split off
    exclusions = {
        abbreviation: [{
            ORTH: abbreviation
        }]
        for abbreviation in ABBREVIATIONS
    }
    tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy()
    tokenizer_exceptions.update(exclusions)

    tokenizer = Tokenizer(nlp.vocab,
                          tokenizer_exceptions,
                          prefix_search=prefix_re.search,
                          suffix_search=suffix_re.search,
                          infix_finditer=infix_re.finditer,
                          token_match=nlp.tokenizer.token_match)
    return tokenizer