Пример #1
0
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
    global jieba_tokenize
    if jieba_tokenize is None:
        from wordfreq.chinese import jieba_tokenize
    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.casefold() for token in tokens if token_expr.match(token)]
Пример #2
0
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
    """
    Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
    """
    global jieba_tokenize
    if jieba_tokenize is None:
        from wordfreq.chinese import jieba_tokenize
    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.casefold() for token in tokens if token_expr.match(token)]
Пример #3
0
def tokenize(text, lang):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language.

    So far, this means:

    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
    - Japanese will be delegated to the external mecab-python module.
    - Chinese or Japanese texts that aren't identified as the appropriate
      language will only split on punctuation and script boundaries, giving
      you untokenized globs of characters that probably represent many words.
    - Turkish will use a different case-folding procedure, so that capital
      I and İ map to ı and i respectively.
    - All other languages will be tokenized using a regex that mostly
      implements the Word Segmentation section of Unicode Annex #29.
      See `simple_tokenize` for details.

    Additionally, the text will be case-folded to lowercase, and text marked
    as Arabic will be normalized more strongly and have combining marks and
    tatweels removed.

    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
    """
    if lang == "ja":
        global mecab_tokenize
        if mecab_tokenize is None:
            from wordfreq.japanese import mecab_tokenize
        tokens = mecab_tokenize(text)
        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]

    if lang == "zh":
        global jieba_tokenize
        if jieba_tokenize is None:
            from wordfreq.chinese import jieba_tokenize
        tokens = jieba_tokenize(text)
        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]

    if lang == "tr":
        return turkish_tokenize(text)

    if lang == "ar":
        text = remove_arabic_marks(unicodedata.normalize("NFKC", text))

    return simple_tokenize(text)