def chinese_tokenize(text, include_punctuation=False, external_wordlist=False): global jieba_tokenize if jieba_tokenize is None: from wordfreq.chinese import jieba_tokenize tokens = jieba_tokenize(text, external_wordlist=external_wordlist) token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE return [token.casefold() for token in tokens if token_expr.match(token)]
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False): """ Tokenize Chinese text, initializing the Jieba tokenizer if necessary. """ global jieba_tokenize if jieba_tokenize is None: from wordfreq.chinese import jieba_tokenize tokens = jieba_tokenize(text, external_wordlist=external_wordlist) token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE return [token.casefold() for token in tokens if token_expr.match(token)]
def tokenize(text, lang): """ Tokenize this text in a way that's relatively simple but appropriate for the language. So far, this means: - Chinese is presumed to already be tokenized. (Sorry. It's hard.) - Japanese will be delegated to the external mecab-python module. - Chinese or Japanese texts that aren't identified as the appropriate language will only split on punctuation and script boundaries, giving you untokenized globs of characters that probably represent many words. - Turkish will use a different case-folding procedure, so that capital I and İ map to ı and i respectively. - All other languages will be tokenized using a regex that mostly implements the Word Segmentation section of Unicode Annex #29. See `simple_tokenize` for details. Additionally, the text will be case-folded to lowercase, and text marked as Arabic will be normalized more strongly and have combining marks and tatweels removed. Strings that are looked up in wordfreq will be run through this function first, so that they can be expected to match the data. """ if lang == "ja": global mecab_tokenize if mecab_tokenize is None: from wordfreq.japanese import mecab_tokenize tokens = mecab_tokenize(text) return [token.casefold() for token in tokens if TOKEN_RE.match(token)] if lang == "zh": global jieba_tokenize if jieba_tokenize is None: from wordfreq.chinese import jieba_tokenize tokens = jieba_tokenize(text) return [token.casefold() for token in tokens if TOKEN_RE.match(token)] if lang == "tr": return turkish_tokenize(text) if lang == "ar": text = remove_arabic_marks(unicodedata.normalize("NFKC", text)) return simple_tokenize(text)