def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): """Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. >>> ana = StemmingAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["test", "test", "test"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param ignore: a set of words to not stem. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. Use None for no cache, or -1 for an unbounded cache. """ ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter( stemfn=stemfn, ignore=ignore, cachesize=cachesize)
def ChineseAnalyzer(expression=default_pattern, stoplist=None, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): """Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. 用小写过滤器、可选的停止停用词过滤器和词干过滤器组成生成器。 >>> ana = ChineseAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["test", "test", "test"] :param expression: 用于提取 token 令牌的正则表达式 :param stoplist: 一个停用词列表。 设置为 None 标识禁用停用词过滤功能。 :param minsize: 单词最小长度,小于它的单词将被从流中删除。 :param maxsize: 单词最大长度,大于它的单词将被从流中删除。 :param gaps: 如果为 True, tokenizer 令牌解析器将会分割正则表达式,而非匹配正则表达式 :param ignore: 一组忽略的单词。 :param cachesize: 缓存词干词的最大数目。 这个数字越大,词干生成的速度就越快,但占用的内存就越多。 使用 None 表示无缓存,使用 -1 表示无限缓存。 """ ret = ChineseTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter( stemfn=stemfn, ignore=ignore, cachesize=cachesize)
def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): """Configures a simple analyzer for the given language, with a LowercaseFilter, StopFilter, and StemFilter. >>> ana = LanguageAnalyzer("es") >>> [token.text for token in ana("Por el mar corren las liebres")] ['mar', 'corr', 'liebr'] The list of available languages is in `whoosh.lang.languages`. You can use :func:`whoosh.lang.has_stemmer` and :func:`whoosh.lang.has_stopwords` to check if a given language has a stemming function and/or stop word list available. :param expression: The regular expression pattern to use to extract tokens. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. """ from whoosh.lang import NoStemmer, NoStopWords # Make the start of the chain chain = (RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()) # Add a stop word filter try: chain = chain | StopFilter(lang=lang) except NoStopWords: pass # Add a stemming filter try: chain = chain | StemFilter(lang=lang, cachesize=cachesize) except NoStemmer: pass return chain