def __init__( self, stoplist=STOP_WORDS, minsize=2, maxsize=None, renumber=True, lang=None, ): """ :param stoplist: A collection of words to remove from the stream. This is converted to a frozenset. The default is a list of common English stop words. :param minsize: The minimum length of token texts. Tokens with text smaller than this will be stopped. The default is 2. :param maxsize: The maximum length of token texts. Tokens with text larger than this will be stopped. Use None to allow any length. :param renumber: Change the 'pos' attribute of unstopped tokens to reflect their position with the stopped words removed. :param lang: Automatically get a list of stop words for the given language """ stops = set() if stoplist: stops.update(stoplist) if lang: from whoosh.lang import stopwords_for_language stops.update(stopwords_for_language(lang)) self.stops = frozenset(stops) self.min = minsize self.max = maxsize self.renumber = renumber
def __init__(self, stoplist=STOP_WORDS, minsize=2, maxsize=None, renumber=True, lang=None): """ :param stoplist: A collection of words to remove from the stream. This is converted to a frozenset. The default is a list of common English stop words. :param minsize: The minimum length of token texts. Tokens with text smaller than this will be stopped. The default is 2. :param maxsize: The maximum length of token texts. Tokens with text larger than this will be stopped. Use None to allow any length. :param renumber: Change the 'pos' attribute of unstopped tokens to reflect their position with the stopped words removed. :param lang: Automatically get a list of stop words for the given language """ stops = set() if stoplist: stops.update(stoplist) if lang: from whoosh.lang import stopwords_for_language stops.update(stopwords_for_language(lang)) self.stops = frozenset(stops) self.min = minsize self.max = maxsize self.renumber = renumber
def __init__(self, rtepair, stop=True, lemmatize=False): self.stop = stop self.stopwords = lang.stopwords_for_language("pt") self.negwords = set(['não', 'nunca', 'falhou' 'rejeitou', 'negou', 'sem', 'jamais', 'nada', 'nenhum', 'nem', 'ninguém', 'menos', 'pouco']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set([lemmatize(token) for token in self.text_tokens]) self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens]) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): """Configures a simple analyzer for the given language, with a LowercaseFilter, StopFilter, and StemFilter. >>> ana = LanguageAnalyzer("es") >>> [token.text for token in ana("Por el mar corren las liebres")] ['mar', 'corr', 'liebr'] :param expression: The regular expression pattern to use to extract tokens. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. """ from whoosh.lang import NoStemmer, NoStopWords from whoosh.lang import stopwords_for_language # Make the start of the chain chain = (RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()) # Add a stop word filter try: stoplist = stopwords_for_language(lang) chain = chain | StopFilter(stoplist=stoplist) except NoStopWords: pass # Add a stemming filter try: chain = chain | StemFilter(lang=lang, cachesize=cachesize) except NoStemmer: pass return chain