def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if '' in words: words.remove('') if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup return self.filter( project=unit.translation.component.project, language=unit.translation.language, source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format( '|'.join(re_escape(word) for word in islice(words, 1000))), )
def ChineseAnalyzer(expression=default_pattern, stoplist=None, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): """Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. 用小写过滤器、可选的停止停用词过滤器和词干过滤器组成生成器。 >>> ana = ChineseAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["test", "test", "test"] :param expression: 用于提取 token 令牌的正则表达式 :param stoplist: 一个停用词列表。 设置为 None 标识禁用停用词过滤功能。 :param minsize: 单词最小长度,小于它的单词将被从流中删除。 :param maxsize: 单词最大长度,大于它的单词将被从流中删除。 :param gaps: 如果为 True, tokenizer 令牌解析器将会分割正则表达式,而非匹配正则表达式 :param ignore: 一组忽略的单词。 :param cachesize: 缓存词干词的最大数目。 这个数字越大,词干生成的速度就越快,但占用的内存就越多。 使用 None 表示无缓存,使用 -1 表示无限缓存。 """ ret = ChineseTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter( stemfn=stemfn, ignore=ignore, cachesize=cachesize)
def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=True, splitwords=True, splitnums=True, mergewords=False, mergenums=False): """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and StopFilter. >>> ana = FancyAnalyzer() >>> [token.text for token in ana("Should I call getInt or get_real?")] ["should", "call", "getInt", "get", "int", "get_real", "get", "real"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ return (RegexTokenizer(expression=expression, gaps=gaps) | IntraWordFilter(splitwords=splitwords, splitnums=splitnums, mergewords=mergewords, mergenums=mergenums) | LowercaseFilter() | StopFilter(stoplist=stoplist, minsize=minsize))
def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): """Composes a RegexTokenizer with a lower case filter, an optional stop filter, and a stemming filter. >>> ana = StemmingAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["test", "test", "test"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param ignore: a set of words to not stem. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. Use None for no cache, or -1 for an unbounded cache. """ ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter( stemfn=stemfn, ignore=ignore, cachesize=cachesize)
def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False): """Composes a RegexTokenizer with a LowercaseFilter and optional StopFilter. >>> ana = StandardAnalyzer() >>> [token.text for token in ana("Testing is testing and testing")] ["testing", "testing", "testing"] :param expression: The regular expression pattern to use to extract tokens. :param stoplist: A list of stop words. Set this to None to disable the stop word filter. :param minsize: Words smaller than this are removed from the stream. :param maxsize: Words longer that this are removed from the stream. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ ret = RegexTokenizer(expression=expression, gaps=gaps) chain = ret | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain
def stop(text): stopper = StopFilter() tokenizer = SimpleAnalyzer() tokens = tokenizer(text) result = [] for token in stopper(tokens): result.append(repr(token.text)) return ' '.join(result)
def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000): """Configures a simple analyzer for the given language, with a LowercaseFilter, StopFilter, and StemFilter. >>> ana = LanguageAnalyzer("es") >>> [token.text for token in ana("Por el mar corren las liebres")] ['mar', 'corr', 'liebr'] The list of available languages is in `whoosh.lang.languages`. You can use :func:`whoosh.lang.has_stemmer` and :func:`whoosh.lang.has_stopwords` to check if a given language has a stemming function and/or stop word list available. :param expression: The regular expression pattern to use to extract tokens. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. :param cachesize: the maximum number of stemmed words to cache. The larger this number, the faster stemming will be but the more memory it will use. """ from whoosh.lang import NoStemmer, NoStopWords # Make the start of the chain chain = (RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()) # Add a stop word filter try: chain = chain | StopFilter(lang=lang) except NoStopWords: pass # Add a stemming filter try: chain = chain | StemFilter(lang=lang, cachesize=cachesize) except NoStemmer: pass return chain
def get_terms(self, unit): """Return list of term pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - basic simple analyzer to split on non-word chars # - simple analyzer just splits words based on regexp to catch in word dashes # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer() | stopfilter, SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError): report_error(cause="Term words parsing") if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no glossary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": # Use regex as that is utilizing pg_trgm index results = self.filter( source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])". format("|".join(re_escape(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.for_project(unit.translation.component.project).filter( language=unit.translation.language)
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": results = self.filter(source__search=reduce( lambda x, y: x | y, (SearchQuery(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.filter( project=unit.translation.component.project, language=unit.translation.language, )