def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ (SimpleAnalyzer(), True), (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True), (StandardAnalyzer(), False), (StemmingAnalyzer(), False), ] source_language = unit.translation.subproject.project.source_language lang_code = source_language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append((LanguageAnalyzer(lang_code), False)) # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append((NgramAnalyzer(4), False)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer, combine in analyzers: # Some Whoosh analyzers break on unicode new_words = [] try: new_words = [token.text for token in analyzer(text)] except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) words.update(new_words) # Add combined string to allow match against multiple word # entries allowing to combine up to 5 words if combine: words.update([ ' '.join(new_words[x:y]) for x in range(len(new_words)) for y in range(1, min(x + 6, len(new_words) + 1)) if x != y ]) # Grab all words in the dictionary dictionary = self.filter(project=unit.translation.subproject.project, language=unit.translation.language) if '' in words: words.remove('') if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup dictionary = dictionary.filter(source__iregex=r'^({0})$'.format( '|'.join([re_escape(word) for word in words]))) return dictionary
def get_words(self, unit): """ Returns list of word pairs for an unit. """ words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ StandardAnalyzer(), StemmingAnalyzer(), ] source_language = unit.translation.subproject.project.source_language lang_code = source_language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append(LanguageAnalyzer(lang_code)) # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update( [token.text for token in analyzer(force_text(text))] ) except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) # Grab all words in the dictionary dictionary = self.filter( project=unit.translation.subproject.project, language=unit.translation.language ) if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup query = Q() for word in words: query |= Q(source__iexact=word) # Filter dictionary dictionary = dictionary.filter(query) return dictionary
def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer from whoosh.lang import has_stemmer, has_stopwords import os if not has_stemmer(language) or not has_stopwords(language): # TODO Display a warning? analyzer = SimpleAnalyzer() else: analyzer = LanguageAnalyzer(language) self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() self.index_path = index_path if not os.path.exists(index_path): try: os.mkdir(index_path) except OSError as e: sys.exit("Error creating Whoosh index: %s" % e) if whoosh_index.exists_in(index_path): try: self.search_index = whoosh_index.open_dir(index_path) except whoosh_index.IndexError as e: sys.exit("Error opening whoosh index: {0}".format(e)) else: self.search_index = whoosh_index.create_in(index_path, self.schema) self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer from whoosh.lang import has_stemmer, has_stopwords import os if not has_stemmer(language) or not has_stopwords(language): # TODO Display a warning? analyzer = SimpleAnalyzer() else: analyzer = LanguageAnalyzer(language) self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() self.index_path = index_path if not os.path.exists(index_path): try: os.mkdir(index_path) except OSError as e: sys.exit("Error creating Whoosh index: %s" % e) if whoosh_index.exists_in(index_path): try: self.search_index = whoosh_index.open_dir(index_path) except whoosh_index.IndexError as e: sys.exit("Error opening whoosh index: %s" % (e)) else: self.search_index = whoosh_index.create_in(index_path, self.schema) self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin())