Python stem примеры, whoosh.lang.porter.stem Python примеры использования

Пример #1

0

Показать файл

Файл: search.py Проект: Spoken-tutorial/spoken-website

def search_for_results(userquery, corrected_flag=True):
    try:
        if os.path.exists(settings.SEARCH_INDEX_DIR):
            # open index directory and create object for searcher class
            index_reference = open_dir(settings.SEARCH_INDEX_DIR)
            searcher = index_reference.searcher()

            # Applying stemming on the userquery
            stem(userquery)

            # OrGroup.factory - which is useful better for giving relavance rather
            # than naive term frequency of the words in the query
            og = qparser.OrGroup.factory(0.9)

            # initializing Multifield Parser for searching in the multiple fields
            queryparser = qparser.MultifieldParser(
                ["tags", "foss", "title", "outline"], schema=index_reference.schema, group=og)

            # These Plugins will remove the ability of the user to specify fields to search
            queryparser.remove_plugin_class(qparser.FieldsPlugin)

            # To remove the ability to search for wildcards, which can be harmful to query performance
            queryparser.remove_plugin_class(qparser.WildcardPlugin)

            # can specify a fuzzy term by adding a ~ followed by an optional maximum edit distance (Ex : jav~1)
            queryparser.add_plugin(qparser.FuzzyTermPlugin())

            # Parse the Given Query
            q = queryparser.parse(userquery)

            # For Correcting Spelling with maximum edit distance 3. More than 3 It may affect the performance.
            corrected = searcher.correct_query(q, userquery, maxdist=3)

            # if the corrected query is not matched with the parsed query then it will ask for Did you mean option
            # if the user Entered the query is equal to the suggested query then it will search for the suggested query
            # else the original query of the is user is searched
            corrected_string = None
            if corrected_flag:
                if corrected.query != q:
                    corrected_string = corrected.string

            results = searcher.search(q, terms=True, limit=None)

            # printing the no.of videos found and their title of the video
            print "%d Videos Found for %s " % (results.scored_length(), userquery)
            if (results.has_matched_terms() and results.scored_length() > 0):
                collection = []
                for hit in results:
                    row = TutorialResource.objects.filter(
                        tutorial_detail_id=hit['VideoId'], language__name='English').first()
                    collection.append(row)
                return collection, corrected_string
            else:
                return None, corrected_string

    # finally close the searcher object
    finally:
        searcher.close()
    return None, None

Пример #2

0

Показать файл

Файл: views.py Проект: UKTradeInvestment/navigator

def query_categories(request):
    """
    A simple AJAX view that returns suggested product categories based on supplied product terms
    """

    query_words = request.GET.get('q', None)

    if query_words is None:
        return {}

    results, suggestion = perform_category_query(stem(query_words))

    if len(query_words) > 3 and len(results) == 0 and suggestion is not None:
        query_words = suggestion
        results, suggestion = perform_category_query(query_words)

    ordered_categories = OrderedDict(sorted(results.items(), key=lambda x: len(x[1]), reverse=True))

    categories = []

    for category, sub_categories in ordered_categories.items():
        if category == 'Mature':
            categories.append([category, ""])
        else:
            categories.append([category, Truncator(", ".join(sub_categories)).words(8)])

    resp = {
        "query": query_words,
        "categories": categories,
        "suggestion": suggestion
    }

    return JsonResponse(resp)

Пример #3

0

Показать файл

Файл: whoosh_backend.py Проект: jannon/django-haystack

    def clean(self, query_fragment):
        """
        Provides a mechanism for sanitizing user input before presenting the
        value to the backend.

        Whoosh 1.X differs here in that you can no longer use a backslash
        to escape reserved characters. Instead, the whole word should be
        quoted.
        """
        words = query_fragment.split()
        cleaned_words = []

        for word in words:
            if word in self.backend.RESERVED_WORDS:
                word = word.replace(word, word.lower())

            for char in self.backend.RESERVED_CHARACTERS:
                if char in word:
                    word = "'%s'" % word
                    break

            if self.stemming:
                word = stem(word)
            cleaned_words.append(word)

        return ' '.join(cleaned_words)

Пример #4

0

Показать файл

def doStemming(s):
    ret = ""
    SS = s.split(' ')
    for text in SS:
        ret = ret + stem(text)
        ret += " "
    return ret

Пример #5

0

Показать файл

Файл: views.py Проект: ashiq-techie/lsiSearch

def queryParsing(query):
	from whoosh import qparser
	from whoosh.analysis import RegexTokenizer
	from whoosh.lang.porter import stem
	from whoosh.lang.morph_en import variations
	from whoosh.analysis import StopFilter
	print("inside queryParsing")
	tokenizer = RegexTokenizer()
	return_list = []   
	
	#Removing stop words
	stopper = StopFilter()
	tokens = stopper(tokenizer(query))

	for t in tokens:
		
		#converting to lower case
		t.text = t.text.lower()
		
		#stemming
		s=stem(t.text)
		return_list.append(s)
		
		#adding variations
		termVariations = variations(t.text)
		for u in termVariations:
			return_list.append(u)

	return return_list

Пример #6

0

Показать файл

Файл: Index_after_stemming_and_removed_stopwords.py Проект: shubhamshuklaer/search_engine

def doStemming(s):
    ret = ""
    SS = s.split(" ")
    for text in SS:
        ret = ret + stem(text)
        ret += " "
    return ret

Пример #7

0

Показать файл

Файл: servlet.py Проект: sshyran/coldice

    def do_search(self, params):
        self.send_response(200)
        self.send_header('Content-Type', 'text/html')

        indices_dir = HOME_FOLDER + '/.indices'
        if not os.path.exists(indices_dir):
            self.wfile.write('<div class="search_result">Index folder %s does not exist.</div>' % indices_dir)
            self.end_headers()
            return

        n = 10
        page = 1
        if 'n' in params:
            n = int(params['n'])
            if n < 10:
                n = 10
            if n > 100:
                n = 100
        if 'p' in params:
            page = int(params['p'])
        if 'q' in params:
            keywords = unicode(' '.join([stem(param.strip()).decode('utf-8') for param in params['q'].lower().split()]))
            query = MultifieldParser(SEARCH_FIELDS, schema=INDEX_SCHEMA).parse(keywords)
            # print query
            weighting = MultiWeighting(BM25F(),
                title=BoostWeighting(10.0), path=BoostWeighting(8.0),
                h1=BoostWeighting(8.0), h2=BoostWeighting(6.0), h3=BoostWeighting(3.0),
                h4=BoostWeighting(2.0), h5=BoostWeighting(1.2),
            )
            searcher = open_dir(indices_dir).searcher(weighting = weighting)
            # print 'n=', n, 'page=', page
            results = searcher.search(query, limit=None)
            self.send_header('Search-Query', keywords)
            self.send_header('Search-Size', len(results))
            pages = len(results) // n
            if len(results) % n > 0:
                pages += 1
            if page < 1:
                page = 1
            elif page > pages:
                page = pages
            self.send_header('Search-Page', page)
            self.send_header('Search-Pages', pages)
            if page > 1:
                self.send_header('Search-Prev', page - 1)
            if page < pages:
                self.send_header('Search-Next', page + 1)
            self.send_header('Search-Limit', n)
            self.end_headers()

            for result in results[(page-1)*n : page*n]:
                # print result.rank, result.score, result.docnum
                response = SEARCH_RESULT % (result['url'], result['title'], result['content'][:200])
                self.wfile.write(response.encode('utf-8'))
        else:
            self.end_headers()
            self.wfile.write('<div class="search_result">No query parameter.</div>')

Пример #8

0

Показать файл

def term_frequency(searcher, term, qp):
    qt = stem(term.lower())
    q = qp.parse(qt)

    try:
        tf = searcher.term_info('content', qt)._weight
        results = searcher.search(q)
        #print("term:", term, "TF", tf)
        return tf

    except:
        pass
        return 0

Пример #9

0

Показать файл

def term_modifications(file):
    final_terms = []
    with open(file, 'r') as f:
        content = f.read()
        text_for_file = str.split(content)
        text_without_stop = [
            i for i in text_for_file if i.lower() not in stop_words
        ]
        for term in text_without_stop:
            h = stem(term.lower())
            # print (h)
            final_terms.append(h)
    return final_terms

Пример #10

0

Показать файл

Файл: query_on_whoosh.py Проект: COSC381-2020Fall/class-project-fmartin7

def query(query_str, items_per_page=10, current_page=1):
    query_str = stem(query_str)
    with ix.searcher(weighting=scoring.Frequency) as searcher:
        query = QueryParser("description", ix.schema).parse(query_str)
        results = searcher.search(query, limit=None)
        num_query_results = len(results)
        query_results = []
        start_index = (current_page - 1) * items_per_page
        end_index = start_index + items_per_page

        for i in range(start_index, min(len(results), end_index)):
            d={}
            d['url'] = "https://www.youtube.com/watch?v=%s" % results[i]['id']
            d['title'] = results[i]['title']
            d['description'] = results[i].highlights('description')
            d['score'] = results[i].score
            query_results.append(d)

        return query_results, num_query_results

Пример #11

0

Показать файл

    def __call__(self, tokens):
        cache = self.cache
        ignores = self.ignores

        for t in tokens:
            if t.stopped:
                yield t
                continue

            text = t.text
            if text in ignores:
                yield t
            elif text in cache:
                t.text = cache[text]
                yield t
            else:
                t.text = s = stem(text)
                cache[text] = s
                yield t

Пример #12

0

Показать файл

Файл: analysis.py Проект: floppya/Whoosh-AppEngine

 def __call__(self, tokens):
     cache = self.cache
     ignores = self.ignores
     
     for t in tokens:
         if t.stopped:
             yield t
             continue
         
         text = t.text
         if text in ignores:
             yield t
         elif text in cache:
             t.text = cache[text]
             yield t
         else:
             t.text = s = stem(text)
             cache[text] = s
             yield t

Пример #13

0

Показать файл

Файл: description_search.py Проект: COSC381-2021Winter/youtube-description-search-AdhamOudeif

def query_on_music(index_name, query_str):
    query_str = stem("Spotify " + query_str)
    index = open_dir(index_name)
    with index.searcher(weighting=scoring.Frequency) as searcher:
        query = QueryParser("description", index.schema).parse(query_str)
        results = searcher.search(query, limit=None)

        formatted_results = []
        for result in results:
            d = {}
            d['url'] = "https://www.youtube.com/watch?v=" + result['id']
            d['snippet'] = {}
            d['snippet']['title'] = result['title']
            d['snippet']['description'] = result.highlights('description')
            d['id'] = {}
            d['id']['videoId'] = result['id']
            d['score'] = result.score
            formatted_results.append(d)

    return formatted_results

Пример #14

0

Показать файл

Файл: QueryExpansionScript.py Проект: lisaparma/IRProject-MeSHBasedSemanticQueryExpansion

def queryIndex(query):
    tokenizer = RegexTokenizer()
    return_list = []

    # Removing stop words
    with open("../smartStopList.txt", "r") as fp:
        line = fp.readline()
        words = []
        while line:
            words.append(line.replace('\n', ''))
            line = fp.readline()

    stopper = StopFilter(stoplist=frozenset(words))
    tokens = stopper(tokenizer(query))

    for t in tokens:
        t.text = t.text.lower()  # Converting to lower case
        s = stem(t.text)  # stemming
        if len(s) > 2:
            return_list.append(s)
    return return_list

Пример #15

0

Показать файл

Файл: query_parsing.py Проект: josnancy/semanticRetrieval

def queryParsing(query):
    print("inside queryParsing")
    tokenizer = RegexTokenizer()
    return_list = []   
    
    #Removing stop words
    stopper = StopFilter()
    tokens = stopper(tokenizer(query))

    for t in tokens:
        
        #converting to lower case
        t.text = t.text.lower()
        
        #stemming
        s=stem(t.text)
        return_list.append(s)
        
        #adding variations
        termVariations = variations(t.text)
        for u in termVariations:
            return_list.append(u)

    return return_list

Пример #16

0

Показать файл

Файл: base_generator.py Проект: borc/simiir

 def _stem_term(self, term):
     """
     Applies the Porter stemming algorithm (implementation from the Whoosh IR toolkit) to a given term, term.
     The returned string represents the stemmed version of the term.
     """
     return stem(term)

Пример #17

0

Показать файл

#print ("the words are :" , words)

# queries is a dictionary with the keys being the query numbers and the elements are the query words
queries = {}
queries_path = '/home/niloo/rb04-queries2'
with open(queries_path, 'r') as q_file:

    for line in q_file:
        query = []
        line = line.replace(':', ' ')
        q = line.split()
        key = q[0]  # key is the Query number
        #print(key)
        q_without_stop = [i for i in q if i.lower() not in stop_words]
        for term in q_without_stop:
            h = stem(term).lower()
            query.append(h)
        q_words = query[1:]
        queries[key] = q_words

# DQ is a dictionary with keys being the query numbers and elements are the corresponding top ranked documents
DQ = {}
Dq_dir = '/home/niloo/run.robust04.bm25.topics.robust04.txt'

for key in queries:
    dq_names = []
    with open(Dq_dir, 'r') as dq:
        lines = dq.readlines()
        for line in lines:
            #print(line)
            if line.startswith(key):

Пример #18

0

Показать файл

Файл: PathfinderInsight.py Проект: SiceX/Pathfinder-Insight

def main():
    """ The main loop for the program """
    g = Gui()
    ix = index.open_dir("indexdir")

    while True:
        event, values = g.window.read()
        g.window["_output_"]('')

        # close windows
        if event is None:
            break

        if event == '_SEARCH_' and values['TERM'] is not None:

            # il parametro 'fieldboosts' regola quanta importanza dare ai match nei vari campi
            qp = MultifieldParser(
                ["procTitle", "topics", "categories", "procContent"],
                termclass=Variations,
                schema=ix.schema,
                fieldboosts={
                    "procTitle": 1.5,
                    "categories": 1.3
                })
            qp.add_plugin(FuzzyTermPlugin())

            terms = str(values['TERM'])
            terms = terms.replace("title", "procTitle").replace("topic", "topics") \
                 .replace("category", "categories").replace("content", "procContent")

            # Modifica della query immessa con aggiunta dei sinonimi nel caso l'opzione sia abilitata, con attenzione
            # al riportare i token booleani senza modifiche ed a tradurre correttamente la definizione dei campi in cui
            # ricercare i termini se richiesti.
            if values['syn_search']:
                with open("utils/wn_s.pl", "r") as f:
                    thesaurus = Thesaurus.from_file(f)
                termsWithSynonyms = []
                for term in terms.split(" "):
                    field = None
                    if ":" in term:
                        field = term.split(":")[0]
                        term = term.split(":")[1]
                    if term not in booleanTokens:
                        termSynonyms = thesaurus.synonyms(term)
                        if field is not None:
                            termSynonyms = [
                                f"{field}:{t}" for t in termSynonyms
                            ]
                            termSynonyms.append(f"{field}:{term}")
                        else:
                            termSynonyms.append(term)
                        termsWithSynonyms.append(" OR ".join(termSynonyms))
                    else:
                        termsWithSynonyms.append(term)
                terms = ' '.join(termsWithSynonyms)

            print("- Searching for >>> " + str(terms))

            # stemming dei termini della query e aggiunta della tilde per ricerca "fuzzy" a quelle effettivamente modificate
            words = terms.split(' ')
            stemmedWords = list()
            for word in words:
                stemmed = stem(word)
                if word != stemmed:
                    stemmedWords.append(stemmed + '~')
                else:
                    stemmedWords.append(stemmed)

            q = qp.parse(' '.join(stemmedWords))

            with ix.searcher() as searcher:
                if not values['syn_search']:
                    correction = searcher.correct_query(q=q,
                                                        qstring=terms,
                                                        maxdist=2)
                    if terms != correction.string:
                        print("- Did you mean >>> " + correction.string)
                results = searcher.search(q, terms=True)

                if not values['syn_search'] and results.is_empty():
                    print(
                        "- No relevant result has been found for query, trying corrected query"
                    )
                    results = searcher.search(qp.parse(correction.string))

                numb = 1
                if not results.is_empty():
                    for elem in results:
                        # print(elem)
                        print(
                            f"Result n.{numb} >>> Title: {str(elem['docTitle'])}\n\tScore: {str(elem.score)}\n"
                            f"\tLink to the page: {str(elem['pageUrl'])}\n")
                        numb += 1
                else:
                    print("- No relevant result has been found")

Пример #19

0

Показать файл

Файл: indexer.py Проект: wahaproject/asmaa

def stemfn(word): return asm_stemming.get_root(stem(word))[0][0]
analyzer = StandardAnalyzer(expression = r"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+\

Пример #20

0

Показать файл

Файл: whooshSearchEngine.py Проект: abougouffa/thawab

def stemfn(word): return stemArabic(stem(word))
# word_re = ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]"
analyzer = StandardAnalyzer(expression = ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+(?:\.?[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+)*") | StemFilter(stemfn)

Пример #21

0

Показать файл

Файл: base_generator.py Проект: MiladAlshomary/simiir

 def _stem_term(self, term):
     """
     Applies the Porter stemming algorithm (implementation from the Whoosh IR toolkit) to a given term, term.
     The returned string represents the stemmed version of the term.
     """
     return stem(term)

Пример #22

0

Показать файл

def search_for_results(userquery, corrected_flag=True):
    try:
        if os.path.exists(settings.SEARCH_INDEX_DIR):
            # open index directory and create object for searcher class
            index_reference = open_dir(settings.SEARCH_INDEX_DIR)
            searcher = index_reference.searcher()

            # Applying stemming on the userquery
            stem(userquery)

            # OrGroup.factory - which is useful better for giving relavance rather
            # than naive term frequency of the words in the query
            og = qparser.OrGroup.factory(0.9)

            # initializing Multifield Parser for searching in the multiple fields
            queryparser = qparser.MultifieldParser(
                ["tags", "foss", "title", "outline"],
                schema=index_reference.schema,
                group=og)

            # These Plugins will remove the ability of the user to specify fields to search
            queryparser.remove_plugin_class(qparser.FieldsPlugin)

            # To remove the ability to search for wildcards, which can be harmful to query performance
            queryparser.remove_plugin_class(qparser.WildcardPlugin)

            # can specify a fuzzy term by adding a ~ followed by an optional maximum edit distance (Ex : jav~1)
            queryparser.add_plugin(qparser.FuzzyTermPlugin())

            # Parse the Given Query
            q = queryparser.parse(userquery)

            # For Correcting Spelling with maximum edit distance 3. More than 3 It may affect the performance.
            corrected = searcher.correct_query(q, userquery, maxdist=3)

            # if the corrected query is not matched with the parsed query then it will ask for Did you mean option
            # if the user Entered the query is equal to the suggested query then it will search for the suggested query
            # else the original query of the is user is searched
            corrected_string = None
            if corrected_flag:
                if corrected.query != q:
                    corrected_string = corrected.string

            results = searcher.search(q, terms=True, limit=None)

            # printing the no.of videos found and their title of the video
            print(("%d Videos Found for %s " %
                   (results.scored_length(), userquery)))
            if (results.has_matched_terms() and results.scored_length() > 0):
                collection = []
                for hit in results:
                    row = TutorialResource.objects.filter(
                        tutorial_detail_id=hit['VideoId'],
                        language__name='English').first()
                    collection.append(row)
                return collection, corrected_string
            else:
                return None, corrected_string

    # finally close the searcher object
    finally:
        searcher.close()
    return None, None

Пример #23

0

Показать файл

Файл: search.py Проект: DataUSA/datausa-api

def do_search(txt, sumlevel=None, kind=None, tries=0, limit=10, is_stem=None, my_vars=None):
    txt = txt.replace(",", "")

    my_filter = None

    if kind and sumlevel:
        kf = query.Term("kind", kind)
        sf = query.Term("sumlevel", sumlevel)
        my_filter = query.And([kf, sf])
    elif kind:
        my_filter = query.Term("kind", kind)
    elif sumlevel:
        my_filter = query.Term("sumlevel", sumlevel)
    if is_stem and is_stem > 0 and my_filter is not None:
        my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem)
    elif is_stem and is_stem > 0 and my_filter is None:
        my_filter = query.NumericRange("is_stem", 1, is_stem)

    if tries > 2:
        return [], [], [], []
    q = qp.parse(txt)

    rext = RegexTokenizer()
    var_txt = u" ".join([stem(token.text) if len(token.text) > 3 else token.text for token in rext(unicode(txt))])

    var_q = vars_qp.parse(var_txt)
    var_keywords = {}
    vars_max_score = None
    # search for variables in query
    if not my_vars:
        # my_vars can save original vars detected before autocorrecting for spelling,
        # so we'll only do var searches that haven't yet been autocorrected
        with vars_ix.searcher() as s:
        # s = vars_ix.searcher()
            results = s.search(var_q)
            # raise Exception(list(results)[0])
            vscores = [r.score for r in results]
            vars_max_score = max(vscores) if vscores else None

            my_vars = [{"matched_on": r.highlights("name"),
                        "name": r["name"],
                        "description": r["description"].split(","),
                        "section": r["section"],
                        "section_title": r["section_title"],
                        "related_attrs": r["related_attrs"].split(","),
                        "related_vars": r["related_vars"].split(","),
                        "params": json.loads(r["params"]) if 'params' in r else None} for r in results]
        if my_vars:
            already_seen = []
            filtered_my_vars = []
            for my_var in my_vars:
                if my_var["related_vars"] not in already_seen:
                    filtered_my_vars.append(my_var)
                already_seen.append(my_var["related_vars"])
                highlight_txt = my_var["matched_on"]

                if highlight_txt:
                    matches = re.findall(r'<b class="[^>]+">([^>]+)</b>', highlight_txt)
                    if matches:
                        for matched_txt in matches:
                            var_keywords[matched_txt] = True
            my_vars = filtered_my_vars

    try:
        for term in q:
            for keyword in var_keywords.keys():
                if term.text == 'in' and " in " in txt:
                    term.boost = -1
                elif term.text in keyword or keyword in term.text:
                    term.boost = -0.5
    except NotImplementedError:
        for keyword in var_keywords.keys():
            if q.text == 'in' and " in " in txt:
                q.boost = -1
            elif q.text in keyword or keyword in q.text:
                q.boost = -0.5

    weighter = SimpleWeighter(txt, B=.6, content_B=1.0, K1=2.75)
    with ix.searcher(weighting=weighter) as s:
        if len(txt) > 2:
            corrector = s.corrector("display")
            suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3)
        else:
            suggs = []
        results = s.search_page(q, 1, sortedby=[scores], pagelen=20, filter=my_filter)
        data = [[r["id"], r["name"], r["zvalue"],
                 r["kind"], r["display"],
                 r["sumlevel"] if "sumlevel" in r else "",
                 r["is_stem"] if "is_stem" in r else False,
                 r["url_name"] if "url_name" in r else None]
                for r in results]

        if not data and suggs:
            return do_search(suggs[0], sumlevel, kind, tries=tries+1, limit=limit, is_stem=is_stem,
                             my_vars=my_vars)

        ascores = [r.score for r in results]
        attr_max_score = max(ascores) if ascores else 0
        # raise Exception(attr_max_score, vars_max_score)
        # insert nationwide linkage
        data = nationwide_results(data, my_vars, attr_max_score, vars_max_score, txt)

        return data, suggs, tries, my_vars

Пример #24

0

Показать файл

Файл: whooshSearchEngine.py Проект: linuxer9/thawab

def stemfn(word):
    return stemArabic(stem(word))

Пример #25

0

Показать файл

def do_search(txt,
              sumlevel=None,
              kind=None,
              tries=0,
              limit=10,
              is_stem=None,
              my_vars=None):
    txt = txt.replace(",", "")

    my_filter = None

    if kind and sumlevel:
        kf = query.Term("kind", kind)
        sf = query.Term("sumlevel", sumlevel)
        my_filter = query.And([kf, sf])
    elif kind:
        my_filter = query.Term("kind", kind)
    elif sumlevel:
        my_filter = query.Term("sumlevel", sumlevel)
    if is_stem and is_stem > 0 and my_filter is not None:
        my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem)
    elif is_stem and is_stem > 0 and my_filter is None:
        my_filter = query.NumericRange("is_stem", 1, is_stem)

    if tries > 2:
        return [], [], [], []
    q = qp.parse(txt)

    rext = RegexTokenizer()
    var_txt = u" ".join([
        stem(token.text) if len(token.text) > 3 else token.text
        for token in rext(unicode(txt))
    ])

    var_q = vars_qp.parse(var_txt)
    var_keywords = {}
    vars_max_score = None
    # search for variables in query
    if not my_vars:
        # my_vars can save original vars detected before autocorrecting for spelling,
        # so we'll only do var searches that haven't yet been autocorrected
        with vars_ix.searcher() as s:
            # s = vars_ix.searcher()
            results = s.search(var_q)
            # raise Exception(list(results)[0])
            vscores = [r.score for r in results]
            vars_max_score = max(vscores) if vscores else None

            my_vars = [{
                "matched_on":
                r.highlights("name"),
                "name":
                r["name"],
                "description":
                r["description"].split(","),
                "section":
                r["section"],
                "section_title":
                r["section_title"],
                "related_attrs":
                r["related_attrs"].split(","),
                "related_vars":
                r["related_vars"].split(","),
                "params":
                json.loads(r["params"]) if 'params' in r else None
            } for r in results]
        if my_vars:
            already_seen = []
            filtered_my_vars = []
            for my_var in my_vars:
                if my_var["related_vars"] not in already_seen:
                    filtered_my_vars.append(my_var)
                already_seen.append(my_var["related_vars"])
                highlight_txt = my_var["matched_on"]

                if highlight_txt:
                    matches = re.findall(r'<b class="[^>]+">([^>]+)</b>',
                                         highlight_txt)
                    if matches:
                        for matched_txt in matches:
                            var_keywords[matched_txt] = True
            my_vars = filtered_my_vars

    try:
        for term in q:
            for keyword in var_keywords.keys():
                if term.text == 'in' and " in " in txt:
                    term.boost = -1
                elif term.text in keyword or keyword in term.text:
                    term.boost = -0.5
    except NotImplementedError:
        for keyword in var_keywords.keys():
            if q.text == 'in' and " in " in txt:
                q.boost = -1
            elif q.text in keyword or keyword in q.text:
                q.boost = -0.5

    weighter = SimpleWeighter(txt, B=.45, content_B=1.0, K1=1.5)
    with ix.searcher(weighting=weighter) as s:
        if len(txt) > 2:
            corrector = s.corrector("display")
            suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3)
        else:
            suggs = []
        results = s.search_page(q,
                                1,
                                sortedby=[scores],
                                pagelen=20,
                                filter=my_filter)
        data = [[
            r["id"], r["name"], r["zvalue"], r["kind"], r["display"],
            r["sumlevel"] if "sumlevel" in r else "",
            r["is_stem"] if "is_stem" in r else False,
            r["url_name"] if "url_name" in r else None
        ] for r in results]

        if not data and suggs:
            return do_search(suggs[0],
                             sumlevel,
                             kind,
                             tries=tries + 1,
                             limit=limit,
                             is_stem=is_stem,
                             my_vars=my_vars)

        ascores = [r.score for r in results]
        attr_max_score = max(ascores) if ascores else 0
        # raise Exception(attr_max_score, vars_max_score)
        # insert nationwide linkage
        data = nationwide_results(data, my_vars, attr_max_score,
                                  vars_max_score, txt)

        return data, suggs, tries, my_vars

Python stem примеры использования