def search_for_results(userquery, corrected_flag=True): try: if os.path.exists(settings.SEARCH_INDEX_DIR): # open index directory and create object for searcher class index_reference = open_dir(settings.SEARCH_INDEX_DIR) searcher = index_reference.searcher() # Applying stemming on the userquery stem(userquery) # OrGroup.factory - which is useful better for giving relavance rather # than naive term frequency of the words in the query og = qparser.OrGroup.factory(0.9) # initializing Multifield Parser for searching in the multiple fields queryparser = qparser.MultifieldParser( ["tags", "foss", "title", "outline"], schema=index_reference.schema, group=og) # These Plugins will remove the ability of the user to specify fields to search queryparser.remove_plugin_class(qparser.FieldsPlugin) # To remove the ability to search for wildcards, which can be harmful to query performance queryparser.remove_plugin_class(qparser.WildcardPlugin) # can specify a fuzzy term by adding a ~ followed by an optional maximum edit distance (Ex : jav~1) queryparser.add_plugin(qparser.FuzzyTermPlugin()) # Parse the Given Query q = queryparser.parse(userquery) # For Correcting Spelling with maximum edit distance 3. More than 3 It may affect the performance. corrected = searcher.correct_query(q, userquery, maxdist=3) # if the corrected query is not matched with the parsed query then it will ask for Did you mean option # if the user Entered the query is equal to the suggested query then it will search for the suggested query # else the original query of the is user is searched corrected_string = None if corrected_flag: if corrected.query != q: corrected_string = corrected.string results = searcher.search(q, terms=True, limit=None) # printing the no.of videos found and their title of the video print "%d Videos Found for %s " % (results.scored_length(), userquery) if (results.has_matched_terms() and results.scored_length() > 0): collection = [] for hit in results: row = TutorialResource.objects.filter( tutorial_detail_id=hit['VideoId'], language__name='English').first() collection.append(row) return collection, corrected_string else: return None, corrected_string # finally close the searcher object finally: searcher.close() return None, None
def query_categories(request): """ A simple AJAX view that returns suggested product categories based on supplied product terms """ query_words = request.GET.get('q', None) if query_words is None: return {} results, suggestion = perform_category_query(stem(query_words)) if len(query_words) > 3 and len(results) == 0 and suggestion is not None: query_words = suggestion results, suggestion = perform_category_query(query_words) ordered_categories = OrderedDict(sorted(results.items(), key=lambda x: len(x[1]), reverse=True)) categories = [] for category, sub_categories in ordered_categories.items(): if category == 'Mature': categories.append([category, ""]) else: categories.append([category, Truncator(", ".join(sub_categories)).words(8)]) resp = { "query": query_words, "categories": categories, "suggestion": suggestion } return JsonResponse(resp)
def clean(self, query_fragment): """ Provides a mechanism for sanitizing user input before presenting the value to the backend. Whoosh 1.X differs here in that you can no longer use a backslash to escape reserved characters. Instead, the whole word should be quoted. """ words = query_fragment.split() cleaned_words = [] for word in words: if word in self.backend.RESERVED_WORDS: word = word.replace(word, word.lower()) for char in self.backend.RESERVED_CHARACTERS: if char in word: word = "'%s'" % word break if self.stemming: word = stem(word) cleaned_words.append(word) return ' '.join(cleaned_words)
def doStemming(s): ret = "" SS = s.split(' ') for text in SS: ret = ret + stem(text) ret += " " return ret
def queryParsing(query): from whoosh import qparser from whoosh.analysis import RegexTokenizer from whoosh.lang.porter import stem from whoosh.lang.morph_en import variations from whoosh.analysis import StopFilter print("inside queryParsing") tokenizer = RegexTokenizer() return_list = [] #Removing stop words stopper = StopFilter() tokens = stopper(tokenizer(query)) for t in tokens: #converting to lower case t.text = t.text.lower() #stemming s=stem(t.text) return_list.append(s) #adding variations termVariations = variations(t.text) for u in termVariations: return_list.append(u) return return_list
def doStemming(s): ret = "" SS = s.split(" ") for text in SS: ret = ret + stem(text) ret += " " return ret
def do_search(self, params): self.send_response(200) self.send_header('Content-Type', 'text/html') indices_dir = HOME_FOLDER + '/.indices' if not os.path.exists(indices_dir): self.wfile.write('<div class="search_result">Index folder %s does not exist.</div>' % indices_dir) self.end_headers() return n = 10 page = 1 if 'n' in params: n = int(params['n']) if n < 10: n = 10 if n > 100: n = 100 if 'p' in params: page = int(params['p']) if 'q' in params: keywords = unicode(' '.join([stem(param.strip()).decode('utf-8') for param in params['q'].lower().split()])) query = MultifieldParser(SEARCH_FIELDS, schema=INDEX_SCHEMA).parse(keywords) # print query weighting = MultiWeighting(BM25F(), title=BoostWeighting(10.0), path=BoostWeighting(8.0), h1=BoostWeighting(8.0), h2=BoostWeighting(6.0), h3=BoostWeighting(3.0), h4=BoostWeighting(2.0), h5=BoostWeighting(1.2), ) searcher = open_dir(indices_dir).searcher(weighting = weighting) # print 'n=', n, 'page=', page results = searcher.search(query, limit=None) self.send_header('Search-Query', keywords) self.send_header('Search-Size', len(results)) pages = len(results) // n if len(results) % n > 0: pages += 1 if page < 1: page = 1 elif page > pages: page = pages self.send_header('Search-Page', page) self.send_header('Search-Pages', pages) if page > 1: self.send_header('Search-Prev', page - 1) if page < pages: self.send_header('Search-Next', page + 1) self.send_header('Search-Limit', n) self.end_headers() for result in results[(page-1)*n : page*n]: # print result.rank, result.score, result.docnum response = SEARCH_RESULT % (result['url'], result['title'], result['content'][:200]) self.wfile.write(response.encode('utf-8')) else: self.end_headers() self.wfile.write('<div class="search_result">No query parameter.</div>')
def term_frequency(searcher, term, qp): qt = stem(term.lower()) q = qp.parse(qt) try: tf = searcher.term_info('content', qt)._weight results = searcher.search(q) #print("term:", term, "TF", tf) return tf except: pass return 0
def term_modifications(file): final_terms = [] with open(file, 'r') as f: content = f.read() text_for_file = str.split(content) text_without_stop = [ i for i in text_for_file if i.lower() not in stop_words ] for term in text_without_stop: h = stem(term.lower()) # print (h) final_terms.append(h) return final_terms
def query(query_str, items_per_page=10, current_page=1): query_str = stem(query_str) with ix.searcher(weighting=scoring.Frequency) as searcher: query = QueryParser("description", ix.schema).parse(query_str) results = searcher.search(query, limit=None) num_query_results = len(results) query_results = [] start_index = (current_page - 1) * items_per_page end_index = start_index + items_per_page for i in range(start_index, min(len(results), end_index)): d={} d['url'] = "https://www.youtube.com/watch?v=%s" % results[i]['id'] d['title'] = results[i]['title'] d['description'] = results[i].highlights('description') d['score'] = results[i].score query_results.append(d) return query_results, num_query_results
def __call__(self, tokens): cache = self.cache ignores = self.ignores for t in tokens: if t.stopped: yield t continue text = t.text if text in ignores: yield t elif text in cache: t.text = cache[text] yield t else: t.text = s = stem(text) cache[text] = s yield t
def query_on_music(index_name, query_str): query_str = stem("Spotify " + query_str) index = open_dir(index_name) with index.searcher(weighting=scoring.Frequency) as searcher: query = QueryParser("description", index.schema).parse(query_str) results = searcher.search(query, limit=None) formatted_results = [] for result in results: d = {} d['url'] = "https://www.youtube.com/watch?v=" + result['id'] d['snippet'] = {} d['snippet']['title'] = result['title'] d['snippet']['description'] = result.highlights('description') d['id'] = {} d['id']['videoId'] = result['id'] d['score'] = result.score formatted_results.append(d) return formatted_results
def queryIndex(query): tokenizer = RegexTokenizer() return_list = [] # Removing stop words with open("../smartStopList.txt", "r") as fp: line = fp.readline() words = [] while line: words.append(line.replace('\n', '')) line = fp.readline() stopper = StopFilter(stoplist=frozenset(words)) tokens = stopper(tokenizer(query)) for t in tokens: t.text = t.text.lower() # Converting to lower case s = stem(t.text) # stemming if len(s) > 2: return_list.append(s) return return_list
def queryParsing(query): print("inside queryParsing") tokenizer = RegexTokenizer() return_list = [] #Removing stop words stopper = StopFilter() tokens = stopper(tokenizer(query)) for t in tokens: #converting to lower case t.text = t.text.lower() #stemming s=stem(t.text) return_list.append(s) #adding variations termVariations = variations(t.text) for u in termVariations: return_list.append(u) return return_list
def _stem_term(self, term): """ Applies the Porter stemming algorithm (implementation from the Whoosh IR toolkit) to a given term, term. The returned string represents the stemmed version of the term. """ return stem(term)
#print ("the words are :" , words) # queries is a dictionary with the keys being the query numbers and the elements are the query words queries = {} queries_path = '/home/niloo/rb04-queries2' with open(queries_path, 'r') as q_file: for line in q_file: query = [] line = line.replace(':', ' ') q = line.split() key = q[0] # key is the Query number #print(key) q_without_stop = [i for i in q if i.lower() not in stop_words] for term in q_without_stop: h = stem(term).lower() query.append(h) q_words = query[1:] queries[key] = q_words # DQ is a dictionary with keys being the query numbers and elements are the corresponding top ranked documents DQ = {} Dq_dir = '/home/niloo/run.robust04.bm25.topics.robust04.txt' for key in queries: dq_names = [] with open(Dq_dir, 'r') as dq: lines = dq.readlines() for line in lines: #print(line) if line.startswith(key):
def main(): """ The main loop for the program """ g = Gui() ix = index.open_dir("indexdir") while True: event, values = g.window.read() g.window["_output_"]('') # close windows if event is None: break if event == '_SEARCH_' and values['TERM'] is not None: # il parametro 'fieldboosts' regola quanta importanza dare ai match nei vari campi qp = MultifieldParser( ["procTitle", "topics", "categories", "procContent"], termclass=Variations, schema=ix.schema, fieldboosts={ "procTitle": 1.5, "categories": 1.3 }) qp.add_plugin(FuzzyTermPlugin()) terms = str(values['TERM']) terms = terms.replace("title", "procTitle").replace("topic", "topics") \ .replace("category", "categories").replace("content", "procContent") # Modifica della query immessa con aggiunta dei sinonimi nel caso l'opzione sia abilitata, con attenzione # al riportare i token booleani senza modifiche ed a tradurre correttamente la definizione dei campi in cui # ricercare i termini se richiesti. if values['syn_search']: with open("utils/wn_s.pl", "r") as f: thesaurus = Thesaurus.from_file(f) termsWithSynonyms = [] for term in terms.split(" "): field = None if ":" in term: field = term.split(":")[0] term = term.split(":")[1] if term not in booleanTokens: termSynonyms = thesaurus.synonyms(term) if field is not None: termSynonyms = [ f"{field}:{t}" for t in termSynonyms ] termSynonyms.append(f"{field}:{term}") else: termSynonyms.append(term) termsWithSynonyms.append(" OR ".join(termSynonyms)) else: termsWithSynonyms.append(term) terms = ' '.join(termsWithSynonyms) print("- Searching for >>> " + str(terms)) # stemming dei termini della query e aggiunta della tilde per ricerca "fuzzy" a quelle effettivamente modificate words = terms.split(' ') stemmedWords = list() for word in words: stemmed = stem(word) if word != stemmed: stemmedWords.append(stemmed + '~') else: stemmedWords.append(stemmed) q = qp.parse(' '.join(stemmedWords)) with ix.searcher() as searcher: if not values['syn_search']: correction = searcher.correct_query(q=q, qstring=terms, maxdist=2) if terms != correction.string: print("- Did you mean >>> " + correction.string) results = searcher.search(q, terms=True) if not values['syn_search'] and results.is_empty(): print( "- No relevant result has been found for query, trying corrected query" ) results = searcher.search(qp.parse(correction.string)) numb = 1 if not results.is_empty(): for elem in results: # print(elem) print( f"Result n.{numb} >>> Title: {str(elem['docTitle'])}\n\tScore: {str(elem.score)}\n" f"\tLink to the page: {str(elem['pageUrl'])}\n") numb += 1 else: print("- No relevant result has been found")
def stemfn(word): return asm_stemming.get_root(stem(word))[0][0] analyzer = StandardAnalyzer(expression = r"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+\
def stemfn(word): return stemArabic(stem(word)) # word_re = ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]" analyzer = StandardAnalyzer(expression = ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+(?:\.?[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+)*") | StemFilter(stemfn)
def search_for_results(userquery, corrected_flag=True): try: if os.path.exists(settings.SEARCH_INDEX_DIR): # open index directory and create object for searcher class index_reference = open_dir(settings.SEARCH_INDEX_DIR) searcher = index_reference.searcher() # Applying stemming on the userquery stem(userquery) # OrGroup.factory - which is useful better for giving relavance rather # than naive term frequency of the words in the query og = qparser.OrGroup.factory(0.9) # initializing Multifield Parser for searching in the multiple fields queryparser = qparser.MultifieldParser( ["tags", "foss", "title", "outline"], schema=index_reference.schema, group=og) # These Plugins will remove the ability of the user to specify fields to search queryparser.remove_plugin_class(qparser.FieldsPlugin) # To remove the ability to search for wildcards, which can be harmful to query performance queryparser.remove_plugin_class(qparser.WildcardPlugin) # can specify a fuzzy term by adding a ~ followed by an optional maximum edit distance (Ex : jav~1) queryparser.add_plugin(qparser.FuzzyTermPlugin()) # Parse the Given Query q = queryparser.parse(userquery) # For Correcting Spelling with maximum edit distance 3. More than 3 It may affect the performance. corrected = searcher.correct_query(q, userquery, maxdist=3) # if the corrected query is not matched with the parsed query then it will ask for Did you mean option # if the user Entered the query is equal to the suggested query then it will search for the suggested query # else the original query of the is user is searched corrected_string = None if corrected_flag: if corrected.query != q: corrected_string = corrected.string results = searcher.search(q, terms=True, limit=None) # printing the no.of videos found and their title of the video print(("%d Videos Found for %s " % (results.scored_length(), userquery))) if (results.has_matched_terms() and results.scored_length() > 0): collection = [] for hit in results: row = TutorialResource.objects.filter( tutorial_detail_id=hit['VideoId'], language__name='English').first() collection.append(row) return collection, corrected_string else: return None, corrected_string # finally close the searcher object finally: searcher.close() return None, None
def do_search(txt, sumlevel=None, kind=None, tries=0, limit=10, is_stem=None, my_vars=None): txt = txt.replace(",", "") my_filter = None if kind and sumlevel: kf = query.Term("kind", kind) sf = query.Term("sumlevel", sumlevel) my_filter = query.And([kf, sf]) elif kind: my_filter = query.Term("kind", kind) elif sumlevel: my_filter = query.Term("sumlevel", sumlevel) if is_stem and is_stem > 0 and my_filter is not None: my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem) elif is_stem and is_stem > 0 and my_filter is None: my_filter = query.NumericRange("is_stem", 1, is_stem) if tries > 2: return [], [], [], [] q = qp.parse(txt) rext = RegexTokenizer() var_txt = u" ".join([stem(token.text) if len(token.text) > 3 else token.text for token in rext(unicode(txt))]) var_q = vars_qp.parse(var_txt) var_keywords = {} vars_max_score = None # search for variables in query if not my_vars: # my_vars can save original vars detected before autocorrecting for spelling, # so we'll only do var searches that haven't yet been autocorrected with vars_ix.searcher() as s: # s = vars_ix.searcher() results = s.search(var_q) # raise Exception(list(results)[0]) vscores = [r.score for r in results] vars_max_score = max(vscores) if vscores else None my_vars = [{"matched_on": r.highlights("name"), "name": r["name"], "description": r["description"].split(","), "section": r["section"], "section_title": r["section_title"], "related_attrs": r["related_attrs"].split(","), "related_vars": r["related_vars"].split(","), "params": json.loads(r["params"]) if 'params' in r else None} for r in results] if my_vars: already_seen = [] filtered_my_vars = [] for my_var in my_vars: if my_var["related_vars"] not in already_seen: filtered_my_vars.append(my_var) already_seen.append(my_var["related_vars"]) highlight_txt = my_var["matched_on"] if highlight_txt: matches = re.findall(r'<b class="[^>]+">([^>]+)</b>', highlight_txt) if matches: for matched_txt in matches: var_keywords[matched_txt] = True my_vars = filtered_my_vars try: for term in q: for keyword in var_keywords.keys(): if term.text == 'in' and " in " in txt: term.boost = -1 elif term.text in keyword or keyword in term.text: term.boost = -0.5 except NotImplementedError: for keyword in var_keywords.keys(): if q.text == 'in' and " in " in txt: q.boost = -1 elif q.text in keyword or keyword in q.text: q.boost = -0.5 weighter = SimpleWeighter(txt, B=.6, content_B=1.0, K1=2.75) with ix.searcher(weighting=weighter) as s: if len(txt) > 2: corrector = s.corrector("display") suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3) else: suggs = [] results = s.search_page(q, 1, sortedby=[scores], pagelen=20, filter=my_filter) data = [[r["id"], r["name"], r["zvalue"], r["kind"], r["display"], r["sumlevel"] if "sumlevel" in r else "", r["is_stem"] if "is_stem" in r else False, r["url_name"] if "url_name" in r else None] for r in results] if not data and suggs: return do_search(suggs[0], sumlevel, kind, tries=tries+1, limit=limit, is_stem=is_stem, my_vars=my_vars) ascores = [r.score for r in results] attr_max_score = max(ascores) if ascores else 0 # raise Exception(attr_max_score, vars_max_score) # insert nationwide linkage data = nationwide_results(data, my_vars, attr_max_score, vars_max_score, txt) return data, suggs, tries, my_vars
def stemfn(word): return stemArabic(stem(word))
def do_search(txt, sumlevel=None, kind=None, tries=0, limit=10, is_stem=None, my_vars=None): txt = txt.replace(",", "") my_filter = None if kind and sumlevel: kf = query.Term("kind", kind) sf = query.Term("sumlevel", sumlevel) my_filter = query.And([kf, sf]) elif kind: my_filter = query.Term("kind", kind) elif sumlevel: my_filter = query.Term("sumlevel", sumlevel) if is_stem and is_stem > 0 and my_filter is not None: my_filter = my_filter & query.NumericRange("is_stem", 1, is_stem) elif is_stem and is_stem > 0 and my_filter is None: my_filter = query.NumericRange("is_stem", 1, is_stem) if tries > 2: return [], [], [], [] q = qp.parse(txt) rext = RegexTokenizer() var_txt = u" ".join([ stem(token.text) if len(token.text) > 3 else token.text for token in rext(unicode(txt)) ]) var_q = vars_qp.parse(var_txt) var_keywords = {} vars_max_score = None # search for variables in query if not my_vars: # my_vars can save original vars detected before autocorrecting for spelling, # so we'll only do var searches that haven't yet been autocorrected with vars_ix.searcher() as s: # s = vars_ix.searcher() results = s.search(var_q) # raise Exception(list(results)[0]) vscores = [r.score for r in results] vars_max_score = max(vscores) if vscores else None my_vars = [{ "matched_on": r.highlights("name"), "name": r["name"], "description": r["description"].split(","), "section": r["section"], "section_title": r["section_title"], "related_attrs": r["related_attrs"].split(","), "related_vars": r["related_vars"].split(","), "params": json.loads(r["params"]) if 'params' in r else None } for r in results] if my_vars: already_seen = [] filtered_my_vars = [] for my_var in my_vars: if my_var["related_vars"] not in already_seen: filtered_my_vars.append(my_var) already_seen.append(my_var["related_vars"]) highlight_txt = my_var["matched_on"] if highlight_txt: matches = re.findall(r'<b class="[^>]+">([^>]+)</b>', highlight_txt) if matches: for matched_txt in matches: var_keywords[matched_txt] = True my_vars = filtered_my_vars try: for term in q: for keyword in var_keywords.keys(): if term.text == 'in' and " in " in txt: term.boost = -1 elif term.text in keyword or keyword in term.text: term.boost = -0.5 except NotImplementedError: for keyword in var_keywords.keys(): if q.text == 'in' and " in " in txt: q.boost = -1 elif q.text in keyword or keyword in q.text: q.boost = -0.5 weighter = SimpleWeighter(txt, B=.45, content_B=1.0, K1=1.5) with ix.searcher(weighting=weighter) as s: if len(txt) > 2: corrector = s.corrector("display") suggs = corrector.suggest(txt, limit=10, maxdist=2, prefix=3) else: suggs = [] results = s.search_page(q, 1, sortedby=[scores], pagelen=20, filter=my_filter) data = [[ r["id"], r["name"], r["zvalue"], r["kind"], r["display"], r["sumlevel"] if "sumlevel" in r else "", r["is_stem"] if "is_stem" in r else False, r["url_name"] if "url_name" in r else None ] for r in results] if not data and suggs: return do_search(suggs[0], sumlevel, kind, tries=tries + 1, limit=limit, is_stem=is_stem, my_vars=my_vars) ascores = [r.score for r in results] attr_max_score = max(ascores) if ascores else 0 # raise Exception(attr_max_score, vars_max_score) # insert nationwide linkage data = nationwide_results(data, my_vars, attr_max_score, vars_max_score, txt) return data, suggs, tries, my_vars