def full_search(self, query, time_limit=-1, search_limit=50, edit_dist=0): val = {} try: searcher = self._index.searcher(weighting=scoring.TF_IDF()) if time_limit > 0: c = searcher.collector(limit=search_limit) tlc = TimeLimitCollector(c, timelimit=time_limit) try: searcher.search_with_collector(query, tlc) except TimeLimit: None try: res = tlc.results() except TimeLimit: res = [] else: res = searcher.search(query, limit=search_limit) for ii in res: val[ii['title']] = (ii.docnum, self.scale(ii.score)) finally: searcher.close() return val
def search(self, text: str, limit: int, timelimit=3.0): with self.index.searcher() as searcher: or_group = OrGroup.factory(.9) parser = MultifieldParser(['content', 'quiz_bowl'], schema=self.schema, group=or_group) text_query = parser.parse(text) collector = searcher.collector(limit=limit) tlc = TimeLimitCollector(collector, timelimit=timelimit) partial = True try: searcher.search_with_collector(text_query, tlc) partial = False except searching.TimeLimit: pass # There is a bug in whoosh that makes calling len directory or indirectly fail # which is why we don't use list() results = [(r['page'], r.score) for r in tlc.results()] # Doing logging using partial instead of directory is required due to a mysterious race # condition between whoosh time limits and log.info. Its important that all of whoosh's # functions including search_with_collector() and tlc.results() are called before # logging anything if partial: log.info( 'Search took longer than {}s, getting partial results'. format(timelimit)) if len(results) == 0: return [('<UNK_ANSWER>', 0)] return results
def doSearch(self, text): q = self.qp.parse(text) # build query with self.ix.searcher( weighting=scoring.Frequency) as s: # simple scorer may help c = s.collector(limit=self.MaxResults) c = TimeLimitCollector(c, 0.5) try: s.search_with_collector(q, c) except: print("TIMEOUT!") results = c.results() # partial results if hung self.searchResults.clear() #my_cf = highlight.PinpointFragmenter(maxchars=100, surround=60) my_cf = highlight.ContextFragmenter(maxchars=160, surround=30) #my_cf = highlight.SentenceFragmenter(maxchars=200, sentencechars='\n') results.fragmenter = my_cf if len(results) > 0: for res in results: res.fragmenter = my_cf # self.searchResults.append(res.highlights('Text',top=1) + '*--*\n' + res['MeetingLink']+ '\n') self.searchResults.append(res.highlights('Text', top=1)) self.searchResults.append('-Link to Meeting -') self.searchResults.append(res['MeetingLink'] + '\n') self.searchResults.append('----------') self.searchResults.append('----------') cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)
def doSearch(self, text): q = self.qp.parse(text) # build query with event-provided search key with self.ix.searcher(weighting = scoring.BM25F) as s: # there are several NLP style scorers for Whoosh c = s.collector(limit=self.MaxResults) # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long... c = TimeLimitCollector(c,0.5) try: s.search_with_collector(q,c) except: print("TIMEOUT!") # DEBUG out put to console if we're timing out a lot results = c.results() # If we do get a timeout, still return whatever we've got, i.e. partial results #----------------------------------------------------- self.searchResults.clear() # ** Now format the results for display ** results.fragmenter = WholeFragmenter() # we want the full technical name not just the local context. self.MaudeResults.clear() # Clear if len(results)> 0: self.results = [] for res in results: self.results.append(res['msid']) HighLightedMsid = res.highlights('msid') # construct MSID string with highlights, if that's where the match is... if len(HighLightedMsid) >0: msid_str = HighLightedMsid else: msid_str = res['msid'] HighLightedTechName = res.highlights('technical_name') # construct technical_name string with highlights, if relevant if len(HighLightedTechName) >0: tech_str = HighLightedTechName else: tech_str = res['technical_name'] self.searchResults.append(msid_str + ' - ' + tech_str) cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start) # return cursor to beginning of search results
def search_metaindex_by_keyword(self, text, limit=None, timelimit=1): """ Performs a query in the metadata search index by the 'key' field. Arguments: text: String used to perform the search in the index. limit: Maximum number of results to be returned. By default there is no limit. timelimit: Maximum number of seconds to execute the search. Searches that take longer than timelimit will return only partial results. Returns: A list of dictionaries, each containing the fields in the metadata index, whose values match the query text in the 'key' field. """ results_list = [] if self.metaindex: with self.metaindex.searcher() as searcher: query = QueryParser('key', self.metaindex.schema).parse(text) coll = searcher.collector(limit) tlc = TimeLimitCollector(coll, timelimit, use_alarm=False) # Try searching try: searcher.search_with_collector(query, tlc) except TimeLimit: print( "searchByKeyWord: Index search took too long, aborting!" ) # get partial results, if available results = tlc.results() for res in results: results_list.append(dict(res)) return results_list
def test_media(): index_path = os.path.join(config.index_root_dir, 'media') storage = FileStorage(index_path) ix = storage.open_index() with ix.searcher() as searcher: #print list(searcher.lexicon('title')) myquery = Term('title', u'尾巴') #myquery = Term('movieid', u'mi1022160') tc = searcher.collector(limit=200) tlc = TimeLimitCollector(tc, timelimit=1) #limit seacher time searcher.search_with_collector(myquery, tlc) for hit in tlc.results(): #print hit.fields() print hit.fields()
def test_video(): index_path = os.path.join(config.index_root_dir, 'video') storage = FileStorage(index_path) ix = storage.open_index() with ix.searcher() as searcher: #print list(searcher.lexicon('title')) myquery = Term('title', u'全面') #myquery = Term('movieid', u'mi1022160') tc = searcher.collector(limit=20) tlc = TimeLimitCollector(tc, timelimit=1) #limit seacher time searcher.search_with_collector(myquery, tlc) #for hit in tlc.results(): #print hit.fields() # print hit.fields().get('title') print '===========================' results = searcher.search_page(myquery, 1, 10) #for hit in results: # print hit.fields().get('title') print '===============================' parser = MultifieldParser(['title', 'pinyin_title'], ix.schema) # parser = QueryParser('title', schema = ix.schema) q = parser.parse(u'quan mian') results = searcher.search_page(q, 1, 10) for hit in results: print hit.fields()
def __init__(self,MSID_index_dir, Searchable,MaxResults=10,Timeout = 0.5): ''' Initializes the wrapper object with ijdex reference and preferences parameter MSID_index_dir = (string) Existing Whoosh Index directory parameter Searchable = (string) List of fieldnames of the index to search parameter MaxResults = (numeric) Maximum # of results to return parameter Timeout = (numeric) Maximum # of seconds to wait before ending search ''' self.ix = index.open_dir(MSID_index_dir) # self.qp = MultifieldParser(Searchable, schema=self.ix.schema) # Search all the specified fields #self.qp = QueryParser(Searchable[0], schema=self.ix.schema) # Search ONLY the first field #self.s = self.ix.searcher(weighting = scoring.Frequency) # Simple Scorer self.s = self.ix.searcher(weighting = scoring.BM25F) # Fancy Scorer c = self.s.collector(limit=MaxResults) # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long... self.c = TimeLimitCollector(c,Timeout) self.Searchable = Searchable self.LastResults = None
def search_for_track(self, querystring): if len(querystring) >= 3: with self.ix.searcher() as searcher: collector = searcher.collector(limit=20) tlc = TimeLimitCollector(collector, timelimit=1.4, use_alarm=False) parser = MultifieldParser(["artist", "album", "title"], self.ix.schema) parser.add_plugin(qparser.FuzzyTermPlugin()) myquery = parser.parse(querystring) try: searcher.search_with_collector(myquery, tlc) if len(tlc.results()) == 0: myquery = parser.parse(" ".join(word + "~2" for word in querystring.split())) searcher.search_with_collector(myquery, tlc) except TimeLimit: logging.info("Time Limit for query reached!") logging.debug("czas zapytania: ", collector.runtime) ret = [self.__tracks[int(result["id"])] for result in tlc.results()] return ret else: return []
def find_closest(self, raw_query, threshold=50): """ Returns the best score of similarity """ from whoosh import qparser from whoosh.qparser import QueryParser from fuzzywuzzy import fuzz from extractors.ir import IrIndex if self.parser is None: og = qparser.OrGroup.factory(0.9) self.parser = QueryParser("text", schema=self.schema, group=og) query_text, query_len = IrIndex.prepare_query(raw_query.lower()) print("Query: %s" % query_text) query = self.parser.parse(query_text) print("-------------") closest_question = -1 with self.index.searcher() as s: c = s.collector(limit=10) tlc = TimeLimitCollector(c, timelimit=5) try: s.search_with_collector(query, tlc) except TimeLimit: None try: results = tlc.results() except TimeLimit: print("Time limit reached!") return -1 print(results[0]['id'], self.raw[results[0]['id']][:50]) similarity = fuzz.ratio(self.raw[results[0]['id']], raw_query.lower()) if similarity > threshold: closest_question = results[0]['id'] print("Old!", closest_question, similarity) else: print("NEW! %f" % similarity) print("-------------") return closest_question
def searchIndex(self, sq): indexParser = MultifieldParser(["query", "target"], schema=self.schema).parse(unicode(sq)) with self.ix.searcher() as s: collector = s.collector(limit=None) timed_collector = TimeLimitCollector(collector, timelimit=30.0) try: results = s.search_with_collector(indexParser, timed_collector) except TimeLimit: print 'Search ime limit of 30 seconds exceeded.' hits = timed_collector.results() # Convert result structure into a jsonable list # TODO: improve this structure matches = [] for i in hits: matches.append({"sourcelang": i["query"], "targetlang": i["target"], "distance": (1.0/i.score)}) return matches
def find_closest(self, raw_query, threshold=50): """ Returns the best score of similarity """ from whoosh import qparser from whoosh.qparser import QueryParser from fuzzywuzzy import fuzz from extractors.ir import IrIndex if self.parser is None: og = qparser.OrGroup.factory(0.9) self.parser = QueryParser("text", schema=self.schema, group=og) query_text, query_len = IrIndex.prepare_query(raw_query.lower()) print ("Query: %s" % query_text) query = self.parser.parse(query_text) print ("-------------") closest_question = -1 with self.index.searcher() as s: c = s.collector(limit=10) tlc = TimeLimitCollector(c, timelimit=5) try: s.search_with_collector(query, tlc) except TimeLimit: None try: results = tlc.results() except TimeLimit: print ("Time limit reached!") return -1 print (results[0]["id"], self.raw[results[0]["id"]][:50]) similarity = fuzz.ratio(self.raw[results[0]["id"]], raw_query.lower()) if similarity > threshold: closest_question = results[0]["id"] print ("Old!", closest_question, similarity) else: print ("NEW! %f" % similarity) print ("-------------") return closest_question
class WhooshWrap(): ''' Wrapper class to make Whoosh API a little simpler Initialize by pointing to an existing Whoosh index and specifying searchable fields, Max Results and Timeout Query by running self.doSearch, providing query string, and timeout Results of the last search are stored in the object as Whoosh results object (requires open index to access) and returned as a traditional python dictionary ''' def __init__(self,MSID_index_dir, Searchable,MaxResults=10,Timeout = 0.5): ''' Initializes the wrapper object with ijdex reference and preferences parameter MSID_index_dir = (string) Existing Whoosh Index directory parameter Searchable = (string) List of fieldnames of the index to search parameter MaxResults = (numeric) Maximum # of results to return parameter Timeout = (numeric) Maximum # of seconds to wait before ending search ''' self.ix = index.open_dir(MSID_index_dir) # self.qp = MultifieldParser(Searchable, schema=self.ix.schema) # Search all the specified fields #self.qp = QueryParser(Searchable[0], schema=self.ix.schema) # Search ONLY the first field #self.s = self.ix.searcher(weighting = scoring.Frequency) # Simple Scorer self.s = self.ix.searcher(weighting = scoring.BM25F) # Fancy Scorer c = self.s.collector(limit=MaxResults) # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long... self.c = TimeLimitCollector(c,Timeout) self.Searchable = Searchable self.LastResults = None def doSearch(self,qstring,ReturnFields): ''' Performs a search on the index with the provided query and returns a Dict of results parameter qstring = (string) Search key parameter ReturnFields = (list of strings) List of fieldnames to include in return results. NOTE, may be different than Searchable, but fields must exist in index returnval result_fields = dict of result strings : lists per field, i.e. = result_dict = {'Return Fields 1' : [ list of result strings ], 'Return Fields 2' : [ list of result strings ]....} ''' q = self.qp.parse(qstring) # build query with event-provided search key try: self.s.search_with_collector(q,self.c) except: print("TIMEOUT!") # DEBUG out put to console if we're timing out a lot results = self.c.results() # If we do get a timeout, still return whatever we've got, i.e. partial results self.LastResults = results # ResultsDict ={} for field in ReturnFields: ResultsDict[field] = [] for res in results: ResultsDict[field].append(res[field]) # should check that field is in results return ResultsDict
#!/usr/local/bin/python #-*- encoding:utf-8 -*- from whoosh.index import open_dir from whoosh.fields import * from whoosh import qparser; from chinesetokenizer import ChineseAnalyzer #from whoosh.analysis import RegexAnalyzer #analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)") from whoosh.collectors import TimeLimitCollector, TimeLimit analyzer = ChineseAnalyzer() ix = open_dir('IndexDir/titleIndex'); with ix.searcher() as searcher: qp = qparser.QueryParser("content", ix.schema,group=qparser.syntax.OrGroup); c = searcher.collector(limit=10); tlc = TimeLimitCollector(c, timelimit=15); q = qp.parse(u'五子棋GOMOKU') for pair in q.all_terms(): print pair; results = searcher.search_with_collector(q, tlc); print 'Here' if results.has_matched_terms(): print('YYY',results.matched_terms()) if 0 != len(results): for hit in results: print 'xxx'; print hit['content'].encode('utf-8');
def cal_sim(train_data_path, test_data_path, dst_result_path=None, save_n_best_search=1): schema = Schema(context=TEXT(stored=True), response=STORED, post=TEXT(stored=True)) index_i = re.findall('\d', train_data_path)[0] index_path = "../tmp/ix_index/" + index_i if not os.path.exists(index_path): os.makedirs(index_path) ix = create_in(index_path, schema) writer = ix.writer() def get_cpr(line): lines = line.lower().strip().split('\t') context = '' post = lines[0] response = lines[1] return context.strip().decode('utf-8'), response.decode( 'utf-8'), post.decode('utf-8') def load_train_data(file_name, writer): f = open(file_name) for line in f: context, response, post = get_cpr(line) if context != '': writer.add_document(context=context, response=response, post=post) else: writer.add_document(response=response, post=post) writer.commit() def get_query(line, ix): lines = line.strip().split('\t') post = lines[0].decode('utf-8') q2 = QueryParser("post", ix.schema).parse(post) terms = list(q2.all_terms()) query = Or([Term(*x) for x in terms]) return query load_train_data(train_data_path, writer) f = open(test_data_path, 'r') fw_search = open(dst_result_path, 'w') with ix.searcher(weighting=scoring.TF_IDF()) as searcher: c = searcher.collector(limit=10) tlc = TimeLimitCollector(c, timelimit=10.0) for line in f: try: query = get_query(line, ix) searcher.search_with_collector(query, tlc) results = tlc.results() for i in range(min(len(results), save_n_best_search)): fw_search.write(line.strip() + '\t' + str(results[i]["post"]) + '\t' + str(results[i]["response"]) + '\n') except Exception as e: print('TimeLimit, ignore it!') print(line) fw_search.close()
query[i] = QueryParser("content", ix.schema).parse(sentenceToBeParsed[i]) # Top 'n' documents as result #topN = 2 overlaps = [set() for _ in range(indexStore)] overlapCount = 0 with ix.searcher() as searcher: # Get a collector object print("Finished loading searcher") for i, k in zip(range(0, len(fullSentence)), tqdm(range(len(overlaps)))): c = searcher.collector(limit=50, terms=True) # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds tlc = TimeLimitCollector(c, timelimit=120.0) # Try searching try: searcher.search_with_collector(query[i], tlc) except TimeLimit: print("Search took too long, aborting!") results = tlc.results() #results = searcher.search(query, terms=True,limit=10) #results= searcher.search(query,limit=10) if results.scored_length() > 0: overlapCount += 1 for j in range(0, results.scored_length()):
def home(request): title = "Search text" form = SearchForm(request.POST or None) context = { "title": title, "form": form } if form.is_valid(): instance = form.save(commit=False) instance.save() message = "You will get search results for: %s via %s soon" % (instance.searching_text, instance.email) context = { "title": "Thank you", "message": message, } with ix.searcher() as searcher: query = QueryParser("text", ix.schema).parse(instance.searching_text) # Get a collector object c = searcher.collector(limit=None) # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds tlc = TimeLimitCollector(c, timelimit=instance.t_limit) # Try searching try: searcher.search_with_collector(query, tlc) except TimeLimit: pass # You can still get partial results from the collector results = tlc.results() lst = [] for i in range(0, len(results)): st = '' st += 'Book: ' st += results[i]["book"] st += ', chapter: ' st += results[i]["chapter"] st += ', page: ' st += str(results[i]["page"]) lst.append(st) # with ix.searcher() as searcher: # query = QueryParser("text", ix.schema).parse(instance.searching_text) # results = searcher.search(query) # lst = [] # for i in range(0, len(results)): # st = '' # st += 'Book: ' # st += results[i]["book"] # st += ', chapter: ' # st += results[i]["chapter"] # st += ', page: ' # st += str(results[i]["page"]) # lst.append(st) logging.basicConfig(format=u'%(levelname)-8s [%(asctime)s] %(message)s', level=logging.DEBUG, filename=u'mylog.log') time_diff = datetime.datetime.now(timezone.utc) - instance.timestamp logging.info(time_diff.total_seconds()) subject = 'Search results for: ' + form.cleaned_data.get('searching_text') message = 'Search results for: ' + form.cleaned_data.get('searching_text') + '\n' for i in range(0, len(lst)): message += str(i+1)+') ' message += lst[i] message += '\n' from_email = settings.EMAIL_HOST_USER to_email = form.cleaned_data.get('email') send_mail(subject, message, from_email, [to_email], fail_silently=True) return render(request, "home.html", context)
def home(request): title = "Search text" form = SearchForm(request.POST or None) context = {"title": title, "form": form} if form.is_valid(): instance = form.save(commit=False) instance.save() message = "You will get search results for: %s via %s soon" % ( instance.searching_text, instance.email) context = { "title": "Thank you", "message": message, } with ix.searcher() as searcher: query = QueryParser("text", ix.schema).parse(instance.searching_text) # Get a collector object c = searcher.collector(limit=None) # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds tlc = TimeLimitCollector(c, timelimit=instance.t_limit) # Try searching try: searcher.search_with_collector(query, tlc) except TimeLimit: pass # You can still get partial results from the collector results = tlc.results() lst = [] for i in range(0, len(results)): st = '' st += 'Book: ' st += results[i]["book"] st += ', chapter: ' st += results[i]["chapter"] st += ', page: ' st += str(results[i]["page"]) lst.append(st) # with ix.searcher() as searcher: # query = QueryParser("text", ix.schema).parse(instance.searching_text) # results = searcher.search(query) # lst = [] # for i in range(0, len(results)): # st = '' # st += 'Book: ' # st += results[i]["book"] # st += ', chapter: ' # st += results[i]["chapter"] # st += ', page: ' # st += str(results[i]["page"]) # lst.append(st) logging.basicConfig( format=u'%(levelname)-8s [%(asctime)s] %(message)s', level=logging.DEBUG, filename=u'mylog.log') time_diff = datetime.datetime.now(timezone.utc) - instance.timestamp logging.info(time_diff.total_seconds()) subject = 'Search results for: ' + form.cleaned_data.get( 'searching_text') message = 'Search results for: ' + form.cleaned_data.get( 'searching_text') + '\n' for i in range(0, len(lst)): message += str(i + 1) + ') ' message += lst[i] message += '\n' from_email = settings.EMAIL_HOST_USER to_email = form.cleaned_data.get('email') send_mail(subject, message, from_email, [to_email], fail_silently=True) return render(request, "home.html", context)