def search_metaindex_by_keyword(self, text, limit=None, timelimit=1): """ Performs a query in the metadata search index by the 'key' field. Arguments: text: String used to perform the search in the index. limit: Maximum number of results to be returned. By default there is no limit. timelimit: Maximum number of seconds to execute the search. Searches that take longer than timelimit will return only partial results. Returns: A list of dictionaries, each containing the fields in the metadata index, whose values match the query text in the 'key' field. """ results_list = [] if self.metaindex: with self.metaindex.searcher() as searcher: query = QueryParser('key', self.metaindex.schema).parse(text) coll = searcher.collector(limit) tlc = TimeLimitCollector(coll, timelimit, use_alarm=False) # Try searching try: searcher.search_with_collector(query, tlc) except TimeLimit: print( "searchByKeyWord: Index search took too long, aborting!" ) # get partial results, if available results = tlc.results() for res in results: results_list.append(dict(res)) return results_list
def doSearch(self, text): q = self.qp.parse(text) # build query with self.ix.searcher( weighting=scoring.Frequency) as s: # simple scorer may help c = s.collector(limit=self.MaxResults) c = TimeLimitCollector(c, 0.5) try: s.search_with_collector(q, c) except: print("TIMEOUT!") results = c.results() # partial results if hung self.searchResults.clear() #my_cf = highlight.PinpointFragmenter(maxchars=100, surround=60) my_cf = highlight.ContextFragmenter(maxchars=160, surround=30) #my_cf = highlight.SentenceFragmenter(maxchars=200, sentencechars='\n') results.fragmenter = my_cf if len(results) > 0: for res in results: res.fragmenter = my_cf # self.searchResults.append(res.highlights('Text',top=1) + '*--*\n' + res['MeetingLink']+ '\n') self.searchResults.append(res.highlights('Text', top=1)) self.searchResults.append('-Link to Meeting -') self.searchResults.append(res['MeetingLink'] + '\n') self.searchResults.append('----------') self.searchResults.append('----------') cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start)
def full_search(self, query, time_limit=-1, search_limit=50, edit_dist=0): val = {} try: searcher = self._index.searcher(weighting=scoring.TF_IDF()) if time_limit > 0: c = searcher.collector(limit=search_limit) tlc = TimeLimitCollector(c, timelimit=time_limit) try: searcher.search_with_collector(query, tlc) except TimeLimit: None try: res = tlc.results() except TimeLimit: res = [] else: res = searcher.search(query, limit=search_limit) for ii in res: val[ii['title']] = (ii.docnum, self.scale(ii.score)) finally: searcher.close() return val
def doSearch(self, text): q = self.qp.parse(text) # build query with event-provided search key with self.ix.searcher(weighting = scoring.BM25F) as s: # there are several NLP style scorers for Whoosh c = s.collector(limit=self.MaxResults) # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long... c = TimeLimitCollector(c,0.5) try: s.search_with_collector(q,c) except: print("TIMEOUT!") # DEBUG out put to console if we're timing out a lot results = c.results() # If we do get a timeout, still return whatever we've got, i.e. partial results #----------------------------------------------------- self.searchResults.clear() # ** Now format the results for display ** results.fragmenter = WholeFragmenter() # we want the full technical name not just the local context. self.MaudeResults.clear() # Clear if len(results)> 0: self.results = [] for res in results: self.results.append(res['msid']) HighLightedMsid = res.highlights('msid') # construct MSID string with highlights, if that's where the match is... if len(HighLightedMsid) >0: msid_str = HighLightedMsid else: msid_str = res['msid'] HighLightedTechName = res.highlights('technical_name') # construct technical_name string with highlights, if relevant if len(HighLightedTechName) >0: tech_str = HighLightedTechName else: tech_str = res['technical_name'] self.searchResults.append(msid_str + ' - ' + tech_str) cursor = self.searchResults.moveCursor(QtGui.QTextCursor.Start) # return cursor to beginning of search results
def search(self, text: str, limit: int, timelimit=3.0): with self.index.searcher() as searcher: or_group = OrGroup.factory(.9) parser = MultifieldParser(['content', 'quiz_bowl'], schema=self.schema, group=or_group) text_query = parser.parse(text) collector = searcher.collector(limit=limit) tlc = TimeLimitCollector(collector, timelimit=timelimit) partial = True try: searcher.search_with_collector(text_query, tlc) partial = False except searching.TimeLimit: pass # There is a bug in whoosh that makes calling len directory or indirectly fail # which is why we don't use list() results = [(r['page'], r.score) for r in tlc.results()] # Doing logging using partial instead of directory is required due to a mysterious race # condition between whoosh time limits and log.info. Its important that all of whoosh's # functions including search_with_collector() and tlc.results() are called before # logging anything if partial: log.info( 'Search took longer than {}s, getting partial results'. format(timelimit)) if len(results) == 0: return [('<UNK_ANSWER>', 0)] return results
def test_video(): index_path = os.path.join(config.index_root_dir, 'video') storage = FileStorage(index_path) ix = storage.open_index() with ix.searcher() as searcher: #print list(searcher.lexicon('title')) myquery = Term('title', u'全面') #myquery = Term('movieid', u'mi1022160') tc = searcher.collector(limit=20) tlc = TimeLimitCollector(tc, timelimit=1) #limit seacher time searcher.search_with_collector(myquery, tlc) #for hit in tlc.results(): #print hit.fields() # print hit.fields().get('title') print '===========================' results = searcher.search_page(myquery, 1, 10) #for hit in results: # print hit.fields().get('title') print '===============================' parser = MultifieldParser(['title', 'pinyin_title'], ix.schema) # parser = QueryParser('title', schema = ix.schema) q = parser.parse(u'quan mian') results = searcher.search_page(q, 1, 10) for hit in results: print hit.fields()
def test_media(): index_path = os.path.join(config.index_root_dir, 'media') storage = FileStorage(index_path) ix = storage.open_index() with ix.searcher() as searcher: #print list(searcher.lexicon('title')) myquery = Term('title', u'尾巴') #myquery = Term('movieid', u'mi1022160') tc = searcher.collector(limit=200) tlc = TimeLimitCollector(tc, timelimit=1) #limit seacher time searcher.search_with_collector(myquery, tlc) for hit in tlc.results(): #print hit.fields() print hit.fields()
def __init__(self,MSID_index_dir, Searchable,MaxResults=10,Timeout = 0.5): ''' Initializes the wrapper object with ijdex reference and preferences parameter MSID_index_dir = (string) Existing Whoosh Index directory parameter Searchable = (string) List of fieldnames of the index to search parameter MaxResults = (numeric) Maximum # of results to return parameter Timeout = (numeric) Maximum # of seconds to wait before ending search ''' self.ix = index.open_dir(MSID_index_dir) # self.qp = MultifieldParser(Searchable, schema=self.ix.schema) # Search all the specified fields #self.qp = QueryParser(Searchable[0], schema=self.ix.schema) # Search ONLY the first field #self.s = self.ix.searcher(weighting = scoring.Frequency) # Simple Scorer self.s = self.ix.searcher(weighting = scoring.BM25F) # Fancy Scorer c = self.s.collector(limit=MaxResults) # The "collector" allows setting the timeout for a search. In this case it's 0.5 seconds which is a little long... self.c = TimeLimitCollector(c,Timeout) self.Searchable = Searchable self.LastResults = None
def find_closest(self, raw_query, threshold=50): """ Returns the best score of similarity """ from whoosh import qparser from whoosh.qparser import QueryParser from fuzzywuzzy import fuzz from extractors.ir import IrIndex if self.parser is None: og = qparser.OrGroup.factory(0.9) self.parser = QueryParser("text", schema=self.schema, group=og) query_text, query_len = IrIndex.prepare_query(raw_query.lower()) print("Query: %s" % query_text) query = self.parser.parse(query_text) print("-------------") closest_question = -1 with self.index.searcher() as s: c = s.collector(limit=10) tlc = TimeLimitCollector(c, timelimit=5) try: s.search_with_collector(query, tlc) except TimeLimit: None try: results = tlc.results() except TimeLimit: print("Time limit reached!") return -1 print(results[0]['id'], self.raw[results[0]['id']][:50]) similarity = fuzz.ratio(self.raw[results[0]['id']], raw_query.lower()) if similarity > threshold: closest_question = results[0]['id'] print("Old!", closest_question, similarity) else: print("NEW! %f" % similarity) print("-------------") return closest_question
def searchIndex(self, sq): indexParser = MultifieldParser(["query", "target"], schema=self.schema).parse(unicode(sq)) with self.ix.searcher() as s: collector = s.collector(limit=None) timed_collector = TimeLimitCollector(collector, timelimit=30.0) try: results = s.search_with_collector(indexParser, timed_collector) except TimeLimit: print 'Search ime limit of 30 seconds exceeded.' hits = timed_collector.results() # Convert result structure into a jsonable list # TODO: improve this structure matches = [] for i in hits: matches.append({"sourcelang": i["query"], "targetlang": i["target"], "distance": (1.0/i.score)}) return matches
#!/usr/local/bin/python #-*- encoding:utf-8 -*- from whoosh.index import open_dir from whoosh.fields import * from whoosh import qparser; from chinesetokenizer import ChineseAnalyzer #from whoosh.analysis import RegexAnalyzer #analyzer = RegexAnalyzer(ur"([\u4e00-\u9fa5])|(\w+(\.?\w+)*)") from whoosh.collectors import TimeLimitCollector, TimeLimit analyzer = ChineseAnalyzer() ix = open_dir('IndexDir/titleIndex'); with ix.searcher() as searcher: qp = qparser.QueryParser("content", ix.schema,group=qparser.syntax.OrGroup); c = searcher.collector(limit=10); tlc = TimeLimitCollector(c, timelimit=15); q = qp.parse(u'五子棋GOMOKU') for pair in q.all_terms(): print pair; results = searcher.search_with_collector(q, tlc); print 'Here' if results.has_matched_terms(): print('YYY',results.matched_terms()) if 0 != len(results): for hit in results: print 'xxx'; print hit['content'].encode('utf-8');
def cal_sim(train_data_path, test_data_path, dst_result_path=None, save_n_best_search=1): schema = Schema(context=TEXT(stored=True), response=STORED, post=TEXT(stored=True)) index_i = re.findall('\d', train_data_path)[0] index_path = "../tmp/ix_index/" + index_i if not os.path.exists(index_path): os.makedirs(index_path) ix = create_in(index_path, schema) writer = ix.writer() def get_cpr(line): lines = line.lower().strip().split('\t') context = '' post = lines[0] response = lines[1] return context.strip().decode('utf-8'), response.decode( 'utf-8'), post.decode('utf-8') def load_train_data(file_name, writer): f = open(file_name) for line in f: context, response, post = get_cpr(line) if context != '': writer.add_document(context=context, response=response, post=post) else: writer.add_document(response=response, post=post) writer.commit() def get_query(line, ix): lines = line.strip().split('\t') post = lines[0].decode('utf-8') q2 = QueryParser("post", ix.schema).parse(post) terms = list(q2.all_terms()) query = Or([Term(*x) for x in terms]) return query load_train_data(train_data_path, writer) f = open(test_data_path, 'r') fw_search = open(dst_result_path, 'w') with ix.searcher(weighting=scoring.TF_IDF()) as searcher: c = searcher.collector(limit=10) tlc = TimeLimitCollector(c, timelimit=10.0) for line in f: try: query = get_query(line, ix) searcher.search_with_collector(query, tlc) results = tlc.results() for i in range(min(len(results), save_n_best_search)): fw_search.write(line.strip() + '\t' + str(results[i]["post"]) + '\t' + str(results[i]["response"]) + '\n') except Exception as e: print('TimeLimit, ignore it!') print(line) fw_search.close()
query[i] = QueryParser("content", ix.schema).parse(sentenceToBeParsed[i]) # Top 'n' documents as result #topN = 2 overlaps = [set() for _ in range(indexStore)] overlapCount = 0 with ix.searcher() as searcher: # Get a collector object print("Finished loading searcher") for i, k in zip(range(0, len(fullSentence)), tqdm(range(len(overlaps)))): c = searcher.collector(limit=50, terms=True) # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds tlc = TimeLimitCollector(c, timelimit=120.0) # Try searching try: searcher.search_with_collector(query[i], tlc) except TimeLimit: print("Search took too long, aborting!") results = tlc.results() #results = searcher.search(query, terms=True,limit=10) #results= searcher.search(query,limit=10) if results.scored_length() > 0: overlapCount += 1 for j in range(0, results.scored_length()):
def home(request): title = "Search text" form = SearchForm(request.POST or None) context = {"title": title, "form": form} if form.is_valid(): instance = form.save(commit=False) instance.save() message = "You will get search results for: %s via %s soon" % ( instance.searching_text, instance.email) context = { "title": "Thank you", "message": message, } with ix.searcher() as searcher: query = QueryParser("text", ix.schema).parse(instance.searching_text) # Get a collector object c = searcher.collector(limit=None) # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds tlc = TimeLimitCollector(c, timelimit=instance.t_limit) # Try searching try: searcher.search_with_collector(query, tlc) except TimeLimit: pass # You can still get partial results from the collector results = tlc.results() lst = [] for i in range(0, len(results)): st = '' st += 'Book: ' st += results[i]["book"] st += ', chapter: ' st += results[i]["chapter"] st += ', page: ' st += str(results[i]["page"]) lst.append(st) # with ix.searcher() as searcher: # query = QueryParser("text", ix.schema).parse(instance.searching_text) # results = searcher.search(query) # lst = [] # for i in range(0, len(results)): # st = '' # st += 'Book: ' # st += results[i]["book"] # st += ', chapter: ' # st += results[i]["chapter"] # st += ', page: ' # st += str(results[i]["page"]) # lst.append(st) logging.basicConfig( format=u'%(levelname)-8s [%(asctime)s] %(message)s', level=logging.DEBUG, filename=u'mylog.log') time_diff = datetime.datetime.now(timezone.utc) - instance.timestamp logging.info(time_diff.total_seconds()) subject = 'Search results for: ' + form.cleaned_data.get( 'searching_text') message = 'Search results for: ' + form.cleaned_data.get( 'searching_text') + '\n' for i in range(0, len(lst)): message += str(i + 1) + ') ' message += lst[i] message += '\n' from_email = settings.EMAIL_HOST_USER to_email = form.cleaned_data.get('email') send_mail(subject, message, from_email, [to_email], fail_silently=True) return render(request, "home.html", context)