def getSearchEngineResult(query_dict): result_dict = {} ix = index.open_dir("index") # with ix.searcher(weighting=scoring.BM25F()) as searcher: with ix.searcher(weighting=scoring.ScoringFunction()) as searcher: # TODO - Define your own query parser parser = QueryParser("contents", schema=ix.schema, group=OrGroup.factory(0)) stemmizer = LancasterStemmer() stopWords = set(stopwords.words('english')) # print(stopWords) for qid, q in query_dict.items(): table = str.maketrans('\n?.,!', ' ') q_nomark = q.translate(table) new_q = '' for word in q_nomark.split(' '): if word.lower() not in stopWords: word_stem = stemmizer.stem(word.lower()) new_q += word_stem + ' ' # print(new_q) query = parser.parse(new_q.lower()) results = searcher.search(query, limit=None) # for result in results: # print(result.fields()['docID'], result.score) result_dict[qid] = [result.fields()['docID'] for result in results] return result_dict
def search(self, text: str, limit: int, timelimit=3.0): with self.index.searcher() as searcher: or_group = OrGroup.factory(.9) parser = MultifieldParser(['content', 'quiz_bowl'], schema=self.schema, group=or_group) text_query = parser.parse(text) collector = searcher.collector(limit=limit) tlc = TimeLimitCollector(collector, timelimit=timelimit) partial = True try: searcher.search_with_collector(text_query, tlc) partial = False except searching.TimeLimit: pass # There is a bug in whoosh that makes calling len directory or indirectly fail # which is why we don't use list() results = [(r['page'], r.score) for r in tlc.results()] # Doing logging using partial instead of directory is required due to a mysterious race # condition between whoosh time limits and log.info. Its important that all of whoosh's # functions including search_with_collector() and tlc.results() are called before # logging anything if partial: log.info( 'Search took longer than {}s, getting partial results'. format(timelimit)) if len(results) == 0: return [('<UNK_ANSWER>', 0)] return results
def __init__(self, index_dir, index_name='dialogues', context_size=1, rewrite=False, num_candidates=5, slot_detector=None): ''' Load index from index_dir or build it from dialogues. context_size: number of previous utterances to include ''' self.slot_detector = slot_detector self.index_name = index_name if not index.exists_in(index_dir, indexname=index_name) or rewrite: if not os.path.exists(index_dir): print 'Create index in', index_dir os.makedirs(index_dir) elif rewrite: print 'Rewrite index in', index_dir shutil.rmtree(index_dir) os.makedirs(index_dir) self.ix = index.create_in(index_dir, schema=DialogueSchema, indexname=index_name) self.loaded_index = False else: print 'Load index from', index_dir self.ix = index.open_dir(index_dir, indexname=index_name) self.loaded_index = True self.context_size = context_size self.parser_icontext = QueryParser('immediate_context', schema=self.ix.schema, group=OrGroup.factory(0.9)) self.parser_pcontext = QueryParser('prev_context', schema=self.ix.schema, group=OrGroup.factory(0.9)) self.parser_title = QueryParser('title', schema=self.ix.schema, group=OrGroup.factory(0.9)) self.num_candidates = num_candidates self.empty_candidates = [{} for _ in xrange(self.num_candidates)] self.search_time = 0. self.num_query = 0. self.num_empty = 0
def before_first_request(): global parser, searcher index = load_index() og = OrGroup.factory(0.9) parser = QueryParser("text", schema=index.schema, group=og) searcher = index.searcher() app.logger.addHandler(logging.StreamHandler(sys.stdout)) app.logger.setLevel(logging.INFO)
def search(self, q: str, tool_name_boost: CanConvertToFloat, tool_id_boost: CanConvertToFloat, tool_section_boost: CanConvertToFloat, tool_description_boost: CanConvertToFloat, tool_label_boost: CanConvertToFloat, tool_stub_boost: CanConvertToFloat, tool_help_boost: CanConvertToFloat, tool_search_limit: CanConvertToFloat, tool_enable_ngram_search: bool, tool_ngram_minsize: CanConvertToInt, tool_ngram_maxsize: CanConvertToInt) -> List[str]: """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher self.searcher = self.index.searcher(weighting=MultiWeighting( BM25F(), old_id=BM25F(old_id_B=float(tool_id_boost)), name=BM25F(name_B=float(tool_name_boost)), section=BM25F(section_B=float(tool_section_boost)), description=BM25F(description_B=float(tool_description_boost)), labels=BM25F(labels_B=float(tool_label_boost)), stub=BM25F(stub_B=float(tool_stub_boost)), help=BM25F(help_B=float(tool_help_boost)))) # Use OrGroup to change the default operation for joining multiple terms to logical OR. # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match. # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur' # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user. # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1. # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default # Adding the FuzzyTermPlugin to account for misspellings and typos, using a max distance of 2 og = OrGroup.factory(0.9) self.parser = MultifieldParser([ 'name', 'old_id', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema, group=og) cleaned_query = q.lower() if tool_enable_ngram_search is True: rval = self._search_ngrams(cleaned_query, tool_ngram_minsize, tool_ngram_maxsize, tool_search_limit) return rval else: cleaned_query = ' '.join(token.text for token in self.rex(cleaned_query)) # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie' parsed_query = self.parser.parse(f"*{cleaned_query}*") hits = self.searcher.search(parsed_query, limit=float(tool_search_limit), sortedby='') return [hit['id'] for hit in hits]
def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher self.searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost) })) # Use OrGroup to change the default operation for joining multiple terms to logical OR. # This means e.g. for search 'bowtie of king arthur' a document that only has 'bowtie' will be a match. # https://whoosh.readthedocs.io/en/latest/api/qparser.html#whoosh.qparser.MultifieldPlugin # However this changes scoring i.e. searching 'bowtie of king arthur' a document with 'arthur arthur arthur' # would have a higher score than a document with 'bowtie arthur' which is usually unexpected for a user. # Hence we introduce a bonus on multi-hits using the 'factory()' method using a scaling factor between 0-1. # https://whoosh.readthedocs.io/en/latest/parsing.html#searching-for-any-terms-instead-of-all-terms-by-default og = OrGroup.factory(0.9) self.parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema, group=og) cleaned_query = q.lower() # Replace hyphens, since they are wildcards in Whoosh causing false positives if cleaned_query.find('-') != -1: cleaned_query = (' ').join( token.text for token in self.rex(to_unicode(cleaned_query))) if tool_enable_ngram_search is True: rval = self._search_ngrams(cleaned_query, tool_ngram_minsize, tool_ngram_maxsize, tool_search_limit) return rval else: # Use asterisk Whoosh wildcard so e.g. 'bow' easily matches 'bowtie' parsed_query = self.parser.parse(cleaned_query + '*') hits = self.searcher.search(parsed_query, limit=float(tool_search_limit), sortedby='') return [hit['id'] for hit in hits]
def getRandom15SearchResult(query_dict): """ Get Random Search results for checking variance, max and min value of BPREF Query Number 8,50,59 should be avoided hence those queries are only related to one document. :param query_dict: standard query_dict :return: result dictionary which contains every info about search result """ result_dict = {} tot_result_dict ={} ix = index.open_dir("index") with ix.searcher(weighting=scoring.ScoringFunction()) as searcher: parser = QueryParser("contents", schema=ix.schema, group=OrGroup.factory(0.4)) from whoosh.lang.porter2 import stem as ps2 query_dict = {k: split_stem(stem_fn=ps2, value=v) for k, v in query_dict.items()} import random r = [i for i in range(1, 94) if i not in [8, 50, 59]] rand_15 = random.sample(r, 15) for qid in rand_15: q = query_dict[qid] query = parser.parse(q) results = searcher.search(query, limit=None) result_dict[qid] = [result.fields()['docID'] for result in results] query_expansion = parse_document(result_dict[qid], 3, 2) expand_set = set([ps2(word[0]) for word in query_expansion]).union( set([word[0] for word in query_expansion])) q_set = set(q.split()) unique_set = expand_set - q_set q += " " + " ".join(unique_set) tot_query = parser.parse(q) tot_results = searcher.search(tot_query, limit=None) tot_result_dict[qid] = [result.fields()['docID'] for result in tot_results] print("Random 15 queries: " + ",".join(str(x) for x in rand_15)) return tot_result_dict
def search_index(query, score_func_name, dirname): ix = index.open_dir(dirname, schema=get_schema()) og = OrGroup.factory(0.9) qp = QueryParser("content", schema=get_schema(), group=og) # qp.add_plugin(FuzzyTermPlugin()) # query = ' '.join([(x + '~' if len(x) > 5 else x) for x in query.split(' ')]) q = qp.parse(query) score_func = OkBM25() if score_func_name == 'ok': score_func = OkBM25() elif score_func_name == 'bm25f': score_func = BM25F() elif score_func_name == 'pln': score_func = PLN() elif score_func_name == 'tfidf': score_func = TF_IDF() elif score_func_name == 'freq': score_func = Frequency() searcher = ix.searcher(weighting=score_func) results = searcher.search(q, limit=None) results.fragmenter.surround = 100 return results
"""Set up app config.""" import os from pathlib import Path from whoosh.index import open_dir from whoosh.qparser import MultifieldParser, OrGroup WHOOSH_INDEX_DIR = Path(os.getenv("WHOOSH_INDEX_DIR", "whoosh_index")) SEARCH_INDEX = open_dir(WHOOSH_INDEX_DIR) QUERY_PARSER = MultifieldParser(["name", "artist_name"], SEARCH_INDEX.schema, group=OrGroup.factory(0.9))
# 方法一 使用FileStorage对象 from whoosh.filedb.filestore import FileStorage storage = FileStorage('index') # idx_path 为索引路径 idx1 = storage.open_index(indexname='idx1') from whoosh import index # 方法二 使用open_dir函数 from whoosh.index import open_dir idx2 = open_dir('index', indexname='idx2') # indexname 为索引名 print(index.exists_in('index', indexname='idx2')) pass from whoosh.qparser import QueryParser, MultifieldParser, OrGroup, FieldsPlugin og = OrGroup.factory(0.9) qp = QueryParser("content", schema=idx1.schema) # group=OrGroup qp.remove_plugin_class(FieldsPlugin) q = qp.parse("reset") print(q) # mqp = MultifieldParser(["title", "content"], schema=idx1.schema) # mq = mqp.parse(u"many only") # # from whoosh.query import * # myquery = And([Term("title", u"third"), q]) # # myquery = Term("title", u"ird") # print(myquery) searcher = idx1.searcher() r = (searcher.search(q=q, limit=None)) print(len(r)) for hit in r: