print(pre,recall,f1) return (pre,recall,f1) import os import torch if __name__ == '__main__': Score = "bm25" # bm25, tfidf, tf Pivots_N = 10 # number of plausible English mentions Search_N = 500 # number of searched entities for each plausible English mention InputIndexDir = "data_process/DBIndex2" input_data_file = "Release/output_toy_de.json" output_data_file = "Release/output_toy_de_search.json" #------------------------------------------------------ if Score == "bm25": myscore = scoring.BM25F() elif Score =="tfidf": myscore = scoring.TF_IDF() elif Score == "tf": myscore = scoring.Frequency() elif Score == "multi": myscore = scoring.MultiWeighting(scoring.BM25F(), id=scoring.Frequency(), keys=scoring.TF_IDF()) else: myscore = scoring.BM25F() #---------------Input Query---------------------- schema = Schema(title=TEXT(stored=True, analyzer=StemmingAnalyzer()), content=TEXT(stored=True)) All_Result = [] ix = open_dir(InputIndexDir) sf = torch.nn.Softmax(dim=0) alldata = read_json(input_data_file) with ix.searcher(weighting=myscore) as searcher:
def src(indexdir, quer, dsc=True, flds="2", lim=100, w="bm", opt=[], lo="o", resdir="", run=""): """ Questa funzione permette di effettuare il reperimento di risultati per query contenute nel file indicato in quer dall'indice indicato in indexdir, utilizzando oggetti e funzioni del modulo whoosh. Parameters ---------- indexdir : string Una stringa che indica il percorso della cartella contenente l'indice su cui effettuare la ricerca. quer : string Una stringa che indica il percorso del file contenente le query da utilizzare. dsc : bool Indica quale campo della query utilizzare, titolo o descrizione, di default usa la descrizione. flds : string Indica il numero di campi, puo' assumere solo valori "1", "2" o "3". lim : int Numero massimo di documenti reperiti per query deve essere un numero positivo. w : string Schema di pesatura utilizzato, puo' assumere solo valori "bm" o "tf". "bm" sta per BM25 mentre "tf" sta per TF_IDF (per ulteriori informazioni vedi whoosh.scorig) lo : string Operatore logico utilizzato per le parole delle query puo' assumere valori "o" per OR o "a" per AND. opt : list Dovrebbe contenere due valori di tipo numerico da assegnare ai parametri del BM25. resdir : string Una stringa che indica il percorso della cartella dove salvare il file dei risultati. Se striga vuota come da default i risultati vengono stampati con print. run : string Una stringa opzionale, permette di aggiungere parte del tag e del nome dei file dei risultati Returns ------- None Notes ----- Questa funzione e' stata fatta per effettuare ricerche in un indice della gia' citata collezione ohsumed, non e' garantito che funzioni per altri. In particolare il file delle query deve essere organizzato allo stesso modo del file contenente le query sperimentali per la collezione ohsumed (il quale si dovrebbe trovare nella stessa cartella di questo programma) e i documenti dell'indice dovrebbero avere i campi 'identifier', 'title', 'abstract' e 'terms'. """ fst = FileStorage(indexdir) ix = fst.open_index() # Creo il runtag utilizzando tutti i parametri che si possono cambiare tag = run + "_BATCH_DESC" + str(dsc)[0] + "_" + flds + "C_GRP" + lo.upper( ) + "_" + w.upper() + "_" + str(lim) + "RES" # ------------------------------------------------------------------------------------------------ # # Interpreta la scelta di quale operatore logico si usa per raggruppare le parole delle query if lo == "o": lgroup = qparser.OrGroup elif lo == "a": lgroup = qparser.AndGroup # ------------------------------------------------------------------------------------------------ # # Interpreta la scelta dello schema di peastura if w == "tf": score = scoring.TF_IDF() elif w == "bm": if opt: # opt dovrebbe contenere il punto che ottimizza un valore(come MAP) per i due parametri score = scoring.BM25F(opt[0], opt[1]) else: score = scoring.BM25F() # ------------------------------------------------------------------------------------------------ # # Interpeta il numero di campi dei documenti da utilizzare if flds == "1": campi = "title" parser = qp elif flds == "2": campi = ["title", "abstract"] parser = mp elif flds == "3": campi = ["title", "abstract", "terms"] parser = mp # ----------------------------------------------------------------------------------------------- # #--- apertura del file delle query ---# infile = open(quer, 'r') #--- lettura del file text = infile.read() #--- dom delle query dom = parseString(text) #--- estrazione dei dati della query title = gettagdata(dom, 'title') # Utilizzare il campo title delle query if dsc == True: title = gettagdata(dom, 'desc') # Utilizzare il campo desc delle query # Togliere i commenti dalle righe successive e commentare la riga prcedente per usare entrambi #desc = gettagdata(dom,'desc') #for x in range(len(title)-1): # title[x]=title[x]+" "+desc[x] num = gettagdata(dom, 'num') infile.close() # ------------------------------------------------------------------------------------------------- # # Apre il file dove inserire i risultati se esiste if resdir and os.path.exists(resdir): resfile = open( resdir + "/" + run + "_" + flds + "C" + ".treceval", 'w' ) # Se si cambiano piu' parametri e' consigliato usare la variabile tag al posto di run+"_"+flds+"C" per non rischiare di sovrascrivere risultati print "File dei risultati " + run + "_" + flds + "C" + ".treceval" else: print resdir, "does not exist" resdir = None # Effettua la ricerca per ogni query for qid in num[:]: title[int(qid) - 1].encode('utf-8') query = parser(campi, ix.schema, group=lgroup).parse(title[int(qid) - 1]) new_query = parser(campi, ix.schema, group=lgroup).parse( expq_cor(ix, query) ) # Corregge la query se le parole hanno una lettera sbagliata #print new_query results = ix.searcher(weighting=score).search( new_query, limit=lim) # Effettua la ricerca effettiva if results: if not resdir: # Stampa i risultati in console res(results, qid, lim, tag) else: # Stampa i risultati su file print "sta stampando i risultati della query " + qid + " su file" res(results, qid, lim, tag, resfile) else: print "non ha trovato risultati" resfile.close() ix.searcher().close() return None
from whoosh.qparser import QueryParser from whoosh import scoring from whoosh.index import open_dir import sys ix = open_dir("index") print("==========SUMMARY SEARCH ENGINE==========") query_str = input("Search: ") print("\nResults: ") with ix.searcher(weighting=scoring.TF_IDF()) as searcher: query = QueryParser("synopsis", schema=ix.schema).parse(query_str) results = searcher.search(query, limit=1000) last = len(results) lastPage = last / 2 / 10.0 if (lastPage % 10 != 0): lastPage = int(lastPage) + 1 else: lastPage = int(lastPage) pageNum = 1 def openRes(results, choose, pageNum): print("Title:", results[choose * 2 - 2]['title']) print("Author:", results[choose * 2 - 2]['author']) print("Date:", results[choose * 2 - 2]['date']) print("Genre:", results[choose * 2 - 2]['genre']) print("Summary:\n", results[choose * 2 - 2]['synopsis']) action = input("Return to search? (y/n):") if (action == 'y' or action == 'Y'):
def search( self, given_query='', #search function in_query=[''], ex_query=[''], diets=[], allergies=[], page=1, ranking="BM25"): # These are only for parsing not for filling the results keys = [ 'name', 'ingredients', 'cautions', 'dietLabels', 'healthLabels' ] try: #open the index index = open_dir('WhooshIndex') except Exception: self.index() #make the index if it doesnt exist index = open_dir('WhooshIndex') if ranking == "TF-IDF": #set the ranking algorithm ranking = scoring.TF_IDF() else: ranking = scoring.BM25F() with index.searcher(weighting=ranking) as searcher: # Universal all docs in case of None # because in the intersection the smaller # result will be returned parser = QueryParser('url', schema=index.schema) q = parser.parse('http OR https') all_docs = searcher.search(q, limit=None) # Creates an empty result for a filter and mask p = QueryParser('id', schema=index.schema) q = p.parse('') myMask = searcher.search(q, limit=None) myFilter = searcher.search(q, limit=None) # include query parsing if in_query != ['']: in_parser = QueryParser('ingredients', schema=index.schema) inFilter = searcher.search(q, limit=None) in_q = in_parser.parse( in_query[0]) #get the first ingredient... in_r = searcher.search(in_q, limit=None) inFilter.extend(in_r) for q in in_query: in_q = in_parser.parse( q ) #take the intersection of remaining docs with docs containing next ingredient in_r = searcher.search(in_q, limit=None) inFilter.filter(in_r) myFilter.extend(inFilter) # exclude query parsing if ex_query != ['']: ex_parser = QueryParser('ingredients', schema=index.schema) for q in ex_query: ex_q = ex_parser.parse(q) ex_r = searcher.search(ex_q, limit=None) myMask.extend(ex_r) #list of docs to mask # allergies query parsing if allergies != []: allergy_parser = QueryParser('cautions', schema=index.schema) for q in allergies: allergy_q = allergy_parser.parse(q) allergy_r = searcher.search(allergy_q, limit=None) myMask.extend(allergy_r) #list of docs to mask # diets query parsing if diets != []: p = QueryParser('id', schema=index.schema) q = p.parse('') dietFilter = searcher.search(q, limit=None) diet_parser = QueryParser('dietInfo', schema=index.schema) diet_q = diet_parser.parse(diets[0]) diet_r = searcher.search(diet_q, limit=None) #get the first diet dietFilter.extend(diet_r) for d in diets: diet_q = diet_parser.parse(d) diet_r = searcher.search(diet_q, limit=None) dietFilter.filter( diet_r ) #take the intersection of whats already in the filter and the new docs to filter by if ( in_query == [''] ): #if we had no ingredients filter, let the filter be the diet filter myFilter.extend(dietFilter) else: myFilter.filter( dietFilter ) #otherwise the filter is the intersection of our two filters # filtering results to get intersection # print(type(results)) # Check if the filter is empty so we don't intersect nothing if (diets == [] and in_query == ['']): myFilter = all_docs elif myFilter.scored_length( ) == 0: #if we filtered and got nothing, we should return nothing payload = {} payload_entries = list() payload['entries'] = payload_entries payload['total'] = 0 return payload if given_query != '' and given_query != None: #the actual search if given_query[0] == '"' and given_query[-1] == '"': given_query = given_query[1:-1] parser = MultifieldParser(keys, schema=index.schema) else: parser = MultifieldParser(keys, schema=index.schema, group=OrGroup) query = parser.parse(given_query) results = searcher.search_page(query, page, filter=myFilter, mask=myMask) else: parser = QueryParser( 'url', schema=index.schema ) #if we arent given a query for the search, filter and mask all docs q = parser.parse('http OR https') results = searcher.search_page(q, page, filter=myFilter, mask=myMask) # Format results for returning payload = {} payload_entries = list() for x in results: payload_entries.append({ 'name': x['name'], 'image': x['image'], 'id': x['id'] }) payload['entries'] = payload_entries payload['total'] = len(results) return payload
def searcher(queryString, p): ls = [] schema = Schema(id=ID(stored=True), img=TEXT(stored=True), title=TEXT(stored=True), h1=TEXT(analyzer=StemmingAnalyzer(), stored=True), content=TEXT(analyzer=StemmingAnalyzer(), stored=True)) if (p == "1"): print "TF_IDF" with ix.searcher(weighting=scoring.TF_IDF()) as searcher: qparserObject = qparser.MultifieldParser( ["id", "h1", "title", "content"], ix.schema) query = qparserObject.parse(queryString) results = searcher.search(query, limit=10) corrected = searcher.correct_query(query, queryString) if corrected.query != query: print "Did you mean:" + corrected.string print "Showing results for " + corrected.string newQuery = qparserObject.parse(corrected.string) correctedResults = searcher.search(newQuery, terms=True) l = len(correctedResults) if (len(correctedResults) != 0): for hit in correctedResults: a = {} print hit["title"] a["title"] = hit["title"] a["imagetag"] = hit["img"] a["file"] = "../final_database/" + hit["id"] + ".html" a["data"] = hit.highlights("content", top=5) ls.append(a) print "hello" return render_template('result.html', entries=ls, num=l, query=queryString, sim=0, correct=corrected.string) else: return render_template('result.html', correct=queryString, num=0) else: results = searcher.search(query) l = len(results) if (len(results) != 0): for hit in results: a = {} a["title"] = hit["title"] a["imagetag"] = hit["img"] a["file"] = "../final_database/" + hit["id"] + ".html" a["data"] = hit.highlights("content", top=5) ls.append(a) return render_template('result.html', entries=ls, num=l, query=queryString, sim=2, correct=corrected.string) else: return render_template('result.html', correct=queryString, num=0) else: with ix.searcher() as searcher: qparserObject = qparser.MultifieldParser(["id", "h1", "content"], ix.schema) query = qparserObject.parse(queryString) results = searcher.search(query, limit=None) corrected = searcher.correct_query(query, queryString) if corrected.query != query: print "Did you mean:" + corrected.string print "Showing results for " + corrected.string newQuery = qparserObject.parse(corrected.string) correctedResults = searcher.search(newQuery, terms=True) l = len(correctedResults) if (len(correctedResults) != 0): for hit in correctedResults: print hit["title"] a = {} a["title"] = hit["title"] a["imagetag"] = hit["img"] a["file"] = "../final_database/" + hit["id"] + ".html" a["data"] = hit.highlights("content", top=5) ls.append(a) print "hello" return render_template('result.html', entries=ls, num=l, query=queryString, sim=0, correct=corrected.string) else: return render_template('result.html', correct=queryString, num=0) else: results = searcher.search(query) l = len(results) if (len(results) != 0): for hit in results: a = {} a["title"] = hit["title"] a["imagetag"] = hit["img"] a["file"] = "../final_database/" + hit["id"] + ".html" a["data"] = hit.highlights("content", top=5) ls.append(a) return render_template('result.html', entries=ls, num=l, query=queryString, sim=3, correct=corrected.string) else: return render_template('result.html', correct=queryString, num=0)
analyzer_names[i],schema) sw1_utils_schema.Fill_Empty_Schema(datasets[idx],directory_containing_the_index,\ datasets_len[idx]) dir_idx_list.append(directory_containing_the_index) ### ### Open the Index ### # for each of the datasets open index. And for each of the scoring functions # create a searcher, process queries and save retrieved results into Q_results. # Compute the MRR value for each of the search engine configurations Q_Res = [] config_names = [] # scoring functions sc_functions = [scoring.Frequency(), scoring.TF_IDF(), scoring.BM25F()] sc_fun_name = ['Frequency', 'TF_IDF', 'BM25F'] for idx in range(len(datasets)): max_number_of_results = datasets_len[idx] print('Search Engine Configuration' + "\t" + "\t" + "\t" + 'MRR') if datasets[idx] == 'Cranfield_DATASET': Q_dict = Cran_Q GT_dict = Cran_GT for elem in dir_idx_list[:3]: directory_containing_the_index = elem ix = index.open_dir(directory_containing_the_index) ### Select a Scoring-Function for s in range(len(sc_functions)): scoring_function = sc_functions[s] ### Create a Searcher for the Index with the selected Scoring-Function
else: basePath = "./datasets/NPL/" indexDir = basePath + "index/" docDir = basePath + "docs/" benchmarkPath = basePath + "benchmark-data.json" if arg["search"]: query = arg['<query>'] spellCheck = not arg["--no-spell-check"] batch = arg["--batch"] limit = parseLimit(arg["--limit"]) wildcard = not arg["--no-wildcard"] if arg['--tf-idf']: modelScoring = scoring.TF_IDF() elif arg['--freq']: modelScoring = scoring.Frequency() else: modelScoring = scoring.BM25F() if spellCheck: fixed = correct(query) if fixed != query: if batch or confirm("Did you mean: `" + fixed + "`?"): query = fixed print("Searching for `" + query + "`") index = openIndex(indexDir) r = search(query, index, modelScoring, limit, wildcard)
def __init__(self): self._ix = indexer.im.get_index(song_index=True) self._searcher = self._ix.searcher(weighting=scoring.TF_IDF())
def refresh_searcher(self): self._ix = indexer.im.get_index() self._searcher = self._ix.searcher(weighting=scoring.TF_IDF())
def main(): args = parse_args() query = args.query number = args.number rank_func = args.rank_func index_loc = args.index_loc B = args.B weight_B = args.weight_B K1 = args.K1 if query is None: query_list = read_query() else: temp_str = ' ' query = temp_str.join(query) query_list = [query] if index_loc is None: index_loc = 'index' if weight_B is not None: rank_func = 1 if rank_func == 1: B1, B2, B3, B4, B5 = get_B(weight_B) weighting = scoring.BM25F(B=B, K1=K1, title_B=B1, body_B=B2, category_B=B3, date_B=B4, rating_B=B5) rank_name = 'bm25f' elif rank_func == 2: weighting = scoring.TF_IDF() rank_name = 'tf-idf' elif rank_func == 3: weighting = scoring.Frequency() rank_name = 'frequency' else: weighting = scoring.BM25F(B=B, K1=K1) rank_name = 'bm25' ix = open_dir(index_loc) with ix.searcher(weighting=weighting) as searcher: # parser = QueryParser(schema=ix.schema) parser = MultifieldParser( ['title', 'body', 'category', 'date', 'rating'], schema=ix.schema) for this_query in query_list: que = parser.parse(this_query) print('\n') print('--', this_query) results = searcher.search(que, limit=number) if len(results) == 0: print(' ') print('no matched result. please try again.') else: for hit in results: print(' ') print('#', hit.rank, rank_name, 'score:', round(hit.score, 10)) print('title:', hit['title']) print('imdb:', hit['imdbid'], 'date:', hit['date'], 'rating:', hit['rating'], 'category:', hit['category']) print('body:', hit['body'])