def query_processor(self, query): """ Using aserini for search as a lite weight replacement for elasticsearch :param query: query from user :return: a function """ #Retrieve set of candidate documents searcher = pysearch.SimpleSearcher(self.index) results = searcher.search(query, self.number_docs) documents = [] history = set() for res in results: did = res.docid history.add(did) title = res.lucene_document.get("title") text = res.lucene_document.get("abstract") inp_dict = {"id": did, "title": title, "text": text} documents.append(inp_dict) for i, para in enumerate(res.contents.split("\n")): if i == 0 or i == 1: continue else: documents.append({"id": did, "title": title, "text": para}) return self.processor(query, documents)
def evaluate1(examples_df, args): with open(args.input_dir + '/index_table_doc.json') as f: index_table = json.load(f) k1 = args.k1 searcher = pysearch.SimpleSearcher(args.input_dir + '/lucene-index-nq-doc') print("Begin evaluation 1") correct = 0 num_of_examples = 0 for index, example in examples_df.iterrows(): num_of_examples += 1 question = example["query"] actual_doc_id = example["doc_id"] #print("******************** Example id : ", num_of_examples, "***************") #print("question : ", question) #print("context : ") # Search using anserini hits = searcher.search(question, k=k1) for i in range(0, k1): index = hits[i].docid #print(hits[i].content) #print(i, ":", index_table[index]["content"]) #print("score : ", hits[i].docid) if index_table[index]["doc_id"] == actual_doc_id: correct += 1 break print("Total number of examples : ", num_of_examples) recall = correct / num_of_examples print("Proportion of correct retrieval : %f" % (recall))
def query(self, query, b, k1): self["index"].create_index() searcher = pysearch.SimpleSearcher( self["index"].get_index_path().as_posix()) searcher.set_bm25_similarity(k1, b) hits = searcher.search(query) return OrderedDict({hit.docid: hit.score for hit in hits})
def query(self, query): self["index"].create_index() searcher = pysearch.SimpleSearcher( self["index"].get_index_path().as_posix()) searcher.set_lm_dirichlet_similarity(self.cfg["mu"]) hits = searcher.search(query) return OrderedDict({hit.docid: hit.score for hit in hits})
def __init__(self, index_location, k=1000, wmodel="BM25" **kwargs): super().__init__(kwargs) self.index_location = index_location self.k = k init_anserini() from pyserini.search import pysearch self.searcher = pysearch.SimpleSearcher(index_location) self.wmodel = wmodel self._setsimilarty(wmodel)
def test_basic(self): searcher = pysearch.SimpleSearcher('{}lucene-index.cacm'.format( self.index_dir)) hits = searcher.search('information retrieval') self.assertEqual(hits[0].docid, 'CACM-3134') self.assertAlmostEqual(hits[0].score, 4.76550, places=5) self.assertEqual(hits[9].docid, 'CACM-2516') self.assertAlmostEqual(hits[9].score, 4.21740, places=5)
def query(self, query, b, k1, fbterms, fbdocs, ow): self["index"].create_index() searcher = pysearch.SimpleSearcher( self["index"].get_index_path().as_posix()) searcher.set_bm25_similarity(k1, b) searcher.set_rm3_reranker(fb_terms=fbterms, fb_docs=fbdocs, original_query_weight=ow) hits = searcher.search(query) return OrderedDict({hit.docid: hit.score for hit in hits})
def searchDatabase(question, keywords=KEYWORDS, pysearch=pysearch, lucene_database='lucene-index-covid-2020-03-27/', BERTSQuAD_Model=model, displayTable=True, displayHTML=False): """Search Database """ ## search the lucene database with a combination of the question and the keywords searcher = pysearch.SimpleSearcher(lucene_database) hits = searcher.search(question + '. ' + keywords) ## collect the relevant data in a hit dictionary hit_dictionary = {} for i in range(0, N_HITS): doc_json = json.loads(hits[i].raw) idx = int(hits[i].docid) hit_dictionary[idx] = doc_json hit_dictionary[idx]['title'] = hits[i].lucene_document.get("title") hit_dictionary[idx]['authors'] = hits[i].lucene_document.get("authors") hit_dictionary[idx]['doi'] = hits[i].lucene_document.get("doi") ## scrub the abstracts in prep for BERT-SQuAD for idx, v in hit_dictionary.items(): abs_dirty = v['abstract'] # looks like the abstract value can be an empty list v['abstract_paragraphs'] = [] v['abstract_full'] = '' if abs_dirty: # looks like if it is a list, then the only entry is a dictionary wher text is in 'text' key # looks like it is broken up by paragraph if it is in that form. lets make lists for every paragraph # and a new entry that is full abstract text as both could be valuable for BERT derrived QA if isinstance(abs_dirty, list): for p in abs_dirty: v['abstract_paragraphs'].append(p['text']) v['abstract_full'] += p['text'] + ' \n\n' # looks like in some cases the abstract can be straight up text so we can actually leave that alone if isinstance(abs_dirty, str): v['abstract_paragraphs'].append(abs_dirty) v['abstract_full'] += abs_dirty + ' \n\n' ## Search collected abstracts with BERT-SQuAD answers = searchAbstracts(hit_dictionary, BERTSQuAD_Model, question) # print(answers) ## display results in a nice format return displayResults(hit_dictionary, answers, question, displayTable=displayTable, displayHTML=displayHTML)
def evaluate3(examples_df, args): # TODO: This doesn't make sense !!! with open(args.input_dir + '/index_table_doc.json') as f: index_table_doc = json.load(f) with open(args.input_dir + '/index_table_ele.json') as f: index_table_ele = json.load(f) k1 = args.k1 k2 = args.k2 searcher1 = pysearch.SimpleSearcher(args.input_dir + '/lucene-index-nq-doc') searcher2 = pysearch.SimpleSearcher(args.input_dir + '/lucene-index-nq-ele') print("Begin evaluation 3") correct = 0 num_of_examples = 0 for index, example in examples_df.iterrows(): num_of_examples += 1 question = example["query"] actual_element_id = example["context_id"] actual_doc_id = example["doc_id"] doc_hits = searcher1.search(question, k=k1) ele_hits = searcher2.search(question, k=k2) doc_ids = [] for i in range(0, k1): index = doc_hits[i].docid doc_ids.append(index_table_doc[index]["doc_id"]) for i in range(0, k2): index = ele_hits[i].docid if index_table_ele[index]["doc_id"] in doc_ids and index_table_ele[ index]["doc_id"] == actual_doc_id and index_table_ele[ index]["element_id"] == actual_element_id: correct += 1 break print("Total number of examples : ", num_of_examples) recall = correct / num_of_examples print("Proportion of correct retrieval : %f" % (recall))
def setUp(self): # Download pre-built CACM index; append a random value to avoid filename clashes. r = randint(0, 10000000) self.collection_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.tar.gz' self.tarball_name = 'lucene-index.cacm-{}.tar.gz'.format(r) self.index_dir = 'index{}/'.format(r) filename, headers = urlretrieve(self.collection_url, self.tarball_name) tarball = tarfile.open(self.tarball_name) tarball.extractall(self.index_dir) tarball.close() self.searcher = pysearch.SimpleSearcher(f'{self.index_dir}lucene-index.cacm')
def build_searcher(self, index_path): searcher = pysearch.SimpleSearcher(index_path) searcher.set_bm25_similarity(settings.bm25_k1, settings.bm25_b) print(f'Initializing BM25 {index_path}, ' f'setting k1={settings.bm25_k1} and b={settings.bm25_b}') if settings.rm3: searcher.set_rm3_reranker(settings.rm3_fb_terms, settings.rm3_fb_docs, settings.rm3_original_query_weight) print('Initializing RM3, setting ' f'fbTerms={settings.rm3_fb_terms}, ' f'fbDocs={settings.rm3_fb_docs} and ' f'originalQueryWeight={settings.rm3_original_query_weight}') return searcher
def __init__(self): self.QA_MODEL = BertForQuestionAnswering.from_pretrained(BIOASQ_DIR) self.QA_TOKENIZER = BertTokenizer.from_pretrained(BIOASQ_DIR) self.QA_MODEL.to(torch_device) self.QA_MODEL.eval() self.searcher = pysearch.SimpleSearcher(luceneDir) self.USE_SUMMARY = False if self.USE_SUMMARY: self.SUMMARY_TOKENIZER = BartTokenizer.from_pretrained( 'bart-large-cnn') self.SUMMARY_MODEL = BartForConditionalGeneration.from_pretrained( 'bart-large-cnn') self.SUMMARY_MODEL.to(torch_device) self.SUMMARY_MODEL.eval()
def do_query(query_string, index, k1=1.2, b=0.75, n=100): """ Does a bm25 search and returns the most relevant 1000 docs and their ids """ with patch("pyserini.setup.configure_classpath") as mock_setup: mock_setup.return_value = None from pyserini.search import pysearch searcher = pysearch.SimpleSearcher(index.index_path) searcher.set_bm25_similarity(k1, b) hits = searcher.search(query_string, n) doc_ids = [hit.docid for hit in hits] docs = [index.getdoc(doc_id) for doc_id in doc_ids] return doc_ids, docs
def __init__(self, index_path: Path, topic_path: Path, qrel_path: Path, columns: List[str], topic_reader: TopicReader, searcher_name: str = "bm25"): self.qrel = qrel_path self.topic = topic_path index_path = str(index_path) self.index_utils = JIndexUtils(JString(index_path)) self.searcher = pysearch.SimpleSearcher(index_path) self._set_searcher(searcher_name) self.columns = columns self.topic_reader = topic_reader
def build_searcher(self, index_path): self.searcher = pysearch.SimpleSearcher(index_path) self.searcher.set_bm25_similarity(settings.bm25_k1, settings.bm25_b) print(f"Initializing BM25 {index_path}, " f"setting k1={settings.bm25_k1} and b={settings.bm25_b}") if settings.rm3: self.searcher.set_rm3_reranker( settings.rm3_fb_terms, settings.rm3_fb_docs, settings.rm3_original_query_weight, ) print("Initializing RM3, setting " f"fbTerms={settings.rm3_fb_terms}, " f"fbDocs={settings.rm3_fb_docs} and " f"originalQueryWeight={settings.rm3_original_query_weight}")
def search(self): """Search engine. Retrieves and re-ranks the answer candidates given a query. Renders the top-k answers for a query. """ # Download model model_name = get_trained_model("finbert-qa") model_path = path + "/model/trained/finbert-qa/" + model_name # Load model self.model.load_state_dict(torch.load(model_path), strict=False) self.model.eval() searcher = pysearch.SimpleSearcher(fiqa_index) self.k = self.config['top_k'] if self.config['user_input'] == True: # Ask the user for a keyword query. self.query = input("\nPlease enter your question: ") else: self.query = self.config['query'] hits = searcher.search(self.query, k=50) cands = [] for i in range(0, len(hits)): cands.append(int(hits[i].docid)) if len(cands) == 0: print("\nNo answers found.") sys.exit() else: print("\nRanking...\n") self.rank, self.scores = self.predict(self.model, self.query, cands) print("Question: \n\t{}\n".format(self.query)) if len(cands) < self.k: self.k = len(cands) else: pass print("Top-{} Answers: \n".format(self.k)) for i in range(0, self.k): print("{}.\t{}\n".format(i + 1, docid_to_text[self.rank[i]]))
def query_batches_each_item(valid, n): # default n is 3 null_size = 0 query_text = valid['query_text'].values ids = valid['description_id'].values submit = np.zeros((len(ids), n + 1)).astype(np.str) submit_score = np.zeros((len(ids), n + 1)).astype(np.str) searcher = pysearch.SimpleSearcher(index_path) searcher.set_bm25_similarity(FLAGS.k1, FLAGS.b) count = len(ids) bar = tqdm(range(count)) for i in bar: col = list() col_score = list() col.append(ids[i]) col_score.append(ids[i]) cur_query = query_text[i] hits = searcher.search(cur_query.encode('utf-8'), k=n) if len(hits) == 0: null_size += 1 for idx in range(0, n): col.append(FLAGS.default_cite) col_score.append(FLAGS.default_score) else: min_cnt = min(len(hits), n) for idx in range(0, min_cnt): col.append(hits[idx].docid) col_score.append(hits[idx].score) while min_cnt < n: col.append(FLAGS.default_cite) col_score.append(FLAGS.default_score) min_cnt += 1 submit[i] = col submit_score[i] = col_score print('nullsize:{}'.format(null_size)) return submit, submit_score
def __init__(self, index_location, k=1000, wmodel="BM25", **kwargs): """ Construct an AnseriniBatchRetrieve retrieve. Args: index_location(str): The location of the Anserini index. wmodel(str): Weighting models supported by Anserini. There are three options: * `"BM25"` - the BM25 weighting model * `"QLD"` - Dirichlet language modelling * `"TFIDF"` - Lucene's `ClassicSimilarity <https://lucene.apache.org/core/8_1_0/core/org/apache/lucene/search/similarities/ClassicSimilarity.html>`_. k(int): number of results to return. Default is 1000. """ super().__init__(kwargs) self.index_location = index_location self.k = k _init_anserini() from pyserini.search import pysearch self.searcher = pysearch.SimpleSearcher(index_location) self.wmodel = wmodel self._setsimilarty(wmodel)
def __init__(self, lucene_index_path: str, min_df: int = 1) -> None: self.min_df: int = min_df self.index_utils = pyutils.IndexReaderUtils(lucene_index_path) # get num_docs self.searcher = pysearch.SimpleSearcher(lucene_index_path) self.num_docs: int = self.searcher.num_docs # pre-processing self.vocabulary_ = set() self.idf_ = {} for term in self.index_utils.terms(): self.idf_[term.term] = math.log(self.num_docs / term.df) if term.df > self.min_df: self.vocabulary_.add(term.term) self.term_to_index = {} for index, term in enumerate(self.vocabulary_): self.term_to_index[term] = index self.vocabulary_size = len(self.vocabulary_) print(f'Found {self.vocabulary_size} terms')
def create_dataset(question_df, labels, cands_size): """Retrieves the top-k candidate answers for a question and creates a list of lists of the dataset containing the question id, list of relevant answer ids, and the list of answer candidates Returns: dataset: list of list in the form [qid, [pos ans], [ans candidates]] ---------- Arguments: question_df: Dataframe containing the qid and question text labels: Dictonary containing the qid to text map cands_size: int - number of candidates to retrieve """ dataset = [] # Calls retriever searcher = pysearch.SimpleSearcher(fiqa_index) # For each question for i, row in question_df.iterrows(): qid = row['qid'] tmp = [] # Append qid tmp.append(qid) # Append list of relevant docs tmp.append(labels[qid]) # Retrieves answer candidates cands = [] query = row['question'] query = re.sub('[£€§]', '', query) hits = searcher.search(query, k=cands_size) for docid in range(0, len(hits)): cands.append(int(hits[docid].docid)) # Append candidate answers tmp.append(cands) dataset.append(tmp) return dataset
from pyserini.search import pysearch searcher = pysearch.SimpleSearcher('natural_questions/lucene-index-msmarco/') hits = searcher.search('who is the announcer on americas got talent?') # Print the first 10 hits: for i in range(0, 10): print(f'{i+1} {hits[i].docid} {hits[i].score}') # Grab the actual text: hits[0].content
def __init__(self, index_path: str): self.searcher = pysearch.SimpleSearcher(index_path)
import numpy as np def make_run_file(file, topics, searcher, w_bm25, w_rnp): probTrue = np.load('trueProbs_d2v.npy',allow_pickle='TRUE').item() with open(file, 'w') as runfile: cnt = 0 print('Running {} queries in total'.format(len(topics))) for id in topics: query = topics[id]['title'].encode('utf-8') hits = searcher.search(query, 10) for i in range(0, len(hits)): doc_id = hits[i].docid bm25_score = hits[i].score real_news_prob = probTrue[str(doc_id)] score = w_bm25 * bm25_score + w_rnp * real_news_prob _ = runfile.write('{} Q0 {} {} {:.6f} Anserini\n'.format(id, hits[i].docid, i+1, score)) cnt += 1 if cnt % 100 == 0: print(f'{cnt} queries completed') if __name__ == "__main__": topics = pysearch.get_topics('robust04') searcher = pysearch.SimpleSearcher('robust_index') make_run_file('run.fnc-reranker.txt', topics , searcher, 0.5, 0.5)
import os os.environ["JAVA_HOME"] = "/usr/lib/jvm/jdk-11.0.2" from pyserini.search import pysearch import pandas as pd from IPython.core.display import display, HTML import json query = 'What collaborations are happening within 2019-nCoV research community' keywords = 'inter-sectorial, international, collaboration, global, coronavirus, novel coronavirus, sharing' searcher = pysearch.SimpleSearcher('lucene-index-covid-2020-03-27/') hits = searcher.search(query + '. ' + keywords) # n_hits = len(hits) n_hits = 3 display(HTML('<div style="font-family: Times New Roman; font-size: 20px; padding-bottom:12px"><b>Query</b>: '+query+'</div>')) for i in range(0, n_hits): doc_json = json.loads(hits[i].raw) available_keys ={} for i in range(0, n_hits): doc_json = json.loads(hits[i].raw) for k in doc_json.keys(): if k in available_keys: available_keys[k]+=1 else: available_keys[k]=1 hit_dictionary = {} for i in range(0, n_hits): doc_json = json.loads(hits[i].raw)
parser.add_argument('--rm3', action='store_true', default=False, help='use RM3') parser.add_argument('--fbTerms', default=10, type=int, help='RM3 parameter: number of expansion terms') parser.add_argument('--fbDocs', default=10, type=int, help='RM3 parameter: number of documents') parser.add_argument('--originalQueryWeight', default=0.5, type=float, help='RM3 parameter: weight to assign to the original query') args = parser.parse_args() data_type = 'oc' if args.valid_docs: data_type = 'pd' valid_docs = set(open(args.valid_docs).read().strip().split('\n')) searcher = pysearch.SimpleSearcher(args.index) searcher.set_bm25_similarity(args.k1, args.b) print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b)) if args.rm3: searcher.setRM3Reranker(args.fbTerms, args.fbDocs, args.originalQueryWeight) print('Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}'.format( args.fbTerms, args.fbDocs, args.originalQueryWeight)) with open(args.output, 'w') as fout: start_time = time.time() for line_number, line in tqdm(enumerate(open(args.qid_queries, encoding='utf-8'))): query_id, query = line.strip().split('\t') # We return one more result because it is almost certain that we will # retrieve the document that originated the query. hits = searcher.search(query, args.hits + 1)
def __init__(self, index_dir): self.simple_searcher = pysearch.SimpleSearcher(index_dir)
lmir_dir = LMIR('DIR', np.array([query_tfs], dtype=np.float32), np.array([idfs], dtype=np.float32), [dl], [dl_set], Pw, count)[0] lmir_abs = LMIR('ABS', np.array([query_tfs], dtype=np.float32), np.array([idfs], dtype=np.float32), [dl], [dl_set], Pw, count)[0] except Exception as e: lmir_jm = lmir_dir = lmir_abs = None features[idx + 5 * 4] = lmir_abs features[idx + 6 * 4] = lmir_dir features[idx + 7 * 4] = lmir_jm if __name__ == '__main__': filename = 'features.txt' searcher = pysearch.SimpleSearcher(CLUEWEB_INDEX) baseline_docs = get_baseline_mapping() urls = get_urls() for www, baseline in baseline_docs.items(): print('Processing {}'.format(www)) queries = get_queries(www) term_doc_freq = get_document_frequencies(queries.values()) if www in ('www1', 'www2'): rels = get_relevance(www) else: rels = None with open(data_folder / www / filename, 'w') as f: for qid, dids in tqdm.tqdm(baseline.items()): # For query-level normalization scores = {i: [] for i in range(32)} scores['docs'] = []
from pyserini.search import pysearch import subprocess from tqdm.auto import tqdm import random import pickle import sys import unicodedata import string import re import os from collections import defaultdict import math index_path = path("lucene-index.msmarco-doc.pos+docvectors+rawdocs") searcher = pysearch.SimpleSearcher(index_path) relevant_docs = defaultdict(lambda:[]) for file in [path("qrels/msmarco-doctrain-qrels.tsv"), path("qrels/msmarco-docdev-qrels.tsv")]: for line in open(file): query_id, _, doc_id, rel = line.split() assert rel == "1" relevant_docs[query_id].append(doc_id) pattern = re.compile('([^\s\w]|_)+') anserini_top_10 = defaultdict(lambda:[]) searcher.set_bm25_similarity(0.9, 0.4) pairs_per_split = defaultdict(lambda: []) threads = 42 # Number of Threads to use when retrieving k = 10 # Number of documents to retrieve neg_samples = 2 # Number of negatives samples to use
def extract_docs_for_reranking(query, index_path, K): searcher = pysearch.SimpleSearcher(index_path) hits = searcher.search(query, K) return hits
def search(self, query: str): self.searcher = pysearch.SimpleSearcher( 'indexes/sample_collection_jsonl') return self.searcher.search(q=query, k=settings.max_docs)