def query_processor(self, query):
        """
        Using aserini for search as a lite weight replacement for elasticsearch
        :param query: query from user
        :return: a function
        """

        #Retrieve set of candidate documents
        searcher = pysearch.SimpleSearcher(self.index)
        results = searcher.search(query, self.number_docs)
        documents = []
        history = set()

        for res in results:
            did = res.docid
            history.add(did)
            title = res.lucene_document.get("title")
            text = res.lucene_document.get("abstract")
            inp_dict = {"id": did, "title": title, "text": text}
            documents.append(inp_dict)
            for i, para in enumerate(res.contents.split("\n")):
                if i == 0 or i == 1:
                    continue
                else:
                    documents.append({"id": did, "title": title, "text": para})

        return self.processor(query, documents)
예제 #2
0
def evaluate1(examples_df, args):
    with open(args.input_dir + '/index_table_doc.json') as f:
        index_table = json.load(f)
    k1 = args.k1
    searcher = pysearch.SimpleSearcher(args.input_dir + '/lucene-index-nq-doc')
    print("Begin evaluation 1")
    correct = 0
    num_of_examples = 0
    for index, example in examples_df.iterrows():
        num_of_examples += 1
        question = example["query"]
        actual_doc_id = example["doc_id"]
        #print("******************** Example id : ", num_of_examples, "***************")
        #print("question : ", question)
        #print("context : ")
        # Search using anserini
        hits = searcher.search(question, k=k1)
        for i in range(0, k1):
            index = hits[i].docid
            #print(hits[i].content)
            #print(i, ":", index_table[index]["content"])
            #print("score : ", hits[i].docid)
            if index_table[index]["doc_id"] == actual_doc_id:
                correct += 1
                break
    print("Total number of examples : ", num_of_examples)
    recall = correct / num_of_examples
    print("Proportion of correct retrieval : %f" % (recall))
예제 #3
0
    def query(self, query, b, k1):
        self["index"].create_index()
        searcher = pysearch.SimpleSearcher(
            self["index"].get_index_path().as_posix())
        searcher.set_bm25_similarity(k1, b)

        hits = searcher.search(query)
        return OrderedDict({hit.docid: hit.score for hit in hits})
예제 #4
0
    def query(self, query):
        self["index"].create_index()
        searcher = pysearch.SimpleSearcher(
            self["index"].get_index_path().as_posix())
        searcher.set_lm_dirichlet_similarity(self.cfg["mu"])

        hits = searcher.search(query)
        return OrderedDict({hit.docid: hit.score for hit in hits})
예제 #5
0
 def __init__(self, index_location, k=1000, wmodel="BM25" **kwargs):
     super().__init__(kwargs)
     self.index_location = index_location
     self.k = k
     init_anserini()
     from pyserini.search import pysearch
     self.searcher = pysearch.SimpleSearcher(index_location)
     self.wmodel = wmodel
     self._setsimilarty(wmodel)
예제 #6
0
    def test_basic(self):
        searcher = pysearch.SimpleSearcher('{}lucene-index.cacm'.format(
            self.index_dir))
        hits = searcher.search('information retrieval')

        self.assertEqual(hits[0].docid, 'CACM-3134')
        self.assertAlmostEqual(hits[0].score, 4.76550, places=5)

        self.assertEqual(hits[9].docid, 'CACM-2516')
        self.assertAlmostEqual(hits[9].score, 4.21740, places=5)
예제 #7
0
    def query(self, query, b, k1, fbterms, fbdocs, ow):
        self["index"].create_index()
        searcher = pysearch.SimpleSearcher(
            self["index"].get_index_path().as_posix())
        searcher.set_bm25_similarity(k1, b)
        searcher.set_rm3_reranker(fb_terms=fbterms,
                                  fb_docs=fbdocs,
                                  original_query_weight=ow)

        hits = searcher.search(query)
        return OrderedDict({hit.docid: hit.score for hit in hits})
예제 #8
0
def searchDatabase(question,
                   keywords=KEYWORDS,
                   pysearch=pysearch,
                   lucene_database='lucene-index-covid-2020-03-27/',
                   BERTSQuAD_Model=model,
                   displayTable=True,
                   displayHTML=False):
    """Search Database
    """
    ## search the lucene database with a combination of the question and the keywords
    searcher = pysearch.SimpleSearcher(lucene_database)
    hits = searcher.search(question + '. ' + keywords)

    ## collect the relevant data in a hit dictionary
    hit_dictionary = {}
    for i in range(0, N_HITS):
        doc_json = json.loads(hits[i].raw)
        idx = int(hits[i].docid)
        hit_dictionary[idx] = doc_json
        hit_dictionary[idx]['title'] = hits[i].lucene_document.get("title")
        hit_dictionary[idx]['authors'] = hits[i].lucene_document.get("authors")
        hit_dictionary[idx]['doi'] = hits[i].lucene_document.get("doi")

    ## scrub the abstracts in prep for BERT-SQuAD
    for idx, v in hit_dictionary.items():
        abs_dirty = v['abstract']
        # looks like the abstract value can be an empty list
        v['abstract_paragraphs'] = []
        v['abstract_full'] = ''

        if abs_dirty:
            # looks like if it is a list, then the only entry is a dictionary wher text is in 'text' key
            # looks like it is broken up by paragraph if it is in that form.  lets make lists for every paragraph
            # and a new entry that is full abstract text as both could be valuable for BERT derrived QA
            if isinstance(abs_dirty, list):
                for p in abs_dirty:
                    v['abstract_paragraphs'].append(p['text'])
                    v['abstract_full'] += p['text'] + ' \n\n'

            # looks like in some cases the abstract can be straight up text so we can actually leave that alone
            if isinstance(abs_dirty, str):
                v['abstract_paragraphs'].append(abs_dirty)
                v['abstract_full'] += abs_dirty + ' \n\n'
    ## Search collected abstracts with BERT-SQuAD
    answers = searchAbstracts(hit_dictionary, BERTSQuAD_Model, question)
    # print(answers)
    ## display results in a nice format
    return displayResults(hit_dictionary,
                          answers,
                          question,
                          displayTable=displayTable,
                          displayHTML=displayHTML)
예제 #9
0
def evaluate3(examples_df, args):
    # TODO: This doesn't make sense !!!
    with open(args.input_dir + '/index_table_doc.json') as f:
        index_table_doc = json.load(f)
    with open(args.input_dir + '/index_table_ele.json') as f:
        index_table_ele = json.load(f)
    k1 = args.k1
    k2 = args.k2
    searcher1 = pysearch.SimpleSearcher(args.input_dir +
                                        '/lucene-index-nq-doc')
    searcher2 = pysearch.SimpleSearcher(args.input_dir +
                                        '/lucene-index-nq-ele')
    print("Begin evaluation 3")
    correct = 0
    num_of_examples = 0
    for index, example in examples_df.iterrows():
        num_of_examples += 1
        question = example["query"]
        actual_element_id = example["context_id"]
        actual_doc_id = example["doc_id"]

        doc_hits = searcher1.search(question, k=k1)
        ele_hits = searcher2.search(question, k=k2)
        doc_ids = []

        for i in range(0, k1):
            index = doc_hits[i].docid
            doc_ids.append(index_table_doc[index]["doc_id"])
        for i in range(0, k2):
            index = ele_hits[i].docid
            if index_table_ele[index]["doc_id"] in doc_ids and index_table_ele[
                    index]["doc_id"] == actual_doc_id and index_table_ele[
                        index]["element_id"] == actual_element_id:
                correct += 1
                break

    print("Total number of examples : ", num_of_examples)
    recall = correct / num_of_examples
    print("Proportion of correct retrieval : %f" % (recall))
예제 #10
0
    def setUp(self):
        # Download pre-built CACM index; append a random value to avoid filename clashes.
        r = randint(0, 10000000)
        self.collection_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.tar.gz'
        self.tarball_name = 'lucene-index.cacm-{}.tar.gz'.format(r)
        self.index_dir = 'index{}/'.format(r)

        filename, headers = urlretrieve(self.collection_url, self.tarball_name)

        tarball = tarfile.open(self.tarball_name)
        tarball.extractall(self.index_dir)
        tarball.close()

        self.searcher = pysearch.SimpleSearcher(f'{self.index_dir}lucene-index.cacm')
예제 #11
0
    def build_searcher(self, index_path):
        searcher = pysearch.SimpleSearcher(index_path)
        searcher.set_bm25_similarity(settings.bm25_k1, settings.bm25_b)
        print(f'Initializing BM25 {index_path}, '
              f'setting k1={settings.bm25_k1} and b={settings.bm25_b}')
        if settings.rm3:
            searcher.set_rm3_reranker(settings.rm3_fb_terms,
                                      settings.rm3_fb_docs,
                                      settings.rm3_original_query_weight)

            print('Initializing RM3, setting '
                  f'fbTerms={settings.rm3_fb_terms}, '
                  f'fbDocs={settings.rm3_fb_docs} and '
                  f'originalQueryWeight={settings.rm3_original_query_weight}')
        return searcher
예제 #12
0
    def __init__(self):
        self.QA_MODEL = BertForQuestionAnswering.from_pretrained(BIOASQ_DIR)
        self.QA_TOKENIZER = BertTokenizer.from_pretrained(BIOASQ_DIR)
        self.QA_MODEL.to(torch_device)
        self.QA_MODEL.eval()
        self.searcher = pysearch.SimpleSearcher(luceneDir)
        self.USE_SUMMARY = False

        if self.USE_SUMMARY:
            self.SUMMARY_TOKENIZER = BartTokenizer.from_pretrained(
                'bart-large-cnn')
            self.SUMMARY_MODEL = BartForConditionalGeneration.from_pretrained(
                'bart-large-cnn')
            self.SUMMARY_MODEL.to(torch_device)
            self.SUMMARY_MODEL.eval()
예제 #13
0
    def do_query(query_string, index, k1=1.2, b=0.75, n=100):
        """
        Does a bm25 search and returns the most relevant 1000 docs and their ids
        """
        with patch("pyserini.setup.configure_classpath") as mock_setup:
            mock_setup.return_value = None
            from pyserini.search import pysearch

            searcher = pysearch.SimpleSearcher(index.index_path)
            searcher.set_bm25_similarity(k1, b)
            hits = searcher.search(query_string, n)
            doc_ids = [hit.docid for hit in hits]
            docs = [index.getdoc(doc_id) for doc_id in doc_ids]

            return doc_ids, docs
    def __init__(self,
                 index_path: Path,
                 topic_path: Path,
                 qrel_path: Path,
                 columns: List[str],
                 topic_reader: TopicReader,
                 searcher_name: str = "bm25"):

        self.qrel = qrel_path
        self.topic = topic_path
        index_path = str(index_path)
        self.index_utils = JIndexUtils(JString(index_path))
        self.searcher = pysearch.SimpleSearcher(index_path)
        self._set_searcher(searcher_name)
        self.columns = columns
        self.topic_reader = topic_reader
예제 #15
0
    def build_searcher(self, index_path):
        self.searcher = pysearch.SimpleSearcher(index_path)
        self.searcher.set_bm25_similarity(settings.bm25_k1, settings.bm25_b)
        print(f"Initializing BM25 {index_path}, "
              f"setting k1={settings.bm25_k1} and b={settings.bm25_b}")
        if settings.rm3:
            self.searcher.set_rm3_reranker(
                settings.rm3_fb_terms,
                settings.rm3_fb_docs,
                settings.rm3_original_query_weight,
            )

            print("Initializing RM3, setting "
                  f"fbTerms={settings.rm3_fb_terms}, "
                  f"fbDocs={settings.rm3_fb_docs} and "
                  f"originalQueryWeight={settings.rm3_original_query_weight}")
예제 #16
0
    def search(self):
        """Search engine. Retrieves and re-ranks the answer candidates given a query.
        Renders the top-k answers for a query.
        """
        # Download model
        model_name = get_trained_model("finbert-qa")
        model_path = path + "/model/trained/finbert-qa/" + model_name
        # Load model
        self.model.load_state_dict(torch.load(model_path), strict=False)
        self.model.eval()

        searcher = pysearch.SimpleSearcher(fiqa_index)
        self.k = self.config['top_k']

        if self.config['user_input'] == True:
            # Ask the user for a keyword query.
            self.query = input("\nPlease enter your question: ")
        else:
            self.query = self.config['query']

        hits = searcher.search(self.query, k=50)

        cands = []

        for i in range(0, len(hits)):
            cands.append(int(hits[i].docid))

        if len(cands) == 0:
            print("\nNo answers found.")
            sys.exit()
        else:
            print("\nRanking...\n")
            self.rank, self.scores = self.predict(self.model, self.query,
                                                  cands)

            print("Question: \n\t{}\n".format(self.query))

            if len(cands) < self.k:
                self.k = len(cands)
            else:
                pass

            print("Top-{} Answers: \n".format(self.k))
            for i in range(0, self.k):
                print("{}.\t{}\n".format(i + 1, docid_to_text[self.rank[i]]))
def query_batches_each_item(valid, n):  # default n is 3

    null_size = 0
    query_text = valid['query_text'].values
    ids = valid['description_id'].values
    submit = np.zeros((len(ids), n + 1)).astype(np.str)
    submit_score = np.zeros((len(ids), n + 1)).astype(np.str)

    searcher = pysearch.SimpleSearcher(index_path)
    searcher.set_bm25_similarity(FLAGS.k1, FLAGS.b)
    count = len(ids)
    bar = tqdm(range(count))
    for i in bar:
        col = list()
        col_score = list()
        col.append(ids[i])
        col_score.append(ids[i])

        cur_query = query_text[i]
        hits = searcher.search(cur_query.encode('utf-8'), k=n)

        if len(hits) == 0:
            null_size += 1
            for idx in range(0, n):
                col.append(FLAGS.default_cite)
                col_score.append(FLAGS.default_score)
        else:
            min_cnt = min(len(hits), n)
            for idx in range(0, min_cnt):
                col.append(hits[idx].docid)
                col_score.append(hits[idx].score)
            while min_cnt < n:
                col.append(FLAGS.default_cite)
                col_score.append(FLAGS.default_score)
                min_cnt += 1

        submit[i] = col
        submit_score[i] = col_score
    print('nullsize:{}'.format(null_size))
    return submit, submit_score
예제 #18
0
    def __init__(self, index_location, k=1000, wmodel="BM25", **kwargs):
        """
            Construct an AnseriniBatchRetrieve retrieve. 

            Args:

                index_location(str): The location of the Anserini index.
                wmodel(str): Weighting models supported by Anserini. There are three options: 
                
                 * `"BM25"` - the BM25 weighting model
                 * `"QLD"`  - Dirichlet language modelling
                 *  `"TFIDF"` - Lucene's `ClassicSimilarity <https://lucene.apache.org/core/8_1_0/core/org/apache/lucene/search/similarities/ClassicSimilarity.html>`_.
                k(int): number of results to return. Default is 1000.
        """
        super().__init__(kwargs)
        self.index_location = index_location
        self.k = k
        _init_anserini()
        from pyserini.search import pysearch
        self.searcher = pysearch.SimpleSearcher(index_location)
        self.wmodel = wmodel
        self._setsimilarty(wmodel)
예제 #19
0
    def __init__(self, lucene_index_path: str, min_df: int = 1) -> None:
        self.min_df: int = min_df
        self.index_utils = pyutils.IndexReaderUtils(lucene_index_path)

        # get num_docs
        self.searcher = pysearch.SimpleSearcher(lucene_index_path)
        self.num_docs: int = self.searcher.num_docs

        # pre-processing
        self.vocabulary_ = set()
        self.idf_ = {}

        for term in self.index_utils.terms():
            self.idf_[term.term] = math.log(self.num_docs / term.df)
            if term.df > self.min_df:
                self.vocabulary_.add(term.term)

        self.term_to_index = {}
        for index, term in enumerate(self.vocabulary_):
            self.term_to_index[term] = index
        self.vocabulary_size = len(self.vocabulary_)
        print(f'Found {self.vocabulary_size} terms')
예제 #20
0
def create_dataset(question_df, labels, cands_size):
    """Retrieves the top-k candidate answers for a question and
    creates a list of lists of the dataset containing the question id,
    list of relevant answer ids, and the list of answer candidates

    Returns:
        dataset: list of list in the form [qid, [pos ans], [ans candidates]]
    ----------
    Arguments:
        question_df: Dataframe containing the qid and question text
        labels: Dictonary containing the qid to text map
        cands_size: int - number of candidates to retrieve
    """
    dataset = []
    # Calls retriever
    searcher = pysearch.SimpleSearcher(fiqa_index)
    # For each question
    for i, row in question_df.iterrows():
        qid = row['qid']
        tmp = []
        # Append qid
        tmp.append(qid)
        # Append list of relevant docs
        tmp.append(labels[qid])
        # Retrieves answer candidates
        cands = []
        query = row['question']
        query = re.sub('[£€§]', '', query)
        hits = searcher.search(query, k=cands_size)

        for docid in range(0, len(hits)):
            cands.append(int(hits[docid].docid))
        # Append candidate answers
        tmp.append(cands)
        dataset.append(tmp)

    return dataset
예제 #21
0
from pyserini.search import pysearch

searcher = pysearch.SimpleSearcher('natural_questions/lucene-index-msmarco/')
hits = searcher.search('who is the announcer on americas got talent?')

# Print the first 10 hits:
for i in range(0, 10):
    print(f'{i+1} {hits[i].docid} {hits[i].score}')

# Grab the actual text:
hits[0].content
예제 #22
0
 def __init__(self, index_path: str):
     self.searcher = pysearch.SimpleSearcher(index_path)
예제 #23
0
import numpy as np


def make_run_file(file, topics, searcher, w_bm25, w_rnp):
    probTrue = np.load('trueProbs_d2v.npy',allow_pickle='TRUE').item()
    with open(file, 'w') as runfile:
        cnt = 0
        print('Running {} queries in total'.format(len(topics)))
        for id in topics:
            query = topics[id]['title'].encode('utf-8')
            hits = searcher.search(query, 10)
            for i in range(0, len(hits)):
                doc_id = hits[i].docid

                bm25_score = hits[i].score
                real_news_prob = probTrue[str(doc_id)]

                score = w_bm25 * bm25_score + w_rnp * real_news_prob

                _ = runfile.write('{} Q0 {} {} {:.6f} Anserini\n'.format(id, hits[i].docid, i+1, score))
                cnt += 1
                if cnt % 100 == 0:
                	print(f'{cnt} queries completed')

if __name__ == "__main__":
	topics = pysearch.get_topics('robust04')
	searcher = pysearch.SimpleSearcher('robust_index')


	make_run_file('run.fnc-reranker.txt', topics , searcher, 0.5, 0.5)
예제 #24
0
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/jdk-11.0.2"
from pyserini.search import pysearch
import pandas as pd
from IPython.core.display import display, HTML
import json

query = 'What collaborations are happening within 2019-nCoV research community'
keywords = 'inter-sectorial, international, collaboration, global, coronavirus, novel coronavirus, sharing'

searcher = pysearch.SimpleSearcher('lucene-index-covid-2020-03-27/')
hits = searcher.search(query + '. ' + keywords)
# n_hits = len(hits)
n_hits = 3

display(HTML('<div style="font-family: Times New Roman; font-size: 20px; padding-bottom:12px"><b>Query</b>: '+query+'</div>'))

for i in range(0, n_hits):
    doc_json = json.loads(hits[i].raw)
    
available_keys ={}
for i in range(0, n_hits):
    doc_json = json.loads(hits[i].raw)
    for k in doc_json.keys():
        if k in available_keys:
            available_keys[k]+=1
        else:
            available_keys[k]=1
hit_dictionary = {}
for i in range(0, n_hits):
    doc_json = json.loads(hits[i].raw)
예제 #25
0
    parser.add_argument('--rm3', action='store_true', default=False, help='use RM3')
    parser.add_argument('--fbTerms', default=10, type=int,
                        help='RM3 parameter: number of expansion terms')
    parser.add_argument('--fbDocs', default=10, type=int,
                        help='RM3 parameter: number of documents')
    parser.add_argument('--originalQueryWeight', default=0.5, type=float,
                        help='RM3 parameter: weight to assign to the original query')

    args = parser.parse_args()

    data_type = 'oc'
    if args.valid_docs:
        data_type = 'pd'
        valid_docs = set(open(args.valid_docs).read().strip().split('\n'))

    searcher = pysearch.SimpleSearcher(args.index)
    searcher.set_bm25_similarity(args.k1, args.b)
    print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b))
    if args.rm3:
        searcher.setRM3Reranker(args.fbTerms, args.fbDocs, args.originalQueryWeight)
        print('Initializing RM3, setting fbTerms={}, fbDocs={} and originalQueryWeight={}'.format(
            args.fbTerms, args.fbDocs, args.originalQueryWeight))

    with open(args.output, 'w') as fout:
        start_time = time.time()
        for line_number, line in tqdm(enumerate(open(args.qid_queries, encoding='utf-8'))):
            query_id, query = line.strip().split('\t')
            # We return one more result because it is almost certain that we will
            # retrieve the document that originated the query.
            hits = searcher.search(query, args.hits + 1)
예제 #26
0
 def __init__(self, index_dir):
     self.simple_searcher = pysearch.SimpleSearcher(index_dir)
예제 #27
0
        lmir_dir = LMIR('DIR', np.array([query_tfs], dtype=np.float32),
                        np.array([idfs], dtype=np.float32), [dl], [dl_set], Pw,
                        count)[0]
        lmir_abs = LMIR('ABS', np.array([query_tfs], dtype=np.float32),
                        np.array([idfs], dtype=np.float32), [dl], [dl_set], Pw,
                        count)[0]
    except Exception as e:
        lmir_jm = lmir_dir = lmir_abs = None
    features[idx + 5 * 4] = lmir_abs
    features[idx + 6 * 4] = lmir_dir
    features[idx + 7 * 4] = lmir_jm


if __name__ == '__main__':
    filename = 'features.txt'
    searcher = pysearch.SimpleSearcher(CLUEWEB_INDEX)
    baseline_docs = get_baseline_mapping()
    urls = get_urls()
    for www, baseline in baseline_docs.items():
        print('Processing {}'.format(www))
        queries = get_queries(www)
        term_doc_freq = get_document_frequencies(queries.values())
        if www in ('www1', 'www2'):
            rels = get_relevance(www)
        else:
            rels = None
        with open(data_folder / www / filename, 'w') as f:
            for qid, dids in tqdm.tqdm(baseline.items()):
                # For query-level normalization
                scores = {i: [] for i in range(32)}
                scores['docs'] = []
예제 #28
0
from pyserini.search import pysearch
import subprocess
from tqdm.auto import tqdm
import random
import pickle
import sys
import unicodedata
import string
import re
import os
from collections import defaultdict
import math


index_path = path("lucene-index.msmarco-doc.pos+docvectors+rawdocs")
searcher = pysearch.SimpleSearcher(index_path)
relevant_docs = defaultdict(lambda:[])
for file in [path("qrels/msmarco-doctrain-qrels.tsv"), path("qrels/msmarco-docdev-qrels.tsv")]:
    for line in open(file):
        query_id, _, doc_id, rel = line.split()
        assert rel == "1"
        relevant_docs[query_id].append(doc_id)                            

pattern = re.compile('([^\s\w]|_)+')

anserini_top_10 = defaultdict(lambda:[])
searcher.set_bm25_similarity(0.9, 0.4)
pairs_per_split = defaultdict(lambda: [])
threads = 42 # Number of Threads to use when retrieving
k = 10       # Number of documents to retrieve 
neg_samples = 2 # Number of negatives samples to use
예제 #29
0
def extract_docs_for_reranking(query, index_path, K):
    searcher = pysearch.SimpleSearcher(index_path)
    hits = searcher.search(query, K)
    return hits
예제 #30
0
 def search(self, query: str):
     self.searcher = pysearch.SimpleSearcher(
         'indexes/sample_collection_jsonl')
     return self.searcher.search(q=query, k=settings.max_docs)