def testTermQuery(self): should = querybuilder.JBooleanClauseOccur['should'].value query_builder = querybuilder.get_boolean_query_builder() query_builder.add(querybuilder.get_term_query('information'), should) query_builder.add(querybuilder.get_term_query('retrieval'), should) query = query_builder.build() hits1 = self.searcher.search(query) hits2 = self.searcher.search('information retrieval') for h1, h2 in zip(hits1, hits2): self.assertEqual(h1.docid, h2.docid) self.assertEqual(h1.score, h2.score)
def testBuildBoostedQuery(self): term_query1 = querybuilder.get_term_query('information') term_query2 = querybuilder.get_term_query('retrieval') boost1 = querybuilder.get_boost_query(term_query1, 2.) boost2 = querybuilder.get_boost_query(term_query2, 2.) should = querybuilder.JBooleanClauseOccur['should'].value boolean_query = querybuilder.get_boolean_query_builder() boolean_query.add(boost1, should) boolean_query.add(boost2, should) bq = boolean_query.build() hits1 = self.searcher.search(bq) boolean_query2 = querybuilder.get_boolean_query_builder() boolean_query2.add(term_query1, should) boolean_query2.add(term_query2, should) bq2 = boolean_query2.build() hits2 = self.searcher.search(bq2) for h1, h2 in zip(hits1, hits2): self.assertEqual(h1.docid, h2.docid) self.assertAlmostEqual(h1.score, h2.score * 2, delta=0.001) boost3 = querybuilder.get_boost_query(term_query1, 2.) boost4 = querybuilder.get_boost_query(term_query2, 3.) boolean_query = querybuilder.get_boolean_query_builder() boolean_query.add(boost3, should) boolean_query.add(boost4, should) bq3 = boolean_query.build() hits3 = self.searcher.search(bq3) for h1, h3 in zip(hits1, hits3): self.assertNotEqual(h1.score, h3.score)
def testIncompatabilityWithRM3(self): should = querybuilder.JBooleanClauseOccur['should'].value query_builder = querybuilder.get_boolean_query_builder() query_builder.add(querybuilder.get_term_query('information'), should) query_builder.add(querybuilder.get_term_query('retrieval'), should) query = query_builder.build() hits = self.searcher.search(query) self.assertEqual(10, len(hits)) self.searcher.set_rm3() self.assertTrue(self.searcher.is_using_rm3()) with self.assertRaises(NotImplementedError): self.searcher.search(query)
def buildQuery(queries, en, es, de): should = querybuilder.JBooleanClauseOccur['should'].value boolean_query_builder = querybuilder.get_boolean_query_builder() if en: for word in queries["en"]: term = querybuilder.get_term_query(word) boolean_query_builder.add(term, should) if es: for word in queries["es"]: term = querybuilder.get_term_query(word) boolean_query_builder.add(term, should) if de: for word in queries["de"]: term = querybuilder.get_term_query(word) boolean_query_builder.add(term, should) return boolean_query_builder.build()
def testTermQuery2(self): term_query1 = querybuilder.get_term_query('inform', analyzer=get_lucene_analyzer(stemming=False)) term_query2 = querybuilder.get_term_query('retriev', analyzer=get_lucene_analyzer(stemming=False)) should = querybuilder.JBooleanClauseOccur['should'].value boolean_query1 = querybuilder.get_boolean_query_builder() boolean_query1.add(term_query1, should) boolean_query1.add(term_query2, should) bq1 = boolean_query1.build() hits1 = self.searcher.search(bq1) hits2 = self.searcher.search('information retrieval') for h1, h2 in zip(hits1, hits2): self.assertEqual(h1.docid, h2.docid) self.assertEqual(h1.score, h2.score)
def search(expander, rankers, topicreader, index, anserini, output): # Information Retrieval using Anserini rank_cmd = '{}target/appassembler/bin/SearchCollection'.format(anserini) model_name = expander.get_model_name() try: Q_filename = '{}.{}.txt'.format(output, model_name) for ranker in rankers: Q_pred = '{}.{}.{}.txt'.format(output, model_name, utils.get_ranker_name(ranker)) q_dic={} searcher = SimpleSearcher(index) if ranker =='-bm25': searcher.set_bm25(0.9, 0.4) elif ranker =='-qld': searcher.set_qld() if isinstance(expander, OnFields) or isinstance(expander, BertQE) : run_file=open(Q_pred,'w') list_of_raw_queries=utils.get_raw_query(topicreader,Q_filename) for qid,query in list_of_raw_queries.items(): q_dic[qid.strip()]= eval(query) for qid in q_dic.keys(): boost=[] for q_terms,q_weights in q_dic[qid].items(): try: boost.append( querybuilder.get_boost_query(querybuilder.get_term_query(q_terms),q_weights)) except: # term do not exist in the indexed collection () e.g., stop words pass should = querybuilder.JBooleanClauseOccur['should'].value boolean_query_builder = querybuilder.get_boolean_query_builder() for boost_i in boost: boolean_query_builder.add(boost_i, should) retrieved_docs=[] query = boolean_query_builder.build() hits = searcher.search(query,k=10000) for i in range(0, 1000): try: if hits[i].docid not in retrieved_docs: retrieved_docs.append(hits[i].docid) run_file.write(f'{qid} Q0 {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} Pyserini \n') except: pass run_file.close() elif topicreader=='TsvString': run_file=open(Q_pred,'w') qlines=open(Q_filename,'r').readlines() for line in qlines: retrieved_docs=[] qid,qtext=line.split('\t') hits = searcher.search(qtext,k=1000) for i in range(len(hits)): if hits[i].docid not in retrieved_docs: retrieved_docs.append(hits[i].docid) run_file.write(f'{qid} Q0 {hits[i].docid:15} {i+1:2} {hits[i].score:.5f} Pyserini\n') run_file.close() else: cli_cmd = '\"{}\" {} -threads 44 -topicreader {} -index {} -topics {} -output {}'.format(rank_cmd, ranker, topicreader, index, Q_filename, Q_pred) print('{}\n'.format(cli_cmd)) stream = os.popen(cli_cmd) print(stream.read()) except:#all exception related to calling the SearchCollection cannot be captured here!! since it is outside the process scope print('INFO: MAIN: SEARCH: There has been error in {}!\n{}'.format(expander, traceback.format_exc())) raise