class SSQA_S_Searcher: def __init__(self, indexDir, analyzer): lucene.initVM() self.reader = DirectoryReader.open(indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = analyzer logger.debug("Search similarity func: {}".format( self.searcher.getSimilarity())) def search(self, query_text, top_n): query_text = query_text.strip() query = QueryParser("content", self.analyzer).parse( QueryParser.escape(query_text.strip())) scoreDocs = self.searcher.search(query, top_n).scoreDocs count = 0 out_list = [] for scoreDoc in tqdm(scoreDocs): docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append(doc['content']) count += 1 logger.info("Added {} sentences".format(count)) return out_list def close(self): self.reader.close()
class ParagSearcher: def __init__(self, Lid, db_path=config.DB_SSQA): lucene.initVM() self.db = SSQA_DB(db_path) lesson_str = self.db.get_lesson_str(Lid) parags = str_lesson2parags(lesson_str) # Index a Lesson myIndexer = _ChineseRamIndexer() myIndexer.index_lesson(parags) myIndexer.close() self.reader = DirectoryReader.open(myIndexer.indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = SmartChineseAnalyzer() logger.debug('search similarity:{}'.format( self.searcher.getSimilarity())) def __exit__(self, *args): self.close() def search(self, query_text, top_n=1): query_text = query_text.strip() # query = QueryParser("content", self.analyzer).parse(QueryParser.escape(query_text.strip())) query = QueryParser("content", self.analyzer).parse(query_text) scoreDocs = self.searcher.search(query, top_n).scoreDocs out_list = [] for scoreDoc in scoreDocs: docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append((doc['pid'], doc['content'], scoreDoc.score)) return out_list def close(self): self.db.close() self.reader.close()
class CosQASearcher: def __init__(self, lang): lucene.initVM() if lang == 'zh': indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_ZH))) analyzer = SmartChineseAnalyzer() elif lang == 'en': indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_EN))) analyzer = EnglishAnalyzer() else: raise ValueError( 'lang should be "zh" or "en", {} is invalid!'.format(lang)) self.reader = DirectoryReader.open(indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = analyzer logger.debug('search similarity func: {}'.format( self.searcher.getSimilarity())) def search(self, query_text, top_n=1): query_text = query_text.strip() query = QueryParser("content", self.analyzer).parse( QueryParser.escape(query_text.strip())) # query = QueryParser("content", self.analyzer).parse(query_text) scoreDocs = self.searcher.search(query, top_n).scoreDocs out_list = [] for scoreDoc in scoreDocs: docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append( (doc['did'], doc['title_en'], doc['content'], scoreDoc.score)) return out_list def close(self): self.reader.close()
class SearchBuilder(object): def __init__(self, index_path, field, similarity="boolean", use_relevance_feedback=False, feedback_index_path=None): self.reader = DirectoryReader.open( FSDirectory.open(Paths.get(index_path))) self.searcher = IndexSearcher(self.reader) if use_relevance_feedback and feedback_index_path is not None: self.feedback_reader = DirectoryReader.open( FSDirectory.open(Paths.get(feedback_index_path))) self.feedback_searcher = IndexSearcher(self.feedback_reader) self.similarity = similarity self.stopwords = stop_words() if similarity == "boolean": self.searcher.setSimilarity(BooleanSimilarity()) elif similarity == "tf": self.searcher.setSimilarity(TFSimilarity()) elif similarity == "tfidf": self.searcher.setSimilarity(ClassicSimilarity()) elif similarity == "BM25": self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) else: print("Unknown similarity, so we use BM25(1.2, 0.2) as default") self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) analyzer = StandardAnalyzer() print(self.searcher.getSimilarity()) self.parser = QueryParser(field, analyzer) def remove_stopwords(self, query_text): new_query_tokens = [] query_tokens = query_text.split() for query_token in query_tokens: if query_token not in self.stopwords: new_query_tokens.append(query_token) return " ".join(new_query_tokens) def search_query(self, query, num_returns=50, use_multipass_pseudo_relevance_feedback=False, doc_counts=None, add_nums=None): query_text = query["description"] print(query_text.lower()) query_text = " ".join(tokenizer.tokenize(query_text)) query_text = self.remove_stopwords(query_text.lower()) print(query_text) query_search = self.parser.parse(query_text) if use_multipass_pseudo_relevance_feedback: if doc_counts is None: doc_counts = [5, 9] if add_nums is None: add_nums = [2, 13] assert len(doc_counts) == len( add_nums), "The number of pass is inconsistent!" for doc_count, add_num in zip(doc_counts, add_nums): final_list = [] initial_hits = self.searcher.search(query_search, doc_count).scoreDocs term_tf_idf = {} for initial_hit in initial_hits: termVector = self.reader.getTermVector( initial_hit.doc, "text") terms_enum = termVector.iterator() termsref = BytesRefIterator.cast_(terms_enum) N_terms = 0 term_idf = {} term_freq = {} term_list = [] while (termsref.next()): termval = TermsEnum.cast_(termsref) termText = termval.term().utf8ToString() if termText in self.stopwords: continue tc = termval.totalTermFreq() if termText in term_freq: term_freq[termText] += tc else: term_freq[termText] = tc if termText in term_idf: term_idf[termText] += 1 else: term_idf[termText] = 1 if termText not in term_list: term_list.append(termText) N_terms = N_terms + 1 for term in term_list: if term in term_tf_idf: term_tf_idf[term] += term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) else: term_tf_idf[term] = term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) sorted_term_tf_idf = sorted(term_tf_idf.items(), key=lambda x: x[1], reverse=True) for each in sorted_term_tf_idf: if each[0] not in self.stopwords: final_list.append(each[0]) print("added query tokens:", final_list[:add_num]) query_text = query_text + " " + " ".join(final_list[:add_num]) query_search = self.parser.parse(query_text) results = self.searcher.search(query_search, num_returns) hits = results.scoreDocs trec_results = [] for rank, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) trec_result = { "QueryID": query["Number"], "Q0": "Q0", "DocID": doc.get(".U"), "Rank": str(rank + 1), "Score": str(hit.score), "RunID": self.similarity + "-mpprf-" + str(len(doc_counts)) + "passes" if use_multipass_pseudo_relevance_feedback else self.similarity } trec_results.append(trec_result) return trec_results def search_query_with_relevance_feedback(self, query, feedback_qrels, num_returns=50, add_num=1): query_text = query["description"] print(query_text) query_text = " ".join(tokenizer.tokenize(query_text)) query_text = self.remove_stopwords(query_text.lower()) print(query_text) query_number = query["Number"] qrel_doc_ids = [ qrel["docno"] for qrel in feedback_qrels if qrel["qid"] == query_number ] final_list = [] term_tf_idf = {} doc_count = len(qrel_doc_ids) for qrel_doc_id in qrel_doc_ids: initial_hit = self.feedback_searcher.search( TermQuery(Term(".U", qrel_doc_id)), 1).scoreDocs if len(initial_hit) == 0: continue assert len(initial_hit) == 1 termVector = self.reader.getTermVector(initial_hit[0].doc, "text") terms_enum = termVector.iterator() termsref = BytesRefIterator.cast_(terms_enum) N_terms = 0 term_idf = {} term_freq = {} term_list = [] while (termsref.next()): termval = TermsEnum.cast_(termsref) termText = termval.term().utf8ToString() if termText in self.stopwords: continue tc = termval.totalTermFreq() if termText in term_freq: term_freq[termText] += tc else: term_freq[termText] = tc if termText in term_idf: term_idf[termText] += 1 else: term_idf[termText] = 1 if termText not in term_list: term_list.append(termText) N_terms = N_terms + 1 for term in term_list: if term in term_tf_idf: term_tf_idf[term] += term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) else: term_tf_idf[term] = term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) sorted_tf_idf = sorted(term_tf_idf.items(), key=lambda x: x[1], reverse=True) for each in sorted_tf_idf: if each[0] not in self.stopwords and not str(each[0]).isnumeric( ) and each[0] not in query_text.split(" "): final_list.append(each[0]) print(final_list[:add_num]) query_text = query_text + " " + " ".join(final_list[:add_num]) query_text = " ".join(query_text.split(" ")) print(query_text) query_search = self.parser.parse(query_text) results = self.searcher.search(query_search, num_returns) hits = results.scoreDocs trec_results = [] for rank, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) trec_result = { "QueryID": query["Number"], "Q0": "Q0", "DocID": doc.get(".U"), "Rank": str(rank + 1), "Score": str(hit.score), "RunID": self.similarity } trec_results.append(trec_result) return trec_results # def search_query_with_glove(self, query, doc_vectors, num_returns=50, index2word_set=None): # query_text = query["description"] # query_text = " ".join(word_tokenize(query_text)) # query_text = self.remove_stopwords(query_text) # query_vec = avg_feature_vector(query_text, model=glove_vectors, num_features=300, index2word_set=index2word_set) # doc_similarity = {} # for doc_id in tqdm(doc_vectors, desc="compute doc similarity:", total=len(doc_vectors.items())): # doc_similarity[doc_id] = 1 - spatial.distance.cosine(query_vec, doc_vectors[doc_id]) # doc_similarity = sorted(doc_similarity.items(), key=lambda x: x[1], reverse=True)[:num_returns] # trec_results = [] # for i, doc_id in tqdm(enumerate(doc_similarity), desc="output results:", total=len(doc_similarity)): # trec_result = {"QueryID": query["Number"], # "Q0": "Q0", # "DocID": doc_id[0], # "Rank": str(i + 1), # "Score": str(doc_id[1]), # "RunID": self.similarity+"+embedding"} # trec_results.append(trec_result) # return trec_results # # def search_query_with_transformers(self, query, doc_vectors, num_returns=50): # query_text = query["description"] # query_text = " ".join(word_tokenize(query_text)) # query_text = self.remove_stopwords(query_text) # query_vec = distilroberta_model.encode(query_text, convert_to_tensor=True) # doc_similarity = {} # for doc_id in tqdm(doc_vectors, desc="compute doc similarity:", total=len(doc_vectors.items())): # doc_similarity[doc_id] = util.pytorch_cos_sim(query_vec, doc_vectors[doc_id]) # doc_similarity = sorted(doc_similarity.items(), key=lambda x: x[1], reverse=True)[:num_returns] # trec_results = [] # for i, doc_id in tqdm(enumerate(doc_similarity), desc="output results:", total=len(doc_similarity)): # trec_result = {"QueryID": query["Number"], # "Q0": "Q0", # "DocID": doc_id[0], # "Rank": str(i + 1), # "Score": str(doc_id[1]), # "RunID": self.similarity+"+embedding"} # trec_results.append(trec_result) # return trec_results def get_results_from_queries(self, queries, num_returns=50, use_pseudo_relevance_feedback=False): trec_results = [] for query in queries: search_results = self.search_query(query, num_returns, use_pseudo_relevance_feedback) trec_results = trec_results + search_results return trec_results # # def get_results_from_queries_with_pretrained_embedding_similariy(self, queries, doc_vectors, num_returns=50): # trec_results = [] # for query in tqdm(queries, desc="queries", total=len(queries)): # search_results = self.search_query_with_glove(query, doc_vectors, num_returns) # trec_results = trec_results + search_results # return trec_results # # def get_results_from_queries_with_transformers(self, queries, doc_vectors, num_returns=50): # trec_results = [] # for query in tqdm(queries, desc="queries", total=len(queries)): # search_results = self.search_query_with_transformers(query, doc_vectors, num_returns) # trec_results = trec_results + search_results # return trec_results def get_results_from_queries_with_relevance_feedback( self, queries, feedback_qrels, num_returns=50): trec_results = [] for query in queries: search_results = self.search_query_with_relevance_feedback( query, feedback_qrels, num_returns=num_returns) trec_results = trec_results + search_results return trec_results