def getIndexSearcher(self): indexSearcher = IndexSearcher(self.mIndexReader) if self.mSimilarity != None: indexSearcher.setSimilarity(self.mSimilarity) return indexSearcher
class SSQA_S_Searcher: def __init__(self, indexDir, analyzer): lucene.initVM() self.reader = DirectoryReader.open(indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = analyzer logger.debug("Search similarity func: {}".format( self.searcher.getSimilarity())) def search(self, query_text, top_n): query_text = query_text.strip() query = QueryParser("content", self.analyzer).parse( QueryParser.escape(query_text.strip())) scoreDocs = self.searcher.search(query, top_n).scoreDocs count = 0 out_list = [] for scoreDoc in tqdm(scoreDocs): docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append(doc['content']) count += 1 logger.info("Added {} sentences".format(count)) return out_list def close(self): self.reader.close()
def process_q_test(q, out_q): lucene.initVM() lucene.getVMEnv().attachCurrentThread() index = DirectoryReader.open(SimpleFSDirectory( Paths.get(robust_index_dir))) searcher = IndexSearcher(index) searcher.setSimilarity(BM25Similarity()) analyzer = EnglishAnalyzer() qparser = QueryParser("contents", analyzer) preprocessor = Preprocess() while not exitFlag: qid, query = q.get() tname = multiprocessing.current_process().name # print(tname, qid, query) if query == "DONE": break try: # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000) # if len(dids) >= 10: # out_q.put((qid, dids, scores)) dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer, preprocessor) out_q.put((qid, dids_text)) except: print('%s exception %s, %s' % (tname, qid, query))
class IndexAndTaxonomy(object): def __init__(self, settings, indexDirectory=None, taxoDirectory=None): self._settings = settings self._similarity = settings.similarity self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks self._reader = DirectoryReader.open(indexDirectory) self.taxoReader = DirectoryTaxonomyReader(taxoDirectory) self._readerSettingsWrapper = ReaderSettingsWrapper() self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks} self._readerSettingsWrapper.set = self._setReadSettings self._searcher = None self._executor = None self._reopenSearcher = True def reopen(self): reader = DirectoryReader.openIfChanged(self._reader) if reader is None: return self._reader.close() self._reader = reader self._reopenSearcher = True taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader) if taxoReader is None: return self.taxoReader.close() self.taxoReader = taxoReader @property def searcher(self): if not self._reopenSearcher: return self._searcher if self._settings.multithreaded: if self._executor: self._executor.shutdown(); self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks); self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks) else: self._searcher = IndexSearcher(self._reader) self._searcher.setSimilarity(self._similarity) self._reopenSearcher = False return self._searcher def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None): # This method must be thread-safe if similarity is None: self._similarity = self._settings.similarity else: self._similarity = BM25Similarity(similarity["k1"], similarity["b"]) if numberOfConcurrentTasks is None: self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks else: self._numberOfConcurrentTasks = numberOfConcurrentTasks self._reopenSearcher = True def close(self): self.taxoReader.close() self._reader.close()
def config(): base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75. searcher.setSimilarity(bm25Sim) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT) return searcher,analyzer
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def run(self): print("Starting " + self.name) lucene.getVMEnv().attachCurrentThread() index = DirectoryReader.open( SimpleFSDirectory(Paths.get(robust_index_dir))) searcher = IndexSearcher(index) searcher.setSimilarity(BM25Similarity()) analyzer = EnglishAnalyzer() qparser = QueryParser("contents", analyzer) # process_query(self.name, self.q, self.out_q, searcher, qparser) print("Exiting " + self.name)
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0] * len( feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
class DocRepo: def __init__(self): # self.analyzer = StandardAnalyzer() # self.analyzer = PersianAnalyzer(StopFilter.makeStopSet(sw)) # self.analyzer = PersianAnalyzer() self.analyzer = StopAnalyzer(Paths.get(Config.stop_words_address)) self.config = IndexWriterConfig(self.analyzer) self.index = RAMDirectory() self.w = IndexWriter(self.index, self.config) def addDocument(self, id): global answers_train preA = answers_train[id] doc = Document() doc.add(TextField("pa", preA, Field.Store.YES)) doc.add(StringField("id", str(id), Field.Store.YES)) self.w.addDocument(doc) self.w.commit() def __del__(self): self.w.close() def get_most_similar(self, sentence, do_log=False): # print('query string is',string) # q = QueryParser('pa', self.analyzer).parse(sentence) query_builder = BooleanQuery.Builder() for token in sentence.split(' '): if token not in sw: qtq = TermQuery(Term("pa", token)) query_builder.add( BooleanClause(qtq, BooleanClause.Occur.SHOULD)) q = query_builder.build() hitsPerPage = 2 reader = DirectoryReader.open(self.w) self.searcher = IndexSearcher(reader) simi = BM25Similarity(Config.k1, Config.b) # simi = ClassicSimilarity() self.searcher.setSimilarity(simi) docs = self.searcher.search(q, hitsPerPage) hits = docs.scoreDocs # print("Found " + str(len(hits)) + " hits.") if len(hits) > 0: mate = self.searcher.doc(hits[0].doc).get("id") if do_log: print("found something. mate: ", mate, "- score : ", hits[0].score) return hits[0], int(mate) else: return None, -1
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def __init__(self, index_dir, index_file, rawQuery): self.indexFile = os.path.join(index_dir, index_file) # lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately directory = SimpleFSDirectory(File(self.indexFile)) searcher = IndexSearcher(DirectoryReader.open(directory)) searcher.setSimilarity(BM25Similarity(1.2, 0.75)) # set BM25 as the similarity metric, k=1.2, b=0.75 if 'Standard' in self.indexFile: print "Use the StandardAnalyzer" analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # build a standard analyzer with default stop words if 'Porter' in self.indexFile: print "Use the PorterStemmer analyzer" analyzer = PorterStemmerAnalyzer() self.run(searcher, analyzer, rawQuery) del searcher
def main(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) queries = makeQueryList(args["queryFile"]) print 'lucene', lucene.VERSION print "\n" directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR)) print directory.getDirectory() searcher = IndexSearcher(DirectoryReader.open(directory)) searcher.setSimilarity(ClassicSimilarity()) analyzer = StandardAnalyzer() run(searcher, analyzer, queries) del searcher
def find(self, query): transformer = StringTransformer() analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) processed_query = ' '.join( self._preprocessor(transformer.transform(query))) query = QueryParser(Version.LUCENE_CURRENT, "content", analyzer).parse(processed_query) hits = searcher.get_description(query, 10) result_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) result_list.append(doc.get("path").encode("utf-8")) return result_list
def lucene_retrieval(q_string, use_BM25=False): """ :param q_string: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def doc_text(hists): """ return doc_name & score :param hists: """ text = '_NONE_' for h in hists: docID = h.doc doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") text = doc.get("text") #score = h.score # yield (file_name, doc_name, score, text) return text result = '_NONE_' # escape special characters via escape function if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here #print(q_string) query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists result = doc_text(hs) # reader.close() return result # text: also nodes
def get_sorted_results(self, query): SHOULD = BooleanClause.Occur.SHOULD parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer) reader = IndexReader.open(self.directory) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) topDocs = searcher.search(parsed_query, 10) j = 0 for i in topDocs.scoreDocs: d = searcher.doc(i.doc) print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score) j += 1
class ParagSearcher: def __init__(self, Lid, db_path=config.DB_SSQA): lucene.initVM() self.db = SSQA_DB(db_path) lesson_str = self.db.get_lesson_str(Lid) parags = str_lesson2parags(lesson_str) # Index a Lesson myIndexer = _ChineseRamIndexer() myIndexer.index_lesson(parags) myIndexer.close() self.reader = DirectoryReader.open(myIndexer.indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = SmartChineseAnalyzer() logger.debug('search similarity:{}'.format( self.searcher.getSimilarity())) def __exit__(self, *args): self.close() def search(self, query_text, top_n=1): query_text = query_text.strip() # query = QueryParser("content", self.analyzer).parse(QueryParser.escape(query_text.strip())) query = QueryParser("content", self.analyzer).parse(query_text) scoreDocs = self.searcher.search(query, top_n).scoreDocs out_list = [] for scoreDoc in scoreDocs: docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append((doc['pid'], doc['content'], scoreDoc.score)) return out_list def close(self): self.db.close() self.reader.close()
class CosQASearcher: def __init__(self, lang): lucene.initVM() if lang == 'zh': indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_ZH))) analyzer = SmartChineseAnalyzer() elif lang == 'en': indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_EN))) analyzer = EnglishAnalyzer() else: raise ValueError( 'lang should be "zh" or "en", {} is invalid!'.format(lang)) self.reader = DirectoryReader.open(indexDir) self.searcher = IndexSearcher(self.reader) self.searcher.setSimilarity(mySimilarity()) self.analyzer = analyzer logger.debug('search similarity func: {}'.format( self.searcher.getSimilarity())) def search(self, query_text, top_n=1): query_text = query_text.strip() query = QueryParser("content", self.analyzer).parse( QueryParser.escape(query_text.strip())) # query = QueryParser("content", self.analyzer).parse(query_text) scoreDocs = self.searcher.search(query, top_n).scoreDocs out_list = [] for scoreDoc in scoreDocs: docIndex = scoreDoc.doc doc = self.searcher.doc(docIndex) log_debug(doc, logger) log_debug(self.searcher.explain(query, docIndex), logger) out_list.append( (doc['did'], doc['title_en'], doc['content'], scoreDoc.score)) return out_list def close(self): self.reader.close()
class QuestionLuceneSearch(): def __init__(self): self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print('Creating index at', prm.index_folder) if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print('copying index from', prm.index_folder, 'to', prm.local_index_folder) if os.path.exists(prm.local_index_folder): print('Folder', prm.local_index_folder, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(BM25Similarity()) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print('Creating index at', prm.index_folder_term) self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term) if os.path.exists(prm.local_index_folder_term): print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print('Loading Text-ID mapping...') self.text_id_map, self.id_text_map = self.get_text_id_map() def get_text_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() text_id = {} id_text = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) text = doc['text'] text_id[text] = idd id_text[idd] = text return text_id, id_text # def add_doc(self, doc_id, title, txt, add_terms): def add_doc(self, doc_id, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) # doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str,words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder, docs_path, add_terms=False): print('Loading Vocab...') if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print("%d docs in index" % self.writer.numDocs()) print("Indexing documents...") # import corpus_hdf5 # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path) import pickle with open(docs_path, "rb") as read_file: corpus = pickle.load(read_file) idx_cnt = 0 # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()): # for doc_id, txt in corpus.items(): for txt in corpus: self.add_doc(idx_cnt, txt, add_terms) # not lowered if idx_cnt % 1000 == 0: print('indexing doc', idx_cnt) idx_cnt += 1 print("Index of %d docs..." % self.writer.numDocs()) self.writer.close() def search_multithread(self, qs, max_cand, max_full_cand, searcher): self.max_cand = max_cand self.max_full_cand = max_full_cand self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = list(map(int, doc['word_idx'].split(' '))) word = doc['word'].split('<&>') else: word_idx = [] word = [] # c[int(doc['id'])] = [word_idx, word] c[int(doc['id'])] = [word_idx, word, hit.score] # print(c) return c def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') query = QueryParser("text", self.analyzer).parse(QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = list(map(int, doc['word_idx'].split(' '))) word = doc['word'].split('<&>') else: word_idx = [] word = [] # c[int(doc['id'])] = [word_idx, word] c[int(doc['id'])] = [word_idx, word, hit.score] out.append(c) return out def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True): if not max_full_cand: max_full_cand = max_cand if prm.docs_path != prm.docs_path_term: max_cand2 = 0 else: max_cand2 = max_full_cand if prm.n_threads > 1: out = self.search_multithread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term) else: out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term) if (prm.docs_path != prm.docs_path_term) and extra_terms: for outt, termss in zip(out, terms): for cand_id, term in zip(list(outt.keys())[:max_full_cand], list(termss.values())): outt[cand_id] = term if save_cache: for q, c in zip(qs, out): if q not in self.cache: self.cache[q] = c return out def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True): # if prm.n_threads > 1: # out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term) # else: # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher) # if (prm.docs_path != prm.docs_path_term) and extra_terms: # terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term) out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.searcher.explain(query, doc_int) c[1] = exp out.append(c) return out def search_pair_score_singlethread(self, q, doc_int, searcher): out = [] try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q)) c = OrderedDict() exp = searcher.explain(query, doc_int) c[1] = exp out.append(c) return out def search_pair_score_multithread(self, qs_trailing_doc, searcher): self.curr_searcher = searcher # out = self.pool.map(self.search_pair_score_multithread_part, product(qs,doc_int)) out = self.pool.map(self.search_pair_score_multithread_part, qs_trailing_doc) return out def search_pair_score_multithread_part(self, q_doc_int): # print(q_doc_int) spl=q_doc_int.split('<|endoftext|>') q = spl[0] print(q) doc_int = int(spl[1]) print(doc_int) if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print('Unexpected error when processing query:', str(q)) print('Using query "dummy".') q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() exp = self.curr_searcher.explain(query, doc_int) c[1] = exp return c
total_recall += recall total_precision += precision total_FB += FB print '%3s Recall: %.6f Precision: %.6f FB: %.6f' % (qid, recall, precision, FB) query_data_length = len(query_data) avg_recall = total_recall/query_data_length avg_precision = total_precision/query_data_length avg_FB = total_FB/query_data_length print 'Avg Recall: %.6f Avg Precision: %.6f Avg FB: %.6f' % (avg_recall, avg_precision, avg_FB) if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) searcher.setSimilarity(similarities.BM25Similarity()) #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity # analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) analyzer = MyAnalyzer(Version.LUCENE_CURRENT) fs = FileSearcher(searcher, analyzer) if len(sys.argv) < 2: fs.perform_user_query(searcher, analyzer) else: fs.results_comparison(searcher, analyzer, sys.argv[1]) del searcher
def __recs_query(self, positive_rated_document_list, scores, recs_number, items_directory, candidate_list: List) -> pd.DataFrame: """ Builds a query using the contents that the user liked. The terms relative to the contents that the user liked are boosted by the rating he/she gave. A filter clause is added to the query to consider only candidate items Args: positive_rated_document_list: List of contents that the user liked scores: Ratings given by the user recs_number: How many items must be recommended. You can only specify the number, not a specific item for which compute the prediction items_directory: Directory where the items are stored Returns: score_frame (pd.DataFrame): DataFrame containing the recommendations for the user """ BooleanQuery.setMaxClauseCount(2000000) searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory( Paths.get(items_directory)))) if self.__classic_similarity: searcher.setSimilarity(ClassicSimilarity()) field_list = searcher.doc(positive_rated_document_list[0]).getFields() user_fields = {} field_parsers = {} analyzer = SimpleAnalyzer() for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] = field.stringValue() field_parsers[field.name()] = QueryParser(field.name(), analyzer) positive_rated_document_list.remove(positive_rated_document_list[0]) for _ in positive_rated_document_list: for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] += field.stringValue() logger.info("Building query") query_builder = BooleanQuery.Builder() for score in scores: for field_name in user_fields.keys(): if field_name == 'content_id': continue field_parsers[field_name].setDefaultOperator( QueryParser.Operator.OR) field_query = field_parsers[field_name].escape( user_fields[field_name]) field_query = field_parsers[field_name].parse(field_query) field_query = BoostQuery(field_query, score) query_builder.add(field_query, BooleanClause.Occur.SHOULD) if candidate_list is not None: id_query_string = ' OR '.join("content_id:\"" + content_id + "\"" for content_id in candidate_list) id_query = QueryParser("testo_libero", KeywordAnalyzer()).parse(id_query_string) query_builder.add(id_query, BooleanClause.Occur.MUST) query = query_builder.build() docs_to_search = len(positive_rated_document_list) + recs_number scoreDocs = searcher.search(query, docs_to_search).scoreDocs logger.info("Building score frame to return") recorded_items = 0 columns = ['to_id', 'rating'] score_frame = pd.DataFrame(columns=columns) for scoreDoc in scoreDocs: if recorded_items >= recs_number: break if scoreDoc.doc not in positive_rated_document_list: doc = searcher.doc(scoreDoc.doc) item_id = doc.getField("content_id").stringValue() recorded_items += 1 score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item_id, scoreDoc.score)], columns=columns) ]) return score_frame
class IndexAndTaxonomy(object): def __init__(self, settings, indexDirectory=None, taxoDirectory=None): self._settings = settings self._similarity = settings.similarity self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks self._reader = DirectoryReader.open(indexDirectory) self.taxoReader = DirectoryTaxonomyReader(taxoDirectory) self._readerSettingsWrapper = ReaderSettingsWrapper() self._readerSettingsWrapper.get = lambda: { "similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks } self._readerSettingsWrapper.set = self._setReadSettings self._searcher = None self._executor = None self._reopenSearcher = True def reopen(self): reader = DirectoryReader.openIfChanged(self._reader) if reader is None: return self._reader.close() self._reader = reader self._reopenSearcher = True taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader) if taxoReader is None: return self.taxoReader.close() self.taxoReader = taxoReader @property def searcher(self): if not self._reopenSearcher: return self._searcher if self._settings.multithreaded: if self._executor: self._executor.shutdown() self._executor = Executors.newFixedThreadPool( self._numberOfConcurrentTasks) self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks) else: self._searcher = IndexSearcher(self._reader) self._searcher.setSimilarity(self._similarity) self._reopenSearcher = False return self._searcher def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None): # This method must be thread-safe if similarity is None: self._similarity = self._settings.similarity else: self._similarity = BM25Similarity(similarity["k1"], similarity["b"]) if numberOfConcurrentTasks is None: self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks else: self._numberOfConcurrentTasks = numberOfConcurrentTasks self._reopenSearcher = True def close(self): self.taxoReader.close() self._reader.close()
class Searcher(object): # 搜索类 def __init__(self, indexDir, computeLengthNorm=True): # 初始化 indexDir-索引文件目录 computeLengthNorm-是否应用SIM(true-不用 false-应用) # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # 标准分词 在针对英文时 以分隔符分词 self.path = os.path.join(INDEX_PATH, indexDir) # 存储路径 self.store = SimpleFSDirectory(File(self.path)) # 存储***? # self.reader = DirectoryReader.open(self.store) self.reader = IndexReader.open(self.store) self.numDocs = self.reader.maxDoc() self.searcher = IndexSearcher(self.reader) # IndexSearch类 sim = CustomSimilarity() # addby zmq if not computeLengthNorm: # SIM sim = CustomSimilarity() self.searcher.setSimilarity(sim) self.mlt = MoreLikeThis(self.reader, sim) # mlt? self.mlt.setAnalyzer(self.analyzer) self.mlt.setMinTermFreq(1) self.mlt.setMinDocFreq(1) # debug self.mlt.setMinWordLen(1) self.mlt.setMaxNumTokensParsed(100000000) BooleanQuery.setMaxClauseCount(1024 * 1024) # 修改最长query clause BUG # debug def searchKeyWords(self, key_value, max_num): # 查询text域中的指定字符匹配的文档 key_value = str(key_value.encode('utf-8')) if type(key_value) != type('') or len(key_value) == 0: raise Exception('Please provide a string.') # term = ('text',key_value) # termquery = TermQuery(Term(*term)) query = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer).parse(key_value) # query = FuzzyQuery(Version.LUCENE_CURRENT, "title",self.analyzer).parse(key_value)###模糊查询 self.query = query results = self.searcher.search(query, max_num) result_list = [] # print len(results.scoreDocs),results.scoreDocs for each_result in results.scoreDocs: docid = each_result.doc result_list.append(self.searcher.doc(docid)['key']) return result_list def getDocID(self, dictID): # 查询某个term命中的文档号 dictID为Key-Value(例如key-'title' value-seller_id) if len(dictID) != 1: raise Exception('Please provide a dict with one pair of field and value.') term = dictID.items()[0] termquery = TermQuery(Term(*term)) self.query = termquery results = self.searcher.search(termquery, 10) try: if len(results.scoreDocs) < 1: return None docid = results.scoreDocs[0].doc print 'id:', docid return docid except Exception, e: logger.error('Doc not found: %s', str(dictID)) raise Exception('Doc not found: %s', str(dictID))
class SearchEngine(object): def __init__(self, root, storedir, isindexing=False, isBM25=True): if not os.path.exists(storedir): os.mkdir(storedir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) if isindexing: store = SimpleFSDirectory(Paths.get(storedir)) config = IndexWriterConfig(self.analyzer) # TODO BM25 parameter tuning if isBM25: config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done') search_dir = SimpleFSDirectory(Paths.get(storedir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) if isBM25: self.searcher.setSimilarity(BM25Similarity()) def indexer(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) def repalcer(text): chars = '\\`*_{}[]()>#+-.!$‘' for c in chars: if c in text: text = text.replace(c, ' ') return text for root, dirnames, filenames in os.walk(root): i = 0 for filename in filenames: i += 1 with open(os.path.join(root, filename)) as f: for line in f.readlines(): line = line.split(' ', 2) docname = line[0] + ' ' + line[1] name = repalcer(line[0]) contents = line[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i) def search(self, query, topk=10): qp = PythonMultiFieldQueryParser(['name', 'contents'], self.analyzer) query = qp.parse( query, ['name', 'contents'], [BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD], self.analyzer) # print(query) scores = self.searcher.search(query, topk).scoreDocs # print('%s total matching documents.' % len(scores)) docnames = [] doccontents = [] for score in scores: doc = self.searcher.doc(score.doc) docnames.append(doc.get('docname')) doccontents.append(doc.get('contents')) return docnames, doccontents def retrieve(self, term, sid): query = term + ' ' + str(sid) query = QueryParser.escape(query) query = QueryParser('docname', self.analyzer).parse(query) score = self.searcher.search(query, 1).scoreDocs doc = self.searcher.doc(score[0].doc) return doc.get('docname'), doc.get('contents')
class SearchIndex: def __init__(self, indexPath): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION #initialize the index self.INDEX_DIR = indexPath #"Clue_Index" self.results = None self.searcher = IndexSearcher(DirectoryReader.open( SimpleFSDirectory(File(self.INDEX_DIR)))) self.searcher.setSimilarity(BM25Similarity()) def initializeAnalyzer(self): #self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT,JavaSet(stopSet)) sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True) for entry in stopSet: sSet.add(entry) self.stopSet = sSet #self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT,sSet) self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) def getTopDocuments(self, query, limit, sfield, dfield): queryObj = QueryParser(Version.LUCENE_CURRENT, sfield, self.analyzer).parse(query) print queryObj scoreDocs = self.searcher.search(queryObj, limit).scoreDocs print '%s total matching documents.' % len(scoreDocs) self.results = scoreDocs rresults = [] i = 0 #reader = self.searcher.getIndexReader(); #print type(reader) for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) rresults.append((doc.get(dfield), scoreDoc.score)) #rresults.append(doc.get(dfield));#,scoreDoc.score)) i += 1 if i == limit: break return rresults #print 'path:', doc.get("URL"), 'name:', doc.get("id"), 'title:', doc.get("title") def getTopDocumentsWithExpansion(self, query, expTerms, limit, sfield, dfield ): print expTerms query = query + ' ' + ' '.join('{0}^{1}'.format(x[0], round(x[1], 2)) for x in expTerms) sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True) for entry in expTerms: sSet.add(entry[0]) analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT, self.stopSet, sSet) queryObj = QueryParser(Version.LUCENE_CURRENT, sfield, analyzer).parse(query) scoreDocs = self.searcher.search(queryObj, limit).scoreDocs print '%s total matching documents.' % len(scoreDocs), queryObj self.results = scoreDocs rresults = [] i = 0 for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) #rresults.append(doc.get(dfield));#,scoreDoc.score)) rresults.append((doc.get(dfield), scoreDoc.score)) i += 1 if i == limit: break return rresults def getField(self, dfield, name, limit): toReturn = [] i = 0 for scoreDoc in self.results: doc = self.searcher.doc(scoreDoc.doc) toReturn.append((doc.get(dfield), doc.get(name))) i += 1 if i == limit: break return toReturn def close(self): del self.searcher
def __init__(self, tweets, storeDir, analyzer): # first, index the tweets if not path.exists(storeDir): mkdir(storeDir) store = SimpleFSDirectory(File(storeDir)) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.index_docs(tweets, writer) writer.commit() writer.close() # set up IndexSearcher reader = IndexReader.open(store) n_docs = reader.numDocs() searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) queryparser = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)) # create document vectors doc_vectors = self.get_doc_vectors(reader, tweets, n_docs) cs_scorer = CosineSimilarityScorer(doc_vectors, reader, searcher, tweets) bm25_scorer = BM25Scorer(doc_vectors, searcher, queryparser) # find relevant tweets for fav_doc in (1, 26, 51): cs_scores = cs_scorer.get_scores(fav_doc) bm25_scores = bm25_scorer.get_scores(fav_doc) top_cs_scores = dict(sorted(cs_scores.iteritems(), key=itemgetter(1), reverse=True)[:5]) top_bm25_scores = dict(sorted(bm25_scores.iteritems(), key=itemgetter(1), reverse=True)[:5]) # print "top_cs_scores", top_cs_scores # print "top_bm25_scores", top_bm25_scores # calculate composite score by multiplying cs scores by 100 and keeping bm25 scores as is. # cs is bounded from 0.0-1.0. bm25 scores is actually idf * bm25_similarity_score so values # above 10.0 are not uncommon top_blended_scores = {} for key, value in top_cs_scores.iteritems(): top_blended_scores[key] = value * 100.0 for key, value in top_bm25_scores.iteritems(): if key not in top_blended_scores: top_blended_scores[key] = 0.0 top_blended_scores[key] += value top_score = dict(sorted(top_blended_scores.iteritems(), key=itemgetter(1), reverse=True)[:1]) # print "\n" # print "results for", fav_doc # print tweets[fav_doc] print searcher.doc(fav_doc).get("contents") print top_score # if the top score fails to reach 10.0, this result is probably not of high quality so onlyworthy # will decline to identify a relevant match if top_score.values()[0] < 10.0: print "skipping" continue # print tweets[top_score.keys()[0]] print searcher.doc(top_score.keys()[0]).get("contents") print "\n"
# for txtName in gutenberg_list: # words = nltk.corpus.gutenberg.words(txtName) # sents = " ".join(words).split(".") # print(sents[:100]) # # print("Indexing ", txtName, "...") # # for i in range(0, len(sents), 10): # # text = " ".join(sents[i:i+10]) # # doc = Document() # # doc.add(Field("fieldname", text, TextField.TYPE_STORED)) # # iwriter.addDocument(doc) # # iwriter.close() # now search the index ireader = DirectoryReader.open(directory) isearcher = IndexSearcher(ireader) # set similarity method bm25 = BM25Similarity() isearcher.setSimilarity(bm25) # parse a simple query that searches for "text" parser = QueryParser("fieldname", analyzer) query = parser.parse("her sister was reading") hits = isearcher.search(query, 5).scoreDocs print(len(hits)) for hit in hits: result = isearcher.doc(hit.doc) print("[%8.4f] %s" % (hit.score, result.get("fieldname")))
class LuceneSearch(object): def __init__(self, args): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.args = args index_folder = os.path.join(DATA_DIR, args.index_folder) if not os.path.exists(index_folder): self.doc_db = DocDB() logger.info(f'Creating index at {index_folder}') self.create_index(index_folder) fsDir = MMapDirectory(Paths.get(index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(MyTFIDFSimilarity()) self.analyzer = MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True)) self.pool = ThreadPool(processes=args.num_search_workers) def add_doc(self, title, text, tokens): doc = Document() doc.add(Field("title", title, self.t1)) doc.add(Field("text", text, self.t2)) doc.add(Field("token", tokens, self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close() def search_multithread(self, qs, ranker_doc_max, searcher): self.ranker_doc_max = ranker_doc_max self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse(QueryParser.escape(q)) except Exception as e: logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow') if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse('dummy') doc_scores, doc_titles, doc_texts, doc_words = [], [], [], [] hits = self.curr_searcher.search(query, self.ranker_doc_max) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) doc_score = hit.score doc_title = doc['title'] doc_word = doc['token'].split('<&>') doc_text = doc['text'] doc_scores.append(doc_score) doc_titles.append(doc_title) doc_words.append(doc_word) doc_texts.append(doc_text) if len(doc_scores) == 0: logger.warning( colored( f'WARN: search engine returns no results for query: {q}.', 'yellow')) return doc_scores, doc_titles, doc_texts, doc_words def search_singlethread(self, qs, ranker_doc_max, curr_searcher): out = [] for q in qs: try: if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse( QueryParser.escape(q)) except Exception as e: logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow') if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse('dummy') doc_scores, doc_titles, doc_texts, doc_words = [], [], [], [] hits = curr_searcher.search(query, ranker_doc_max) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) doc_score = hit.score doc_title = doc['title'] doc_word = doc['token'].split('<&>') doc_text = doc['text'] doc_scores.append(doc_score) doc_titles.append(doc_title) doc_words.append(doc_word) doc_texts.append(doc_text) if len(doc_scores) == 0: logger.warning( colored( f'WARN: search engine returns no results for query: {q}.', 'yellow')) out.append((doc_scores, doc_titles, doc_texts, doc_words)) return out def batch_closest_docs(self, qs, ranker_doc_max): if self.args.num_search_workers > 1: out = self.search_multithread(qs, ranker_doc_max, self.searcher) else: out = self.search_singlethread(qs, ranker_doc_max, self.searcher) return out def _parse_query(self, field_name, query): ts = self.analyzer.tokenStream("dummy", StringReader(query)) termAtt = ts.getAttribute(CharTermAttribute.class_) ts.reset() tokens = [] while ts.incrementToken(): tokens.append(termAtt.toString()) ts.end() ts.close() booleanQuery = BooleanQuery.Builder() for token in tokens: builder = PhraseQuery.Builder() for i, word in enumerate(token.split(' ')): builder.add(Term(field_name, word), i) pq = builder.build() booleanQuery.add(pq, BooleanClause.Occur.SHOULD) final_query = booleanQuery.build() return final_query
def createIndexSearcher(indexDir): directory = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir))) searcher = IndexSearcher(directory) similarity = BM25Similarity(K1, B) searcher.setSimilarity(similarity) return searcher
class SearchBuilder(object): def __init__(self, index_path, field, similarity="boolean", use_relevance_feedback=False, feedback_index_path=None): self.reader = DirectoryReader.open( FSDirectory.open(Paths.get(index_path))) self.searcher = IndexSearcher(self.reader) if use_relevance_feedback and feedback_index_path is not None: self.feedback_reader = DirectoryReader.open( FSDirectory.open(Paths.get(feedback_index_path))) self.feedback_searcher = IndexSearcher(self.feedback_reader) self.similarity = similarity self.stopwords = stop_words() if similarity == "boolean": self.searcher.setSimilarity(BooleanSimilarity()) elif similarity == "tf": self.searcher.setSimilarity(TFSimilarity()) elif similarity == "tfidf": self.searcher.setSimilarity(ClassicSimilarity()) elif similarity == "BM25": self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) else: print("Unknown similarity, so we use BM25(1.2, 0.2) as default") self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) analyzer = StandardAnalyzer() print(self.searcher.getSimilarity()) self.parser = QueryParser(field, analyzer) def remove_stopwords(self, query_text): new_query_tokens = [] query_tokens = query_text.split() for query_token in query_tokens: if query_token not in self.stopwords: new_query_tokens.append(query_token) return " ".join(new_query_tokens) def search_query(self, query, num_returns=50, use_multipass_pseudo_relevance_feedback=False, doc_counts=None, add_nums=None): query_text = query["description"] print(query_text.lower()) query_text = " ".join(tokenizer.tokenize(query_text)) query_text = self.remove_stopwords(query_text.lower()) print(query_text) query_search = self.parser.parse(query_text) if use_multipass_pseudo_relevance_feedback: if doc_counts is None: doc_counts = [5, 9] if add_nums is None: add_nums = [2, 13] assert len(doc_counts) == len( add_nums), "The number of pass is inconsistent!" for doc_count, add_num in zip(doc_counts, add_nums): final_list = [] initial_hits = self.searcher.search(query_search, doc_count).scoreDocs term_tf_idf = {} for initial_hit in initial_hits: termVector = self.reader.getTermVector( initial_hit.doc, "text") terms_enum = termVector.iterator() termsref = BytesRefIterator.cast_(terms_enum) N_terms = 0 term_idf = {} term_freq = {} term_list = [] while (termsref.next()): termval = TermsEnum.cast_(termsref) termText = termval.term().utf8ToString() if termText in self.stopwords: continue tc = termval.totalTermFreq() if termText in term_freq: term_freq[termText] += tc else: term_freq[termText] = tc if termText in term_idf: term_idf[termText] += 1 else: term_idf[termText] = 1 if termText not in term_list: term_list.append(termText) N_terms = N_terms + 1 for term in term_list: if term in term_tf_idf: term_tf_idf[term] += term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) else: term_tf_idf[term] = term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) sorted_term_tf_idf = sorted(term_tf_idf.items(), key=lambda x: x[1], reverse=True) for each in sorted_term_tf_idf: if each[0] not in self.stopwords: final_list.append(each[0]) print("added query tokens:", final_list[:add_num]) query_text = query_text + " " + " ".join(final_list[:add_num]) query_search = self.parser.parse(query_text) results = self.searcher.search(query_search, num_returns) hits = results.scoreDocs trec_results = [] for rank, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) trec_result = { "QueryID": query["Number"], "Q0": "Q0", "DocID": doc.get(".U"), "Rank": str(rank + 1), "Score": str(hit.score), "RunID": self.similarity + "-mpprf-" + str(len(doc_counts)) + "passes" if use_multipass_pseudo_relevance_feedback else self.similarity } trec_results.append(trec_result) return trec_results def search_query_with_relevance_feedback(self, query, feedback_qrels, num_returns=50, add_num=1): query_text = query["description"] print(query_text) query_text = " ".join(tokenizer.tokenize(query_text)) query_text = self.remove_stopwords(query_text.lower()) print(query_text) query_number = query["Number"] qrel_doc_ids = [ qrel["docno"] for qrel in feedback_qrels if qrel["qid"] == query_number ] final_list = [] term_tf_idf = {} doc_count = len(qrel_doc_ids) for qrel_doc_id in qrel_doc_ids: initial_hit = self.feedback_searcher.search( TermQuery(Term(".U", qrel_doc_id)), 1).scoreDocs if len(initial_hit) == 0: continue assert len(initial_hit) == 1 termVector = self.reader.getTermVector(initial_hit[0].doc, "text") terms_enum = termVector.iterator() termsref = BytesRefIterator.cast_(terms_enum) N_terms = 0 term_idf = {} term_freq = {} term_list = [] while (termsref.next()): termval = TermsEnum.cast_(termsref) termText = termval.term().utf8ToString() if termText in self.stopwords: continue tc = termval.totalTermFreq() if termText in term_freq: term_freq[termText] += tc else: term_freq[termText] = tc if termText in term_idf: term_idf[termText] += 1 else: term_idf[termText] = 1 if termText not in term_list: term_list.append(termText) N_terms = N_terms + 1 for term in term_list: if term in term_tf_idf: term_tf_idf[term] += term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) else: term_tf_idf[term] = term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) sorted_tf_idf = sorted(term_tf_idf.items(), key=lambda x: x[1], reverse=True) for each in sorted_tf_idf: if each[0] not in self.stopwords and not str(each[0]).isnumeric( ) and each[0] not in query_text.split(" "): final_list.append(each[0]) print(final_list[:add_num]) query_text = query_text + " " + " ".join(final_list[:add_num]) query_text = " ".join(query_text.split(" ")) print(query_text) query_search = self.parser.parse(query_text) results = self.searcher.search(query_search, num_returns) hits = results.scoreDocs trec_results = [] for rank, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) trec_result = { "QueryID": query["Number"], "Q0": "Q0", "DocID": doc.get(".U"), "Rank": str(rank + 1), "Score": str(hit.score), "RunID": self.similarity } trec_results.append(trec_result) return trec_results # def search_query_with_glove(self, query, doc_vectors, num_returns=50, index2word_set=None): # query_text = query["description"] # query_text = " ".join(word_tokenize(query_text)) # query_text = self.remove_stopwords(query_text) # query_vec = avg_feature_vector(query_text, model=glove_vectors, num_features=300, index2word_set=index2word_set) # doc_similarity = {} # for doc_id in tqdm(doc_vectors, desc="compute doc similarity:", total=len(doc_vectors.items())): # doc_similarity[doc_id] = 1 - spatial.distance.cosine(query_vec, doc_vectors[doc_id]) # doc_similarity = sorted(doc_similarity.items(), key=lambda x: x[1], reverse=True)[:num_returns] # trec_results = [] # for i, doc_id in tqdm(enumerate(doc_similarity), desc="output results:", total=len(doc_similarity)): # trec_result = {"QueryID": query["Number"], # "Q0": "Q0", # "DocID": doc_id[0], # "Rank": str(i + 1), # "Score": str(doc_id[1]), # "RunID": self.similarity+"+embedding"} # trec_results.append(trec_result) # return trec_results # # def search_query_with_transformers(self, query, doc_vectors, num_returns=50): # query_text = query["description"] # query_text = " ".join(word_tokenize(query_text)) # query_text = self.remove_stopwords(query_text) # query_vec = distilroberta_model.encode(query_text, convert_to_tensor=True) # doc_similarity = {} # for doc_id in tqdm(doc_vectors, desc="compute doc similarity:", total=len(doc_vectors.items())): # doc_similarity[doc_id] = util.pytorch_cos_sim(query_vec, doc_vectors[doc_id]) # doc_similarity = sorted(doc_similarity.items(), key=lambda x: x[1], reverse=True)[:num_returns] # trec_results = [] # for i, doc_id in tqdm(enumerate(doc_similarity), desc="output results:", total=len(doc_similarity)): # trec_result = {"QueryID": query["Number"], # "Q0": "Q0", # "DocID": doc_id[0], # "Rank": str(i + 1), # "Score": str(doc_id[1]), # "RunID": self.similarity+"+embedding"} # trec_results.append(trec_result) # return trec_results def get_results_from_queries(self, queries, num_returns=50, use_pseudo_relevance_feedback=False): trec_results = [] for query in queries: search_results = self.search_query(query, num_returns, use_pseudo_relevance_feedback) trec_results = trec_results + search_results return trec_results # # def get_results_from_queries_with_pretrained_embedding_similariy(self, queries, doc_vectors, num_returns=50): # trec_results = [] # for query in tqdm(queries, desc="queries", total=len(queries)): # search_results = self.search_query_with_glove(query, doc_vectors, num_returns) # trec_results = trec_results + search_results # return trec_results # # def get_results_from_queries_with_transformers(self, queries, doc_vectors, num_returns=50): # trec_results = [] # for query in tqdm(queries, desc="queries", total=len(queries)): # search_results = self.search_query_with_transformers(query, doc_vectors, num_returns) # trec_results = trec_results + search_results # return trec_results def get_results_from_queries_with_relevance_feedback( self, queries, feedback_qrels, num_returns=50): trec_results = [] for query in queries: search_results = self.search_query_with_relevance_feedback( query, feedback_qrels, num_returns=num_returns) trec_results = trec_results + search_results return trec_results
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher1 = IndexSearcher(reader) searcher1.setSimilarity(BM25Similarity()) searcher2 = IndexSearcher(reader) w = IndexWriter(index_mm,config) # read query read_query() # initialize mongodb client mongoObj=Mongo_Object('localhost',27017) # search docDup=set() finalDup={} for i in xrange(len(queries)): print 'process query %d' %(i) query = queries[i] querystr = stemSentence(query[3]) # build searcher q_lucene = QueryParser("all_text", analyzer).parse(querystr) collector = TopScoreDocCollector.create(hitsPerPage); searcher1.search(q_lucene, collector); hits = collector.topDocs().scoreDocs; # find candidate results after 1st round filter docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) if d['title'] in docDup: finalDup[d['title']]=d continue docDup.add(d['title']) docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) title=d['title'] if d['title'] in docDup: continue docDup.add(title) item=(mongoObj.conn_me).find_one({'title':title}) if item is None: continue entitylist=item['entitylist'].split('|') for en_title in entitylist: if title==en_title: continue t=Term('title',en_title) q=TermQuery(t) docs=searcher2.search(q,2) if docs.totalHits<=1: continue docID2=(docs.scoreDocs)[0].doc doc=searcher2.doc(docID2) finalDup[doc['title']]=doc print 'begin to clean index, there are %d dup records' %(len(finalDup)) for title in finalDup: doc=finalDup[title] # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract name=doc['name'] value=doc['value'] category=doc['category'] skos_category=doc['skos_category'] all_text=doc['all_text'] raw_name=doc['raw_name'] raw_value=doc['raw_value'] abstract=doc['abstract'] print 'process '+title t=Term('title',title) q=TermQuery(t) w.deleteDocuments(q) addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract) # process remaining records #global batch,cnt_batch #if cnt_batch>0: #w.addDocuments(batch) #cnt_batch=0 #del batch[:] w.close()
class LuceneRetrieval(BaseRetrieval): """ Encapsulates the Lucene retrieval engine """ def __init__(self, index_path, method, logger=None, use_default_similarity=False): self.index_path=index_path directory = SimpleFSDirectory(File(self.index_path)) self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) self.reader=DirectoryReader.open(directory) self.searcher = IndexSearcher(self.reader) # uncomment one of these lines to change the type of parser, query and weight used if use_default_similarity: self.query_parser=QueryParser else: self.query_parser=FieldAgnosticQueryParser if use_default_similarity: similarity=DefaultSimilarity() self.useExplainQuery=False else: similarity=FieldAgnosticSimilarity() self.useExplainQuery=True # by default, FieldAgnosticSimilarity uses coord factor, can be disabled ## similarity.useCoord=False self.searcher.setSimilarity(similarity) self.method=method # never used? self.logger=logger def runQueryViaExplain(self,query, max_results): """ Really crappy solution to make sure that explanations and searches are the same while I fix Lucene """ results=[] index=0 for index in range(self.reader.numDocs()): explanation=self.searcher.explain(query,index) score=explanation.getValue() ## match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL) ## if match: ## score=float(match.group(1)) hit=namedtuple("Hit",["doc","score"]) hit.doc=index hit.score=score ## heapq.heappush(results,hit) results.append(hit) results.sort(key=lambda x:x.score,reverse=True) if max_results < self.reader.numDocs(): results=results[:max_results] return results def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL): """ LOTS OF SWEET LUCENE """ original_query=structured_query if not structured_query or len(structured_query) == 0 : return [] self.last_query=structured_query query_text=self.rewriteQuery(structured_query["structured_query"], ["text"]) try: query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text) except: print("Lucene exception:",sys.exc_info()[:2]) return None structured_query["lucene_query"]=query_text if self.useExplainQuery: # this should only exist until I fix the lucene bulkScorer to give the same results hits=self.runQueryViaExplain(query,max_results) else: collector=TopScoreDocCollector.create(max_results, True) self.searcher.search(query, collector) hits = collector.topDocs().scoreDocs ## print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) res=[] ## if len(hits.scoreDocs) ==0: ## print "Original query:",original_query ## print "Query:", query for hit in hits: doc = self.searcher.doc(hit.doc) metadata= json.loads(doc.get("metadata")) res.append((hit.score,metadata)) return res def formulaFromExplanation(self, query, doc_id): """ Runs .explain() for one query/doc pair, generates and returns a \ StoredFormula instance from it :param query: Elastic DSL Query :param doc_id: id of document to run .explain() for :returns: """ explanation=self.searcher.explain(query,doc_id) formula=StoredFormula() formula.fromLuceneExplanation(explanation) return formula
class Searcher: """ Class that contains the search methods """ def __init__(self, searchDir): self.analyzer = MyPythonEnglishAnalyzer( stopwords=Indexer.ENGLISH_STOP_WORDS_SET) self.directory = FSDirectory.open(Paths.get(searchDir)) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) def simpleSearch(self, query, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: parser = QueryParser("content_section", self.analyzer) query = parser.parse(QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits def simpleSearchID(self, query, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: parser = QueryParser("id_section", self.analyzer) query = parser.parse(QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits def multiFieldsSearch(self, query, sim): """ Method that searches through documents using content_section and title_article Fields searchDir : the path to the folder that contains the index. """ # Now search the index: lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits def pairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = QueryParser("content_section", self.analyzer) query1 = parser.parse(QueryParser.escape(title)) query2 = parser.parse(QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits def multiFieldsPairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title)) query2 = MultiFieldQueryParser.parse(parser, QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits
import argparse parser = argparse.ArgumentParser( description='Execute queries on comment body') parser.add_argument('user_name', type=str, help="User name (profile to use)") parser.add_argument('index_dir', metavar='dir', type=str, help="Index directory") parser.add_argument('--sim', type=str, nargs='?', default="tfidf", help="Similarity (in [tfidf, lm, bm25])") parser.add_argument('--reorder', type=str, nargs='?', default="no", help="Reordering (in [ups, normups])") parser.add_argument('--short', action='store_false', help="Don't show the body of comments") args = parser.parse_args() if args.sim in ['bm25']: similarity = BM25Similarity() elif args.sim in ['lm']: similarity = LMDirichletSimilarity() else: similarity = ClassicSimilarity() # Sample query storeDir = SimpleFSDirectory(Paths.get(args.index_dir)) searcher = IndexSearcher(DirectoryReader.open(storeDir)) if similarity is not None: searcher.setSimilarity(similarity) analyzer = StandardAnalyzer() run(searcher, analyzer, args.user_name, reordering=args.reorder, show_bodies=not args.short)
class LuceneCorpus(object): # to init a LuceneCorpus, we need the outputdir, which is passed as index_dir # we need filenames that contains one for more corpus we just created # we need a parser, this parser should implement function 'parse' which knows how to split, how to stem def __init__(self, index_dir, filenames, parser, similarity=None): """ :param index_dir: where to store the Lucene index :param filenames: the corpus created previously. Note that the format of corpus that has been created is consistent :param parser: SimpleWordParser in Parser.py, where we can apply functions such as stemming :param similarity: We can put None here(then default Vector Space Model with TF-IDF is used) or we can use BM25 similarity to index :return: """ self._index_dir = index_dir self._filenames = filenames self._parser = parser self._similarity = similarity lucene.initVM() # the WhitespaceAnalyzer split the text based on whitespace self._analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) self._store = SimpleFSDirectory(File(self._index_dir)) self._searcher = None def prp_index(self): ''' Prepare the index given our "corpus" file(s) ''' print '=> Preparing Lucene index %s' % self._index_dir writer = self._get_writer(create=True) print ' Currently %d docs (dir %s)' % (writer.numDocs(), self._index_dir) num_pages, num_sections = 0, 0 page_name, section_name = None, None num_lines = 0 for ifname,fname in enumerate(self._filenames): print ' Adding lines to index from file #%d: %s' % (ifname, fname) with open(fname,'rt') as infile: for text in infile: if len(text)==0: print 'Reached EOF' break # EOF # CorpusReader.PAGE_NAME_PREFIX is <Page> # all our corpus we manipulated them to have this tag as the start of a page if text.startswith(CorpusReader.PAGE_NAME_PREFIX): page_name = text[len(CorpusReader.PAGE_NAME_PREFIX):].strip() section_name = None num_pages += 1 elif text.startswith(CorpusReader.SECTION_NAME_PREFIX): section_name = text[len(CorpusReader.SECTION_NAME_PREFIX):].strip() num_sections += 1 else: assert (page_name is not None) and (section_name is not None) if self._parser is None: luc_text = text else: # note in our case the we always have SimpleWordParser section_words = self._parser.parse(text, calc_weights=False) #True) luc_text = ' '.join(section_words) # for each section, we add the whole section to Lucene index, we store the text and makes it searchable # seems like page is not necessary here since we do not add document page by page but section by section doc = Document() # there is only one field for each document, which is the text field # section_name is not used as a field doc.add(Field("text", luc_text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) num_lines += 1 if num_lines % 100000 == 0: print ' read %d lines so far: %d pages, %d sections' % (num_lines, num_pages, num_sections) print ' Finished - %d docs (dir %s)' % (writer.numDocs(), self._index_dir) writer.close() def search(self, words, max_docs, weight_func=lambda n: np.ones(n), score_func=lambda s: s): ''' Search the index for the given words, return total score ''' searcher = self._get_searcher() if type(words)==str: search_text = words search_text = AsciiConvertor.convert(search_text) for c in '/+-&|!(){}[]^"~*?:': search_text = search_text.replace('%s'%c, '\%s'%c) else: search_text = ' '.join(words) print 'search_text: %s' % search_text # note that whatever parser that we put as our argument, eventually when searching with query, we will use Lucene parser to split query words query = QueryParser(Version.LUCENE_CURRENT, "text", self._analyzer).parse(search_text) hits = searcher.search(query, max_docs) score_sum = 0.0 weights = weight_func(len(hits.scoreDocs)) for hit,weight in zip(hits.scoreDocs, weights): score_sum += weight * score_func(hit.score) return score_sum def _get_writer(self, analyzer=None, create=False): config = IndexWriterConfig(Version.LUCENE_CURRENT, self._analyzer) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) if self._similarity is not None: config.setSimilarity(self._similarity) writer = IndexWriter(self._store, config) return writer def _get_searcher(self): if self._searcher is None: self._searcher = IndexSearcher(DirectoryReader.open(self._store)) if self._similarity is not None: self._searcher.setSimilarity(self._similarity) return self._searcher
class Lucene(object): # default fieldnames for id and contents FIELDNAME_ID = "id" FIELDNAME_CONTENTS = "contents" # internal fieldtypes # used as Enum, the actual values don't matter FIELDTYPE_ID = "id" FIELDTYPE_ID_TV = "id_tv" FIELDTYPE_TEXT = "text" FIELDTYPE_TEXT_TV = "text_tv" FIELDTYPE_TEXT_TVP = "text_tvp" FIELDTYPE_TEXT_NTV = "text_ntv" FIELDTYPE_TEXT_NTVP = "text_ntvp" def __init__(self, index_dir, max_shingle_size=None): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True self.dir = SimpleFSDirectory(File(index_dir)) self.max_shingle_size = max_shingle_size self.analyzer = None self.reader = None self.searcher = None self.writer = None self.ldf = None @staticmethod def get_version(): """Get Lucene version.""" return Version.LUCENE_48 @staticmethod def preprocess(text): """Tokenize and stop the input text.""" ts = StandardTokenizer(Lucene.get_version(), StringReader(text.lower())) ts = StopFilter(Lucene.get_version(), ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET) string_builder = StringBuilder() ts.reset() char_term_attr = ts.addAttribute(CharTermAttribute.class_) while ts.incrementToken(): if string_builder.length() > 0: string_builder.append(" ") string_builder.append(char_term_attr.toString()) return string_builder.toString() def get_analyzer(self): """Get analyzer.""" if self.analyzer is None: std_analyzer = StandardAnalyzer(Lucene.get_version()) if self.max_shingle_size is None: self.analyzer = std_analyzer else: self.analyzer = ShingleAnalyzerWrapper(std_analyzer, self.max_shingle_size) return self.analyzer def open_reader(self): """Open IndexReader.""" if self.reader is None: self.reader = DirectoryReader.open(self.dir) def get_reader(self): return self.reader def close_reader(self): """Close IndexReader.""" if self.reader is not None: self.reader.close() self.reader = None else: raise Exception("There is no open IndexReader to close") def open_searcher(self): """ Open IndexSearcher. Automatically opens an IndexReader too, if it is not already open. There is no close method for the searcher. """ if self.searcher is None: self.open_reader() self.searcher = IndexSearcher(self.reader) def get_searcher(self): """Returns index searcher (opens it if needed).""" self.open_searcher() return self.searcher def set_lm_similarity_jm(self, method="jm", smoothing_param=0.1): """ Set searcher to use LM similarity. :param method: LM similarity ("jm" or "dirichlet") :param smoothing_param: smoothing parameter (lambda or mu) """ if method == "jm": similarity = LMJelinekMercerSimilarity(smoothing_param) elif method == "dirichlet": similarity = LMDirichletSimilarity(smoothing_param) else: raise Exception("Unknown method") if self.searcher is None: raise Exception("Searcher has not been created") self.searcher.setSimilarity(similarity) def open_writer(self): """Open IndexWriter.""" if self.writer is None: config = IndexWriterConfig(Lucene.get_version(), self.get_analyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.dir, config) else: raise Exception("IndexWriter is already open") def close_writer(self): """Close IndexWriter.""" if self.writer is not None: self.writer.close() self.writer = None else: raise Exception("There is no open IndexWriter to close") def add_document(self, contents): """ Adds a Lucene document with the specified contents to the index. See LuceneDocument.create_document() for the explanation of contents. """ if self.ldf is None: # create a single LuceneDocument object that will be reused self.ldf = LuceneDocument() self.writer.addDocument(self.ldf.create_document(contents)) def get_lucene_document_id(self, doc_id): """Loads a document from a Lucene index based on its id.""" self.open_searcher() query = TermQuery(Term(self.FIELDNAME_ID, doc_id)) tophit = self.searcher.search(query, 1).scoreDocs if len(tophit) == 1: return tophit[0].doc else: return None def get_document_id(self, lucene_doc_id): """Gets lucene document id and returns the document id.""" self.open_reader() return self.reader.document(lucene_doc_id).get(self.FIELDNAME_ID) def print_document(self, lucene_doc_id, term_vect=False): """Prints document contents.""" if lucene_doc_id is None: print "Document is not found in the index." else: doc = self.reader.document(lucene_doc_id) print "Document ID (field '" + self.FIELDNAME_ID + "'): " + doc.get( self.FIELDNAME_ID) # first collect (unique) field names fields = [] for f in doc.getFields(): if f.name() != self.FIELDNAME_ID and f.name() not in fields: fields.append(f.name()) for fname in fields: print fname for fv in doc.getValues( fname): # printing (possibly multiple) field values print "\t" + fv # term vector if term_vect: print "-----" termfreqs = self.get_doc_termfreqs(lucene_doc_id, fname) for term in termfreqs: print term + " : " + str(termfreqs[term]) print "-----" def get_lucene_query(self, query, field=FIELDNAME_CONTENTS): """Creates Lucene query from keyword query.""" query = query.replace("(", "").replace(")", "").replace("!", "") return QueryParser(Lucene.get_version(), field, self.get_analyzer()).parse(query) def analyze_query(self, query, field=FIELDNAME_CONTENTS): """ Analyses the query and returns query terms. :param query: query :param field: field name :return: list of query terms """ qterms = [] # holds a list of analyzed query terms ts = self.get_analyzer().tokenStream(field, query) term = ts.addAttribute(CharTermAttribute.class_) ts.reset() while ts.incrementToken(): qterms.append(term.toString()) ts.end() ts.close() return qterms def get_id_lookup_query(self, id, field=None): """Creates Lucene query for searching by (external) document id.""" if field is None: field = self.FIELDNAME_ID return TermQuery(Term(field, id)) def get_and_query(self, queries): """Creates an AND Boolean query from multiple Lucene queries.""" # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.MUST) return bq def get_or_query(self, queries): """Creates an OR Boolean query from multiple Lucene queries.""" # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.SHOULD) return bq def get_phrase_query(self, query, field): """Creates phrase query for searching exact phrase.""" phq = PhraseQuery() for t in query.split(): phq.add(Term(field, t)) return phq def get_span_query(self, terms, field, slop, ordered=True): """ Creates near span query :param terms: list of terms :param field: field name :param slop: number of terms between the query terms :param ordered: If true, ordered search; otherwise unordered search :return: lucene span near query """ span_queries = [] for term in terms: span_queries.append(SpanTermQuery(Term(field, term))) span_near_query = SpanNearQuery(span_queries, slop, ordered) return span_near_query def get_doc_phrase_freq(self, phrase, field, slop, ordered): """ Returns collection frequency for a given phrase and field. :param phrase: str :param field: field name :param slop: number of terms in between :param ordered: If true, term occurrences should be ordered :return: dictionary {doc: freq, ...} """ # creates span near query span_near_query = self.get_span_query(phrase.split(" "), field, slop=slop, ordered=ordered) # extracts document frequency self.open_searcher() index_reader_context = self.searcher.getTopReaderContext() term_contexts = HashMap() terms = TreeSet() span_near_query.extractTerms(terms) for term in terms: term_contexts.put(term, TermContext.build(index_reader_context, term)) leaves = index_reader_context.leaves() doc_phrase_freq = {} # iterates over all atomic readers for atomic_reader_context in leaves: bits = atomic_reader_context.reader().getLiveDocs() spans = span_near_query.getSpans(atomic_reader_context, bits, term_contexts) while spans.next(): lucene_doc_id = spans.doc() doc_id = atomic_reader_context.reader().document( lucene_doc_id).get(self.FIELDNAME_ID) if doc_id not in doc_phrase_freq: doc_phrase_freq[doc_id] = 1 else: doc_phrase_freq[doc_id] += 1 return doc_phrase_freq def get_id_filter(self): return FieldValueFilter(self.FIELDNAME_ID) def __to_retrieval_results(self, scoredocs, field_id=FIELDNAME_ID): """Converts Lucene scoreDocs results to RetrievalResults format.""" rr = RetrievalResults() if scoredocs is not None: for i in xrange(len(scoredocs)): score = scoredocs[i].score lucene_doc_id = scoredocs[i].doc # internal doc_id doc_id = self.reader.document(lucene_doc_id).get(field_id) rr.append(doc_id, score, lucene_doc_id) return rr def score_query(self, query, field_content=FIELDNAME_CONTENTS, field_id=FIELDNAME_ID, num_docs=100): """Scores a given query and return results as a RetrievalScores object.""" lucene_query = self.get_lucene_query(query, field_content) scoredocs = self.searcher.search(lucene_query, num_docs).scoreDocs return self.__to_retrieval_results(scoredocs, field_id) def num_docs(self): """Returns number of documents in the index.""" self.open_reader() return self.reader.numDocs() def num_fields(self): """Returns number of fields in the index.""" self.open_reader() atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader) return atomic_reader.getFieldInfos().size() def get_fields(self): """Returns name of fields in the index.""" fields = [] self.open_reader() atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader) for fieldInfo in atomic_reader.getFieldInfos().iterator(): fields.append(fieldInfo.name) return fields def get_doc_termvector(self, lucene_doc_id, field): """Outputs the document term vector as a generator.""" terms = self.reader.getTermVector(lucene_doc_id, field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum def get_doc_termfreqs(self, lucene_doc_id, field): """ Returns term frequencies for a given document field. :param lucene_doc_id: Lucene document ID :param field: document field :return dict: with terms """ termfreqs = {} for term, termenum in self.get_doc_termvector(lucene_doc_id, field): termfreqs[term] = int(termenum.totalTermFreq()) return termfreqs def get_doc_termfreqs_all_fields(self, lucene_doc_id): """ Returns term frequency for all fields in the given document. :param lucene_doc_id: Lucene document ID :return: dictionary {field: {term: freq, ...}, ...} """ doc_termfreqs = {} vectors = self.reader.getTermVectors(lucene_doc_id) if vectors: for field in vectors.iterator(): doc_termfreqs[field] = {} terms = vectors.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): doc_termfreqs[field][bytesref.utf8ToString()] = int( termenum.totalTermFreq()) print doc_termfreqs[field] return doc_termfreqs def get_coll_termvector(self, field): """ Returns collection term vector for the given field.""" self.open_reader() fields = MultiFields.getFields(self.reader) if fields is not None: terms = fields.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum def get_coll_termfreq(self, term, field): """ Returns collection term frequency for the given field. :param term: string :param field: string, document field :return: int """ self.open_reader() return self.reader.totalTermFreq(Term(field, term)) def get_doc_freq(self, term, field): """ Returns document frequency for the given term and field. :param term: string, term :param field: string, document field :return: int """ self.open_reader() return self.reader.docFreq(Term(field, term)) def get_doc_count(self, field): """ Returns number of documents with at least one term for the given field. :param field: string, field name :return: int """ self.open_reader() return self.reader.getDocCount(field) def get_coll_length(self, field): """ Returns length of field in the collection. :param field: string, field name :return: int """ self.open_reader() return self.reader.getSumTotalTermFreq(field) def get_avg_len(self, field): """ Returns average length of a field in the collection. :param field: string, field name """ self.open_reader() n = self.reader.getDocCount( field) # number of documents with at least one term for this field len_all = self.reader.getSumTotalTermFreq(field) if n == 0: return 0 else: return len_all / float(n)
class Index: def __init__(self, folder=None, fields=[], similarity="tfidf"): self.jcc = lucene.initVM() if folder: self.directory = SimpleFSDirectory(File(folder)) else: self.directory = RAMDirectory() self.fields = {} for field in fields: ft = FieldType() for pname, pvalue in field.props.items(): setter = getattr(ft, "set" + pname.capitalize()) setter(pvalue) ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # ft.setOmitNorms(True) self.fields[field.name] = ft self.similarity = similarity.lower() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.writer = None self.searcher = None def attach_thread(self): self.jcc.attachCurrentThread() def open_writer(self): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.directory, config) def add(self, **doc): if not self.writer: self.open_writer() d = Document() for field, value in doc.items(): # try : d.add(Field(field, value, self.fields[field])) # except Exception, e : # print # print "Fudeu" # pass self.writer.addDocument(d) def commit(self): self.writer.commit() def close(self): if self.writer: self.writer.close() def open_searcher(self): self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) if (self.similarity == "bm25"): self.searcher.setSimilarity(BM25Similarity()) def preprocess_query(self, query, fields, mode="ANY"): ''' Fix query according to provided mode. If the value is not supported, the query remains unchanged ''' terms = query.lower().strip().split() if mode == "ANY": query = " OR ".join(terms) elif mode == "ALL": query = " AND ".join(terms) else: print "Invalid mode parameter '%s'." % mode query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return query def search(self, query, search_fields, return_fields, filter=None, ignore=set(), mode="ANY", return_scores=False, limit=1000000): ''' Search documents in the index using a standard analyzer (tokenizes and removes top words). Supports two search modes: ANY and ALL ANY: include documents that contain at least one term of the query. ALL: include only documents that contain all terms of the query. ''' if not self.searcher: self.open_searcher() # Return empty results if query is empty (Lucene can't handle it nicely) if query.strip() == '': if return_scores: return [], [] else: return [] query = self.preprocess_query(query, search_fields, mode) # If limit is not provided, return all matched documents. A little hack is required # to do that. We query for one document and get the count total matched documents. # if not limit : # hits = self.searcher.search(query, 1) # limit = hits.totalHits # Fetch more than asked in case we have to remove entries from the ignore set if limit != None: limit += len(ignore) hits = self.searcher.search(query, filter, limit) hits = hits.scoreDocs docs = [] for hit in hits: doc = self.searcher.doc(hit.doc) if doc['id'] not in ignore: docs.append([doc[f] for f in return_fields]) if return_scores: scores = [hit.score for hit in hits] return docs[:limit], scores[:limit] return docs[:limit] def explain(self, query, fields, doc): if not self.searcher: self.open_searcher() query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return self.searcher.explain(query, doc) def get_documents(self, doc_ids, fields): docs = [] for doc_id in doc_ids: doc = self.reader.document(doc_id) if isinstance(fields, basestring): docs.append(doc.get(fields)) else: docs.append({f: doc.get(f) for f in fields}) return docs def get_query_scores(self, query, fields, doc_ids, mode="ANY"): # Creates pre-filter to ignore all other documents filter = TermsFilter([Term("id", id) for id in doc_ids]) query = self.preprocess_query(query, fields, mode) hits = self.searcher.search(query, filter, len(doc_ids)).scoreDocs # Creates scores' mapping using entity id instead of internal index id scores = { str(self.reader.document(hit.doc).get("id")): hit.score for hit in hits } # Normalize to 0..1 interval # n = 1.0/sum(scores.values()) # scores # Adds to the mapping entries for the non-returned docs (no term found) for doc_id in doc_ids: if doc_id not in scores: scores[doc_id] = 0.0 return scores
print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return translate = makeFrenchQuery.makeFrenchQuery(command) commande = "" for word in translate: commande += word commande += " " print print "Searching for:", commande query = QueryParser("contents", analyzer).parse(commande) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name"), ( 'score: %f' % (scoreDoc.score)) if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) searcher.setSimilarity(ClassicSimilarity()) analyzer = FrenchAnalyzer() run(searcher, analyzer) del searcher
class Lucene(object): # default fieldnames for id and contents FIELDNAME_ID = "id" FIELDNAME_CONTENTS = "contents" # internal fieldtypes # used as Enum, the actual values don't matter FIELDTYPE_ID = "id" FIELDTYPE_ID_TV = "id_tv" FIELDTYPE_TEXT = "text" FIELDTYPE_TEXT_TV = "text_tv" FIELDTYPE_TEXT_TVP = "text_tvp" FIELDTYPE_TEXT_NTV = "text_ntv" FIELDTYPE_TEXT_NTVP = "text_ntvp" def __init__(self, index_dir, max_shingle_size=None): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True self.dir = SimpleFSDirectory(File(index_dir)) self.max_shingle_size = max_shingle_size self.analyzer = None self.reader = None self.searcher = None self.writer = None self.ldf = None @staticmethod def get_version(): """Get Lucene version.""" return Version.LUCENE_48 @staticmethod def preprocess(text): """Tokenize and stop the input text.""" ts = StandardTokenizer(Lucene.get_version(), StringReader(text.lower())) ts = StopFilter(Lucene.get_version(), ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET) string_builder = StringBuilder() ts.reset() char_term_attr = ts.addAttribute(CharTermAttribute.class_) while ts.incrementToken(): if string_builder.length() > 0: string_builder.append(" ") string_builder.append(char_term_attr.toString()) return string_builder.toString() def get_analyzer(self): """Get analyzer.""" if self.analyzer is None: std_analyzer = StandardAnalyzer(Lucene.get_version()) if self.max_shingle_size is None: self.analyzer = std_analyzer else: self.analyzer = ShingleAnalyzerWrapper(std_analyzer, self.max_shingle_size) return self.analyzer def open_reader(self): """Open IndexReader.""" if self.reader is None: self.reader = DirectoryReader.open(self.dir) def get_reader(self): return self.reader def close_reader(self): """Close IndexReader.""" if self.reader is not None: self.reader.close() self.reader = None else: raise Exception("There is no open IndexReader to close") def open_searcher(self): """ Open IndexSearcher. Automatically opens an IndexReader too, if it is not already open. There is no close method for the searcher. """ if self.searcher is None: self.open_reader() self.searcher = IndexSearcher(self.reader) def get_searcher(self): """Returns index searcher (opens it if needed).""" self.open_searcher() return self.searcher def set_lm_similarity_jm(self, method="jm", smoothing_param=0.1): """ Set searcher to use LM similarity. :param method: LM similarity ("jm" or "dirichlet") :param smoothing_param: smoothing parameter (lambda or mu) """ if method == "jm": similarity = LMJelinekMercerSimilarity(smoothing_param) elif method == "dirichlet": similarity = LMDirichletSimilarity(smoothing_param) else: raise Exception("Unknown method") if self.searcher is None: raise Exception("Searcher has not been created") self.searcher.setSimilarity(similarity) def open_writer(self): """Open IndexWriter.""" if self.writer is None: config = IndexWriterConfig(Lucene.get_version(), self.get_analyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.dir, config) else: raise Exception("IndexWriter is already open") def close_writer(self): """Close IndexWriter.""" if self.writer is not None: self.writer.close() self.writer = None else: raise Exception("There is no open IndexWriter to close") def add_document(self, contents): """ Adds a Lucene document with the specified contents to the index. See LuceneDocument.create_document() for the explanation of contents. """ if self.ldf is None: # create a single LuceneDocument object that will be reused self.ldf = LuceneDocument() self.writer.addDocument(self.ldf.create_document(contents)) def get_lucene_document_id(self, doc_id): """Loads a document from a Lucene index based on its id.""" self.open_searcher() query = TermQuery(Term(self.FIELDNAME_ID, doc_id)) tophit = self.searcher.search(query, 1).scoreDocs if len(tophit) == 1: return tophit[0].doc else: return None def get_document_id(self, lucene_doc_id): """Gets lucene document id and returns the document id.""" self.open_reader() return self.reader.document(lucene_doc_id).get(self.FIELDNAME_ID) def print_document(self, lucene_doc_id, term_vect=False): """Prints document contents.""" if lucene_doc_id is None: print "Document is not found in the index." else: doc = self.reader.document(lucene_doc_id) print "Document ID (field '" + self.FIELDNAME_ID + "'): " + doc.get(self.FIELDNAME_ID) # first collect (unique) field names fields = [] for f in doc.getFields(): if f.name() != self.FIELDNAME_ID and f.name() not in fields: fields.append(f.name()) for fname in fields: print fname for fv in doc.getValues(fname): # printing (possibly multiple) field values print "\t" + fv # term vector if term_vect: print "-----" termfreqs = self.get_doc_termfreqs(lucene_doc_id, fname) for term in termfreqs: print term + " : " + str(termfreqs[term]) print "-----" def get_lucene_query(self, query, field=FIELDNAME_CONTENTS): """Creates Lucene query from keyword query.""" query = query.replace("(", "").replace(")", "").replace("!", "") return QueryParser(Lucene.get_version(), field, self.get_analyzer()).parse(query) def analyze_query(self, query, field=FIELDNAME_CONTENTS): """ Analyses the query and returns query terms. :param query: query :param field: field name :return: list of query terms """ qterms = [] # holds a list of analyzed query terms ts = self.get_analyzer().tokenStream(field, query) term = ts.addAttribute(CharTermAttribute.class_) ts.reset() while ts.incrementToken(): qterms.append(term.toString()) ts.end() ts.close() return qterms def get_id_lookup_query(self, id, field=None): """Creates Lucene query for searching by (external) document id.""" if field is None: field = self.FIELDNAME_ID return TermQuery(Term(field, id)) def get_and_query(self, queries): """Creates an AND Boolean query from multiple Lucene queries.""" # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.MUST) return bq def get_or_query(self, queries): """Creates an OR Boolean query from multiple Lucene queries.""" # empty boolean query with Similarity.coord() disabled bq = BooleanQuery(False) for q in queries: bq.add(q, BooleanClause.Occur.SHOULD) return bq def get_phrase_query(self, query, field): """Creates phrase query for searching exact phrase.""" phq = PhraseQuery() for t in query.split(): phq.add(Term(field, t)) return phq def get_span_query(self, terms, field, slop, ordered=True): """ Creates near span query :param terms: list of terms :param field: field name :param slop: number of terms between the query terms :param ordered: If true, ordered search; otherwise unordered search :return: lucene span near query """ span_queries = [] for term in terms: span_queries.append(SpanTermQuery(Term(field, term))) span_near_query = SpanNearQuery(span_queries, slop, ordered) return span_near_query def get_doc_phrase_freq(self, phrase, field, slop, ordered): """ Returns collection frequency for a given phrase and field. :param phrase: str :param field: field name :param slop: number of terms in between :param ordered: If true, term occurrences should be ordered :return: dictionary {doc: freq, ...} """ # creates span near query span_near_query = self.get_span_query(phrase.split(" "), field, slop=slop, ordered=ordered) # extracts document frequency self.open_searcher() index_reader_context = self.searcher.getTopReaderContext() term_contexts = HashMap() terms = TreeSet() span_near_query.extractTerms(terms) for term in terms: term_contexts.put(term, TermContext.build(index_reader_context, term)) leaves = index_reader_context.leaves() doc_phrase_freq = {} # iterates over all atomic readers for atomic_reader_context in leaves: bits = atomic_reader_context.reader().getLiveDocs() spans = span_near_query.getSpans(atomic_reader_context, bits, term_contexts) while spans.next(): lucene_doc_id = spans.doc() doc_id = atomic_reader_context.reader().document(lucene_doc_id).get(self.FIELDNAME_ID) if doc_id not in doc_phrase_freq: doc_phrase_freq[doc_id] = 1 else: doc_phrase_freq[doc_id] += 1 return doc_phrase_freq def get_id_filter(self): return FieldValueFilter(self.FIELDNAME_ID) def __to_retrieval_results(self, scoredocs, field_id=FIELDNAME_ID): """Converts Lucene scoreDocs results to RetrievalResults format.""" rr = RetrievalResults() if scoredocs is not None: for i in xrange(len(scoredocs)): score = scoredocs[i].score lucene_doc_id = scoredocs[i].doc # internal doc_id doc_id = self.reader.document(lucene_doc_id).get(field_id) rr.append(doc_id, score, lucene_doc_id) return rr def score_query(self, query, field_content=FIELDNAME_CONTENTS, field_id=FIELDNAME_ID, num_docs=100): """Scores a given query and return results as a RetrievalScores object.""" lucene_query = self.get_lucene_query(query, field_content) scoredocs = self.searcher.search(lucene_query, num_docs).scoreDocs return self.__to_retrieval_results(scoredocs, field_id) def num_docs(self): """Returns number of documents in the index.""" self.open_reader() return self.reader.numDocs() def num_fields(self): """Returns number of fields in the index.""" self.open_reader() atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader) return atomic_reader.getFieldInfos().size() def get_fields(self): """Returns name of fields in the index.""" fields = [] self.open_reader() atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader) for fieldInfo in atomic_reader.getFieldInfos().iterator(): fields.append(fieldInfo.name) return fields def get_doc_termvector(self, lucene_doc_id, field): """Outputs the document term vector as a generator.""" terms = self.reader.getTermVector(lucene_doc_id, field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum def get_doc_termfreqs(self, lucene_doc_id, field): """ Returns term frequencies for a given document field. :param lucene_doc_id: Lucene document ID :param field: document field :return dict: with terms """ termfreqs = {} for term, termenum in self.get_doc_termvector(lucene_doc_id, field): termfreqs[term] = int(termenum.totalTermFreq()) return termfreqs def get_doc_termfreqs_all_fields(self, lucene_doc_id): """ Returns term frequency for all fields in the given document. :param lucene_doc_id: Lucene document ID :return: dictionary {field: {term: freq, ...}, ...} """ doc_termfreqs = {} vectors = self.reader.getTermVectors(lucene_doc_id) if vectors: for field in vectors.iterator(): doc_termfreqs[field] = {} terms = vectors.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): doc_termfreqs[field][bytesref.utf8ToString()] = int(termenum.totalTermFreq()) print doc_termfreqs[field] return doc_termfreqs def get_coll_termvector(self, field): """ Returns collection term vector for the given field.""" self.open_reader() fields = MultiFields.getFields(self.reader) if fields is not None: terms = fields.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum def get_coll_termfreq(self, term, field): """ Returns collection term frequency for the given field. :param term: string :param field: string, document field :return: int """ self.open_reader() return self.reader.totalTermFreq(Term(field, term)) def get_doc_freq(self, term, field): """ Returns document frequency for the given term and field. :param term: string, term :param field: string, document field :return: int """ self.open_reader() return self.reader.docFreq(Term(field, term)) def get_doc_count(self, field): """ Returns number of documents with at least one term for the given field. :param field: string, field name :return: int """ self.open_reader() return self.reader.getDocCount(field) def get_coll_length(self, field): """ Returns length of field in the collection. :param field: string, field name :return: int """ self.open_reader() return self.reader.getSumTotalTermFreq(field) def get_avg_len(self, field): """ Returns average length of a field in the collection. :param field: string, field name """ self.open_reader() n = self.reader.getDocCount(field) # number of documents with at least one term for this field len_all = self.reader.getSumTotalTermFreq(field) if n == 0: return 0 else: return len_all / float(n)
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser("contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name") if __name__ == '__main__': STORE_DIR = "/usr/src/pylucene/aclImdb/index" lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION directory = SimpleFSDirectory(Paths.get(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() searcher.setSimilarity(BM25Similarity()) run(searcher, analyzer) del searcher
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) # read query read_query() # initialize mongodb client mongoObj = Mongo_Object('localhost', 27017) # initialize word2vec print 'load word2vec model' w2vmodel = gensim.models.Word2Vec.load_word2vec_format( "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary", binary=True) print 'finish loading word2vec model' # search global hitsPerPage fields = ['name', 'value'] #parser=MultiFieldQueryParser(fields,analyzer) #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) rec_result = open('pylucene.runs', 'w') for i in range(len(queries)): query = queries[i] print 'processing query ' + str(i) + ':' + query[0] querystr = remove_duplicate(stemSentence(query[1])) #q_lucene=MultiFieldQueryParser.parse(parser,querystr) q_lucene = QueryParser("all_text", analyzer).parse(querystr) print "q_lucene: " + q_lucene.toString() collector = TopScoreDocCollector.create(hitsPerPage) searcher.search(q_lucene, collector) hits = collector.topDocs().scoreDocs # build query object for computeScore #queryObj=Query_Object(query,mongoObj,w2vmodel) # initialize duplicate remover docDup = set() # find candidate results after 1st round filter candidates = PriorityQueue() for j in xrange(len(hits)): docID = hits[j].doc d = searcher.doc(docID) name = cleanSentence(d['title'].strip()) if name in docDup: continue docDup.add(name) # build entity object entityObj = Entity_Object(d, mongoObj, w2vmodel) #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel) score = hits[j].score candidates.put((-score, j)) # output results from priority queue larger score first rank = 0 while candidates.empty() == False and rank < 100: rank = rank + 1 item = candidates.get() score = -item[0] j = item[1] # index of hits[] docID = hits[j].doc d = searcher.doc(docID) title = '<dbpedia:' + d.get('title') + '>' res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str( rank) + '\t' + str(score) + '\t' + 'pylucene_multifield' rec_result.writelines(res_line + '\n') rec_result.close()