def __init__(self, root, storedir, isindexing=False, isBM25=True): if not os.path.exists(storedir): os.mkdir(storedir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) if isindexing: store = SimpleFSDirectory(Paths.get(storedir)) config = IndexWriterConfig(self.analyzer) # TODO BM25 parameter tuning if isBM25: config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done') search_dir = SimpleFSDirectory(Paths.get(storedir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) if isBM25: self.searcher.setSimilarity(BM25Similarity())
def __init__(self, index_path, field, similarity="boolean", use_relevance_feedback=False, feedback_index_path=None): self.reader = DirectoryReader.open( FSDirectory.open(Paths.get(index_path))) self.searcher = IndexSearcher(self.reader) if use_relevance_feedback and feedback_index_path is not None: self.feedback_reader = DirectoryReader.open( FSDirectory.open(Paths.get(feedback_index_path))) self.feedback_searcher = IndexSearcher(self.feedback_reader) self.similarity = similarity self.stopwords = stop_words() if similarity == "boolean": self.searcher.setSimilarity(BooleanSimilarity()) elif similarity == "tf": self.searcher.setSimilarity(TFSimilarity()) elif similarity == "tfidf": self.searcher.setSimilarity(ClassicSimilarity()) elif similarity == "BM25": self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) else: print("Unknown similarity, so we use BM25(1.2, 0.2) as default") self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) analyzer = StandardAnalyzer() print(self.searcher.getSimilarity()) self.parser = QueryParser(field, analyzer)
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) #self.analyzer = StandardAnalyzer() self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searcher = IndexSearcher(self.reader) self.dict_term_freq = {} if similarity == 'BM25': (self.searcher).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
def get_most_similar(self, sentence, do_log=False): # print('query string is',string) # q = QueryParser('pa', self.analyzer).parse(sentence) query_builder = BooleanQuery.Builder() for token in sentence.split(' '): if token not in sw: qtq = TermQuery(Term("pa", token)) query_builder.add( BooleanClause(qtq, BooleanClause.Occur.SHOULD)) q = query_builder.build() hitsPerPage = 2 reader = DirectoryReader.open(self.w) self.searcher = IndexSearcher(reader) simi = BM25Similarity(Config.k1, Config.b) # simi = ClassicSimilarity() self.searcher.setSimilarity(simi) docs = self.searcher.search(q, hitsPerPage) hits = docs.scoreDocs # print("Found " + str(len(hits)) + " hits.") if len(hits) > 0: mate = self.searcher.doc(hits[0].doc).get("id") if do_log: print("found something. mate: ", mate, "- score : ", hits[0].score) return hits[0], int(mate) else: return None, -1
def publish_services(self, service_list): transformer = WSDLTransformer() current_document = 1 indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig( Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT)) writerConfig.setSimilarity(BM25Similarity()) index_writer = IndexWriter(indexDir, writerConfig) for wsdl in service_list: if self._document_expansion: #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl)))) bag_of_words = ' '.join( self._semantic_transformer.transform( transformer.transform(wsdl))) else: #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl))) bag_of_words = ' '.join(transformer.transform(wsdl)) doc = Document() doc.add( Field("content", bag_of_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO)) index_writer.addDocument(doc) current_document += 1 index_writer.close()
def process_q_test(q, out_q): lucene.initVM() lucene.getVMEnv().attachCurrentThread() index = DirectoryReader.open(SimpleFSDirectory( Paths.get(robust_index_dir))) searcher = IndexSearcher(index) searcher.setSimilarity(BM25Similarity()) analyzer = EnglishAnalyzer() qparser = QueryParser("contents", analyzer) preprocessor = Preprocess() while not exitFlag: qid, query = q.get() tname = multiprocessing.current_process().name # print(tname, qid, query) if query == "DONE": break try: # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000) # if len(dids) >= 10: # out_q.put((qid, dids, scores)) dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer, preprocessor) out_q.put((qid, dids_text)) except: print('%s exception %s, %s' % (tname, qid, query))
def __init__( self, commitTimeout=10, commitCount=100000, multithreaded=True, readonly=False, lruTaxonomyWriterCacheSize=4000, analyzer=MerescoStandardAnalyzer(), similarity=BM25Similarity(), fieldRegistry=FieldRegistry(), maxMergeAtOnce=2, segmentsPerTier=8.0, numberOfConcurrentTasks=6, verbose=True, ): self.commitTimeout = commitTimeout self.commitCount = commitCount self.multithreaded = multithreaded self.readonly = readonly self.lruTaxonomyWriterCacheSize = lruTaxonomyWriterCacheSize self.analyzer = analyzer self.similarity = similarity self.fieldRegistry = fieldRegistry self.maxMergeAtOnce = maxMergeAtOnce self.segmentsPerTier = segmentsPerTier self.numberOfConcurrentTasks = numberOfConcurrentTasks self.verbose = verbose
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searchers = [] self.searchers.append(IndexSearcher(self.reader)) if similarity == 'BM25': (self.searchers[0]).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache'] if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache_with_wikipedia'] else: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache']
def __init__(self): indexdir = './IndexFiles.index' lucene.initVM(vmargs=['-Djava.awt.headless=true']) search_dir = SimpleFSDirectory(Paths.get(indexdir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) self.searcher.setSimilarity(BM25Similarity()) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) self.lemmatizer = nltk.stem.WordNetLemmatizer()
def run(self): print("Starting " + self.name) lucene.getVMEnv().attachCurrentThread() index = DirectoryReader.open( SimpleFSDirectory(Paths.get(robust_index_dir))) searcher = IndexSearcher(index) searcher.setSimilarity(BM25Similarity()) analyzer = EnglishAnalyzer() qparser = QueryParser("contents", analyzer) # process_query(self.name, self.q, self.out_q, searcher, qparser) print("Exiting " + self.name)
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0] * len( feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def __init__(self): self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print('Creating index at', prm.index_folder) if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print('copying index from', prm.index_folder, 'to', prm.local_index_folder) if os.path.exists(prm.local_index_folder): print('Folder', prm.local_index_folder, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(BM25Similarity()) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print('Creating index at', prm.index_folder_term) self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term) if os.path.exists(prm.local_index_folder_term): print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.') else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print('Loading Text-ID mapping...') self.text_id_map, self.id_text_map = self.get_text_id_map()
def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None): # This method must be thread-safe if similarity is None: self._similarity = self._settings.similarity else: self._similarity = BM25Similarity(similarity["k1"], similarity["b"]) if numberOfConcurrentTasks is None: self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks else: self._numberOfConcurrentTasks = numberOfConcurrentTasks self._reopenSearcher = True
def find(self, query): transformer = StringTransformer() analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) processed_query = ' '.join( self._preprocessor(transformer.transform(query))) query = QueryParser(Version.LUCENE_CURRENT, "content", analyzer).parse(processed_query) hits = searcher.get_description(query, 10) result_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) result_list.append(doc.get("path").encode("utf-8")) return result_list
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = StandardAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searcher = IndexSearcher(self.reader) self.dict_term_freq = {} self.dict_doc_field_title = {} if similarity == 'BM25': (self.searcher).setSimilarity(BM25Similarity())
def lucene_retrieval(q_string, use_BM25=False): """ :param q_string: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def doc_text(hists): """ return doc_name & score :param hists: """ text = '_NONE_' for h in hists: docID = h.doc doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") text = doc.get("text") #score = h.score # yield (file_name, doc_name, score, text) return text result = '_NONE_' # escape special characters via escape function if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here #print(q_string) query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists result = doc_text(hs) # reader.close() return result # text: also nodes
def __init__(self, lang=None, dataset=None, analyzer=None, index_path=None, k1=None, b=None): super().__init__(k1, b) print("Searcher k1: {}, b: {}", self.k1, self.b) self.similarity = BM25Similarity(self.k1, self.b) self.searcher = {} self.parser = {} self.languages = [] self.lang = lang self.dataset = dataset self.__call__ = self.query if lang != None or dataset != None or analyzer != None: self.addLang(lang, dataset, analyzer, index_path)
def get_sorted_results(self, query): SHOULD = BooleanClause.Occur.SHOULD parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer) reader = IndexReader.open(self.directory) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) topDocs = searcher.search(parsed_query, 10) j = 0 for i in topDocs.scoreDocs: d = searcher.doc(i.doc) print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score) j += 1
def __init__(self): wikidir = './wiki-pages-text' indexdir = './IndexFiles.index' lucene.initVM(vmargs=['-Djava.awt.headless=true']) if not os.path.exists(indexdir): os.mkdir(indexdir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) store = SimpleFSDirectory(Paths.get(indexdir)) config = IndexWriterConfig(self.analyzer) config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(wikidir, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done')
def open_searcher(self): self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader) if (self.similarity == "bm25"): self.searcher.setSimilarity(BM25Similarity())
import argparse parser = argparse.ArgumentParser( description='Execute queries on comment body') parser.add_argument('user_name', type=str, help="User name (profile to use)") parser.add_argument('index_dir', metavar='dir', type=str, help="Index directory") parser.add_argument('--sim', type=str, nargs='?', default="tfidf", help="Similarity (in [tfidf, lm, bm25])") parser.add_argument('--reorder', type=str, nargs='?', default="no", help="Reordering (in [ups, normups])") parser.add_argument('--short', action='store_false', help="Don't show the body of comments") args = parser.parse_args() if args.sim in ['bm25']: similarity = BM25Similarity() elif args.sim in ['lm']: similarity = LMDirichletSimilarity() else: similarity = ClassicSimilarity() # Sample query storeDir = SimpleFSDirectory(Paths.get(args.index_dir)) searcher = IndexSearcher(DirectoryReader.open(storeDir)) if similarity is not None: searcher.setSimilarity(similarity) analyzer = StandardAnalyzer() run(searcher, analyzer, args.user_name, reordering=args.reorder, show_bodies=not args.short)
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) # read query read_query() # initialize mongodb client mongoObj = Mongo_Object('localhost', 27017) # initialize word2vec print 'load word2vec model' w2vmodel = gensim.models.Word2Vec.load_word2vec_format( "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary", binary=True) print 'finish loading word2vec model' # search global hitsPerPage fields = ['name', 'value'] #parser=MultiFieldQueryParser(fields,analyzer) #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) rec_result = open('pylucene.runs', 'w') for i in range(len(queries)): query = queries[i] print 'processing query ' + str(i) + ':' + query[0] querystr = remove_duplicate(stemSentence(query[1])) #q_lucene=MultiFieldQueryParser.parse(parser,querystr) q_lucene = QueryParser("all_text", analyzer).parse(querystr) print "q_lucene: " + q_lucene.toString() collector = TopScoreDocCollector.create(hitsPerPage) searcher.search(q_lucene, collector) hits = collector.topDocs().scoreDocs # build query object for computeScore #queryObj=Query_Object(query,mongoObj,w2vmodel) # initialize duplicate remover docDup = set() # find candidate results after 1st round filter candidates = PriorityQueue() for j in xrange(len(hits)): docID = hits[j].doc d = searcher.doc(docID) name = cleanSentence(d['title'].strip()) if name in docDup: continue docDup.add(name) # build entity object entityObj = Entity_Object(d, mongoObj, w2vmodel) #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel) score = hits[j].score candidates.put((-score, j)) # output results from priority queue larger score first rank = 0 while candidates.empty() == False and rank < 100: rank = rank + 1 item = candidates.get() score = -item[0] j = item[1] # index of hits[] docID = hits[j].doc d = searcher.doc(docID) title = '<dbpedia:' + d.get('title') + '>' res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str( rank) + '\t' + str(score) + '\t' + 'pylucene_multifield' rec_result.writelines(res_line + '\n') rec_result.close()
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher1 = IndexSearcher(reader) searcher1.setSimilarity(BM25Similarity()) searcher2 = IndexSearcher(reader) w = IndexWriter(index_mm,config) # read query read_query() # initialize mongodb client mongoObj=Mongo_Object('localhost',27017) # search docDup=set() finalDup={} for i in xrange(len(queries)): print 'process query %d' %(i) query = queries[i] querystr = stemSentence(query[3]) # build searcher q_lucene = QueryParser("all_text", analyzer).parse(querystr) collector = TopScoreDocCollector.create(hitsPerPage); searcher1.search(q_lucene, collector); hits = collector.topDocs().scoreDocs; # find candidate results after 1st round filter docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) if d['title'] in docDup: finalDup[d['title']]=d continue docDup.add(d['title']) docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) title=d['title'] if d['title'] in docDup: continue docDup.add(title) item=(mongoObj.conn_me).find_one({'title':title}) if item is None: continue entitylist=item['entitylist'].split('|') for en_title in entitylist: if title==en_title: continue t=Term('title',en_title) q=TermQuery(t) docs=searcher2.search(q,2) if docs.totalHits<=1: continue docID2=(docs.scoreDocs)[0].doc doc=searcher2.doc(docID2) finalDup[doc['title']]=doc print 'begin to clean index, there are %d dup records' %(len(finalDup)) for title in finalDup: doc=finalDup[title] # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract name=doc['name'] value=doc['value'] category=doc['category'] skos_category=doc['skos_category'] all_text=doc['all_text'] raw_name=doc['raw_name'] raw_value=doc['raw_value'] abstract=doc['abstract'] print 'process '+title t=Term('title',title) q=TermQuery(t) w.deleteDocuments(q) addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract) # process remaining records #global batch,cnt_batch #if cnt_batch>0: #w.addDocuments(batch) #cnt_batch=0 #del batch[:] w.close()
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def createIndexSearcher(indexDir): directory = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir))) searcher = IndexSearcher(directory) similarity = BM25Similarity(K1, B) searcher.setSimilarity(similarity) return searcher
""" import time import json import lucene import nltk from org.apache.lucene import queryparser, analysis from org.apache.lucene.search.similarities import BM25Similarity from lupyne import engine start_time = time.time() lucene.initVM() dest = "lucene_wikis.index" indexer = engine.Indexer(dest) indexer.setSimilarity(BM25Similarity()) analyzer = analysis.standard.StandardAnalyzer() parser = queryparser.classic.QueryParser("text", analyzer) prediction = {} word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer() nltk.download('averaged_perceptron_tagger') related_sentences = [] def re_process_string(str): str = str.replace(" ", "_") str = str.replace("(", "-LRB-") str = str.replace(")", "-RRB-") return str
# for txtName in gutenberg_list: # words = nltk.corpus.gutenberg.words(txtName) # sents = " ".join(words).split(".") # print(sents[:100]) # # print("Indexing ", txtName, "...") # # for i in range(0, len(sents), 10): # # text = " ".join(sents[i:i+10]) # # doc = Document() # # doc.add(Field("fieldname", text, TextField.TYPE_STORED)) # # iwriter.addDocument(doc) # # iwriter.close() # now search the index ireader = DirectoryReader.open(directory) isearcher = IndexSearcher(ireader) # set similarity method bm25 = BM25Similarity() isearcher.setSimilarity(bm25) # parse a simple query that searches for "text" parser = QueryParser("fieldname", analyzer) query = parser.parse("her sister was reading") hits = isearcher.search(query, 5).scoreDocs print(len(hits)) for hit in hits: result = isearcher.doc(hit.doc) print("[%8.4f] %s" % (hit.score, result.get("fieldname")))
] answers = [item["answer"] for item in qas] print("Loading Lucene Index ...") lucene.initVM(vmargs=['-Djava.aws.headless=true']) analyzer = StandardAnalyzer() searchDir = NIOFSDirectory(Paths.get(args.index_path)) searcher = IndexSearcher(DirectoryReader.open(searchDir)) # try tuning the hyperparameters of bm25 for k1 in [0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2]: for b in [0.5, 0.6, 0.7, 0.8, 0.9]: print(f"Grid search.... k1: {k1}; b: {b}") searcher.setSimilarity(BM25Similarity(k1, b)) parser = QueryParser('Context', analyzer) retrieved = [] print("Searching ...") for q in tqdm(questions): query = parser.parse(QueryParser.escape(q)) # print(q, "|", QueryParser.escape(q), "|", query) # import pdb; pdb.set_trace() scoreDocs = searcher.search(query, args.topk).scoreDocs topkDocs = [] for hit in scoreDocs: doc = searcher.doc(hit.doc) topkDocs.append({ "title": doc.get("Title"),
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser("contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name") if __name__ == '__main__': STORE_DIR = "/usr/src/pylucene/aclImdb/index" lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION directory = SimpleFSDirectory(Paths.get(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() searcher.setSimilarity(BM25Similarity()) run(searcher, analyzer) del searcher
def __init__(self, lang, dataset, analyzer, index_path=None, data_path=None, ram_size=2048): """ Returns scored documents in multiple languages. Parameters: dataset (str): ['mlqa_dev', 'mlqa_test', 'wiki'] lang (str): ['en', 'es', 'de'] anlyzer (str): ['en', 'es', 'de', 'standard'] ram_size (int): Size of memory used while indexing Returns: """ super().__init__() idxdir = self.get_index(lang, dataset, index_path) self.mlqa = True if dataset == 'mlqa_dev': self.dataset = MLQADataset('dev', lang, lang, data_path) elif dataset == 'mlqa_test': self.dataset = MLQADataset('test', lang, lang, data_path) elif dataset == 'wiki': self.mlqa = False self.dataset = Wiki(lang, data_path) else: raise RuntimeError("No dataloader for {}".format(dataset)) # stores index files, poor concurency try NIOFSDirectory instead store = SimpleFSDirectory(Paths.get(idxdir)) # limit max. number of tokens per document. # analyzer will not consume more tokens than that #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # configuration for index writer config = IndexWriterConfig(analyzers[analyzer]()) # creates or overwrites index config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # setting similarity BM25Similarity(k1=1.2,b=0.75) similarity = BM25Similarity(self.k1, self.b) config.setSimilarity(similarity) config.setRAMBufferSizeMB(float(ram_size)) # create index writer self.writer = IndexWriter(store, config) self.ftdata = FieldType() self.ftmeta = FieldType() # IndexSearcher will return value of the field self.ftdata.setStored(True) self.ftmeta.setStored(True) # will be analyzed by Analyzer self.ftdata.setTokenized(True) self.ftmeta.setTokenized(False) # what informations are stored (probabli DOCS would be sufficient) # DOCS: Only documents are indexed: term frequencies and positions are omitted. # Phrase and other positional queries on the field will throw an exception, # and scoring will behave as if any term in the document appears only once. # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are # omitted. This enables normal scoring, except Phrase and other positional # queries will throw an exception. # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions. # This is a typical default for full-text search: full scoring is enabled # and positional queries are supported. self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.ftmeta.setIndexOptions(IndexOptions.DOCS) # instantiate some reusable objects # TODO: create document, add fields then change only field value and # re-add document self.doc = Document() # Id cannot be reused because there is multiple values # I could store list of fields and add one if its not enough #self.fieldId = Field("id", "dummy", self.ftmeta) self.fieldTitle = Field("title", "dummy", self.ftdata) self.doc.add(self.fieldTitle) self.fieldContext = Field("context", "dummy", self.ftdata) self.doc.add(self.fieldContext) self.fieldIds = [Field("id", "dummy", self.ftmeta)]