def irsolver(data_file, index): from questions import get_input_data lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) pred = [] mapp = {1: 'A', 2: 'B', 3: 'C', 4: 'D'} idx, ques, ans = get_input_data(data_file) for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)): max_score = -1000000 best_ans = 'A' for i, ai in enumerate(a): sc = query(q, ai, analyzer, searcher) print(acm, i, sc) if sc > max_score: max_score = sc best_ans = mapp[i + 1] pred.append(best_ans) return idx, pred
def getRandomDoc2(): location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei" MAX = 1000 docNum = randrange(0, reader.maxDoc()) doc = reader.document(docNum) #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) files = [] fileRoots = [] paths = [] paths.append(doc.get("articlepath")) pth = paths[0].replace("/home/kevin/Downloads/","/home/kevin/git/YIF/imageFinder/web/static/web/")#os.path.join(tools.__path__,"static/web/images") for root, directories, filenames in os.walk(pth):#probably something wrong with the location for filename in filenames: if (".jpg" or ".gif" or ".png") in filename: files.append(root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" +filename)#temp, will need to chance fileRoots.append(root) print (root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" + filename) try: rng = randrange(0, len(files)) except: return -1 else: return files[randrange(0, len(files))]
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q, a, t, p in qatp: if n % 100 == 0: print 'finding candidates sample', n n += 1 q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def shourcut_retriever(keyword): '''查询器:在简介中查询''' global flag if flag: lucene.initVM() flag = False analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_4_10_1, "shortcut", analyzer).parse(keyword) MAX = 20 hits = searcher.search(query, MAX) print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) results = [] for hit in hits.scoreDocs: print(hit.score, hit.doc, hit.toString()) doc = searcher.doc(hit.doc) result = [doc.get('shortcut'), doc.get('url'), doc.get('name')] print(doc.get('url')) results.append(result) return results
def load_index(self): indexDir = File(self.index_path) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("Index contains %d documents." % n_docs)
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q,a,t,p in qatp: if n % 100 == 0: print 'finding candidates sample', n n+=1 q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def irsolver(data_file, index) : from questions import get_input_data lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) pred = [] mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'} idx, ques, ans = get_input_data(data_file) for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) : max_score = -1000000 best_ans = 'A' for i, ai in enumerate(a): sc = query(q, ai, analyzer, searcher) print(acm, i, sc) if sc > max_score : max_score = sc best_ans = mapp[i+1] pred.append(best_ans) return idx, pred
def __init__(self, index_path): # 요 부분은 다 lucene을 통해 인덱스들을 불러오고, reader와 searcher 초기화 indexDir = File(index_path) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open( index) #IndexReader 열고 닫지 않았었음........................... self.searcher = IndexSearcher(self.reader)
def retrieve(indexdir, queries): lucene.initVM() f = open("results_lucene.txt", "w") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(indexdir))) searcher = IndexSearcher(reader) fields = ["title", "abstract", "authors"] st = PorterStemmer() for id, q in queries.iteritems(): query = q tokenizer = RegexpTokenizer(r'\w+') qwords = tokenizer.tokenize(query) qwords_k = [st.stem(q) for q in qwords] query = " ".join(qwords_k) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) MAX = 1000 hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for i, hit in enumerate(hits.scoreDocs): f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score)) # print hit.doc+1, hit.score # doc = searcher.doc(hit.doc) # print doc.get("authors").encode("utf-8") f.close()
def get_image_pmcid(pmcid, classes = ""): fields = ["pmcid", "class"] docs = [] location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) # query.setDefaultOperator(QueryParserBase.AND_OPERATOR) #query = query.parse(query, ('4175339','1')) # query.parse(queryString)#"Shigella sonnei" # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 #hits = searcher.search(query, MAX) if classes == "all": queryStr = "pmcid:(" + ' '.join(pmcid) +")" else: queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query q = query.parse(queryStr) hits = searcher.search(q, MAX) for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append(searcher.doc(hit.doc)) return docs #This will return the image documents that belong to a pmcid(article)
def evaluate_index(index_dir, context, analyzer): # eval time of indexing (overall) # we should also measure the elapsed time of # each index_document call seperately start = time.clock() Indexer(index_dir, context, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(index_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) # print str(vocab_size) # size of vocabulary # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field # print str(vocabulary.getSumTotalTermFreq()) # #tokens # print str(vocabulary.getSumDocFreq()) # #postings reader.close() return duration, vocab_size
def __init__(self, lucene_index_dir='lucene_index/', num_docs_to_return=1000): lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(File(lucene_index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.num_docs_to_return = num_docs_to_return self.ireader = IndexReader.open(directory)
def __init__(self, path): print "Loading data.json..." with open(path, "r") as f: self.data = json.load(f) lucene.initVM() self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) self.reader = IndexReader.open(SimpleFSDirectory(File("index/"))) self.searcher = IndexSearcher(self.reader)
def __init__(self, lucene_dir_path): if lucene_dir_path!=None and lucene_dir_path!='': lucene.initVM() directory = SimpleFSDirectory(File(lucene_dir_path)) self.indexReader = IndexReader.open(directory) self.is_init=True else: self.is_init=False
def SearchQuery(queryString, fields, classification): #if __name__ == "__main__": #if __name__ == "retriever": location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) #query.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(query, queryString) #query.parse(queryString)#"Shigella sonnei" #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) paths = [] pmcids = [] documentDict = {} for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) pmcids.append(doc.get("pmcid")) docDict = {"title" : doc.get("title")}#we can add any other field we want... documentDict[doc.get("pmcid")] = docDict #Where we get the images for all the pmcids images = get_image_pmcid(pmcids, classification)#should take in pmcids and class #create dictionary of images with pmcid being their key imagesDict = {} for img in images: img_pmcid = img.get("pmcid") if img_pmcid in imagesDict.keys(): imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid")) else: imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))] #for each pmcid, we will assign an image to it for the search results for pmcid in pmcids: if imagesDict: docDict = documentDict[pmcid] docDict["imgURL"] = imagesDict[pmcid][0] documentDict[pmcid] = docDict else: docDict = documentDict[pmcid] docDict["imgURL"] = "images/NoImageAvailable.jpg" documentDict[pmcid] = docDict #END - Where we get the images for all the pmcids return documentDict
def __init__( self, lucene_index_dir='/dccstor/cssblr/amrita/dialog_qa/code/prepro_lucene/lucene_index/' ): lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(File(lucene_index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.num_docs_to_return = 5 self.ireader = IndexReader.open(directory)
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def search_results(search): results = [] search_string = search.data['search'] print "buscando:" + search_string lucene.initVM() analyzer = SpanishAnalyzer() reader = IndexReader.open(SimpleFSDirectory(File(indexDirectory))) searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(search_string) MAX = 1000 hits = searcher.search(query, MAX) if not hits.totalHits: flash('No results found!') return redirect('/') else: # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) flash("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) # render_template('index.html', form=search) items = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() if len(items) > 10: flash('Returning only first 10 results') break doc = searcher.doc(hit.doc) items.append( Row( hit.score, doc.get("tipo_sesion"), doc.get("organo"), doc.get("presidente"), doc.get("dia") + "/" + doc.get("mes") + "/" + doc.get("anio"), doc.get("tipo_epigrafe"), doc.get("text").replace( search_string, '<span class="highlightme">' + search_string + '</span>'), # doc.get("filename") # + "<a href='#'>hola" '<a href="' + doc.get("filename") + '">' + doc.get("filename") + '</a>')) # print hit.score, hit.doc, hit.toString() # doc = searcher.doc(hit.doc) # # print doc.get("text").encode("utf-8") # print # display results return render_template('index.html', form=search, items=items, search_string=search_string)
def load_index(self): indexDir = File(self.index_path) a = {"code": self.porter_analyzer} self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("Index contains %d documents." % n_docs)
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0] * len( feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def xmlrpc_getStatistics(self, instance): reader = IndexReader.open(self.indexPath) filter = RangeFilter('instance', instance, instance, 1, 1) num = filter.bits(reader).cardinality() stat = Vector() stat.add(num) stat.add(0)#len(index.terms())) reader.close() return stat
def get_wiki_docids(data_file, wikipedia_index): from questions import get_input_data data = get_input_data(data_file) lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index))) searcher = IndexSearcher(reader) generate_docids(data, data_file, analyzer, searcher)
def __init__(self): #self.segmentor.load('./cws.model') INDEXDIR = './Myindex' #lucene.initVM(vmargs='-Xcheck:jni,-verbose:jni,-verbose:gc') lucene.initVM(vmargs=['-Djava.awt.headless=true']) #vm_env = lucene.getVMEnv() #vm_env.attachCurrentThread() #lucene.initVM(vmargs='-') #print 'lucene', lucene.VERSION self.directory = SimpleFSDirectory(File(INDEXDIR)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) self.reader = IndexReader.open(self.directory)
def group_tests(): TP = 0.0 FN = 0.0 n = 0.0 precision = 0 recall = 0 lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex/"))) searcher = IndexSearcher(reader) with open('Labeled800Queries/labeler3.txt', 'r') as f: for line in f: n += 1 line = line.split('\t') user_query = line[0] labels = line[1:] user_query = re.sub('[^0-9a-zA-Z]+', ' ', user_query) print user_query print labels res = predict(user_query, analyzer, reader, searcher, test = "group") converted_res = [] for label in res: #print label[0] converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')]) if not res: print "empty goal category set" print converted_res """ compare labels and converted_res """ for label in labels: label = label.replace('\r', '') label = label.replace('\n', '') if label not in cvt.WikiToKDD.values(): continue #print label if label in converted_res: TP += 1.0 else: FN += 1.0 print "==========================================================" precision = TP/(SIZE*n) recall = TP/(TP+FN) print "precision:", precision print "recall:", recall
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def find(self, query): transformer = StringTransformer() analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) processed_query = ' '.join( self._preprocessor(transformer.transform(query))) query = QueryParser(Version.LUCENE_CURRENT, "content", analyzer).parse(processed_query) hits = searcher.get_description(query, 10) result_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) result_list.append(doc.get("path").encode("utf-8")) return result_list
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100): """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs :param query: :param analyzer: :param index: :param qparser_regexp: :param max_results: :return: """ reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "content", analyzer).parse( query if not qparser_regexp else re.sub(qparser_regexp, " ", query)) hits = searcher.search(query, max_results) return hits.scoreDocs
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100): """ :param query: :param analyzer: :param index: :param qparser_regexp: :param max_results: :return: """ reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "content", analyzer).parse( query if not qparser_regexp else re.sub(qparser_regexp, " ", query)) hits = searcher.search(query, max_results) return hits.scoreDocs
def delete(primary_keys_map,collection_name,todelete,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(todelete) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) ireader=IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents according to primary keys query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) a=writer.deleteDocuments(query) if commit==True: writer.commit() writer.close() return 000;
def delete(primary_keys_map, collection_name, todelete, commit=False): INDEX_DIR_DEFAULT = "IndexFiles.index" if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(todelete) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) ireader = IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents according to primary keys query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) a = writer.deleteDocuments(query) if commit == True: writer.commit() writer.close() return 000
def lucene_retrieval(q_string, use_BM25=False): """ :param q_string: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def doc_text(hists): """ return doc_name & score :param hists: """ text = '_NONE_' for h in hists: docID = h.doc doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") text = doc.get("text") #score = h.score # yield (file_name, doc_name, score, text) return text result = '_NONE_' # escape special characters via escape function if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here #print(q_string) query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists result = doc_text(hs) # reader.close() return result # text: also nodes
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100): """ :param query: :param analyzer: :param index: :param qparser_regexp: :param max_results: :return: """ reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "content", analyzer).parse( query if not qparser_regexp else re.sub(qparser_regexp, " ", query) ) hits = searcher.search(query, max_results) return hits.scoreDocs
def number(collection_name): if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader=IndexReader.open(direc) except: return 105 numdocs = int(ireader.numDocs()) ireader.close() return numdocs
def number(collection_name): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader = IndexReader.open(direc) except: return 105 numdocs = int(ireader.numDocs()) ireader.close() return numdocs
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100): """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs :param query: :param analyzer: :param index: :param qparser_regexp: :param max_results: :return: """ reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "content", analyzer).parse( query if not qparser_regexp else re.sub(qparser_regexp, " ", query) ) hits = searcher.search(query, max_results) return hits.scoreDocs
def get_sorted_results(self, query): SHOULD = BooleanClause.Occur.SHOULD parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer) reader = IndexReader.open(self.directory) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) topDocs = searcher.search(parsed_query, 10) j = 0 for i in topDocs.scoreDocs: d = searcher.doc(i.doc) print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score) j += 1
def getDocumentPMC_ID(pmcid, imageAndTitle = 0): location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = QueryParser(Version.LUCENE_4_10_1, "pmcid", analyzer).parse(pmcid)#"Shigella sonnei" MAX = 1000 hits = searcher.search(query, MAX) title = "" abstract = "" fullText = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/" doi = ""#need to split volume = "" year = "" publisher = "" for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) if(imageAndTitle == 1): paths = [] paths.append(doc.get("articlepath")) image = get_image(paths) abstract = doc.get("abstract") doi = doc.get("doi") title = doc.get("title") volume = doc.get("volume") year = doc.get("year") publisher = doc.get("publisher") if doi is not None: doiSecond = doi.split('/') doiSecond = doiSecond[1]#second part else: doiSecond = "" #http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3363814/pdf/cc11003.pdf pdf = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/pdf/" + doiSecond + ".pdf" if(imageAndTitle == 1): return title, image, pmcid#image may sometimes show up else: return abstract, doi, title, volume, year, publisher, fullText, pdf,pmcid#image may sometimes show up
def individual_test(): user_query = "microsoft forms" lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex_withTitle/"))) searcher = IndexSearcher(reader) res = predict(user_query, analyzer, reader, searcher) print "goal_categories:" print res converted_res = [] for label in res: #print label[0] converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')]) if not res: print "empty goal category set" print "converted goal_categories:" print converted_res
def xmlrpc_unindexDocument(self, instance, id): """ Unindex document """ filter = BooleanFilter() filter.add(FilterClause(RangeFilter('id', id, id, 1, 1), BooleanClause.Occur.MUST)) filter.add(FilterClause(RangeFilter('instance', instance, instance, 1, 1), BooleanClause.Occur.MUST)) reader = IndexReader.open(self.indexPath) bits = filter.bits(reader) docId = bits.nextSetBit(0) while docId >= 0: reader.deleteDocument(docId) docId = bits.nextSetBit(docId+1) reader.close()
def searchLucene(requestParameter): "this method is used to search Lucene" searchResults = [] requestParameter = requestParameter.replace("/"," ") # 1. open the index if __name__ == "luceneSearch": lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) index = SimpleFSDirectory(File("Home/WishMatcherIndex")) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line fields=["AdLine","FieldString","FieldRelatedWords"] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser,requestParameter) print(query) # 3. search the index for the query # We retrieve and sort all documents that match the query. # In a real application, use a TopScoreDocCollector to sort the hits. searcher = IndexSearcher(reader) hits = searcher.search(query, n_docs).scoreDocs # 4. display results print("Found %d hits:" % len(hits)) for i, hit in enumerate(hits): doc = searcher.doc(hit.doc) product = doc.get("AdLine") url = doc.get("URL") if(doc.get("AdId") != 1200): product = product[:-1] url = url[:-1] print("%d. %s" % (i + 1, doc.get("AdLine"))) r = result(str(product),str(url)) searchResults.append(r) # 5. close resources #searcher.close() print(searchResults) return searchResults
def get_wiki_nums(data_file, wikipedia_index) : lucene.initVM() reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index))) searcher = IndexSearcher(reader) id_file = open(data_file + '.docid') num_file = open(data_file + '.nums', 'w') what = [] for line in id_file : line = line.strip() if len(line) == 0 : continue line = line.split('\t') if len(line) == 2 and int(line[1]) not in [-1, 0, 1, 2, 3]: what.append(int(line[1])) what = list(set(what)) for item in what : num_file.write(str(item) + '\t' + searcher.doc(item).get("num").encode('utf-8') + '\n')
def load_index(self): indexDir = File(self.index_path) a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer() } self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("\nLoading Indices... GitHub index contains [%d] documents." % n_docs)
def load_index(self): indexDir = File(self.index_path) porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "code": JavaCodeAnalyzer() } self.analyzer = PerFieldAnalyzerWrapper(porter_analyzer, a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open( index) #IndexReader 열고 닫지 않았었음........................... n_docs = self.reader.numDocs() print("Index contains %d documents." % n_docs)
def getRandomDoc(): location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei" MAX = 1000 docNum = randrange(0, reader.maxDoc()) doc = reader.document(docNum) fileName = doc.get("filename") filePath = doc.get("filepath") result = filePath + "/" + fileName result = result.replace("/home/kevin/Downloads/","/") return (result, docNum)
def getSimilarityGenerator(field,minTermFreq,minDocFreq,minWordLen): # maxQueryTerms as parameter maxQueryTerms = 30 location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() reader = IndexReader.open(SimpleFSDirectory(File(location))) simil = MoreLikeThis(reader) simil.setFieldNames(field) simil.setMinTermFreq(minTermFreq) simil.setMinDocFreq(minDocFreq) simil.setMinWordLen(minWordLen) #Use same maxQueryTerms to prevent longer queries simil.setMaxQueryTerms(maxQueryTerms) simil.setBoost(True) #!Boost terms within queries by tf-idf score return simil
def evaluate_index(data_dir, store_dir, analyzer): """ Evaluates vocabulary size and indexing speed for different analyzer configurations. """ start = time.clock() Indexer(data_dir, store_dir, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(store_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() # sometimes .size() doesn't return the correct size, in this case # we have to count manually if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) reader.close() return duration, vocab_size
def getDocumentClass(reqClass): import random location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = QueryParser(Version.LUCENE_4_10_1, "class", analyzer).parse(str(random.choice(reqClass)))#get random class and search for a document from here MAX = 1000 hits = searcher.search(query, MAX) docs = [] for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append((searcher.doc(hit.doc), hit.doc)) if not docs: return -1 else: doc = random.choice(docs) return doc#return document and ID
def __init__(self, indexDir, computeLengthNorm=True): # 初始化 indexDir-索引文件目录 computeLengthNorm-是否应用SIM(true-不用 false-应用) # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # 标准分词 在针对英文时 以分隔符分词 self.path = os.path.join(INDEX_PATH, indexDir) # 存储路径 self.store = SimpleFSDirectory(File(self.path)) # 存储***? # self.reader = DirectoryReader.open(self.store) self.reader = IndexReader.open(self.store) self.numDocs = self.reader.maxDoc() self.searcher = IndexSearcher(self.reader) # IndexSearch类 sim = CustomSimilarity() # addby zmq if not computeLengthNorm: # SIM sim = CustomSimilarity() self.searcher.setSimilarity(sim) self.mlt = MoreLikeThis(self.reader, sim) # mlt? self.mlt.setAnalyzer(self.analyzer) self.mlt.setMinTermFreq(1) self.mlt.setMinDocFreq(1) # debug self.mlt.setMinWordLen(1) self.mlt.setMaxNumTokensParsed(100000000) BooleanQuery.setMaxClauseCount(1024 * 1024) # 修改最长query clause BUG
lucene.initVM() # ANALYZER analyzer = StandardAnalyzer(util.Version.LUCENE_CURRENT) # DIRECTORY directory = SimpleFSDirectory(File(luceneIndexPath)) #dont forget to remove the luceneIndexDirectory file everytime you run this code. ''' # INDEX WRITER code removed ''' # INDEX READER reader = IndexReader.open(directory) searcher = IndexSearcher(reader) # QUERYING FOR A QUESTION queryParser = QueryParser(util.Version.LUCENE_CURRENT, "text", analyzer) ###################### end ###################### creating vectors (infering) for testing #inference hyper-parameters tic = time.time() start_alpha = 0.01 infer_epoch = 1000 testingFilePath = '/home/tarun/PE/testFiles/testNum18000.csv'
def predict_test(indexed_data, index_destination, source='directory', already_indexed=False): """ :param indexed_data_dir: :param index_destination: :return: """ def choose_best(): scores = [] for k, v in sorted(res.items(), key=lambda x: x[0]): scores.append((k, 1. * sum(data_test['correctAnswer'] == v) / len(v))) return sorted(scores, key=lambda x: -x[-1])[0][0] def calculate_score(res): """ :param res: :return: """ correct = 0 total = 0 for index, row in data_test.iterrows(): if res[index] == row['correctAnswer']: correct += 1 total += 1 return float(correct)/total if not already_indexed: make_index(indexed_data, index_destination, source) res = {} MAX = 100 docs_per_q = range(1,20) records = [] #analyzer = StandardAnalyzer(Version.LUCENE_30) analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET) reader = IndexReader.open(SimpleFSDirectory(File(index_destination))) searcher = IndexSearcher(reader) for index, row in data_test.iterrows(): queries = [row['answerA'], row['answerB'], row['answerC'], row['answerD']] queries = [row['question'] + ' ' + q for q in queries] scores = {} for q in queries: query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q)) #query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[/^]", "\^", q)) hits = searcher.search(query, MAX) doc_importance = [hit.score for hit in hits.scoreDocs] for n in docs_per_q: scores.setdefault(n, []) scores[n].append(sum(doc_importance[:n])) to_records = [index+102501] to_records.append(['A','B','C','D'][np.argmax(scores[4])]) records.append(to_records) for n in docs_per_q: res.setdefault(n, []) res[n].append(['A','B','C','D'][np.argmax(scores[n])]) df = pandas.DataFrame.from_records(records, columns=["id","correctAnswer"]) df = df.set_index("id") df.to_csv("ololo.csv") # print res[4] best = choose_best() print best score = calculate_score(res[best]) # score = calculate_score(res) print score
def __init__(self, lucene_index_dir='/data/zjy/csqa_data/lucene_dir/'): lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(File(lucene_index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.num_docs_to_return = 5 self.ireader = IndexReader.open(directory)
def open(self): debug('Opening index "%s"' % self.indexPath) self.reader = IndexReader.open(self.indexDir) self.searcher = IndexSearcher(self.reader) self.totalDocs = self.getTotalSentenceCount()