def irsolver(data_file, index) : from questions import get_input_data lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) pred = [] mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'} idx, ques, ans = get_input_data(data_file) for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) : max_score = -1000000 best_ans = 'A' for i, ai in enumerate(a): sc = query(q, ai, analyzer, searcher) print(acm, i, sc) if sc > max_score : max_score = sc best_ans = mapp[i+1] pred.append(best_ans) return idx, pred
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q,a,t,p in qatp: if n % 100 == 0: print 'finding candidates sample', n n+=1 q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def evaluate_index(index_dir, context, analyzer): # eval time of indexing (overall) # we should also measure the elapsed time of # each index_document call seperately start = time.clock() Indexer(index_dir, context, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(index_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) # print str(vocab_size) # size of vocabulary # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field # print str(vocabulary.getSumTotalTermFreq()) # #tokens # print str(vocabulary.getSumDocFreq()) # #postings reader.close() return duration, vocab_size
def getRandomDoc2(): location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei" MAX = 1000 docNum = randrange(0, reader.maxDoc()) doc = reader.document(docNum) #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) files = [] fileRoots = [] paths = [] paths.append(doc.get("articlepath")) pth = paths[0].replace("/home/kevin/Downloads/","/home/kevin/git/YIF/imageFinder/web/static/web/")#os.path.join(tools.__path__,"static/web/images") for root, directories, filenames in os.walk(pth):#probably something wrong with the location for filename in filenames: if (".jpg" or ".gif" or ".png") in filename: files.append(root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" +filename)#temp, will need to chance fileRoots.append(root) print (root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" + filename) try: rng = randrange(0, len(files)) except: return -1 else: return files[randrange(0, len(files))]
def get_image_pmcid(pmcid, classes = ""): fields = ["pmcid", "class"] docs = [] location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) # query.setDefaultOperator(QueryParserBase.AND_OPERATOR) #query = query.parse(query, ('4175339','1')) # query.parse(queryString)#"Shigella sonnei" # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 #hits = searcher.search(query, MAX) if classes == "all": queryStr = "pmcid:(" + ' '.join(pmcid) +")" else: queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query q = query.parse(queryStr) hits = searcher.search(q, MAX) for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append(searcher.doc(hit.doc)) return docs #This will return the image documents that belong to a pmcid(article)
def retrieve(indexdir, queries): lucene.initVM() f = open("results_lucene.txt", "w") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(indexdir))) searcher = IndexSearcher(reader) fields = ["title", "abstract", "authors"] st = PorterStemmer() for id, q in queries.iteritems(): query = q tokenizer = RegexpTokenizer(r'\w+') qwords = tokenizer.tokenize(query) qwords_k = [st.stem(q) for q in qwords] query = " ".join(qwords_k) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) MAX = 1000 hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for i, hit in enumerate(hits.scoreDocs): f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score)) # print hit.doc+1, hit.score # doc = searcher.doc(hit.doc) # print doc.get("authors").encode("utf-8") f.close()
def __init__(self, path): print "Loading data.json..." with open(path, "r") as f: self.data = json.load(f) lucene.initVM() self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) self.reader = IndexReader.open(SimpleFSDirectory(File("index/"))) self.searcher = IndexSearcher(self.reader)
def __init__(self, lucene_index_dir='lucene_index/', num_docs_to_return=100): lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(File(lucene_index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.num_docs_to_return = num_docs_to_return self.ireader = IndexReader.open(directory)
def __init__(self, lucene_dir_path): if lucene_dir_path!=None and lucene_dir_path!='': lucene.initVM() directory = SimpleFSDirectory(File(lucene_dir_path)) self.indexReader = IndexReader.open(directory) self.is_init=True else: self.is_init=False
def SearchQuery(queryString, fields, classification): #if __name__ == "__main__": #if __name__ == "retriever": location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) #query.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(query, queryString) #query.parse(queryString)#"Shigella sonnei" #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) paths = [] pmcids = [] documentDict = {} for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) pmcids.append(doc.get("pmcid")) docDict = {"title" : doc.get("title")}#we can add any other field we want... documentDict[doc.get("pmcid")] = docDict #Where we get the images for all the pmcids images = get_image_pmcid(pmcids, classification)#should take in pmcids and class #create dictionary of images with pmcid being their key imagesDict = {} for img in images: img_pmcid = img.get("pmcid") if img_pmcid in imagesDict.keys(): imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid")) else: imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))] #for each pmcid, we will assign an image to it for the search results for pmcid in pmcids: if imagesDict: docDict = documentDict[pmcid] docDict["imgURL"] = imagesDict[pmcid][0] documentDict[pmcid] = docDict else: docDict = documentDict[pmcid] docDict["imgURL"] = "images/NoImageAvailable.jpg" documentDict[pmcid] = docDict #END - Where we get the images for all the pmcids return documentDict
def __init__( self, lucene_index_dir='/dccstor/cssblr/amrita/dialog_qa/code/prepro_lucene/lucene_index/' ): lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(File(lucene_index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.num_docs_to_return = 5 self.ireader = IndexReader.open(directory)
def load_index(self): indexDir = File(self.index_path) a = {"code": self.porter_analyzer} self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("Index contains %d documents." % n_docs)
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def search_results(search): results = [] search_string = search.data['search'] print "buscando:" + search_string lucene.initVM() analyzer = SpanishAnalyzer() reader = IndexReader.open(SimpleFSDirectory(File(indexDirectory))) searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(search_string) MAX = 1000 hits = searcher.search(query, MAX) if not hits.totalHits: flash('No results found!') return redirect('/') else: # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) flash("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) # render_template('index.html', form=search) items = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() if len(items) > 10: flash('Returning only first 10 results') break doc = searcher.doc(hit.doc) items.append( Row( hit.score, doc.get("tipo_sesion"), doc.get("organo"), doc.get("presidente"), doc.get("dia") + "/" + doc.get("mes") + "/" + doc.get("anio"), doc.get("tipo_epigrafe"), doc.get("text").replace( search_string, '<span class="highlightme">' + search_string + '</span>'), # doc.get("filename") # + "<a href='#'>hola" '<a href="' + doc.get("filename") + '">' + doc.get("filename") + '</a>')) # print hit.score, hit.doc, hit.toString() # doc = searcher.doc(hit.doc) # # print doc.get("text").encode("utf-8") # print # display results return render_template('index.html', form=search, items=items, search_string=search_string)
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0] * len( feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def xmlrpc_getStatistics(self, instance): reader = IndexReader.open(self.indexPath) filter = RangeFilter('instance', instance, instance, 1, 1) num = filter.bits(reader).cardinality() stat = Vector() stat.add(num) stat.add(0)#len(index.terms())) reader.close() return stat
def __init__(self): #self.segmentor.load('./cws.model') INDEXDIR = './Myindex' #lucene.initVM(vmargs='-Xcheck:jni,-verbose:jni,-verbose:gc') lucene.initVM(vmargs=['-Djava.awt.headless=true']) #vm_env = lucene.getVMEnv() #vm_env.attachCurrentThread() #lucene.initVM(vmargs='-') #print 'lucene', lucene.VERSION self.directory = SimpleFSDirectory(File(INDEXDIR)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) self.reader = IndexReader.open(self.directory)
def get_wiki_docids(data_file, wikipedia_index): from questions import get_input_data data = get_input_data(data_file) lucene.initVM() stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index))) searcher = IndexSearcher(reader) generate_docids(data, data_file, analyzer, searcher)
def group_tests(): TP = 0.0 FN = 0.0 n = 0.0 precision = 0 recall = 0 lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex/"))) searcher = IndexSearcher(reader) with open('Labeled800Queries/labeler3.txt', 'r') as f: for line in f: n += 1 line = line.split('\t') user_query = line[0] labels = line[1:] user_query = re.sub('[^0-9a-zA-Z]+', ' ', user_query) print user_query print labels res = predict(user_query, analyzer, reader, searcher, test = "group") converted_res = [] for label in res: #print label[0] converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')]) if not res: print "empty goal category set" print converted_res """ compare labels and converted_res """ for label in labels: label = label.replace('\r', '') label = label.replace('\n', '') if label not in cvt.WikiToKDD.values(): continue #print label if label in converted_res: TP += 1.0 else: FN += 1.0 print "==========================================================" precision = TP/(SIZE*n) recall = TP/(TP+FN) print "precision:", precision print "recall:", recall
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100): """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs :param query: :param analyzer: :param index: :param qparser_regexp: :param max_results: :return: """ reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "content", analyzer).parse( query if not qparser_regexp else re.sub(qparser_regexp, " ", query)) hits = searcher.search(query, max_results) return hits.scoreDocs
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100): """ :param query: :param analyzer: :param index: :param qparser_regexp: :param max_results: :return: """ reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "content", analyzer).parse( query if not qparser_regexp else re.sub(qparser_regexp, " ", query)) hits = searcher.search(query, max_results) return hits.scoreDocs
def find(self, query): transformer = StringTransformer() analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) processed_query = ' '.join( self._preprocessor(transformer.transform(query))) query = QueryParser(Version.LUCENE_CURRENT, "content", analyzer).parse(processed_query) hits = searcher.get_description(query, 10) result_list = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) result_list.append(doc.get("path").encode("utf-8")) return result_list
def delete(primary_keys_map, collection_name, todelete, commit=False): INDEX_DIR_DEFAULT = "IndexFiles.index" if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(todelete) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) ireader = IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents according to primary keys query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) a = writer.deleteDocuments(query) if commit == True: writer.commit() writer.close() return 000
def delete(primary_keys_map,collection_name,todelete,commit=False): INDEX_DIR_DEFAULT="IndexFiles.index" if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs=json.loads(todelete) except: return 100 direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) #setting writer configurations try: config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer=IndexWriter(direc,config) ireader=IndexReader.open(direc) except: return 105 ###as of now deletion of documents support is only based on indexed keys.###################3 tofind_primary_keyvalue_pairs={} tofind_nonprimary_keyvalue_pairs={} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map: tofind_primary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key]=tofind_keyvalue_pairs[key] #filtering documents according to primary keys query=BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) a=writer.deleteDocuments(query) if commit==True: writer.commit() writer.close() return 000;
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100): """ :param query: :param analyzer: :param index: :param qparser_regexp: :param max_results: :return: """ reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "content", analyzer).parse( query if not qparser_regexp else re.sub(qparser_regexp, " ", query) ) hits = searcher.search(query, max_results) return hits.scoreDocs
def number(collection_name): if collection_name!="DEFAULT": INDEX_DIR=collection_name else: INDEX_DIR=INDEX_DIR_DEFAULT direc=SimpleFSDirectory(File(INDEX_DIR)) analyzer=StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader=IndexReader.open(direc) except: return 105 numdocs = int(ireader.numDocs()) ireader.close() return numdocs
def make_request(query, analyzer, index, qparser_regexp=None, max_results=100): """ Returns list of Score objects, which exposes `doc` (document ID in lucene) and `score` attrs :param query: :param analyzer: :param index: :param qparser_regexp: :param max_results: :return: """ reader = IndexReader.open(SimpleFSDirectory(File(index))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "content", analyzer).parse( query if not qparser_regexp else re.sub(qparser_regexp, " ", query) ) hits = searcher.search(query, max_results) return hits.scoreDocs
def lucene_retrieval(q_string, use_BM25=False): """ :param q_string: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def doc_text(hists): """ return doc_name & score :param hists: """ text = '_NONE_' for h in hists: docID = h.doc doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") text = doc.get("text") #score = h.score # yield (file_name, doc_name, score, text) return text result = '_NONE_' # escape special characters via escape function if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here #print(q_string) query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists result = doc_text(hs) # reader.close() return result # text: also nodes
def number(collection_name): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader = IndexReader.open(direc) except: return 105 numdocs = int(ireader.numDocs()) ireader.close() return numdocs
def get_sorted_results(self, query): SHOULD = BooleanClause.Occur.SHOULD parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer) reader = IndexReader.open(self.directory) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) topDocs = searcher.search(parsed_query, 10) j = 0 for i in topDocs.scoreDocs: d = searcher.doc(i.doc) print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score) j += 1
def getDocumentPMC_ID(pmcid, imageAndTitle = 0): location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = QueryParser(Version.LUCENE_4_10_1, "pmcid", analyzer).parse(pmcid)#"Shigella sonnei" MAX = 1000 hits = searcher.search(query, MAX) title = "" abstract = "" fullText = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/" doi = ""#need to split volume = "" year = "" publisher = "" for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) if(imageAndTitle == 1): paths = [] paths.append(doc.get("articlepath")) image = get_image(paths) abstract = doc.get("abstract") doi = doc.get("doi") title = doc.get("title") volume = doc.get("volume") year = doc.get("year") publisher = doc.get("publisher") if doi is not None: doiSecond = doi.split('/') doiSecond = doiSecond[1]#second part else: doiSecond = "" #http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3363814/pdf/cc11003.pdf pdf = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/pdf/" + doiSecond + ".pdf" if(imageAndTitle == 1): return title, image, pmcid#image may sometimes show up else: return abstract, doi, title, volume, year, publisher, fullText, pdf,pmcid#image may sometimes show up
def individual_test(): user_query = "microsoft forms" lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex_withTitle/"))) searcher = IndexSearcher(reader) res = predict(user_query, analyzer, reader, searcher) print "goal_categories:" print res converted_res = [] for label in res: #print label[0] converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')]) if not res: print "empty goal category set" print "converted goal_categories:" print converted_res
def xmlrpc_unindexDocument(self, instance, id): """ Unindex document """ filter = BooleanFilter() filter.add(FilterClause(RangeFilter('id', id, id, 1, 1), BooleanClause.Occur.MUST)) filter.add(FilterClause(RangeFilter('instance', instance, instance, 1, 1), BooleanClause.Occur.MUST)) reader = IndexReader.open(self.indexPath) bits = filter.bits(reader) docId = bits.nextSetBit(0) while docId >= 0: reader.deleteDocument(docId) docId = bits.nextSetBit(docId+1) reader.close()
def searchLucene(requestParameter): "this method is used to search Lucene" searchResults = [] requestParameter = requestParameter.replace("/"," ") # 1. open the index if __name__ == "luceneSearch": lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) index = SimpleFSDirectory(File("Home/WishMatcherIndex")) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line fields=["AdLine","FieldString","FieldRelatedWords"] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser,requestParameter) print(query) # 3. search the index for the query # We retrieve and sort all documents that match the query. # In a real application, use a TopScoreDocCollector to sort the hits. searcher = IndexSearcher(reader) hits = searcher.search(query, n_docs).scoreDocs # 4. display results print("Found %d hits:" % len(hits)) for i, hit in enumerate(hits): doc = searcher.doc(hit.doc) product = doc.get("AdLine") url = doc.get("URL") if(doc.get("AdId") != 1200): product = product[:-1] url = url[:-1] print("%d. %s" % (i + 1, doc.get("AdLine"))) r = result(str(product),str(url)) searchResults.append(r) # 5. close resources #searcher.close() print(searchResults) return searchResults
def get_wiki_nums(data_file, wikipedia_index) : lucene.initVM() reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index))) searcher = IndexSearcher(reader) id_file = open(data_file + '.docid') num_file = open(data_file + '.nums', 'w') what = [] for line in id_file : line = line.strip() if len(line) == 0 : continue line = line.split('\t') if len(line) == 2 and int(line[1]) not in [-1, 0, 1, 2, 3]: what.append(int(line[1])) what = list(set(what)) for item in what : num_file.write(str(item) + '\t' + searcher.doc(item).get("num").encode('utf-8') + '\n')
def load_index(self): indexDir = File(self.index_path) porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) a = { "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer(), "code": JavaCodeAnalyzer() } self.analyzer = PerFieldAnalyzerWrapper(porter_analyzer, a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open( index) #IndexReader 열고 닫지 않았었음........................... n_docs = self.reader.numDocs() print("Index contains %d documents." % n_docs)
def load_index(self): indexDir = File(self.index_path) a = { "code": self.porter_analyzer, "description": self.porter_analyzer, "typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer(), "used_classes": KeywordAnalyzer(), "methods": KeywordAnalyzer(), "class_instance_creation": KeywordAnalyzer(), "id": KeywordAnalyzer() } self.analyzer = PerFieldAnalyzerWrapper(self.porter_analyzer, a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("\nLoading Indices... GitHub index contains [%d] documents." % n_docs)
def getSimilarityGenerator(field,minTermFreq,minDocFreq,minWordLen): # maxQueryTerms as parameter maxQueryTerms = 30 location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() reader = IndexReader.open(SimpleFSDirectory(File(location))) simil = MoreLikeThis(reader) simil.setFieldNames(field) simil.setMinTermFreq(minTermFreq) simil.setMinDocFreq(minDocFreq) simil.setMinWordLen(minWordLen) #Use same maxQueryTerms to prevent longer queries simil.setMaxQueryTerms(maxQueryTerms) simil.setBoost(True) #!Boost terms within queries by tf-idf score return simil
def getRandomDoc(): location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei" MAX = 1000 docNum = randrange(0, reader.maxDoc()) doc = reader.document(docNum) fileName = doc.get("filename") filePath = doc.get("filepath") result = filePath + "/" + fileName result = result.replace("/home/kevin/Downloads/","/") return (result, docNum)
def evaluate_index(data_dir, store_dir, analyzer): """ Evaluates vocabulary size and indexing speed for different analyzer configurations. """ start = time.clock() Indexer(data_dir, store_dir, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(store_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() # sometimes .size() doesn't return the correct size, in this case # we have to count manually if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) reader.close() return duration, vocab_size
def getDocumentClass(reqClass): import random location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = QueryParser(Version.LUCENE_4_10_1, "class", analyzer).parse(str(random.choice(reqClass)))#get random class and search for a document from here MAX = 1000 hits = searcher.search(query, MAX) docs = [] for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append((searcher.doc(hit.doc), hit.doc)) if not docs: return -1 else: doc = random.choice(docs) return doc#return document and ID
def __init__(self, indexDir, computeLengthNorm=True): # 初始化 indexDir-索引文件目录 computeLengthNorm-是否应用SIM(true-不用 false-应用) # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # 标准分词 在针对英文时 以分隔符分词 self.path = os.path.join(INDEX_PATH, indexDir) # 存储路径 self.store = SimpleFSDirectory(File(self.path)) # 存储***? # self.reader = DirectoryReader.open(self.store) self.reader = IndexReader.open(self.store) self.numDocs = self.reader.maxDoc() self.searcher = IndexSearcher(self.reader) # IndexSearch类 sim = CustomSimilarity() # addby zmq if not computeLengthNorm: # SIM sim = CustomSimilarity() self.searcher.setSimilarity(sim) self.mlt = MoreLikeThis(self.reader, sim) # mlt? self.mlt.setAnalyzer(self.analyzer) self.mlt.setMinTermFreq(1) self.mlt.setMinDocFreq(1) # debug self.mlt.setMinWordLen(1) self.mlt.setMaxNumTokensParsed(100000000) BooleanQuery.setMaxClauseCount(1024 * 1024) # 修改最长query clause BUG
from java.io import File from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field from org.apache.lucene.search import IndexSearcher from org.apache.lucene.index import IndexReader from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version if __name__ == "__main__": lucene.initVM() print "lucene version is:", Version # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Get index storage indexDir = SimpleFSDirectory(File("index/")) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_CURRENT, "country", analyzer).parse("India") MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("country").encode("utf-8")
def __init__(self, lucene_index_dir='/data/zjy/csqa_data/lucene_dir/'): lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(File(lucene_index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.num_docs_to_return = 5 self.ireader = IndexReader.open(directory)
docs = coll.find(BasicDBObject({"question_id": question_id})) apis = [] for doc in docs: answer = doc.toMap() apis.append(answer["typed_method_call"]) print apis indexDir = File("/tmp/stackoverflow") # 1. open the index analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) index = SimpleFSDirectory(indexDir) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()} wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) query_string = "lucene get similar documents to the current one" query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"], wrapper_analyzer) #base_query = getSpanNearQuery(analyzer, query_string) base_query = query_parser.parse(query_string)
from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version if __name__ == "__main__": if len(sys.argv) != 3: print sys.argv[0] + ' <stopWords> <searchString>' exit() indexDirectory = sys.argv[1] searchString = sys.argv[2] lucene.initVM() analyzer = SpanishAnalyzer() reader = IndexReader.open(SimpleFSDirectory(File(indexDirectory))) searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(searchString) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") print doc.get("filename")
def open(self): debug('Opening index "%s"' % self.indexPath) self.reader = IndexReader.open(self.indexDir) self.searcher = IndexSearcher(self.reader) self.totalDocs = self.getTotalSentenceCount()
def __init__(self, index_path): indexDir = File(index_path) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader)
def search(collection_name, tofind): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(tofind) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader = IndexReader.open(direc) searcher = IndexSearcher(ireader) except: return 105 #initializing return list return_list = [] #check_list=[] tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map[collection_name]: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs) > 0: query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse( tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query, MAX_RESULTS).scoreDocs for hit in hits: doc = searcher.doc(hit.doc) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: return_list.append(data) else: return_list.append(data) else: for i in range(0, ireader.numDocs()): doc = searcher.doc(i) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: return_list.append(data) else: return_list.append(data) ireader.close() if len(return_list) == 0: return None else: return return_list
lucene.initVM() # ANALYZER analyzer = StandardAnalyzer(util.Version.LUCENE_CURRENT) # DIRECTORY directory = SimpleFSDirectory(File(luceneIndexPath)) #dont forget to remove the luceneIndexDirectory file everytime you run this code. ''' # INDEX WRITER code removed ''' # INDEX READER reader = IndexReader.open(directory) searcher = IndexSearcher(reader) # QUERYING FOR A QUESTION queryParser = QueryParser(util.Version.LUCENE_CURRENT, "text", analyzer) ###################### end ###################### creating vectors (infering) for testing #inference hyper-parameters tic = time.time() start_alpha = 0.01 infer_epoch = 1000 testingFilePath = '/home/tarun/PE/testFiles/testNum18000.csv'
#!/usr/bin/python import sys, os sys.path.append("../lib/lucene-core-3.6.2.jar") sys.path.append("../lib/lucene-core-3.6.2-javadoc.jar") from java.io import File from java.util import Scanner from org.apache.lucene.index import IndexReader, Term from org.apache.lucene.store import SimpleFSDirectory import pdb if __name__ == "__main__": r = IndexReader.open(SimpleFSDirectory(File('../index'))) print "... total number of documents in the index is " + str(r.maxDoc()) t = r.terms() i = 0 count_add = 0 while t.next(): i = i + 1 if i > 100010: break if i > 100000: print "[" + str(i) + "]" + t.term().text() te = Term("contents", "brute") print "... number of documents with the word brute is : " + str( r.docFreq(te)) td = r.termDocs(te)
def predict_test(indexed_data, index_destination, source='directory', already_indexed=False): """ :param indexed_data_dir: :param index_destination: :return: """ def choose_best(): scores = [] for k, v in sorted(res.items(), key=lambda x: x[0]): scores.append((k, 1. * sum(data_test['correctAnswer'] == v) / len(v))) return sorted(scores, key=lambda x: -x[-1])[0][0] def calculate_score(res): """ :param res: :return: """ correct = 0 total = 0 for index, row in data_test.iterrows(): if res[index] == row['correctAnswer']: correct += 1 total += 1 return float(correct)/total if not already_indexed: make_index(indexed_data, index_destination, source) res = {} MAX = 100 docs_per_q = range(1,20) records = [] #analyzer = StandardAnalyzer(Version.LUCENE_30) analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET) reader = IndexReader.open(SimpleFSDirectory(File(index_destination))) searcher = IndexSearcher(reader) for index, row in data_test.iterrows(): queries = [row['answerA'], row['answerB'], row['answerC'], row['answerD']] queries = [row['question'] + ' ' + q for q in queries] scores = {} for q in queries: query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q)) #query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[/^]", "\^", q)) hits = searcher.search(query, MAX) doc_importance = [hit.score for hit in hits.scoreDocs] for n in docs_per_q: scores.setdefault(n, []) scores[n].append(sum(doc_importance[:n])) to_records = [index+102501] to_records.append(['A','B','C','D'][np.argmax(scores[4])]) records.append(to_records) for n in docs_per_q: res.setdefault(n, []) res[n].append(['A','B','C','D'][np.argmax(scores[n])]) df = pandas.DataFrame.from_records(records, columns=["id","correctAnswer"]) df = df.set_index("id") df.to_csv("ololo.csv") # print res[4] best = choose_best() print best score = calculate_score(res[best]) # score = calculate_score(res) print score
def create_searcher(): init_lucene() reader = IndexReader.open(SimpleFSDirectory(File(INDEX_DIR))) return IndexSearcher(reader)