class TextSearcher: def __init__(self, fs_directory): directory = SimpleFSDirectory(Paths.get(fs_directory)) self.index_reader = DirectoryReader.open(directory) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.analyzer = StandardAnalyzer() self.query = None self.lucene_dictionary = LuceneDictionary(self.index_reader, 'contents') self.analyzer = StandardAnalyzer() self.formatter = SimpleHTMLFormatter() self.hits = None def search(self, searchtext): if searchtext is None: return 0 self.query = QueryParser("contents", self.analyzer).parse(searchtext) score_docs = self.searcher.search(self.query, 50).scoreDocs print("%s total matching documents." % len(score_docs)) return len(score_docs) def find_documents(self, search_text): self.query = QueryParser("contents", self.analyzer).parse(search_text) self.hits = self.searcher.search(self.query, 50) return self.hits def get_document(self, document_id): return self.searcher.doc(document_id) def get_current_query(self): return self.query def get_highlighted_hits(self): extracted_fragments = [] scorer = QueryScorer(self.query) fragmenter = SimpleSpanFragmenter(scorer, 10) highlighter = Highlighter(self.formatter, scorer) highlighter.setTextFragmenter(fragmenter) for hit in self.hits.scoreDocs: document = self.searcher.doc(hit.doc) stream = TokenSources.getAnyTokenStream(self.index_reader, hit.doc, 'contents', self.analyzer) best_fragments = highlighter.getBestFragments( stream, document.get('contents'), 10) for fragment in best_fragments: print('fragment: ', fragment) extracted_fragments.append((hit.doc, best_fragments)) return extracted_fragments
def run(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index1" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(analysis(command)) HighlightFormatter = SimpleHTMLFormatter() highlighter = Highlighter(HighlightFormatter, QueryScorer(query)) scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get( "name"), 'url:', doc.get("url"), 'title:', doc.get("title") text = doc.get('contents') highLightText = highlighter.getBestFragment(analyzer, "contents", text) if highLightText != None: highLightText = ''.join(highLightText.split(' ')) data = {} data['url'] = doc.get("url") data['title'] = doc.get('title') data['highlight'] = highLightText result.append(data) return result
def search_loop(index_dir, field="contents", explain=False): searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(Paths.get(index_dir)))) analyzer = StandardAnalyzer() print("Hit enter with no input to quit.") while True: command = input("Query:") if command == '': return print("Searching for: %s" % command) query = QueryParser(field, analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print("%s total matching documents." % len(scoreDocs)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) if field == 'web': print( f'{doc.get("web")} | {doc.get("raw")} | {scoreDoc.score}') else: print('path:', doc.get("path"), 'name:', doc.get("name")) if explain: explanation = searcher.explain(query, scoreDoc.doc) print(explanation) print('------------')
def l_searcher(query_string, directory, number_documents): lucene.initVM() # analyzer = StandardAnalyzer() reader = DirectoryReader.open(FSDirectory.open(Paths.get(directory))) searcher = IndexSearcher(reader) # Top 'n' documents as result topN = number_documents try: # query = QueryParser("question", analyzer).parse(query_string) query = FuzzyQuery(Term("question", query_string), 2) print("The query was: {}".format(query)) hits = searcher.search(query, topN) print("The hits were: ") options = [] options_answers = [] # print(hits.totalHits) for hit in hits.scoreDocs: print(hit.doc) # print(hit.score, hit.doc, hit.toString()) doc = searcher.doc(hit.doc) options_answers.append(doc.get("answer")) options.append(doc.get("question")) # print(doc.get("answer")) return options, options_answers except IndexError: return None
def run_img(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index2" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) querys = BooleanQuery() query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent", analyzer).parse(command) query_title = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) querys.add(query_content, BooleanClause.Occur.SHOULD) querys.add(query_title, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 50).scoreDocs if len(scoreDocs) == 0: print "WARNING: No result" result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print doc.get("title") data = {} data['title'] = doc.get('title') data['url'] = doc.get('url') data['imgurl'] = doc.get('imgurl') result.append(data) return result
def retriever(file_dir): analyzer = WhitespaceAnalyzer() reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))) searcher = IndexSearcher(reader) queryParser = QueryParser("code", analyzer) BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE) with open(file_dir + "/train/train.spl.src", 'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu: sources = [line.strip() for line in fso.readlines()] summaries = [line.strip() for line in fsu.readlines()] with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \ open(file_dir+"/output/ast.out", 'w') as fws: queries = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in ft.readlines() ] for i, line in enumerate(queries): print("query %d" % i) query = queryParser.parse(QueryParser.escape(line)) hits = searcher.search(query, 1).scoreDocs flag = False for hit in hits: doc = searcher.doc(hit.doc) _id = eval(doc.get("id")) flag = True fwo.write(sources[_id] + '\n') fws.write(summaries[_id] + '\n') if not flag: print(query) print(hits) exit(-1)
def search(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) args = [] if request.method == 'POST': if request.form['ies']: args.append('+ies:'+request.form['ies']) if request.form['area']: args.append('+area:'+request.form['area']) if request.form['professor']: args.append('+professor:'+request.form['professor']) if request.form['conceito']: #args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito']) args.append('m:'+request.form['conceito']) args.append('d:'+request.form['conceito']) args.append('f:'+request.form['conceito']) table = [] if(len(args) > 0): scoreDocs = mansearch.buscar('indexer/',args) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table.append(dict((field.name(), field.stringValue()) for field in doc.getFields())) return render_template('busca.html',table = table) pass
def query(self, data): if self.fil.exists(): searcher = IndexSearcher(DirectoryReader.open(self.d)) query = QueryParser( Version.LUCENE_30, "id", self.analyzer).parse( data['query']) hits = searcher.search(query, 100000) results = {} results['totalHits'] = hits.totalHits results['hits'] = {} for hit in hits.scoreDocs: record = {} doc = searcher.doc(hit.doc) fields = doc.getFields() record['score'] = hit.score for field in fields: if field.name() != "id": record[field.name()] = field.stringValue() results['hits'][doc.get('id')] = record searcher.getIndexReader().close() return results
def func2(command): STORE_DIR = "index1" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) res = [] if command == '': return query = QueryParser(Version.LUCENE_CURRENT, "zhuliao", analyzer).parse(command) scoreDocs = searcher.search(query, 9).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) try: res.append([ doc.get("name"), doc.get("collect_num"), doc.get("zhuliao").split(' '), doc.get("zuofa").split('\n'), doc.get("img_url"), doc.get("url") ]) except: pass res1 = [] for i in res: i[1] = int(i[1]) res1.append(tuple(i)) res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True) return res2
class Searcher: #comment out to run searcher by itself lucene.initVM(vmargs=['-Djava.awt.headless=true']) def __init__(self, indexDir): f = Paths.get(indexDir) self._dir = SimpleFSDirectory(f) self._indexSearcher = IndexSearcher(DirectoryReader.open(self._dir)) self._weights = HashMap() self._weights.put(FIELDS[0], 1) self._weights.put(FIELDS[1], 0.2) def search(self, query): SHOULD = BooleanClause.Occur.SHOULD q = MultiFieldQueryParser.parse(query, FIELDS, [SHOULD, SHOULD], StandardAnalyzer()) # print(q.toString()) topHits = 100 scores = self._indexSearcher.search(q, topHits).scoreDocs results = [] for i in range(10): doc = self._indexSearcher.doc(scores[i].doc) results.append(i + 1, scores[i].doc, doc.get("filename"), doc.get("contents")) # print(i+1) # print("Score: ", scores[i].doc) # print("Title: ", doc.get("filename")) # print("Contents: ", doc.get("contents")) return results
def search(querystr): print('lucene', lucene.VERSION) # lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = FSDirectory.open(Paths.get("index")) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() q = QueryParser("name", analyzer).parse(querystr) hitsPerPage = 20 docs = searcher.search(q, hitsPerPage) hits = docs.scoreDocs people = [] number = 1 for hit in hits: # print(hit.doc, hit.score) d = searcher.doc(hit.doc) person = {} print(number, d.get("name")) person['Name'] = (d.get("name")) person['Birth date'] = (d.get("birth_date")) person['Death date'] = (d.get("death_date")) person['Birth note'] = (d.get("birth_note")) person['Death note'] = (d.get("death_note")) people.append(person) number += 1 return people
def get_image_pmcid(pmcid, classes = ""): fields = ["pmcid", "class"] docs = [] location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) # query.setDefaultOperator(QueryParserBase.AND_OPERATOR) #query = query.parse(query, ('4175339','1')) # query.parse(queryString)#"Shigella sonnei" # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 #hits = searcher.search(query, MAX) if classes == "all": queryStr = "pmcid:(" + ' '.join(pmcid) +")" else: queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query q = query.parse(queryStr) hits = searcher.search(q, MAX) for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append(searcher.doc(hit.doc)) return docs #This will return the image documents that belong to a pmcid(article)
class LuceneSearcher(object): fields = ['id', 'text', 'types'] def __init__(self, db_path): directory = SimpleFSDirectory(File(db_path)) reader = DirectoryReader.open(directory) self.searcher = IndexSearcher(reader) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) logger.info("Loaded DB from %s with %d documents: ", db_path, reader.numDocs()) def search(self, query, max_matches=1000): query = VALID_CHARS_PATTERN.sub(' ', query) logger.debug("Searching for %s", query) query = QueryParser(Version.LUCENE_CURRENT, "text", self.analyzer).parse(query) score_docs = self.searcher.search(query, max_matches).scoreDocs logger.debug("%s total matching documents.", len(score_docs)) docs = [self.searcher.doc(d.doc) for d in score_docs] return [self.convert_to_dict(doc) for doc in docs] def convert_to_dict(self, doc): return {field: doc.get(field) for field in self.fields}
def search(self, field, text): """ search text within indexed data input: field fieldname of the value that will be indexed text text to search output: hits return a list of hits """ results = [] idx_reader = DirectoryReader.open(self.directory) idx_searcher = IndexSearcher(idx_reader) # parse query parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser) query = parser.parse(text) # search hits = idx_searcher.search(query, 1000).scoreDocs.tolist() for hit in hits: doc = idx_searcher.doc(hit.doc) score = hit.score title = doc.get(field) url = doc.get("url") results.append((score, url, title)) return results
def search(self, index_dir): # Get handle to index directory directory = SimpleFSDirectory(File(index_dir)) # Creates a searcher searching the provided index. ireader = DirectoryReader.open(directory) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. We specify what field to search into. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(self.query) # Run the query and get top 50 results topDocs = searcher.search(query, self.retrieve_count) # Get top hits scoreDocs = topDocs.scoreDocs doc_ids = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_ids.append(doc.get(FIELD_PATH)) return [int(item) for item in doc_ids]
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q, a, t, p in qatp: if n % 100 == 0: print 'finding candidates sample', n n += 1 q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def search(self, input_query=None, max_answers=10): ''' Searches the given query in the index ''' if input_query is None: return None base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer) query = MultiFieldQueryParser.parse(parser, input_query) scoreDocs = searcher.search(query, max_answers).scoreDocs print "%s total matching documents." % len(scoreDocs) docs = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields()) docs.append(doc_dict) # print doc return docs
class Searcher(object): def __init__(self, **kwargs): """ Initialize a new instance of the Searcher :param count: The number of counts to return from a query :param output: The output directory of the underlying index """ self.count = kwargs.get("count", 100) self.output = kwargs.get("root", "index") self.store = SimpleFSDirectory(File(self.output)) self.analyzer = StandardAnalyzer(Version.LUCENE_30) self.searcher = IndexSearcher(DirectoryReader.open(self.store)) def search(self, query): """ Given a query, apply it against the existing index. :param query: The query to apply to the index :returns: A generator of the matching documents """ query = QueryParser(Version.LUCENE_30, "data", self.analyzer).parse(query) results = self.searcher.search(query, self.count) for result in results.scoreDocs or []: # logger.debug("%s %s %s", hit.score, hit.doc, hit.toString()) document = self.searcher.doc(result.doc) yield document.get("path"), result.score
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q,a,t,p in qatp: if n % 100 == 0: print 'finding candidates sample', n n+=1 q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def search(self): ''' Searches the given query in the index ''' lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION # base_dir = os.path.dirname(os.path.abspath('.')) base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) # print 'path:', doc.get("path"), 'name:', doc.get("name") print doc
def func1(command): global vm_env STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() #lucene.initVM(vmargs=['-Djava.awt.headless=true']) # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) if command == '': return [] command_list = jieba.cut(command) command = " ".join(command_list) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doct = { 'title': doc.get("title"), 'url': doc.get("url"), "sentence": doc.get("sentence") } result.append(doct) del searcher return result
def GET(self): command = web.input().command.encode('utf-8') initvm.vm_env.attachCurrentThread() STORE_DIR = "jdindex" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 20).scoreDocs finalDocs = [] for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) title = doc.get("title").strip('\n') if title not in finalDocs: finalDocs.append(title) web.header('content-type', 'text/json') data = {} data['q'] = command data['p'] = 'false' data['s'] = finalDocs return 'fn(' + json.dumps(data) + ');'
def SearchImgCommand(command): initvm.vm_env.attachCurrentThread() STORE_DIR = "jdindex" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) contentCommand = ' '.join(jieba.cut(command)) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(contentCommand) scoreDocs = searcher.search(query, 50).scoreDocs Already = [] finalDocs = [] for i, scoreDoc in enumerate(scoreDocs): doc = searcher.doc(scoreDoc.doc) itemurl = doc.get("itemurl") if itemurl not in Already: oneDoc = {} oneDoc['imgurl'] = doc.get("imgurl") oneDoc['title'] = doc.get("title").strip('\n') oneDoc['itemurl'] = itemurl oneDoc['score'] = scoreDoc.score finalDocs.append(oneDoc) Already.append(itemurl) return finalDocs
def get_query_results(reader,query,n,field): searcher = IndexSearcher(reader) hits = searcher.search(query, n).scoreDocs print("Found %d hits:" % len(hits)) for i, hit in enumerate(hits): doc = searcher.doc(hit.doc) print("%d. %s" % (i + 1, doc.get(field)))
def search_docs(self, value, field="general_info"): MAX_RESULTS = 1000 searcher = IndexSearcher(DirectoryReader.open(self.store)) query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(value) topDocs = searcher.search(query, MAX_RESULTS) return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
def perform_search(self, searchterm, results_per_page, page): # if there is a field in the searchterm """if ":" in searchterm: # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) else: query = BooleanQuery() query_title = TermQuery(Term("title", searchterm)) query_description = TermQuery(Term("description", searchterm)) query_content = TermQuery(Term("content", searchterm)) # BooleanClause.Occur.MUST for AND queries query.add(query_title, BooleanClause.Occur.SHOULD) query.add(query_description, BooleanClause.Occur.SHOULD) query.add(query_content, BooleanClause.Occur.SHOULD)""" # create QueryParser for each field to be searched parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer) parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer) parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) # put fields together query = BooleanQuery() query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() hits = searcher.search(query, results_per_page + (results_per_page * page)) score_docs = hits.scoreDocs count_results = hits.totalHits duration = datetime.now() - start # results to return results = [] count = 0 for scoreDoc in score_docs: # skip offset if count < results_per_page * page: count += 1 continue count += 1 doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) results.append(table) return results, duration, count_results
def SearchQuery(queryString, fields, classification): #if __name__ == "__main__": #if __name__ == "retriever": location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) #query.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(query, queryString) #query.parse(queryString)#"Shigella sonnei" #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) paths = [] pmcids = [] documentDict = {} for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) pmcids.append(doc.get("pmcid")) docDict = {"title" : doc.get("title")}#we can add any other field we want... documentDict[doc.get("pmcid")] = docDict #Where we get the images for all the pmcids images = get_image_pmcid(pmcids, classification)#should take in pmcids and class #create dictionary of images with pmcid being their key imagesDict = {} for img in images: img_pmcid = img.get("pmcid") if img_pmcid in imagesDict.keys(): imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid")) else: imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))] #for each pmcid, we will assign an image to it for the search results for pmcid in pmcids: if imagesDict: docDict = documentDict[pmcid] docDict["imgURL"] = imagesDict[pmcid][0] documentDict[pmcid] = docDict else: docDict = documentDict[pmcid] docDict["imgURL"] = "images/NoImageAvailable.jpg" documentDict[pmcid] = docDict #END - Where we get the images for all the pmcids return documentDict
class SearchIndex(object): def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer) def search(self, q, page = 1, duplicates = False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight} ) del self.searcher totalPages = int(math.ceil(results.getTotalHits()/float(perPage))) return totalPages, docs def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
class WikiPageIndex(): def __init__(self, index_dir): #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g']) self.index_dir = index_dir self.directory = SimpleFSDirectory(File(self.index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) def createIndex(self): self.writer = IndexWriter(self.directory, self.config) if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) def addDocumentToIndex(self, title, text): doc = Document() doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.addDocument(doc) def closeIndex(self): self.writer.commit() self.writer.close() def searchIndex(self, queryString, field="Text", max_results=100): query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString) scoreDocs = self.searcher.search(query, max_results).scoreDocs log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString)) docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) log.debug(WikiPageIndex.cleanWikiText(doc.get("Text"))) #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70])) docs.append(doc) return docs @staticmethod def cleanWikiText(text): text = text.encode('ascii', 'ignore') text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text) text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text) text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text) return text.strip()
def search(term, n_docs=10, index='index'): store = SimpleFSDirectory(File(index)) searcher = IndexSearcher(DirectoryReader.open(store)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term) # str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery" score_docs = searcher.search(query, n_docs).scoreDocs return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]
def retrieve_wiki(text_query, index_directory_name): lucene.initVM() directory = FSDirectory.open(File(index_directory_name)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) txt =text_query query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('contents')
def main(indexDir, inputDir): """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index""" lucene.initVM() # Open index logger.info("Opening Lucene index [%s]..." % indexDir) dir = SimpleFSDirectory(File(indexDir)) analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT) reader = DirectoryReader.open(dir) searcher = IndexSearcher(reader) # Search documents onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ] rels = list() for f in onlyfiles: journal_code = f.split('.')[0] f = join(inputDir, f) json_data = open(f) data = json.load(json_data) # The results collected after comparison for entry in data: url = entry['url'] date = entry['date'] title = entry['title'] logger.debug("Processing URL [%s] date [%s] - [%s]" % (url, date, title)) tt = nltk.word_tokenize(title) tokens = [] for t in tt: tokens.append(t.lower()) for token in tokens: q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (token, date, journal_code, url) query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(q) hits = searcher.search(query, MAX_HITS) logger.debug("Found %d document(s) that matched query '%s':" % (hits.totalHits, q)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) logger.debug(doc) rels.append({'left': url, 'token': token, 'right': doc.get('url')}) json_data.close() with open('relationships.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for rel in rels: csvwriter.writerow([rel['left'].encode('utf8'), rel['token'].encode('utf8'), rel['right'].encode('utf8')])
class PyLucene(object): def __init__(self): if luceneImport: self.lucene = True else: self.lucene = False #Lucene connection lucene.initVM() indexDir = "texts/index" directory = MMapDirectory(File(indexDir)) directory = DirectoryReader.open(directory) self.analyzer = StandardAnalyzer(Version.LUCENE_30) self.searcher = IndexSearcher(directory) def query(self, terms = []): query = QueryParser(Version.LUCENE_30, "text", self.analyzer).parse(" OR ".join(terms)) MAX = 1000 hits = self.searcher.search(query, MAX) results = [] for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) results.append([doc.get("doc_id").encode("utf-8"), doc.get("head").encode("utf-8")]) return results def occurencies(self, term, morphs): query = [] already = [] for morph in morphs: query.append(morph) #Sometime, when there is doubt about a term, because of xml hashing in Lucene, you would find twice a lemma like wordword query.append(morph+morph) results = self.query(query) resultsReturned = [] for result in results: if result[0] not in already: resultsReturned.append(result) already.append(result[0]) return resultsReturned, len(resultsReturned) def chunk(self, occurency): #Could be updated using the section information but could be only milesone return occurency#, len(occurency)
def author_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] print(entry['prim_author']) if qry in entry['prim_author'].lower(): fname = short_title + CONTENT_EXT results[entry_id] = {'title': short_title, 'file': fname } f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w') f.write(json.dumps(results)) f.close() return json.dumps(results)
def get_sorted_results(self, query): SHOULD = BooleanClause.Occur.SHOULD parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer) reader = IndexReader.open(self.directory) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) topDocs = searcher.search(parsed_query, 10) j = 0 for i in topDocs.scoreDocs: d = searcher.doc(i.doc) print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score) j += 1
def getDocumentPMC_ID(pmcid, imageAndTitle = 0): location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = QueryParser(Version.LUCENE_4_10_1, "pmcid", analyzer).parse(pmcid)#"Shigella sonnei" MAX = 1000 hits = searcher.search(query, MAX) title = "" abstract = "" fullText = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/" doi = ""#need to split volume = "" year = "" publisher = "" for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) if(imageAndTitle == 1): paths = [] paths.append(doc.get("articlepath")) image = get_image(paths) abstract = doc.get("abstract") doi = doc.get("doi") title = doc.get("title") volume = doc.get("volume") year = doc.get("year") publisher = doc.get("publisher") if doi is not None: doiSecond = doi.split('/') doiSecond = doiSecond[1]#second part else: doiSecond = "" #http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3363814/pdf/cc11003.pdf pdf = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + pmcid + "/pdf/" + doiSecond + ".pdf" if(imageAndTitle == 1): return title, image, pmcid#image may sometimes show up else: return abstract, doi, title, volume, year, publisher, fullText, pdf,pmcid#image may sometimes show up
def get_wiki_nums(data_file, wikipedia_index) : lucene.initVM() reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index))) searcher = IndexSearcher(reader) id_file = open(data_file + '.docid') num_file = open(data_file + '.nums', 'w') what = [] for line in id_file : line = line.strip() if len(line) == 0 : continue line = line.split('\t') if len(line) == 2 and int(line[1]) not in [-1, 0, 1, 2, 3]: what.append(int(line[1])) what = list(set(what)) for item in what : num_file.write(str(item) + '\t' + searcher.doc(item).get("num").encode('utf-8') + '\n')
def searchLucene(requestParameter): "this method is used to search Lucene" searchResults = [] requestParameter = requestParameter.replace("/"," ") # 1. open the index if __name__ == "luceneSearch": lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) index = SimpleFSDirectory(File("Home/WishMatcherIndex")) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line fields=["AdLine","FieldString","FieldRelatedWords"] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser,requestParameter) print(query) # 3. search the index for the query # We retrieve and sort all documents that match the query. # In a real application, use a TopScoreDocCollector to sort the hits. searcher = IndexSearcher(reader) hits = searcher.search(query, n_docs).scoreDocs # 4. display results print("Found %d hits:" % len(hits)) for i, hit in enumerate(hits): doc = searcher.doc(hit.doc) product = doc.get("AdLine") url = doc.get("URL") if(doc.get("AdId") != 1200): product = product[:-1] url = url[:-1] print("%d. %s" % (i + 1, doc.get("AdLine"))) r = result(str(product),str(url)) searchResults.append(r) # 5. close resources #searcher.close() print(searchResults) return searchResults
def similarityOfSynopsis(self): directory = SimpleFSDirectory(File(settings.SYNOPSIS_INDEX)) ireader = DirectoryReader.open(directory) searcher = IndexSearcher(ireader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) for root, dirnames, filenames in os.walk(settings.SYNOPSIS): filenames = [int(item) for item in filenames] filenames.sort() filenames = [str(item) for item in filenames] for filename in filenames: path = os.path.join(root, filename) major_movie = models.Movie.objects.get(pk=filename) with open(path, 'r') as moviedoc: content = moviedoc.read().replace('\n', ' ') content = re.sub('[^A-Za-z0-9 ]+', '', content) while True: try: query = queryParser.parse(QueryParser.escape(content)) except Exception as e: self.boolean_query.setMaxClauseCount(self.boolean_query.maxClauseCount * 2) print self.boolean_query.maxClauseCount continue break topDocs = searcher.search(query, len(filenames)) scoreDocs = topDocs.scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) movie_id = int(doc.get(FIELD_PATH)) if movie_id <= major_movie.id: continue minor_movie = models.Movie.objects.get(pk=movie_id) try: similarity = models.Similarities.objects.filter(first_movie=major_movie, second_movie=minor_movie).first() if not similarity: similarity = models.Similarities.objects.filter(first_movie=minor_movie, second_movie=major_movie).first() similarity.synopsis = scoreDoc.score similarity.save() except Exception as e: print major_movie.id, minor_movie.id raise e print u"{0} completed.".format(major_movie.id)
class Indexer: #segmentor = Segmentor() def __init__(self): #self.segmentor.load('./cws.model') INDEXDIR = './Myindex' #lucene.initVM(vmargs='-Xcheck:jni,-verbose:jni,-verbose:gc') lucene.initVM(vmargs=['-Djava.awt.headless=true']) #vm_env = lucene.getVMEnv() #vm_env.attachCurrentThread() #lucene.initVM(vmargs='-') #print 'lucene', lucene.VERSION self.directory = SimpleFSDirectory(File(INDEXDIR)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) self.reader = IndexReader.open(self.directory) def Qsearch(self,query): words = seg.segment(query.strip()) #words = self.segmentor.segment(query.strip()) #print ' '.join(words) vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() result = QueryParser(Version.LUCENE_CURRENT, "contents",self.analyzer) result.setPhraseSlop(0) # "\""+' '.join(words)+"\"~0" means words should be continuous query = result.parse("\""+' '.join(words)+"\"~0") totalHits = self.searcher.search(query, 50) #print "%s total matching documents." % totalHits.totalHits #return totalHits.totalHits for hit in totalHits.scoreDocs: #print"Hit Score: ",hit.score, "Hit Doc:",hit.doc, "HitString:",hit.toString() doc= self.searcher.doc(hit.doc) #print doc.get("name").encode("utf-8") #print "----------------------------------------" t = Term('contents',' '.join(words)) #termDocs = ireader.termDocs(t) #for tt in termDocs: # print ireader.document(termDocs.docs).getFeildable('neme'),termDocs.freq() #print self.reader.totalTermFreq(t) return self.reader.totalTermFreq(t)
def custom_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' print rootdir results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] year = entry['publ_year'] fname = short_title + CONTENT_EXT results[fname] = year;
def do_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File print os.path.abspath(os.path.pardir) reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = [] for hit in hits: doc = searcher.doc(hit.doc); entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) #print 'entry:', entry score = hit.score #print 'Hit:', entry['short_title'], score results.append((score, doc, entry)) return results
def search(termo, **args): indexDir = os.environ.get('MANDEX') or '3iteracao' fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, field, analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(termo + ' '.join(args.values())) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start politicos = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) politicos.append(table) return politicos
def printDoc(indexDir,scoreDocs,args,stats,duration): """ formato: IES , Nota Doutorado , Nota Mestrado , UF , Nota mestrado Profissional , Programa """ format =" #ies , #d , #m , #uf , #f , #program , #professor " #print indexDir class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(File(indexDir)) #print fsDir #Criando buscador baseado no diretorio dos indices passados pelo usuario searcher = IndexSearcher(DirectoryReader.open(fsDir)) #imprimindo a quantidade e os documentos que tem a consulta feita if stats: #Juntando parametros passados com o valor do mesmo command = ' '.join(args) #print command print >>sys.stderr, "Encontrado %d documento(s) (em %s) com consulta igual a '%s':" %(len(scoreDocs), duration,command) newTable = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) newTable.append(template.substitute(table).split(",")) if newTable : headers = ["IES"," Nota Doutorado", " Nota Mestrado "," UF "," Nota mestrado Profissional "," Programa ","Professor"] print tabulate(newTable,headers,tablefmt="grid")