def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query= QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString( ) doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def retrieve( self, query, max_res = 10 ): lucene.initVM() inDir = SimpleFSDirectory( File( self.INDEX_DIR ) ) lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 ) lucene_searcher = IndexSearcher( inDir ) my_query = QueryParser( Version.LUCENE_30, 'content' , lucene_analyzer ).parse( query ) MAX = max_res total_hits = lucene_searcher.search( my_query, MAX ) res_head = '{"query":"' + query + '","results":[' res_tail = ']}' result = res_head hits = total_hits.totalHits if ( hits > 0 ): res_body = '' it = 0 for hit in total_hits.scoreDocs: it += 1 doc = lucene_searcher.doc( hit.doc ) res_body += '{"rank":' +\ str( it ) +\ ',"score":"' +\ str( hit.score ) +\ '","title":"' +\ doc.get( 'title' ).encode('utf-8') +\ '","id":"' +\ doc.get( 'id' ).encode('utf-8') +\ '"}' if ( it < hits ): res_body += ',' result += res_body result += res_tail return result
def run(writer, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print "Searching for:", command IndexReader = writer.getReader() searcher = IndexSearcher(IndexReader) #query = QueryParser(Version.LUCENE_CURRENT, "hashtag", analyzer).parse(command) #scoreDocs = searcher.search(query, 50).scoreDocs wildquery = command + "*" term = Term("hashtag", wildquery) query = WildcardQuery(term) scoreDocs = searcher.search(query, 5).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) score = ( len(command) / len(doc.get("hashtag")) ) * scoreDoc.score print 'tweet:', doc.get("contents") print 'user_name:', doc.get("user_name") print 'when', doc.get("creation_date")
def displayResults(self, query, sort): searcher = IndexSearcher(self.directory, True) fillFields = False computeMaxScore = False docsScoredInOrder = False computeScores = True collector = TopFieldCollector.create(sort, 20, fillFields, computeScores, computeMaxScore, docsScoredInOrder) searcher.search(query, None, collector) scoreDocs = collector.topDocs().scoreDocs print "\nResults for:", query, "sorted by", sort print "Title".rjust(30), "pubmonth".rjust(10), \ "id".center(4), "score".center(15) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title = doc["title"] if len(title) > 30: title = title[:30] print title.encode('ascii', 'replace').rjust(30), \ doc["pubmonth"].rjust(10), \ str(scoreDoc.doc).center(4), \ ("%06f" % (scoreDoc.score)).rjust(12) print " ", doc["category"] # print searcher.explain(query, scoreDoc.doc) searcher.close()
def post(self): q= self.get_argument("query") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) self.render("index.html", title="Results", items=items, query=q)
def get_indexed_file_details(ts_results, lucene_index_dir): ''' This function gets each files details from the lucene index. Arguments: ts_results - topic search results, each item contains [file id, root, file name, similarity score] lucene_index_dir - lucene index directory Returns: file details in a list ''' store = SimpleFSDirectory(File(lucene_index_dir)) searcher = IndexSearcher(store, True) rows = [] for rs in ts_results: doc = searcher.doc(rs[0]) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) row.append(str(rs[3])) # similarity score rows.append(row) return rows
def retrieveDocs(q): lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_30) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def document( self, docId, max_res = 1 ): lucene.initVM() inDir = SimpleFSDirectory( File( self.INDEX_DIR ) ) lucene_analyzer = StandardAnalyzer( Version.LUCENE_30 ) lucene_searcher = IndexSearcher( inDir ) my_query = QueryParser( Version.LUCENE_30, 'id' , lucene_analyzer ).parse( docId ) MAX = max_res total_hits = lucene_searcher.search( my_query, MAX ) result = '{' hits = total_hits.totalHits if ( hits == 1 ): for hit in total_hits.scoreDocs: doc = lucene_searcher.doc( hit.doc ) result += '"id":"' +\ doc.get( 'id' ) +\ '","title":"' +\ doc.get( 'title' ) +\ '","abstract":"' +\ doc.get( 'abstract' ) +\ '","keyword":"' +\ doc.get( 'keyword' ) +\ '","content":"' +\ doc.get( 'content' ) +\ '","authors":"' +\ doc.get( 'authors' ) +\ '"' result += '}' return result
def get_doc_details(doc_id, lucene_index_dir): ''' This function gets a file's details from the lucene index. Arguments: doc_id - file id lucene_index_dir - lucene index directory Returns: file details as a list ''' store = SimpleFSDirectory(File(lucene_index_dir)) searcher = IndexSearcher(store, True) doc = searcher.doc(doc_id) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) return row
class OccuredCandidates: indexDir = 'data/index' max_candidates = 30 def __init__(self): lucene.initVM() self._lversion = Version.LUCENE_30 self._analyzer = EnglishAnalyzer(self._lversion) self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir))) self._translation = loadTranslation() self._links = loadLinks() def find(self, phrase): phrase = phrase.lower().encode('utf8') query = ' '.join(['+'+ word for word in phrase.split(' ')]); query = QueryParser(self._lversion, 'contents', self._analyzer).parse(query) hits = self._searcher.search(query, self.max_candidates) # if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug # todo put article_id in lucene index instead of translating document title links = {} for hit in hits.scoreDocs: title = quote(self._searcher.doc(hit.doc).get("title").encode('utf-8').replace(' ', '_')).replace('%28', '(').replace('%29', ')') if title in self._translation: links[self._translation[title]] = hit.score # else: print title # potential bug return self._links[phrase].get(-1, 0), links def clear_links(self, annotations): return filter(lambda annotation: annotation['links'] and max(annotation['links'].values()) > 1, annotations)
def lucene_search(index_dir, limit, query_text): ''' lucene_search: Search a built index and return upto limit number of responses Arguments: Input index folder, limit value of results returned, query(as string) Returns: paths of responsive files as list ''' logging.basicConfig(file=os.path.join(index_dir,"lucene_search.log")) logger.info("Initializing search....") lucene.initVM() logger.info("Reading index from "+index_dir) index = SimpleFSDirectory(File(index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_30) #Lucene version used to generate index searcher = IndexSearcher(index) logger.info("Parsing query :"+ query_text) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query_text) hits = searcher.search(query, limit) logger.info("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) hit_paths = [] for hit in hits.scoreDocs: # The following code also generates score for responsive/found documents and the # content index which matched # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) hit_paths.append(doc.get("path")) return hit_paths
def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query,MAX) print "Hits: ",total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ",hit.score, "Hit Doc:",hit.doc, "Hit String:",hit.toString() doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def query(self,title): self._th.attachCurrentThread() searcher = IndexSearcher(self._dir) query=QueryParser(Version.LUCENE_30, "title", self._analyzer).parse(title) total_hits = searcher.search(query, 10) for hit in total_hits.scoreDocs: doc = (searcher.doc(hit.doc)) return doc.get("title")+"\n"+doc.get("content")+"--------------------------------" return "None"
def search(command=command1): searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) res = searcher.search(query, 1000000) print 'Total hits:', res.totalHits # return searcher, res return [searcher.doc(doc.doc) for doc in res.scoreDocs[:20]]
def find(self, query, indir): lucene.initVM() INDEXDIR = indir indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30,"<default field>",\ lucene_analyzer).parse("text:" + query + " OR title:" + query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "\nHits: ", total_hits.totalHits, "\n" for hit in total_hits.scoreDocs: print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc( hit.doc).get("department").encode( "utf-8"), "Title:", lucene_searcher.doc( hit.doc).get("title").encode("utf-8") print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
def getCrowds(self, query, field = CrowdFields.text): searcher = IndexSearcher(self.index, True) q = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query) collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(q, collector) hits = collector.topDocs().scoreDocs return [ searcher.doc(scoreDoc.doc).get(CrowdFields.id) for scoreDoc in hits]
class LuceneSearch(object): def __init__(self): STORE_DIR = "index" initVM() print 'lucene', VERSION self.directory = SimpleFSDirectory(File(STORE_DIR)) print self.directory self.searcher = IndexSearcher(self.directory, True) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) def close(self): self.searcher.close() def raw_search(self, query_string): query = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer).parse(query_string) scoreDocs = self.searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) matches = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) #print 'doc matched = ', dir(doc) contents = LuceneDoc.load(doc.get('name')) matches.append({'contents' : contents, 'doc' : doc}) return matches def search(self, query): matches = self.raw_search(query) results = '' if len(matches) > 0: results += str(len(matches))+" results <br/>" for match in matches: results += '<a href='+str(match['contents']['dealUrl'])+'>'+str(match['contents']['merchant'])+'</a><br />' results += '<p>'+str(match['contents']['shortAnnouncementTitle'])+','+str(match['contents']['redemptionLocation'])+'</p><br/>' else: results = "0 results <br/>" return results def cli_search(self): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return matches = self.raw_search(command) print print "Searching for:", command for match in matches: print match['contents']['dealUrl'] print match['contents']['merchant'], ',', match['contents']['redemptionLocation'], ', ', match['contents']['div'] print match['contents']['shortAnnouncementTitle'] print '-'*80
def getCrowds(self, query, field=CrowdFields.text): searcher = IndexSearcher(self.index, True) q = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query) collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(q, collector) hits = collector.topDocs().scoreDocs return [ searcher.doc(scoreDoc.doc).get(CrowdFields.id) for scoreDoc in hits ]
def pesquisar_com_lucene(): initVM() #print 'lucene', VERSION # Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) for query in querys: query_number = query.query_number # Constructs a query parser. We specify what field to search into. query.query_text = query.query_text.replace('?','') query.query_text = query.query_text.replace('*','') queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(query.query_text) # Run the query and get top 50 results topDocs = searcher.search(query,50000) # Get top hits scoreDocs = topDocs.scoreDocs r = resultado_query(query_number,scoreDocs) resultados.append(r) #print "%s total matching documents." % len(scoreDocs) #for scoreDoc in scoreDocs: # doc = searcher.doc(scoreDoc.doc) # print doc.get(FIELD_PATH) with open('resultados_da_busca/resultados.csv', 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in resultados: resultados_da_row = [] i = 1 for resultado_da_query in row.query_results: doc = searcher.doc(resultado_da_query.doc) resultados_da_row.append((i,int(doc.get(FIELD_PATH)))) i = i + 1 spamwriter.writerow([row.query_number,resultados_da_row])
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page): """ Uses the query term provided to search the disease ontology lucene index """ results = [] index_dir = SimpleFSDirectory(File(index_dir)) analyzer = build_perfield_analyzer(index_metadata) searcher = IndexSearcher(index_dir) index_fields = index_metadata.keys() # Since we are paging results we want to grab what page we are on page = (int(search_params.get('page', 1))) - 1 # Doing something pretty hacky here since we are trying to move from 0-based to 1 # based indexing to match our pagingation display offset = int(page) * records_per_page # If we are executing an advanced search we will be building a BooleanQuery # in parts as opposed to the one MultiFieldQueryParser when doing a basic # serach query = None if search_params.get('adv_search') == "True": query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer) else: parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer) query = MultiFieldQueryParser.parse( parser, process_query_param((search_params.get('q')))) # Parse through our hits hits = searcher.search(query, 10000) total_hits = hits.totalHits count = min(hits.totalHits - offset, records_per_page) for i in xrange(0, count): score_doc = hits.scoreDocs[offset + i] doc = searcher.doc(score_doc.doc) term_id = doc.get('term id') name = doc.get('name') explain = searcher.explain(query, score_doc.doc) match_fields = get_field_matches(explain.toString(), index_fields) results.append((term_id, name, list(match_fields))) searcher.close() return (results, total_hits)
def retrieve_document_details(docid, index_dir): ''' This method will be used to retrieve a single document associated with the docid that is passed to it as parameter. The document will be searched in the directory referred by index_dir. If you want to access a specific field's value you can access that using the instance of this document class as document.get(<field_name>). Here <field_name> is a string. ''' store = SimpleFSDirectory(File(index_dir)) searcher = IndexSearcher(store, True) document = searcher.doc(int(docid)) return document
def testQueryParser(self): searcher = IndexSearcher(self.directory, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse("+JUNIT +ANT -MOCK") scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) d = searcher.doc(scoreDocs[0].doc) self.assertEqual("Java Development with Ant", d.get("title")) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse("mock OR junit") scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(2, len(scoreDocs), "JDwA and JIA")
def testSecurityFilter(self): query = TermQuery(Term("keywords", "info")) searcher = IndexSearcher(self.directory, True) topDocs = searcher.search(query, 50) self.assertEqual(2, topDocs.totalHits, "Both documents match") jakeFilter = QueryWrapperFilter(TermQuery(Term("owner", "jake"))) scoreDocs = searcher.search(query, jakeFilter, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) self.assertEqual("jakes sensitive info", searcher.doc(scoreDocs[0].doc).get("keywords"), "elwood is safe")
def search(cls, indexDir, q): fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q) start = time() hits = searcher.search(query, 50).scoreDocs duration = timedelta(seconds=time() - start) print "Found %d document(s) (in %s) that matched query '%s':" %(len(hits), duration, q) for hit in hits: doc = searcher.doc(hit.doc) print 'path:', doc.get("path")
def luceneRetriver(queryString): lucene.initVM() #指明索引所处位置 indexDir = "C:\index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(queryString) MAX = 1000 #最多记录数 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("path").encode("utf-8")
def query(query): lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)] doc = searcher.doc(hit.doc)
def boolean_search_lucene_index(index_dir, query_text, limit): ''' This function searches a boolean query in the learned lucene index Arguments: index_dir - the lucene index directory query_text - the query text which follows http://lucene.apache.org/core/3_6_0/queryparsersyntax.html limit - the number of records to be retrieved Return: rows - the returned document details ''' DEFAULT_QUERY_FIELD = 'all' store = SimpleFSDirectory(File(index_dir)) searcher = IndexSearcher(store, True) parser = QueryParser(Version.LUCENE_CURRENT, DEFAULT_QUERY_FIELD, STD_ANALYZER) query = parser.parse(query_text) start = datetime.datetime.now() scoreDocs = searcher.search(query, limit).scoreDocs duration = datetime.datetime.now() - start # print "Lucene Search: Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) rows = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file row.append(scoreDoc.score) rows.append(row) return rows
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page): """ Uses the query term provided to search the disease ontology lucene index """ results = [] index_dir = SimpleFSDirectory(File(index_dir)) analyzer = build_perfield_analyzer(index_metadata) searcher = IndexSearcher(index_dir) index_fields = index_metadata.keys() # Since we are paging results we want to grab what page we are on page = (int(search_params.get('page', 1))) - 1 # Doing something pretty hacky here since we are trying to move from 0-based to 1 # based indexing to match our pagingation display offset = int(page) * records_per_page # If we are executing an advanced search we will be building a BooleanQuery # in parts as opposed to the one MultiFieldQueryParser when doing a basic # serach query = None if search_params.get('adv_search') == "True": query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer) else: parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer) query = MultiFieldQueryParser.parse(parser, process_query_param((search_params.get('q')))) # Parse through our hits hits = searcher.search(query, 10000) total_hits = hits.totalHits count = min(hits.totalHits - offset, records_per_page) for i in xrange(0, count): score_doc = hits.scoreDocs[offset+i] doc = searcher.doc(score_doc.doc) term_id = doc.get('term id') name = doc.get('name') explain = searcher.explain(query, score_doc.doc) match_fields = get_field_matches( explain.toString(), index_fields ) results.append( (term_id, name, list(match_fields)) ) searcher.close() return (results, total_hits)
def search(cls, indexDir, q): fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q) start = time() hits = searcher.search(query, 50).scoreDocs duration = timedelta(seconds=time() - start) print "Found %d document(s) (in %s) that matched query '%s':" % ( len(hits), duration, q) for hit in hits: doc = searcher.doc(hit.doc) print 'path:', doc.get("path")
def delete_old(self, index): existing_ids = set([book.id for book in Book.objects.all()]) reader = IndexReader.open(index.index, False) searcher = IndexSearcher(reader) try: num = searcher.docFreq(Term('is_book', 'true')) docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num) for result in docs.scoreDocs: stored = searcher.doc(result.doc) book_id = int(stored.get('book_id')) if not book_id in existing_ids: print "book id %d doesn't exist." % book_id index.remove_book(book_id) finally: searcher.close() reader.close()
class WordNetSynonymEngine(object): def __init__(self, indexDir): self.directory = RAMDirectory(SimpleFSDirectory(File(indexDir))) self.searcher = IndexSearcher(self.directory) def getSynonyms(self, word): synList = [] topDocs = self.searcher.search(TermQuery(Term("word", word)), 50) for scoreDoc in topDocs.scoreDocs: doc = self.searcher.doc(scoreDoc.doc) for value in doc.getValues("syn"): synList.append(value) return synList
def testFuzzy(self): self.indexSingleFieldDocs([Field("contents", "fuzzy", Field.Store.YES, Field.Index.ANALYZED), Field("contents", "wuzzy", Field.Store.YES, Field.Index.ANALYZED)]) searcher = IndexSearcher(self.directory) query = FuzzyQuery(Term("contents", "wuzza")) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(2, len(scoreDocs), "both close enough") self.assert_(scoreDocs[0].score != scoreDocs[1].score, "wuzzy closer than fuzzy") self.assertEqual("wuzzy", searcher.doc(scoreDocs[0].doc).get("contents"), "wuzza bear")
def testHits(self): searcher = IndexSearcher(self.directory, True) query = TermQuery(Term("title", "action")) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) highlighter = Highlighter(scorer) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title = doc["title"] stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "title", StringReader(title)) fragment = highlighter.getBestFragment(stream, title) print fragment
def doSearch(self,searchString,fieldToSearch,luceneDir): searchResult =[] store = SimpleFSDirectory(File(luceneDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(store) query = QueryParser(Version.LUCENE_CURRENT,fieldToSearch,analyzer).parse(searchString) hits = searcher.search(query,self.MAX) print "Found %d documents that matched the query '%s'" %(hits.totalHits,searchString) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) #docdict['score'] = hit.score #docdict['docid'] = hit.doc #docdict['content'] = doc.get("contents").encode("utf-8") searchResult.append([doc.get("title").encode("utf-8"),doc.get("contents").encode("utf-8")]) searcher.close() return searchResult
def run(writer, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print "Searching for:", command IndexReader = writer.getReader() searcher = IndexSearcher(IndexReader) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name")
def retrieve(string,tweetID): global eventNum global eventDict global eventList lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: searcher = IndexSearcher(dir) except lucene.JavaError: #print 'Inside First Except' #index(string) eventDict[tweetID] = eventNum eventNum = eventNum + 1 return #searcher = IndexSearcher(dir) try: query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(string) #e = sys.exc_info()[0] #print e MAX = 2 hits = searcher.search(query, MAX) #print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) if hits.totalHits > 0: #print hits.scoreDocs[0].doc #print hits.scoreDocs[0].doc #print tweetID print tweetID, hits.scoreDocs[0].doc eventDict[tweetID] = eventDict[hits.scoreDocs[0].doc] elif hits.totalHits == 0: eventDict[tweetID] = eventNum eventNum = eventNum + 1 for hit in hits.scoreDocs: #print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) #print doc.get("text").encode("utf-8") except lucene.JavaError: pass
def search_lucene_index(index_dir, query_model, limit): ''' This function searches query model (query terms along with their meta data) in the learned lucene index Arguments: index_dir - the lucene index directory query_model - the query model (contains query terms, meta data, and conjunctions) limit - the number of records to be retrieved Return: rows - the returned document details ''' store = SimpleFSDirectory(File(index_dir)) searcher = IndexSearcher(store, True) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, query_model[1], STD_ANALYZER) query = parser.parse(Version.LUCENE_CURRENT, query_model[0], query_model[1], query_model[2], STD_ANALYZER) scoreDocs = searcher.search(query, limit).scoreDocs print "Found %d document(s) that matched query '%s':" %(len(scoreDocs), query) rows = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) row = [] metadata = MetadataType._types for field in metadata: if table.get(field,'empty') != 'empty' : row.append(table.get(field,'empty')) else: row.append('') row.append(str(table.get(MetadataType.FILE_ID,'empty'))) # the unique file id of a file row.append(scoreDoc.score) rows.append(row) return rows
class OccuredCandidates: indexDir = 'data/index' max_candidates = 30 def __init__(self): lucene.initVM() self._lversion = Version.LUCENE_30 self._analyzer = EnglishAnalyzer(self._lversion) self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir))) self._translation = loadTranslation() self._links = loadLinks() def find(self, phrase): phrase = phrase.lower().encode('utf8') query = ' '.join(['+' + word for word in phrase.split(' ')]) query = QueryParser(self._lversion, 'contents', self._analyzer).parse(query) hits = self._searcher.search(query, self.max_candidates) # if not hits.totalHits: print "%d documents for '%s'" % (hits.totalHits, str(query)) # potential bug # todo put article_id in lucene index instead of translating document title links = {} for hit in hits.scoreDocs: title = quote( self._searcher.doc( hit.doc).get("title").encode('utf-8').replace( ' ', '_')).replace('%28', '(').replace('%29', ')') if title in self._translation: links[self._translation[title]] = hit.score # else: print title # potential bug return self._links[phrase].get(-1, 0), links def clear_links(self, annotations): return filter( lambda annotation: annotation['links'] and max(annotation[ 'links'].values()) > 1, annotations)
def luceneRetriver(query): #print ('-------------Searching-------------') #print (query) lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30, 'text', lucene_analyzer).parse(query) MAX = 1000 #存放返回的文档标题list title_list = [] total_hits = lucene_searcher.search(my_query, MAX) #print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs[:10]: #print"Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString() doc = lucene_searcher.doc(hit.doc) #print doc.get("title").encode("utf-8").lstrip(str(TXTDIR)) #print doc.get("text").encode("utf-8") #print ('\n') title_list.append({ doc.get("title").encode("utf-8").lstrip(str(TXTDIR)): round(hit.score, 5) }.copy()) return title_list #print ('查询内容:八卦') #print ('查询结果:') #print ('\n') #luceneRetriver("下列 关于 中国 八卦 不正确 人类 历史 东西方 平等 交流 见证")
def test_search(index_dir): ''' The test function to test the created index ''' store = SimpleFSDirectory(File(index_dir)) searcher = IndexSearcher(store, True) parser = QueryParser(Version.LUCENE_CURRENT, "keywords", STD_ANALYZER) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse('email_subject:Training') start = datetime.datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.datetime.now() - start print "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print scoreDoc.score table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print table
def main(cls, argv): if len(argv) != 3: print "Usage: Explainer <index dir> <query>" else: indexDir = argv[1] queryExpression = argv[2] directory = SimpleFSDirectory(indexDir) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse(queryExpression) print "Query:", queryExpression searcher = IndexSearcher(directory) scoreDocs = searcher.search(query, 50).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) explanation = searcher.explain(query, scoreDoc.doc) print "----------" print doc["title"].encode('utf-8') print explanation
def main(cls, argv): if len(argv) != 3: print "Usage: Explainer <index dir> <query>" else: indexDir = argv[1] queryExpression = argv[2] directory = SimpleFSDirectory(File(indexDir)) query = QueryParser(Version.LUCENE_CURRENT, "contents", SimpleAnalyzer()).parse(queryExpression) print "Query:", queryExpression searcher = IndexSearcher(directory) scoreDocs = searcher.search(query, 50).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) explanation = searcher.explain(query, scoreDoc.doc) print "----------" print doc["title"].encode('utf-8') print explanation
writer.addDocument(doc) print("Indexed lines from stdin (%d documents in index)" % (writer.numDocs())) print("About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() print("...done optimizing index of %d documents" % writer.numDocs()) print("Closing index of %d documents..." % writer.numDocs()) print("...done closing index of %d documents" % writer.numDocs()) writer.close() # RETRIEVAL dir = SimpleFSDirectory(File(fullIndexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) searcher = IndexSearcher(dir) query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(u"¿Dónde está La Mancha?") MAX = 1000 hits = searcher.search(query, MAX) print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) for hit in hits.scoreDocs: print(hit.score, hit.doc, hit.toString()) doc = searcher.doc(hit.doc) print(doc.get("text").encode("utf-8")) print(doc.get("metadata").encode("utf-8"))
# Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create a query query = queryParser.parse(QUERY_STRING) topDocs = searcher.search(query, 50) # Get top hits scoreDocs = topDocs.scoreDocs print "%s total matching documents." % len(scoreDocs) HighlightFormatter = SimpleHTMLFormatter() query_score = QueryScorer (query) highlighter = Highlighter(HighlightFormatter, query_score) # Set the fragment size. We break text in to fragment of 64 characters fragmenter = SimpleSpanFragmenter(query_score, 64) highlighter.setTextFragmenter(fragmenter) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) text = doc.get(FIELD_CONTENTS) ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text)) print doc.get(FIELD_PATH) print highlighter.getBestFragments(ts, text, 3, "...") print ""
def main(indexDir, inputDir): """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index""" lucene.initVM() # Open index logger.info("Opening Lucene index [%s]..." % indexDir) dir = SimpleFSDirectory(File(indexDir)) analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(dir) # Search documents onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ] for f in onlyfiles: json_data = open(inputDir + '/' + f) data = json.load(json_data) # The results collected after comparison results = list() journal_code = f.split('.')[0] for entry in data: url = entry['url'] date = entry['date'] title = entry['title'] logger.debug("Processing URL [%s] date [%s] - [%s]" % (url, date, title)) tt = nltk.word_tokenize(title) tokens = [] for t in tt: tokens.append(t.lower()) entry['similars'] = list() for token in tokens: q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % ( token, date, journal_code, url) query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(q) hits = searcher.search(query, MAX_HITS) logger.debug("Found %d document(s) that matched query '%s':" % (hits.totalHits, q)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) logger.debug(doc) entry['similars'].append({ 'token': token, 'url': doc.get('url'), 'title': doc.get('title') }) results.append(entry) json_data.close() print """<html> <body> <table><thead> <tr> <th>Jornal</th><th>Data</th><th>Título</th><th>URL</th><th>Notícias semelhantes</th> </tr> </thead> <tbody> """ for entry in results: similars = entry['similars'] similars_text = '<ul>' for s in similars: similars_text += '<li>[%s] [%s] [%s]</li>' % ( s['token'].encode('iso-8859-1', 'ignore'), s['title'].encode('iso-8859-1', 'ignore'), s['url'].encode( 'iso-8859-1', 'ignore')) similars_text += '</ul>' print """<tr> <td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td> </tr> """ % (journal_code, entry['date'].encode('iso-8859-1', 'ignore'), entry['title'].encode('iso-8859-1', 'ignore'), entry['url'].encode( 'iso-8859-1', 'ignore'), similars_text)
class DistanceSortingTest(TestCase): def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) self.addPoint(writer, "El Charro", "restaurant", 1, 2) self.addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9) self.addPoint(writer, "Los Betos", "restaurant", 9, 6) self.addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8) writer.close() self.searcher = IndexSearcher(self.directory, True) self.query = TermQuery(Term("type", "restaurant")) def addPoint(self, writer, name, type, x, y): doc = Document() doc.add(Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("x", str(x), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) doc.add( Field("y", str(y), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) writer.addDocument(doc) def testNearestRestaurantToHome(self): sort = Sort(SortField("location", DistanceComparatorSource(0, 0))) scoreDocs = self.searcher.search(self.query, None, 50, sort).scoreDocs self.assertEqual("El Charro", self.searcher.doc(scoreDocs[0].doc).get("name"), "closest") self.assertEqual("Los Betos", self.searcher.doc(scoreDocs[3].doc).get("name"), "furthest") def testNeareastRestaurantToWork(self): sort = Sort(SortField("location", DistanceComparatorSource(10, 10))) docs = self.searcher.search(self.query, None, 3, sort) self.assertEqual(4, docs.totalHits) self.assertEqual(3, len(docs.scoreDocs)) fieldDoc = FieldDoc.cast_(docs.scoreDocs[0]) distance = Double.cast_(fieldDoc.fields[0]).doubleValue() self.assertEqual(sqrt(17), distance, "(10,10) -> (9,6) = sqrt(17)") document = self.searcher.doc(fieldDoc.doc) self.assertEqual("Los Betos", document["name"]) self.dumpDocs(sort, docs) def dumpDocs(self, sort, docs): print "Sorted by:", sort for scoreDoc in docs.scoreDocs: fieldDoc = FieldDoc.cast_(scoreDoc) distance = Double.cast_(fieldDoc.fields[0]).doubleValue() doc = self.searcher.doc(fieldDoc.doc) print " %(name)s @ (%(location)s) ->" % doc, distance
class BooksLikeThis(object): def main(cls, argv): indexDir = System.getProperty("index.dir") directory = SimpleFSDirectory(File(indexDir)) reader = IndexReader.open(directory, True) blt = BooksLikeThis(reader) for id in xrange(reader.maxDoc()): if reader.isDeleted(id): continue doc = reader.document(id) print '' print doc.get("title").encode('utf-8') docs = blt.docsLike(id, doc, 10) if not docs: print " None like this" else: for doc in docs: print " ->", doc.get("title").encode('utf-8') def __init__(self, reader): self.reader = reader self.searcher = IndexSearcher(reader) def docsLike(self, id, doc, max): authors = doc.getValues("author") authorQuery = BooleanQuery() for author in authors: authorQuery.add(TermQuery(Term("author", author)), BooleanClause.Occur.SHOULD) authorQuery.setBoost(2.0) vector = self.reader.getTermFreqVector(id, "subject") subjectQuery = BooleanQuery() for term in vector.getTerms(): tq = TermQuery(Term("subject", term)) subjectQuery.add(tq, BooleanClause.Occur.SHOULD) likeThisQuery = BooleanQuery() likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD) likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD) # exclude myself likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))), BooleanClause.Occur.MUST_NOT) print " Query:", likeThisQuery.toString("contents") scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) if len(docs) < max: docs.append(doc) else: break return docs main = classmethod(main)
def post(self): q = self.get_argument("query") k = self.get_argument("kTerms") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % ( hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) print(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) rQ.append("html_files/" + str(hit.doc)) i = 0 rqSize = 0 for url in rQ: rqSize = rqSize + 1 print(url) f = codecs.open(url, 'r') html = f.read() html = html.decode('utf-8') tag_free = strip_tags(html) path = 'strippedHTML_files' if not os.path.exists(path): os.makedirs(path) filename = str(i) with open(os.path.join(path, filename), 'wb') as temp_file: temp_file.write(tag_free.encode('utf-8')) i = i + 1 path = 'strippedHTML_files' i = 0 for filename in os.listdir(path): with open(os.path.join(path, filename), 'r') as myfile: data = myfile.read() stripStopWords(data, i) i = i + 1 if k > 0: newQuery = calcNewQuery(k, q, rqSize) q = newQuery print("new query is ") print(q) self.render("index.html", title="Results", items=items, query=q, kTerms=k)