def main(cls, argv): allBooks = MatchAllDocsQuery() parser = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)) query = BooleanQuery() query.add(allBooks, BooleanClause.Occur.SHOULD) query.add(parser.parse("java OR action"), BooleanClause.Occur.SHOULD) indexDir = System.getProperty("index.dir") directory = SimpleFSDirectory(File(indexDir)) example = SortingExample(directory) example.displayResults(query, Sort.RELEVANCE) example.displayResults(query, Sort.INDEXORDER) example.displayResults(query, Sort(SortField("category", SortField.STRING))) example.displayResults(query, Sort(SortField("pubmonth", SortField.INT, True))) example.displayResults(query, Sort([SortField("category", SortField.STRING), SortField.FIELD_SCORE, SortField("pubmonth", SortField.INT, True)])) example.displayResults(query, Sort([SortField.FIELD_SCORE, SortField("category", SortField.STRING)])) directory.close()
class LiaTestCase(TestCase): TEST_VERSION = Version.LUCENE_CURRENT def __init__(self, *args): super(LiaTestCase, self).__init__(*args) self.indexDir = System.getProperty("index.dir") def setUp(self): self.directory = SimpleFSDirectory(File(self.indexDir)) def tearDown(self): self.directory.close() def getWriter(self, directory=None, analyzer=None, open_mode=None): config = IndexWriterConfig(self.TEST_VERSION, analyzer or LimitTokenCountAnalyzer(WhitespaceAnalyzer(Version.LUCENE_CURRENT), 10000) ) config.setOpenMode(open_mode or IndexWriterConfig.OpenMode.CREATE) return IndexWriter(directory or self.directory, config) def getSearcher(self, directory=None, reader=None): if reader is not None: return IndexSearcher(reader) return IndexSearcher(DirectoryReader.open(directory or self.directory)) # # For troubleshooting # def dumpHits(self, searcher, scoreDocs): if not scoreDocs: print "No hits" else: for scoreDoc in scoreDocs: print "%s: %s" %(scoreDoc.score, searcher.doc(scoreDoc.doc).get('title')) def assertHitsIncludeTitle(self, searcher, scoreDocs, title, fail=False): for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) if title == doc.get("title"): if fail: self.fail("title '%s' found" %(title)) return if not fail: self.fail("title '%s' not found" %(title)) def parseDate(self, s): return SimpleDateFormat("yyyy-MM-dd").parse(s)
def setUp(self): indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "index-dir") cIndexDir = "%s-compound" % (indexDir) mIndexDir = "%s-multi" % (indexDir) self.rmdir(cIndexDir) self.rmdir(mIndexDir) self.cDir = SimpleFSDirectory(File(cIndexDir)) self.mDir = SimpleFSDirectory(File(mIndexDir))
def setUp(self): fsIndexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "fs-index") self.rmdir(fsIndexDir) self.ramDir = RAMDirectory() self.fsDir = SimpleFSDirectory(File(fsIndexDir))
def __init__(self): self.__dict__ = self.__shared_state if not self.__shared_state: self.jccvm = lucene.initVM() self.index = SimpleFSDirectory( lucene.File(settings.lucene_index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query= QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString( ) doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def names(): lst = [] search = "spax" #request.form['product'] lucene.initVM() dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: if hit.score >= 1: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") items = doc.get("text").encode("utf-8").split(',') for item in items: if item == search: pass elif item not in lst: lst.append(item) #print lst data = {"products": lst} if request.method == 'POST': return jsonify(data) else: return jsonify(data)
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def post(self): q= self.get_argument("query") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) self.render("index.html", title="Results", items=items, query=q)
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def __init__(self, location): lucene.initVM() directory = SimpleFSDirectory(File(location)) self.reader = IndexReader.open(directory, True) self.searcher = IndexSearcher(self.reader) self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer())
def reindex(self): writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) indexutils.reindex_all(self.reader, writer, self.corpus.analyzer) writer.optimize() writer.close() self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)}) self.parent.write({'status': "Ready!"})
def retrieveDocs(q): lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_30) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def setUp(self): indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') self.rmdir(indexDir) self.dir = SimpleFSDirectory(File(indexDir)) self.addDocuments(self.dir)
def init(): global searcher, analyzer, vm vm = initVM() STORE_DIR = "index_qst" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
def __init__(self): lucene.initVM() self._lversion = Version.LUCENE_30 self._analyzer = EnglishAnalyzer(self._lversion) self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir))) self._translation = loadTranslation() self._links = loadLinks()
def _init_index(self): if not os.path.exists(self.corpus.path): os.mkdir(self.corpus.path) try: searcher = IndexSearcher(SimpleFSDirectory(File(self.corpus.path)), True) #except lucene.JavaError: except: analyzer = self.corpus.analyzer writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) writer.optimize() writer.close() self.lucene_index = SimpleFSDirectory(File(self.corpus.path)) self.searcher = IndexSearcher(self.lucene_index, True) self.reader = IndexReader.open(self.lucene_index, True) self.analyzer = self.corpus.analyzer
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" %(dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.commit() writer.close() dir.close() return numIndexed
def init(): global STORE_DIR, directory, searcher, analyzer, vm_env STORE_DIR = "index_lucene_v3_highlight" if (vm_env == None): vm_env = initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
def index_files(files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
def search(command): STORE_DIR = "index" vm_env = initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) result = run(searcher, analyzer, command) searcher.close() return result
def SearchFiles(command): STORE_DIR = "lucene/index" getVMEnv().attachCurrentThread() # print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) rankedfiles = run(searcher, analyzer, command) searcher.close() return rankedfiles
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % ( dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
class LiaTestCase(TestCase): def __init__(self, *args): super(LiaTestCase, self).__init__(*args) self.indexDir = os.environ["index.dir"] def setUp(self): self.directory = SimpleFSDirectory(self.indexDir) def tearDown(self): self.directory.close() # # For troubleshooting # def dumpHits(self, searcher, scoreDocs): if not scoreDocs: print "No hits" else: for scoreDoc in scoreDocs: print "%s: %s" %(scoreDoc.score, searcher.doc(scoreDoc.doc).get('title')) def assertHitsIncludeTitle(self, searcher, scoreDocs, title, fail=False): for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) if title == doc.get("title"): if fail: self.fail("title '%s' found" %(title)) return if not fail: self.fail("title '%s' not found" %(title)) def parseDate(self, s): return datetime.date("yyyy-MM-dd")
def Searchfile(command, prior, page, RPP): STORE_DIR = "index_ans" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) store = run(searcher, analyzer, command, prior) searcher.close() start = (page - 1) * RPP end = start + RPP return store[start:end], len(store)
def __init__(self, rows=None): #lucene.initVM() # Django의 setttings.py 에 lucene.initVM() 설정 후 불러다 사용 vm_env = lucene.getVMEnv() if vm_env == None: lucene.initVM() else: vm_env.attachCurrentThread() self.analyzer = lucene.StandardAnalyzer(Version.LUCENE_30) self.indexDir = SimpleFSDirectory(File(INDEX_DIRECTORY)) self.rows = rows
class LiaTestCase(TestCase): def __init__(self, *args): super(LiaTestCase, self).__init__(*args) self.indexDir = System.getProperty("index.dir") def setUp(self): self.directory = SimpleFSDirectory(File(self.indexDir)) def tearDown(self): self.directory.close() # # For troubleshooting # def dumpHits(self, searcher, scoreDocs): if not scoreDocs: print "No hits" else: for scoreDoc in scoreDocs: print "%s: %s" % (scoreDoc.score, searcher.doc( scoreDoc.doc).get('title')) def assertHitsIncludeTitle(self, searcher, scoreDocs, title, fail=False): for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) if title == doc.get("title"): if fail: self.fail("title '%s' found" % (title)) return if not fail: self.fail("title '%s' not found" % (title)) def parseDate(self, s): return SimpleDateFormat("yyyy-MM-dd").parse(s)
def import_csv_with_content(self, csv_file, content_field): try: writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir) writer.close() except UnicodeDecodeError: try: writer.close() except: pass self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'}) return self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page): """ Uses the query term provided to search the disease ontology lucene index """ results = [] index_dir = SimpleFSDirectory(File(index_dir)) analyzer = build_perfield_analyzer(index_metadata) searcher = IndexSearcher(index_dir) index_fields = index_metadata.keys() # Since we are paging results we want to grab what page we are on page = (int(search_params.get('page', 1))) - 1 # Doing something pretty hacky here since we are trying to move from 0-based to 1 # based indexing to match our pagingation display offset = int(page) * records_per_page # If we are executing an advanced search we will be building a BooleanQuery # in parts as opposed to the one MultiFieldQueryParser when doing a basic # serach query = None if search_params.get('adv_search') == "True": query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer) else: parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer) query = MultiFieldQueryParser.parse( parser, process_query_param((search_params.get('q')))) # Parse through our hits hits = searcher.search(query, 10000) total_hits = hits.totalHits count = min(hits.totalHits - offset, records_per_page) for i in xrange(0, count): score_doc = hits.scoreDocs[offset + i] doc = searcher.doc(score_doc.doc) term_id = doc.get('term id') name = doc.get('name') explain = searcher.explain(query, score_doc.doc) match_fields = get_field_matches(explain.toString(), index_fields) results.append((term_id, name, list(match_fields))) searcher.close() return (results, total_hits)
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith('.properties'): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def query(query): lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)] doc = searcher.doc(hit.doc)
def search(cls, indexDir, q): fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q) start = time() hits = searcher.search(query, 50).scoreDocs duration = timedelta(seconds=time() - start) print "Found %d document(s) (in %s) that matched query '%s':" % ( len(hits), duration, q) for hit in hits: doc = searcher.doc(hit.doc) print 'path:', doc.get("path")
def setUp(self): self.directory = SimpleFSDirectory(self.indexDir)