def setUp(self): indexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "index-dir") cIndexDir = "%s-compound" % (indexDir) mIndexDir = "%s-multi" % (indexDir) self.rmdir(cIndexDir) self.rmdir(mIndexDir) self.cDir = SimpleFSDirectory(File(cIndexDir)) self.mDir = SimpleFSDirectory(File(mIndexDir))
def setUp(self): fsIndexDir = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "fs-index") self.rmdir(fsIndexDir) self.ramDir = RAMDirectory() self.fsDir = SimpleFSDirectory(File(fsIndexDir))
def __init__(self): self.__dict__ = self.__shared_state if not self.__shared_state: self.jccvm = lucene.initVM() self.index = SimpleFSDirectory( lucene.File(settings.lucene_index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
def luceneRetriver(query): lucene.initVM() indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query= QueryParser(Version.LUCENE_30,"text",\ lucene_analyzer).parse(query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "Hits: ", total_hits.totalHits for hit in total_hits.scoreDocs: print "Hit Score: ", hit.score, "Hit Doc:", hit.doc, "HitString:", hit.toString( ) doc = lucene_searcher.doc(hit.doc) print doc.get("text").encode("utf-8")
def __init__(self, location): lucene.initVM() directory = SimpleFSDirectory(File(location)) self.reader = IndexReader.open(directory, True) self.searcher = IndexSearcher(self.reader) self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer())
def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def post(self): q= self.get_argument("query") # self.write(key) # def query(query): # query = self.get_argument("q") lucene.initVM() indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 10 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) items = [] rQ = [] #for key, value in doc_urls.iteritems() # print (key, value) for hit in hits.scoreDocs: #items.append({'score':hit.score, 'doc':hit.doc, 'blah':hit.toString(), 'url':doc_urls[str(hit.doc)]}) print hit.score, hit.doc, hit.toString() print(len(doc_urls)) items.append(doc_urls[str(hit.doc)]) doc = searcher.doc(hit.doc) print(hit.doc) self.render("index.html", title="Results", items=items, query=q)
def names(): lst = [] search = "spax" #request.form['product'] lucene.initVM() dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(lucene.Version.LUCENE_CURRENT, "text", analyzer).parse(search) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: if hit.score >= 1: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") items = doc.get("text").encode("utf-8").split(',') for item in items: if item == search: pass elif item not in lst: lst.append(item) #print lst data = {"products": lst} if request.method == 'POST': return jsonify(data) else: return jsonify(data)
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def init(): global searcher, analyzer, vm vm = initVM() STORE_DIR = "index_qst" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
def reindex(self): writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) indexutils.reindex_all(self.reader, writer, self.corpus.analyzer) writer.optimize() writer.close() self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)}) self.parent.write({'status': "Ready!"})
def setUp(self): indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') self.rmdir(indexDir) self.dir = SimpleFSDirectory(File(indexDir)) self.addDocuments(self.dir)
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def retrieveDocs(q): lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_30) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def __init__(self): lucene.initVM() self._lversion = Version.LUCENE_30 self._analyzer = EnglishAnalyzer(self._lversion) self._searcher = IndexSearcher(SimpleFSDirectory(File(self.indexDir))) self._translation = loadTranslation() self._links = loadLinks()
def _init_index(self): if not os.path.exists(self.corpus.path): os.mkdir(self.corpus.path) try: searcher = IndexSearcher(SimpleFSDirectory(File(self.corpus.path)), True) #except lucene.JavaError: except: analyzer = self.corpus.analyzer writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) writer.optimize() writer.close() self.lucene_index = SimpleFSDirectory(File(self.corpus.path)) self.searcher = IndexSearcher(self.lucene_index, True) self.reader = IndexReader.open(self.lucene_index, True) self.analyzer = self.corpus.analyzer
def index_files(files, index_directory): lucene.initVM() d = SimpleFSDirectory(File(index_directory)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512)) for f in files: parse_file(f, writer) writer.optimize() writer.close()
def init(): global STORE_DIR, directory, searcher, analyzer, vm_env STORE_DIR = "index_lucene_v3_highlight" if (vm_env == None): vm_env = initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
def search(command): STORE_DIR = "index" vm_env = initVM() print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) result = run(searcher, analyzer, command) searcher.close() return result
def SearchFiles(command): STORE_DIR = "lucene/index" getVMEnv().attachCurrentThread() # print 'lucene', VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) rankedfiles = run(searcher, analyzer, command) searcher.close() return rankedfiles
def Searchfile(command, prior, page, RPP): STORE_DIR = "index_ans" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) store = run(searcher, analyzer, command, prior) searcher.close() start = (page - 1) * RPP end = start + RPP return store[start:end], len(store)
def __init__(self, rows=None): #lucene.initVM() # Django의 setttings.py 에 lucene.initVM() 설정 후 불러다 사용 vm_env = lucene.getVMEnv() if vm_env == None: lucene.initVM() else: vm_env.attachCurrentThread() self.analyzer = lucene.StandardAnalyzer(Version.LUCENE_30) self.indexDir = SimpleFSDirectory(File(INDEX_DIRECTORY)) self.rows = rows
def import_csv_with_content(self, csv_file, content_field): try: writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED) changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir) writer.close() except UnicodeDecodeError: try: writer.close() except: pass self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'}) return self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
def search_lucene_index(search_params, index_dir, index_metadata, records_per_page): """ Uses the query term provided to search the disease ontology lucene index """ results = [] index_dir = SimpleFSDirectory(File(index_dir)) analyzer = build_perfield_analyzer(index_metadata) searcher = IndexSearcher(index_dir) index_fields = index_metadata.keys() # Since we are paging results we want to grab what page we are on page = (int(search_params.get('page', 1))) - 1 # Doing something pretty hacky here since we are trying to move from 0-based to 1 # based indexing to match our pagingation display offset = int(page) * records_per_page # If we are executing an advanced search we will be building a BooleanQuery # in parts as opposed to the one MultiFieldQueryParser when doing a basic # serach query = None if search_params.get('adv_search') == "True": query = build_advanced_search_query(search_params, search_params.get('operator'), analyzer) else: parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, index_fields, analyzer) query = MultiFieldQueryParser.parse( parser, process_query_param((search_params.get('q')))) # Parse through our hits hits = searcher.search(query, 10000) total_hits = hits.totalHits count = min(hits.totalHits - offset, records_per_page) for i in xrange(0, count): score_doc = hits.scoreDocs[offset + i] doc = searcher.doc(score_doc.doc) term_id = doc.get('term id') name = doc.get('name') explain = searcher.explain(query, score_doc.doc) match_fields = get_field_matches(explain.toString(), index_fields) results.append((term_id, name, list(match_fields))) searcher.close() return (results, total_hits)
def search_image(command): if command == ' ': return [] Docs = [] vm_env = getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_img" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(directory, True) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) command_dict = {} allowed_opt = ['site'] opt = 'contents' for i in command.split(' '): if ':' in i: opt, value = i.split(':')[:2] opt = opt.lower() if opt in allowed_opt and value != '': command_dict[opt] = command_dict.get(opt, '') + ' ' + value else: seg_list = jieba.cut(i) command_dict[opt] = command_dict.get(opt, '') + ' ' + " ".join(seg_list) querys = BooleanQuery() for k, v in command_dict.iteritems(): if k == 'site': t = Term(k, '*' + v) query = WildcardQuery(t) else: query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 10000).scoreDocs formatter = SimpleHTMLFormatter("<font color=#FF0000>", "</font>") highlighter = Highlighter(formatter, QueryScorer(querys)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dic = {} doc_dic["url"] = doc.get("url") doc_dic["imgurl"] = doc.get("imgurl") doc_dic["urltitle"] = doc.get("urltitle") text = doc.get("contents") ts = analyzer.tokenStream(doc.get("contents"), StringReader(text)) doc_dic["contents"] = highlighter.getBestFragments(ts, text, 2, "...") Docs.append(doc_dic) searcher.close() return Docs
def createIndex(cls, dataDir, indexDir, useCompound): indexDir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED) writer.setUseCompoundFile(useCompound) for dir, dirnames, filenames in os.walk(dataDir): for filename in filenames: if filename.endswith('.properties'): cls.indexFile(writer, os.path.join(dir, filename), dataDir) writer.optimize() writer.close()
def query(query): lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) searcher = IndexSearcher(dir) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(query) MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString(), doc_urls[str(hit.doc)] doc = searcher.doc(hit.doc)
def search(cls, indexDir, q): fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(fsDir, True) query = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT)).parse(q) start = time() hits = searcher.search(query, 50).scoreDocs duration = timedelta(seconds=time() - start) print "Found %d document(s) (in %s) that matched query '%s':" % ( len(hits), duration, q) for hit in hits: doc = searcher.doc(hit.doc) print 'path:', doc.get("path")
def find(self, query, indir): lucene.initVM() INDEXDIR = indir indir = SimpleFSDirectory(File(INDEXDIR)) lucene_analyzer = StandardAnalyzer(Version.LUCENE_30) lucene_searcher = IndexSearcher(indir) my_query = QueryParser(Version.LUCENE_30,"<default field>",\ lucene_analyzer).parse("text:" + query + " OR title:" + query) MAX = 1000 total_hits = lucene_searcher.search(my_query, MAX) print "\nHits: ", total_hits.totalHits, "\n" for hit in total_hits.scoreDocs: print "Hit Score:", "%.4f" % hit.score, "Department:", lucene_searcher.doc( hit.doc).get("department").encode( "utf-8"), "Title:", lucene_searcher.doc( hit.doc).get("title").encode("utf-8") print lucene_searcher.doc(hit.doc).get("url").encode("utf-8"), '\n'
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % ( dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed