def main1(): print "retrieve and display files......" direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) searcher = lucene.IndexSearcher(direc) search(searcher, analyzer) search2(searcher, analyzer)
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def __init__(self): smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) self.analyzers = {"smartcn": smartcn} directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR)) self.searcher = lucene.IndexSearcher(directory, True) self.pgconn = mypass.getConn() self.sw = sinaweibooauth.SinaWeiboOauth()
def __init__(self, index_dir): ''' Initialises index parameters ''' lucene.initVM() self.index_dir = index_dir if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) store = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) self.analyser = PorterStemmerAnalyzer() self.writer = lucene.IndexWriter( store, self.analyser, True, lucene.IndexWriter.MaxFieldLength.LIMITED) self.writer.setMaxFieldLength(1048576) directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) self.reader = lucene.FilterIndexReader.open(directory, True)
def search(self, restrictions, destination): """ @see: L{NullPrincipalSearcher<datafinder.persistence.search.searcher.NullSearcher>} E1101: Pylint cannot detect the internals of the modules solr and lucene. """ # pylint: disable=E1101 results = list() queryString = search_restriction_mapping.mapSearchRestriction( restrictions) if self._configuration.luceneIndexUri.startswith("file:///"): try: self._configuration.env.attachCurrentThread() indexDir = lucene.SimpleFSDirectory( lucene.File( self._configuration.luceneIndexUri.replace( "file:///", ""))) analyzer = lucene.StandardAnalyzer( lucene.Version.LUCENE_CURRENT) searcher = lucene.IndexSearcher(indexDir) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "content", analyzer).parse(queryString) hits = searcher.search(query, constants.MAX_RESULTS) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) results.append("/%s" % urllib.unquote( doc.get(constants.FILEPATH_FIELD).encode("utf-8"))) searcher.close() except Exception, error: errorMessage = "Cannot search items. Reason: '%s'" % error raise PersistenceError(errorMessage)
def __init__(self, forumname): if not forumname in self.supported_forums: sys.exit() else: self.forum = forumname self.STORE_DIR = self.STORE_BASE_DIR + forumname smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) self.analyzers = {"smartcn": smartcn} directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR)) self.searcher = lucene.IndexSearcher(directory, True) self.pgconn = mypass.getConn()
def __init__(self, network): self.network = network smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = {"smartcn": smartcn} self.pgconn = mypass.getConn() writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.storeDir = self.storeDirBase + self.network store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) self.writer = lucene.IndexWriter(store, writerconfig)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = lucene.SimpleFSDirectory(lucene.File(storeDir)) writer = lucene.IndexWriter(store, analyzer, True, lucene.IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer) ticker = Ticker() print 'optimizing index', threading.Thread(target=ticker.run).start() writer.optimize() writer.close() ticker.tick = False print 'done'
def __init__(self): smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33) #analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33) analyzers = { "smartcn": smartcn } self.pgconn = mypass.getConn() self.sw = sinaweibooauth.SinaWeiboOauth() if not os.path.exists(self.storeDir): os.mkdir(self.storeDir) store = lucene.SimpleFSDirectory(lucene.File(self.storeDir)) writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"]) writerconfig.setWriteLockTimeout(600000L) writerconfig.setMaxThreadStates(50) writerconfig.setRAMBufferSizeMB(128.0) self.writer = lucene.IndexWriter(store, writerconfig)
def func_pic(command): global vm_env vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR="graphIndex" directory = lucene.SimpleFSDirectory(lucene.File(STORE_DIR)) searcher = lucene.IndexSearcher(directory, True) analyzer = lucene.SimpleAnalyzer(lucene.Version.LUCENE_CURRENT) title = [] url = [] imgurl = [] score = [] resultInfo, title, url, imgurl, score = run(command, searcher, analyzer) searcher.close() return resultInfo, title, url, imgurl, score
def index_files(board, time_delta): store = lucene.SimpleFSDirectory( lucene.File(BOARDSPATH + board + '/' + RECENT_INDEX)) writer = lucene.IndexWriter( store, lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT), True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) # writer.setMaxFieldLength(1048576) # 1MB flist = get_all_files(board, time_delta) for filename, owner, title in flist: path = BOARDSPATH + board + '/' + filename if not os.path.exists(path): continue f = open(path, 'r') contents = filter_file(f) debug(contents) try: title = title.decode('gbk') owner = owner.decode('gbk') contents = unicode(contents, 'gbk') except UnicodeDecodeError: f.close() debug(filename) continue f.close() if len(contents) > 0: doc = lucene.Document() doc.add( lucene.Field("name", filename, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("owner", owner, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add( lucene.Field("contents", contents, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) writer.addDocument(doc) debug('adding ' + filename) writer.optimize() writer.close()
def main1(): print "started indexing sample files......" direc = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) config = lucene.IndexWriterConfig(lucene.Version.LUCENE_CURRENT, analyzer) config.setOpenMode(lucene.IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = lucene.IndexWriter(direc, config) #fix this later.....FieldType not defined #field_type=lucene.FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) file1 = open("nitin.json") data = file1.read() contents = json.loads(data) doc = lucene.Document() field = lucene.Field("name", contents['name'], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) doc.add(field) field = lucene.Field("data", data, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) file1.close() file1 = open("nitin2.json") data = file1.read() contents = json.loads(data) doc = lucene.Document() field = lucene.Field("name", contents['name'], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) doc.add(field) field = lucene.Field("data", data, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) file1.close() writer.optimize() print "Indexed and optimized %d documents" % writer.numDocs() writer.close()
def __init__(self, root, storeDir, analyzer, startDate, endDate): if not os.path.exists(storeDir): os.mkdir(storeDir) store = lucene.SimpleFSDirectory(lucene.File(storeDir)) # 创建IndexWriter对象,第一个参数是Directory,第二个是分词器, # 第三个表示是否是创建,如果为false为在此基础上面修改, # 第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,一般用IndexWriter.MaxFieldLength.LIMITED writer = lucene.IndexWriter(store, analyzer, False, lucene.IndexWriter.MaxFieldLength.LIMITED) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer, startDate, endDate) ticker = Ticker() print 'optimizing index', threading.Thread(target=ticker.run).start() writer.optimize() writer.close() ticker.tick = False print 'done'
def search(self, query, field="content", limit=None): ''' Searches the index based on the query supplied. ''' directory = lucene.SimpleFSDirectory(lucene.File(self.index_dir)) searcher = lucene.IndexSearcher(directory, True) query = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, field, self.analyser).parse(query) try: #if there's no limit then use a collector to retrieve them all if limit is None: collector = DocumentHitCollector(searcher) scoreDocs = searcher.search(query, collector) results = collector.get_collected_documents() else: scoreDocs = searcher.search(query, limit).scoreDocs results = [] for scoreDoc in scoreDocs: results.append(searcher.doc(scoreDoc.doc)) except lucene.JavaError, e: print e
# -*_ coding: utf-8 -*- # from lucene import * import lucene text = ["a b c d" , "c d e e"] texts = ["Python 是 一个 很有 吸引力 的 语言", "C++ 语言 也 很 有 吸引力 , 长久 不衰", "我们 希望 Python 和 C++ 高手加入", "我们 的 技术 巨牛 ,人人 都是 高手"] initVM() INDEX_DIR = '/root/weibo_corpus/post_index' directory = lucene.SimpleFSDirectory(lucene.File(INDEX_DIR)) analyzer = SimpleAnalyzer() def read(filename): text = [] with open(filename,'r') as f: count = 0 for line in f: text.append(line.strip()) count = count + 1 if(count%10000==1): print(count) return text def search(searcher,qtext):
doc.get("title").encode('gbk') ]) # sort result results.sort(lambda x, y: cmp(x[0], y[0])) for name, owner, title in results: print name, owner, title def test_fixture(): global BOARDSPATH BOARDSPATH = './' if __name__ == '__main__': #test_fixture() board = sys.argv[1] querystr = sys.argv[2].decode('gbk').strip() lucene.initVM() path = BOARDSPATH + board + '/' + RECENT_INDEX if not os.path.exists(path) or len(querystr) == 0: sys.exit(-1) directory = lucene.SimpleFSDirectory(lucene.File(path)) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) run(searcher, analyzer, querystr) searcher.close()
def __init__(self, storeDir): lucene.initVM() print 'lucene', lucene.VERSION self.dir = lucene.SimpleFSDirectory(lucene.File(storeDir))
#OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, #SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT #LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, #DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY #THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ Creates a sample Lucene index for the full-text search feature. """ import lucene import sys if __name__ == "__main__": lucene.initVM() indexDir = "D:/Downloads/index" dir_ = lucene.SimpleFSDirectory(lucene.File(indexDir)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = lucene.IndexWriter(dir_, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) print("Currently there are %d documents in the index..." % writer.numDocs()) content = ( "Strategische Konzeption, Umsetzung und Betreuung von langfristig " + "hochwirksamen und messbar erfolgreichen Maßnahmen in Social Media.") doc = lucene.Document() doc.add( lucene.Field("content", content, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) doc.add(
#!/usr/bin/python #coding: utf-8 #建索引的文件 import lucene import csv index_dir = '../../data/index/' data_dir = '../../data/corpus.csv' lucene.initVM() directory = lucene.SimpleFSDirectory(lucene.File(index_dir)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) def build_index(): f = open(data_dir) reader = csv.reader(f) print("开始创建索引") indx = 0 writer = lucene.IndexWriter(directory, analyzer, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) for line in reader: eng, zh = line[0], line[1] doc = lucene.Document()
import lucene import sys sys.path.append("..") import util from util.rake import Rake print("load vm") index_dir = '../../data/index/' #搭配地址 location_dir = '../../data/location/' lucene.initVM() directory = lucene.SimpleFSDirectory(lucene.File(index_dir)) analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) directory1 = lucene.SimpleFSDirectory(lucene.File(location_dir)) analyzer1 = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) rake = Rake("../../data/SmartStoplist.txt") def search(word): print("searching ") vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() searcher = lucene.IndexSearcher(directory, True)
def index_ontology_files(oboFile, outDir, xref_map): """ Iterates over our list of ontology files and creates an index for each file. """ lucene.initVM() analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) # Handle a little bit of lucene setup filename, _ext = os.path.splitext(os.path.basename(oboFile)) indexDir = os.path.join(outDir, filename) if os.path.exists(indexDir): raise ExistingIndexDirectoryException( 'Error, attempted to index same file twice or index two files named the same' ) dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = lucene.IndexWriter(dir, analyzer, True, lucene.IndexWriter.MaxFieldLength(512)) for term in oboparser.parse(oboFile, ['is_a']): if term.obsolete: continue doc = lucene.Document() add_field_to_document(doc, "term id", term.id, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED) add_field_to_document(doc, "name", term.name, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 4.0) # Frequently in the definition text we will run into URLs or some sort of hyperlinks that could # query hits that we would not want to occur thus errantly increasing the score of the field. # We will strip out these hyperlinks and index just the text. add_field_to_document(doc, "definition", strip_urls_from_text(term.definition), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED, 0.4) # Synonyms, relationships, xrefs, subsets, and alternate ID's are all represented as lists # in our Ontology object and need to be entered in one at a time add_fields_to_document(doc, "synonym", [x[0] for x in term.synonyms if x], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED, 0.7) add_fields_to_document(doc, "alt_id", term.alternateIds, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "xref", [replace_xref_identifier(x, xref_map) for x in term.xrefs], lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) add_fields_to_document( doc, "relationship", [" ".join(list(x)) for x in list(term.relationships)], lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED) add_fields_to_document(doc, "subset", term.subsets, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED) writer.addDocument(doc) writer.optimize() writer.close()
def __init__(self, dir_file_path): lucene.initVM() self.directory = lucene.SimpleFSDirectory(lucene.File(dir_file_path)) self.analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_30) self.search = lucene.IndexSearcher(self.directory)