def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def configure_lucene(): f = open('clique.txt','r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t','') line = line.replace('\r','') line = line.replace('\n','') line = line.replace('^','') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def luceneIndexer(docdir,indir): """ IndexDocuments from a directory. Args: docdir:文档所在文件夹 indir:索引存放文件夹 Returns: 无返回值 说明: FieldType().setStored=as-is value stored in the Lucene index FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted) FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion. """ """#类型1属性:对于需要检索,需要返回显示setStored(True) type1 = FieldType() type1.setIndexed(True) type1.setStored(True) type1.setTokenized(False) type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。 type2 = FieldType() type2.setIndexed(True) type2.setStored(False) type2.setTokenized(True) type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)""" lucene.initVM() DIRTOINDEX= docdir INDEXIDR= indir indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) #用指定的语言分析器构造一个新的写索引器. index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): #print "Indexing: " print "Indexing:", tfile; document = Document() content = open(tfile,'r').read() #类型使用方式 #doc.add(Field("path", tfile, type1)) #文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"} document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def Indexer(docdir,indir): lucene.initVM() DIRTOINDEX = docdir INDEXDIR = indir indexdir = FSDirectory(File(INDEXDIR)) analyzer = StandardAnalyzer(VERSION.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing ",tfile document=Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done" index_writer.optimize() print index_writer.numDocs() index_writer.close()
def lucene_index(input_folder,output_folder): ''' Indexes fresh text data using lucene 3.6. Doesn't support incremental generation of index as of now. Currently crashes on neo by running out of heap space. Arguments: Input folder for text files. output folder for index location Returns: void. The index is stored if generated. ''' # Setting up log file logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log")) logging.info("Input directory for logging: "+input_folder) logging.info("Output directory of index: "+output_folder) if not os.path.isdir(output_folder): logger.debug("Making output directory for index: "+ output_folder) os.makedirs(output_folder) # Setting up lucene's heap size for index and version of indexer lucene.initVM(initialheap='1024m',maxheap='2048m') index_folder = SimpleFSDirectory(File(output_folder)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) # Optimization to reduce heap space usage for generation of index. Merges buffer with # current index after 15 docs. writer.setMergeFactor(15) writer.setRAMBufferSizeMB(32.0) # Search to find the files to index files_to_index = find_files_in_folder(input_folder) for input_file in files_to_index: doc = Document() content = open(input_file, 'r').read() doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index. doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file writer.addDocument(doc) # Index logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs())) logger.info( "About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() # Compress index logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() logger.info("Closed index")
def luceneIndexer(docdir,indir): """ Index Documents from a dirrcory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def luceneIndexer(docdir, indir): """ IndexDocuments from a directory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile, 'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" %(dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.commit() writer.close() dir.close() return numIndexed
def index(cls, indexDir, dataDir): if not (os.path.exists(dataDir) and os.path.isdir(dataDir)): raise IOError, "%s does not exist or is not a directory" % ( dataDir) dir = SimpleFSDirectory(File(indexDir)) writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) cls.indexDirectory(writer, dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
def luceneIndexer(docdir, indir): """frpFile IndexDocuments from a directory para:{ docdir: the path of the txt file indir: the path of the index file which is generated by the following code } """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, True, \ IndexWriter.MaxFieldLength(512)) #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): list = os.listdir(DIRTOINDEX) for i in range(len(list)): tfile = os.path.join(DIRTOINDEX, list[i]) if os.path.isfile(tfile): print ("Indexing: ", tfile) print ('okokokook') document = Document() content = open(tfile, 'r').read() document.add(Field("text", content, Field.Store.YES, \ Field.Index.ANALYZED)) document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) #print (document) print ("Done: ", tfile) index_writer.optimize() print (index_writer.numDocs()) index_writer.close()
def testIndexWriter(self): writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) self.assertEqual(len(self.keywords), writer.numDocs()) writer.close()
from htmlparser import parsehtml import lucene import os from lucene import SimpleFSDirectory, System, File, Document, Field, StandardAnalyzer, IndexWriter, Version if __name__ == "__main__": lucene.initVM() src_dir = "html_files" indexDir = "index" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print ("Currently there are %d documents in the index..." % writer.numDocs()) print ("Reading lines from directory...") i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() document, errors = parsehtml(data) print(l) i += 1 doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print ("Indexed lines from stdin (%d documents in index)" % (writer.numDocs())) print ("About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() print ("...done optimizing index of %d documents" % writer.numDocs()) print ("Closing index of %d documents..." % writer.numDocs()) writer.close() print ("...done closing index of %d documents" % writer.numDocs())
INDEXDIR = "texts.index" dir = SimpleFSDirectory(File(INDEXDIR)) analyzer = SimpleAnalyzer(Version.LUCENE_35) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) conn = psycopg2.connect("dbname=texts user=swasheck") cur = conn.cursor() cur.execute("select reference, version_id, analysis_text from verse;") for verse in cur.fetchall(): print "Adding %s (version=%s)" % (verse[0],verse[1]) doc = Document() doc.add(Field("reference", verse[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("version", str(verse[1]), Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", verse[2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimizing the index of %d documents...' % writer.numDocs() writer.optimize() print 'Closing the index' writer.close() ''' INDEXDIR = "greek.texts.index" dir = SimpleFSDirectory(File(INDEXDIR)) el_analyzer = GreekAnalyzer(Version.LUCENE_35) analyzer = SimpleAnalyzer(Version.LUCENE_35) writer = IndexWriter(dir, el_analyzer, True, IndexWriter.MaxFieldLength(512)) conn = psycopg2.connect("dbname=texts user=swasheck") cur = conn.cursor() cur.execute("select reference, version_id, analysis_text from verse where version_id in (2,3);") for verse in cur.fetchall(): print "Adding %s (version=%s)" % (verse[0],verse[1]) doc = Document()
indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) # # set variables that affect speed of indexing # writer.setMergeFactor(int(argv[2])) # writer.setMaxMergeDocs(int(argv[3])) # writer.setMaxBufferedDocs(int(argv[4])) # # writer.infoStream = System.out # # print "Merge factor: ", writer.getMergeFactor() # print "Max merge docs:", writer.getMaxMergeDocs() # print "Max buffered docs:", writer.getMaxBufferedDocs() print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) i = 0 print >> sys.stderr, "Reading lines from sys.stdin..." for l in sys.stdin: i += 1 if string.strip(l) == "": continue doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) if i % 10000 == 0: print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % ( i, writer.numDocs())
from lucene import (SimpleFSDirectory, System, File, Document, Field, StandardAnalyzer, IndexWriter, IndexSearcher, QueryParser) if __name__ == "__main__": lucene.initVM() fullIndexDir = r"c:\NLP\PhD\bob\fileDB\LuceneFullIndex" print "lucene version is:", lucene.VERSION fullIndex = SimpleFSDirectory(File(fullIndexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(fullIndex, analyzer, True, IndexWriter.MaxFieldLength(20000000)) ## writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength(512)) print "Currently there are %d documents in the index..." % writer.numDocs() ## print "Reading lines from sys.stdin..." lines=["bla bla bla bla bla","Erase una vez que se era", "En un lugar de La Mancha de cuyo nombre no quiero acordarme, no ha mucho que vivia un hidalgo de los de lanza en ristre", "Manchame mancha mancha que te mancha la mancha"] for l in lines: doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) metadata={"asdfa":"asdfa"} json_metadata=json.dumps(metadata) doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO)) writer.addDocument(doc) print "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print "About to optimize index of %d documents..." % writer.numDocs() writer.optimize()
StandardAnalyzer, IndexWriter, IndexSearcher, QueryParser) if __name__ == "__main__": lucene.initVM() fullIndexDir = r"c:\NLP\PhD\bob\fileDB\LuceneFullIndex" print("lucene version is:", lucene.VERSION) fullIndex = SimpleFSDirectory(File(fullIndexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(fullIndex, analyzer, True, IndexWriter.MaxFieldLength(20000000)) ## writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength(512)) print("Currently there are %d documents in the index..." % writer.numDocs()) ## print "Reading lines from sys.stdin..." lines = [ "bla bla bla bla bla", "Erase una vez que se era", "En un lugar de La Mancha de cuyo nombre no quiero acordarme, no ha mucho que vivia un hidalgo de los de lanza en ristre", "Manchame mancha mancha que te mancha la mancha" ] for l in lines: doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) metadata = {"asdfa": "asdfa"} json_metadata = json.dumps(metadata) doc.add( Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
indexDir = "/Tmp/REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) # # set variables that affect speed of indexing # writer.setMergeFactor(int(argv[2])) # writer.setMaxMergeDocs(int(argv[3])) # writer.setMaxBufferedDocs(int(argv[4])) # # writer.infoStream = System.out # # print "Merge factor: ", writer.getMergeFactor() # print "Max merge docs:", writer.getMaxMergeDocs() # print "Max buffered docs:", writer.getMaxBufferedDocs() print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() i = 0 print >> sys.stderr, "Reading lines from sys.stdin..." for l in sys.stdin: i += 1 if string.strip(l) == "": continue doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) if i % 10000 == 0: print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % (i, writer.numDocs()) print >> sys.stderr, stats()