def luceneIndexer(contents): lucene.initVM() INDEXIDR= settings.INDEX_DIR indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in contents: print"Indexing: ", tfile document= Document() content= tfile.getvalue() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print"Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def configure_lucene(): f = open('clique.txt', 'r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs( ) print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t', '') line = line.replace('\r', '') line = line.replace('\n', '') line = line.replace('^', '') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % ( writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs( ) writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs( ) print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def setUp(self): animals = [ "aardvark", "beaver", "coati", "dog", "elephant", "frog", "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur", "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion", "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak", "zebra" ] analyzer = WhitespaceAnalyzer() aTOmDirectory = RAMDirectory() nTOzDirectory = RAMDirectory() aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) for animal in animals: doc = Document() doc.add(Field("animal", animal, Field.Store.YES, Field.Index.NOT_ANALYZED)) if animal[0].lower() < "n": aTOmWriter.addDocument(doc) else: nTOzWriter.addDocument(doc) aTOmWriter.close() nTOzWriter.close() self.searchers = [ IndexSearcher(aTOmDirectory), IndexSearcher(nTOzDirectory) ]
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(tempfile.gettempdir(), 'index-dir') dir = FSDirectory.open(indexDir,) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = tempfile.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add(Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def index(source, indexName): if(not os.path.exists(indexName)): os.mkdir(indexName) indexDir = File(indexName) writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED) p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL) res = p.findall(source) i = 0 for pair in res: i += 1 doc = Document() doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO)) for t in pair[1].split(): doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED)); #doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc) writer.close() print str(i)+ " docs indexed"
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def createIndex(): #initialize lucene and jvm print("started indexer") lucene.initVM() indexDir = "/Tmp/REMOVEME.index-dir" #get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_30) #get index storage dir = lucene.SimpleFSDirectory(lucene.File(indexDir)) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) src_dir = 'html_files' i = 0 for l in os.listdir(src_dir): l = os.path.join(src_dir, l) with open(l, 'r') as myfile: data=myfile.read() i += 1 document, errors = parsehtml(data) doc = Document() doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def main(cls, argv): if len(argv) < 5: print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>" return docsInIndex = int(argv[1]) # create an index called 'index-dir' in a temp directory indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), 'index-dir') dir = FSDirectory.getDirectory(indexDir, True) analyzer = SimpleAnalyzer() writer = IndexWriter(dir, analyzer, True) # set variables that affect speed of indexing writer.setMergeFactor(int(argv[2])) writer.setMaxMergeDocs(int(argv[3])) writer.setMaxBufferedDocs(int(argv[4])) # writer.infoStream = System.out print "Merge factor: ", writer.getMergeFactor() print "Max merge docs:", writer.getMaxMergeDocs() print "Max buffered docs:", writer.getMaxBufferedDocs() start = time() for i in xrange(docsInIndex): doc = Document() doc.add( Field("fieldname", "Bibamus", Field.Store.YES, Field.Index.TOKENIZED)) writer.addDocument(doc) writer.close() print "Time: ", timedelta(seconds=time() - start)
def index(string): lucene.initVM() indexDir = "REMOVEME.index-dir" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) try: writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512)) except lucene.JavaError: #print 'Inside Index Except' writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) #e = sys.exc_info()[0] #print e #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() doc = Document() doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) #print 'In the index function' #print writer.numDocs() #print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) #print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() #print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() #print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() #print 'ending Indexing' #print string #print 'Total indexes' #print writer.numDocs() writer.close()
def indexfeeds(self,writer): """ """ feedlist=['http://today.reuters.com/rss/topNews', 'http://today.reuters.com/rss/domesticNews', 'http://today.reuters.com/rss/wordNews', 'http://rss.cnn.com/rss/edition.rss', 'http://rss.cnn.com/rss/edition_word.rss', 'http://rss.cnn.com/rss/edition_us.rss'] articletitles=[] for feed in feedlist: f=feedparser.parse(feed) for e in f.entries: if e.title in articletitles: continue contents = e.title.encode('utf8') + self.strphtml(e.description.encode('utf8')) try: doc = Document() doc.add(Field("name", e.title, Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) writer.addDocument(doc) except Exception, e: print 'Unable to index'
def configure_lucene(): f = open('clique.txt','r') lucene.initVM() print 'Inside Function' #indexDir = "/tmp/luceneindex" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT) writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512)) print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs() print >> sys.stderr, "Reading lines from sys.stdin..." for line in f: line = line.replace('\t','') line = line.replace('\r','') line = line.replace('\n','') line = line.replace('^','') line = line.strip() doc = Document() doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs()) print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs() writer.optimize() print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs() print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs() writer.close()
def run(self): env.attachCurrentThread() stream = tweetstream.SampleStream("username", "password") for tweet in stream: try: contents = unicode(tweet['text']) user_name = tweet['user']['screen_name'] #print contents #print user_name doc = Document() doc.add(Field("user_name", user_name, Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED)) else: pass self.writer.addDocument(doc) # optimize for fast search and commit the changes self.writer.optimize() self.writer.commit() except Exception as e: pass
def add_new_document_with_metadata(writer,filepath,fieldnames,values): file = open(filepath) contents = unicode(file.read(), 'UTF-8') file.close() doc = Document() # add name, path, and contents fields doc.add(Field("name", os.path.basename(filepath), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("path", os.path.realpath(filepath), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("txtorg_id", str(uuid.uuid1()), Field.Store.YES, Field.Index.NOT_ANALYZED)) if len(contents) > 0: doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)) else: print "warning: no content in %s" % filename for idx in range(len(fieldnames)): doc.add(Field(fieldnames[idx].lower(),values[idx].lower(),Field.Store.YES,Field.Index.NOT_ANALYZED)) writer.addDocument(doc)
def reindex_all(reader, writer, analyzer): for i in xrange(reader.maxDoc()): if reader.isDeleted(i): continue doc = reader.document(i) p = doc.get("path") pkid = doc.get('txtorg_id') if p is None: # No filepath specified, just use original document writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer) else: # if a path field is found, try to read the file it points to and add a contents field edited_doc = Document() for f in doc.getFields(): edited_doc.add(Field.cast_(f)) try: inf = open(p) contents = unicode(inf.read(), 'UTF-8') inf.close() if len(contents) > 0: edited_doc.add(Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)) else: print "warning: no content in %s" % filename except: print "Could not read file; skipping" writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)
def _addDoc(self, text, writer): """ function to add documents in the lucene index. text fields are indexed by the name "field" """ doc = Document() doc.add(Field("field", text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def addCrowd(self, id, text): doc = Document() doc.add( Field(CrowdFields.id, id, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field(CrowdFields.text, text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.updateDocument(Term(CrowdFields.id, id), doc)
def addContents(self,contents): try: #iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED) writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED) for content in contents: doc = Document() doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES)) writer.addDocument(doc) writer.close() except Exception,e: print 'Unable to add content to RAM index'
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.optimize() writer.close()
def indexSingleFieldDocs(self, fields): writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) for field in fields: doc = Document() doc.add(field) writer.addDocument(doc) writer.commit() writer.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.porterAnalyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("contents", "The quick brown fox jumps over the lazy dogs", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close()
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # Elwood document = Document() document.add( Field("owner", "elwood", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "elwoods sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) # Jake document = Document() document.add( Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED)) document.add( Field("keywords", "jakes sensitive info", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(document) writer.close()
def index(self,path_to_index,path_files): 'indexes anchor texts from a given folder' #lucene.initVM() indexDir = path_to_index directory_index = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_35) writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512)) listOfPathes = [] listOfPathes.extend(glob.glob(path_files+"*.txt")) counter = 0 for path_to_file in listOfPathes: print path_to_file f = open(path_to_file,"r") for line in f: entry = line.split("\t") counter+=1 """ optimizes index after a certain amount of added documents """ if counter%500000==0: print counter writer.optimize() doc = Document() doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() f.close() writer.close() print counter print "done"
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) for line in file: doc = Document() arr = line.split('\t') field = Field("name", arr[2].lower(), Field.Store.YES, Field.Index.TOKENIZED) field.setBoost(1.5) doc.add(field) doc.add( Field("alternate_names", arr[3].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add( Field("state", arr[10].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add( Field("population", arr[14], Field.Store.YES, Field.Index.UN_TOKENIZED)) if int(arr[14]) > 1000000: doc.setBoost(1.2) writer.addDocument(doc) file.close() except Exception, e: print "Failed in indexDocs:", e
def luceneIndexer(docdir,indir): """ IndexDocuments from a directory. Args: docdir:文档所在文件夹 indir:索引存放文件夹 Returns: 无返回值 说明: FieldType().setStored=as-is value stored in the Lucene index FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted) FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion. """ """#类型1属性:对于需要检索,需要返回显示setStored(True) type1 = FieldType() type1.setIndexed(True) type1.setStored(True) type1.setTokenized(False) type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。 type2 = FieldType() type2.setIndexed(True) type2.setStored(False) type2.setTokenized(True) type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)""" lucene.initVM() DIRTOINDEX= docdir INDEXIDR= indir indexdir= SimpleFSDirectory(File(INDEXIDR)) analyzer= StandardAnalyzer(Version.LUCENE_30) #用指定的语言分析器构造一个新的写索引器. index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): #print "Indexing: " print "Indexing:", tfile; document = Document() content = open(tfile,'r').read() #类型使用方式 #doc.add(Field("path", tfile, type1)) #文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"} document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def do_index(): initVM() indexDir = "/home/william/woyaoo/luceneindex" version = Version.LUCENE_CURRENT standardAnalyzer = StandardAnalyzer(version) # chineseAnalyzer = CJKAnalyzer(version) engine = data.engine_from_config("indexdb.config") # engine = data.engine_from_config() db = data.init_datafactory(engine) docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all() print len(docs) idxDir = SimpleFSDirectory(File(indexDir)) perIndexCount = 5000 writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512)) # add field for doc in docs: # print repr(doc.description) lucenedoc = Document() descriptionValue = doc.description.strip("\r\n").encode("UTF-8") # descriptionValue ='中国 abc' print repr(descriptionValue) lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED)) # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(lucenedoc) writer.optimize() writer.close() print "index finished"
def addDocuments(self, dir): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) # # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs # for word in self.docs: doc = Document() doc.add( Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add( Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def indexDocs(self, root, writer): f = codecs.open('picIndex.txt','r',encoding='utf-8') picDict = {} for line in f.xreadlines(): ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url,title,alt] f.close() for src in picDict: doc = Document() doc.add(Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def addDocuments(self, dir, isCompound): writer = IndexWriter(dir, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(isCompound) # change to adjust performance of indexing with FSDirectory # writer.mergeFactor = writer.mergeFactor # writer.maxMergeDocs = writer.maxMergeDocs # writer.minMergeDocs = writer.minMergeDocs for word in self.docs: doc = Document() doc.add(Field("keyword", word, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO)) doc.add(Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) for line in file: doc = Document() arr = line.split('\t') field = Field("name", arr[2].lower(), Field.Store.YES, Field.Index.TOKENIZED) field.setBoost(1.5) doc.add(field) doc.add(Field("alternate_names", arr[3].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add(Field("state", arr[10].lower(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add(Field("population", arr[14], Field.Store.YES, Field.Index.UN_TOKENIZED)) if int(arr[14]) > 1000000: doc.setBoost(1.2) writer.addDocument(doc) file.close() except Exception, e: print "Failed in indexDocs:", e
def setUp(self): # set up sample document directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(directory)
def setUp(self): self.analyzer = WhitespaceAnalyzer() self.directory = RAMDirectory() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for i in xrange(1, 501): doc = Document() doc.add(Field("id", NumberUtils.pad(i), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.close()
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = self.getSearcher()
def indexFile(self, writer, path): try: reader = InputStreamReader(FileInputStream(path), 'iso-8859-1') except JavaError: raise else: doc = Document() doc.add(Field("contents", reader)) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) reader.close() return doc
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True)
def indexFile(self, writer, path): try: file = open(path) string = HTMLReader(InputStreamReader(file, 'utf-8')).read() file.close() except: raise else: doc = Document() doc.add(Field("contents", StringReader(string))) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) return doc
def write_index(self, workflow, property = False): """ adds all keywords in workflow to the index at the specified location types of the keywords can be preserved by setting (Property = True) """ self.ddict = dict() # name is used as id in this case self.adddd("workflow_id", str(workflow.id)) self.adddd("text", workflow.id) self.adddd("text", workflow.name) # this is a key for the workflow #adddd("workflow_source", workflow.source) #self.adddd("text", workflow.source) #adddd("workflow_type", workflow.type) self.adddd("text", workflow.type) # not very interesting #d.add( Field("workflow_version", workflow.version, save, Field.Index.UN_TOKENIZED)) self.indexAnnotations(workflow.annotations, property) for module in workflow.modules: self.adddd("module_name" if property else "text", module.name) self.adddd("package" if property else "text", module.package) # not very interesting #d.add( Field("package_version", module.version, save, Field.Index.UN_TOKENIZED)) self.adddd("module_type" if property else "text", module.type) self.indexAnnotations(module.annotations, property) for p in module.parameters: self.adddd("parameter_name" if property else "text", p.name) self.adddd("parameter_value" if property else "text", p.value) self.adddd("parameter_type" if property else "text", p.type) self.indexAnnotations(p.annotations, property) for c in workflow.connections: self.adddd("port_name" if property else "text", c.startPort) self.adddd("port_name" if property else "text", c.endPort) self.indexAnnotations(c.annotations, property) d = Document() for (k, v) in self.ddict.iteritems(): d.add(Field(k, v, self.save, Field.Index.TOKENIZED)) # Delete old versions WorkflowIndexer.writer.deleteDocuments( [Term('workflow_id', str(workflow.id))] ) # add new WorkflowIndexer.writer.addDocument(d)
def lucene_index(input_folder,output_folder): ''' Indexes fresh text data using lucene 3.6. Doesn't support incremental generation of index as of now. Currently crashes on neo by running out of heap space. Arguments: Input folder for text files. output folder for index location Returns: void. The index is stored if generated. ''' # Setting up log file logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log")) logging.info("Input directory for logging: "+input_folder) logging.info("Output directory of index: "+output_folder) if not os.path.isdir(output_folder): logger.debug("Making output directory for index: "+ output_folder) os.makedirs(output_folder) # Setting up lucene's heap size for index and version of indexer lucene.initVM(initialheap='1024m',maxheap='2048m') index_folder = SimpleFSDirectory(File(output_folder)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) # Optimization to reduce heap space usage for generation of index. Merges buffer with # current index after 15 docs. writer.setMergeFactor(15) writer.setRAMBufferSizeMB(32.0) # Search to find the files to index files_to_index = find_files_in_folder(input_folder) for input_file in files_to_index: doc = Document() content = open(input_file, 'r').read() doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index. doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file writer.addDocument(doc) # Index logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs())) logger.info( "About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() # Compress index logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() logger.info("Closed index")
def Indexer(docdir,indir): lucene.initVM() DIRTOINDEX = docdir INDEXDIR = indir indexdir = FSDirectory(File(INDEXDIR)) analyzer = StandardAnalyzer(VERSION.LUCENE_30) index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')): print "Indexing ",tfile document=Document() content = open(tfile,'r').read() document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done" index_writer.optimize() print index_writer.numDocs() index_writer.close()
def indexFile(self, writer, path): try: file = open(path) string = HTMLReader(InputStreamReader(file, 'utf-8')).read() file.close() except: raise else: doc = Document() doc.add(Field("contents", StringReader(string))) doc.add( Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) return doc
def index(self): dirPath = os.path.join(tempfile.gettempdir(), "verbose-index") dir = FSDirectory.open(dirPath) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(InfoStreamOut()) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close()
def index(self): dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"), "verbose-index") dir = FSDirectory.getDirectory(dirPath, True) writer = IndexWriter(dir, SimpleAnalyzer(), True) writer.setInfoStream(System.out) for i in xrange(100): doc = Document() doc.add(Field("keyword", "goober", Field.Store.YES, Field.Index.UN_TOKENIZED)) writer.addDocument(doc) writer.optimize() writer.close()
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add(Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add(Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)
def indexFile(self, writer, path): f = None try: f = codecs.open(path, encoding='utf-8') text = f.read() except Exception: raise finally: if f != None: f.close() doc = Document() doc.add(Field("contents", StringReader(text))) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) return doc
def luceneIndexer(docdir, indir): """ IndexDocuments from a directory """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer= IndexWriter(indexdir,analyzer,True,\ IndexWriter.MaxFieldLength(512)) for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): print "Indexing: ", tfile document = Document() content = open(tfile, 'r').read() document.add(Field("text",content,Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) print "Done: ", tfile index_writer.optimize() print index_writer.numDocs() index_writer.close()
def someMethod(self): directory = RAMDirectory() analyzer = StandardAnalyzer() writer = IndexWriter(directory, analyzer, True) doc = Document() doc.add(Field.Text("title", "This is the title")) doc.add(Field.UnStored("contents", "...document contents...")) writer.addDocument(doc) writer.addDocument(doc, analyzer) expression = "some query" query = QueryParser.parse(expression, "contents", analyzer) parser = QueryParser("contents", analyzer) query = parser.parseQuery(expression)
def setUp(self): directory = RAMDirectory() writer = IndexWriter(directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) doc1 = Document() doc1.add( Field("field", "the quick brown fox jumped over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc1) doc2 = Document() doc2.add( Field("field", "the fast fox hopped over the hound", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc2) writer.close() self.searcher = IndexSearcher(directory, True)
def indexFile(self, writer, path): doc = Document() try: process = popen2.Popen4(["antiword", "-m", "UTF-8", path]) string = InputStreamReader(process.fromchild, 'utf-8').read() except: raise else: doc.add(Field("contents", StringReader(string))) doc.add(Field("filename", os.path.abspath(path), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) exitCode = process.wait() if exitCode != 0: raise RuntimeError, "pdftotext exit code %d" %(exitCode) return doc
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def indexDocuments(): # empty index directory indexDir = Wikipedia.directory + 'index/' for filename in os.listdir(indexDir): os.remove(indexDir + filename) # index documents lucene.initVM() version = Version.LUCENE_CURRENT analyzer = EnglishAnalyzer(version) writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True, IndexWriter.MaxFieldLength.LIMITED) for article in Wikipedia(): doc = Document() doc.add( Field('id', str(article['id'][0]), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('title', article['url'], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', article['text'], Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) print 'Optimization' writer.optimize() writer.close()
def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) reader = StringReader(reader.read()) doc.add(Field("contents", reader, Field.TermVector.YES)) indexWriter.addDocument(doc)
def setUp(self): animals = [ "aardvark", "beaver", "coati", "dog", "elephant", "frog", "gila monster", "horse", "iguana", "javelina", "kangaroo", "lemur", "moose", "nematode", "orca", "python", "quokka", "rat", "scorpion", "tarantula", "uromastyx", "vicuna", "walrus", "xiphias", "yak", "zebra" ] analyzer = WhitespaceAnalyzer() aTOmDirectory = RAMDirectory() nTOzDirectory = RAMDirectory() aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) for animal in animals: doc = Document() doc.add( Field("animal", animal, Field.Store.YES, Field.Index.NOT_ANALYZED)) if animal[0].lower() < "n": aTOmWriter.addDocument(doc) else: nTOzWriter.addDocument(doc) aTOmWriter.close() nTOzWriter.close() self.searchers = [ IndexSearcher(aTOmDirectory), IndexSearcher(nTOzDirectory) ]
def luceneIndexer(docdir, indir): """frpFile IndexDocuments from a directory para:{ docdir: the path of the txt file indir: the path of the index file which is generated by the following code } """ lucene.initVM() DIRTOINDEX = docdir INDEXIDR = indir indexdir = SimpleFSDirectory(File(INDEXIDR)) analyzer = StandardAnalyzer(Version.LUCENE_30) index_writer = IndexWriter(indexdir, analyzer, True, \ IndexWriter.MaxFieldLength(512)) #for tfile in glob.glob(os.path.join(DIRTOINDEX, '*.txt')): list = os.listdir(DIRTOINDEX) for i in range(len(list)): tfile = os.path.join(DIRTOINDEX, list[i]) if os.path.isfile(tfile): print ("Indexing: ", tfile) print ('okokokook') document = Document() content = open(tfile, 'r').read() document.add(Field("text", content, Field.Store.YES, \ Field.Index.ANALYZED)) document.add(Field("title",str(tfile.strip('.txt')),Field.Store.YES,\ Field.Index.ANALYZED)) index_writer.addDocument(document) #print (document) print ("Done: ", tfile) index_writer.optimize() print (index_writer.numDocs()) index_writer.close()
def indexValue(self, indexWriter, value, uItem, uAttr, uValue, version): STORED = Field.Store.YES UN_STORED = Field.Store.NO TOKENIZED = Field.Index.TOKENIZED UN_INDEXED = Field.Index.NO UN_TOKENIZED = Field.Index.UN_TOKENIZED doc = Document() doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED)) doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED)) doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED)) doc.add(Field("version", str(version), STORED, UN_INDEXED)) doc.add( Field("contents", value, UN_STORED, TOKENIZED, Field.TermVector.YES)) indexWriter.addDocument(doc)
def addDocuments(self, _id, title, content): doc = Document() doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED)) if title is not None and len(title) > 0: doc.add( Field("titleKeyword", title, Field.Store.NO, Field.Index.ANALYZED)) if content is not None and len(content) > 0: doc.add( Field("contentKeyword", content, Field.Store.NO, Field.Index.ANALYZED)) self.index_writer.addDocument(doc)