def index(self): if not (os.path.exists(self._dataDir) and os.path.isdir(self._dataDir)): raise IOError, "%s isn't existed or is not a directory" % ( self._dataDir) dir = SimpleFSDirectory(Paths.get(self._indexDir)) writer = IndexWriter(dir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setUseCompoundFile(False) self.indexDirectory(writer, self._dataDir) numIndexed = writer.numDocs() writer.optimize() writer.close() dir.close() return numIndexed
class FileIndexer: __fileList = [] def __init__(self, dataDir, fileExtension, indexDir): os.path.walk(dataDir, self.__fileSearcher, fileExtension) self.__writer = IndexWriter(indexDir, StandardAnalyzer(), True) self.__writer.setUseCompoundFile(False) def __fileSearcher(self, fileExtension, dirname, filenames): #print "Directory:", dirname for filename in filenames: if filename.split('.')[-1] == fileExtension: self.__fileList.append(os.path.join(dirname, filename)) def getNames(self): return self.__fileList def indexFiles(self): for filename in self.__fileList: print filename print File(filename).getCanonicalPath() doc = Document() doc.add( Field("contents", open(filename, 'r').read(), Field.Store.YES, Field.Index.TOKENIZED)) doc.add( Field("path", File(filename).getCanonicalPath(), Field.Store.YES, Field.Index.UN_TOKENIZED)) self.__writer.addDocument(doc) def optimizeAndClose(self): docCount = self.__writer.docCount() self.__writer.optimize() self.__writer.close() return docCount