class PyLuceneDatabase(CommonIndexer.CommonDatabase): """manage and use a pylucene indexing database""" QUERY_TYPE = PyLucene.Query INDEX_DIRECTORY_NAME = "lucene" def __init__(self, basedir, analyzer=None, create_allowed=True): """initialize or open an indexing database Any derived class must override __init__. @raise ValueError: the given location exists, but the database type is incompatible (e.g. created by a different indexing engine) @raise OSError: the database failed to initialize @param basedir: the parent directory of the database @type basedir: str @param analyzer: bitwise combination of possible analyzer flags to be used as the default analyzer for this database. Leave it empty to use the system default analyzer (self.ANALYZER_DEFAULT). see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... @type analyzer: int @param create_allowed: create the database, if necessary; default: True @type create_allowed: bool """ super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, create_allowed=create_allowed) self.pyl_analyzer = PyLucene.StandardAnalyzer() self.writer = None self.reader = None self.index_version = None try: # try to open an existing database tempreader = PyLucene.IndexReader.open(self.location) tempreader.close() except PyLucene.JavaError, err_msg: # Write an error out, in case this is a real problem instead of an absence of an index # TODO: turn the following two lines into debug output #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() #DEBUG_FOO("could not open index, so going to create: " + errorstr) # Create the index, so we can open cached readers on it if not create_allowed: raise OSError("Indexer: skipping database creation") try: # create the parent directory if it does not exist parent_path = os.path.dirname(self.location) if not os.path.isdir(parent_path): # recursively create all directories up to parent_path os.makedirs(parent_path) except IOError, err_msg: raise OSError("Indexer: failed to create the parent " \ + "directory (%s) of the indexing database: %s" \ % (parent_path, err_msg)) try: tempwriter = PyLucene.IndexWriter(self.location, self.pyl_analyzer, True) tempwriter.close() except PyLucene.JavaError, err_msg: raise OSError("Indexer: failed to open or create a Lucene" \ + " database (%s): %s" % (self.location, err_msg))
def create_index(self, arg): """ Post download setup callback for creating a lucene index """ moreinfo("Creating lucene index") storeDir = "index" if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) self.lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) # Uncomment this line to enable a PorterStemmer analyzer # self.lucene_writer = PyLucene.IndexWriter(store, PorterStemmerAnalyzer(), True) self.lucene_writer.setMaxFieldLength(1048576) count = 0 urllist = [] for urlobj in self._urldict.values(): filename = urlobj.get_full_filename() url = urlobj.get_full_url() try: urllist.index(url) continue except ValueError: urllist.append(url) if not filename in self._downloaddict['_savedfiles']: continue data = '' moreinfo('Adding index for URL', url) if os.path.isfile(filename): try: data = unicode(open(filename).read(), 'iso-8859-1') except UnicodeDecodeError, e: data = '' doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", url, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if data and len(data) > 0: doc.add( PyLucene.Field("contents", data, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: extrainfo("warning: no content in %s" % filename) self.lucene_writer.addDocument(doc) count += 1
def get_writer(self, create=False): writer = None while writer is None: try: writer = lucene.IndexWriter(self.store, self.analyzer, create) writer.setMaxFieldLength(1048576) except Exception, e: print e time.sleep(.1)
def index_node(self, iba_node): self.delete_node(iba_node.nid) create = len(os.listdir('index')) == 0 analyzer = PyLucene.StandardAnalyzer() writer = PyLucene.IndexWriter("index", analyzer, create) writer.addDocument(self._document_node(iba_node)) writer.close() self.count = self.count + 1
def _writer_open(self): """open write access for the indexing database and acquire an exclusive lock """ if not self._writer_is_open(): self._delete_stale_lock() self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer, False) # "setMaxFieldLength" is available since PyLucene v2 # we must stay compatible to v1 for the derived class # (PyLuceneIndexer1) - thus we make this step optional if hasattr(self.writer, "setMaxFieldLength"): self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
def __init__(self, root, storeDir, analyzer): if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) writer = PyLucene.IndexWriter(store, analyzer, True) writer.setMaxFieldLength(1048576) self.indexDocs(root, writer) ticker = Ticker() print 'optimizing index', threading.Thread(target=ticker.run).start() writer.optimize() writer.close() ticker.tick = False print 'done'
def openWriteIndex(self): luceneDir = self.store.newDirectory(self.indexDirectory) create = not luceneDir.exists() analyzer = self._analyzer() fsdir = PyLucene.FSDirectory.getDirectory(luceneDir.path, create) try: writer = PyLucene.IndexWriter(fsdir, analyzer, create) except PyLucene.JavaError, e: lockTimeout = u'Lock obtain timed out: Lock@' msg = e.getJavaException().getMessage() if msg.startswith(lockTimeout): self._lockfile = msg[len(lockTimeout):] raise IndexCorrupt()
def index_files(board, time_delta): store = PyLucene.FSDirectory.getDirectory( BOARDSPATH + board + '/' + RECENT_INDEX, True) writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) writer.setMaxFieldLength(1048576) # 1MB flist = get_all_files(board, time_delta) for filename, owner, title in flist: path = BOARDSPATH + board + '/' + filename if not os.path.exists(path): continue f = open(path, 'r') contents = filter_file(f) debug(contents) try: title = title.decode('gbk') owner = owner.decode('gbk') contents = unicode(contents, 'gbk') except UnicodeDecodeError: f.close() debug(filename) continue f.close() if len(contents) > 0: doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("owner", owner, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("title", title, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("contents", contents, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) writer.addDocument(doc) debug('adding ' + filename) writer.optimize() writer.close()
def create_index(self): """ Post download setup callback for creating a lucene index """ info("Creating lucene index") count = 0 urllist = [] urldb = objects.datamgr.get_urldb() storeDir = "index" if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) lucene_writer.setMaxFieldLength(1048576) for node in urldb.preorder(): urlobj = node.get() # Only index if web-page or document if not urlobj.is_webpage() and not urlobj.is_document(): continue filename = urlobj.get_full_filename() url = urlobj.get_full_url() try: urllist.index(urlobj.index) continue except ValueError: urllist.append(urlobj.index) if not os.path.isfile(filename): continue data = '' extrainfo('Adding index for URL', url) try: data = unicode(open(filename).read(), 'iso-8859-1') except UnicodeDecodeError, e: data = '' try: doc = PyLucene.Document() doc.add( PyLucene.Field("name", 'file://' + filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", url, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if data and len(data) > 0: doc.add( PyLucene.Field("contents", data, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: warning("warning: no content in %s" % filename) lucene_writer.addDocument(doc) except PyLucene.JavaError, e: print e continue
def __init__(self, basedir, analyzer=None, create_allowed=True): """Initialize or open an indexing database. Any derived class must override __init__. :raise ValueError: The given location exists, but the database type is incompatible (e.g. created by a different indexing engine) :raise OSError: the database failed to initialize :param basedir: The parent directory of the database :type basedir: str :param analyzer: Bitwise combination of possible analyzer flags to be used as the default analyzer for this database. Leave it empty to use the system default analyzer (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... :type analyzer: int :param create_allowed: create the database, if necessary; default: True :type create_allowed: bool """ jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, create_allowed=create_allowed) self.pyl_analyzer = PyLucene.StandardAnalyzer() self.writer = None self.reader = None self.index_version = None try: # try to open an existing database tempreader = PyLucene.IndexReader.open(self.location) tempreader.close() except PyLucene.JavaError as err_msg: # Write an error out, in case this is a real problem instead of an absence of an index # TODO: turn the following two lines into debug output #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() #DEBUG_FOO("could not open index, so going to create: " + errorstr) # Create the index, so we can open cached readers on it if not create_allowed: raise OSError("Indexer: skipping database creation") try: # create the parent directory if it does not exist parent_path = os.path.dirname(self.location) if not os.path.isdir(parent_path): # recursively create all directories up to parent_path os.makedirs(parent_path) except IOError as err_msg: raise OSError("Indexer: failed to create the parent " "directory (%s) of the indexing database: %s" % (parent_path, err_msg)) try: tempwriter = PyLucene.IndexWriter(self.location, self.pyl_analyzer, True) tempwriter.close() except PyLucene.JavaError as err_msg: raise OSError("Indexer: failed to open or create a Lucene" " database (%s): %s" % (self.location, err_msg)) # the indexer is initialized - now we prepare the searcher # windows file locking seems inconsistent, so we try 10 times numtries = 0 #self.dir_lock.acquire(blocking=True) # read "self.reader", "self.indexVersion" and "self.searcher" try: while numtries < 10: try: self.reader = PyLucene.IndexReader.open(self.location) self.indexVersion = self.reader.getCurrentVersion( self.location) self.searcher = PyLucene.IndexSearcher(self.reader) break except PyLucene.JavaError as e: # store error message for possible later re-raise (below) lock_error_msg = e time.sleep(0.01) numtries += 1 else: # locking failed for 10 times raise OSError("Indexer: failed to lock index database" " (%s)" % lock_error_msg) finally: pass # self.dir_lock.release() # initialize the searcher and the reader self._index_refresh()
def get_search_index_writer (self, clobber=False): search_index_store = PyLucene.FSDirectory.getDirectory( self.config.search_index_dir, clobber) return PyLucene.IndexWriter(search_index_store, PyLucene.StandardAnalyzer(), clobber)