def _create_query_for_field(self, field, value, analyzer=None): """generate a field query this functions creates a field->value query @param field: the fieldname to be used @type field: str @param value: the wanted value of the field @type value: str @param analyzer: the analyzer to be used possible analyzers are: - L{CommonDatabase.ANALYZER_TOKENIZE} the field value is splitted to be matched word-wise - L{CommonDatabase.ANALYZER_PARTIAL} the field value must start with the query string - L{CommonDatabase.ANALYZER_EXACT} keep special characters and the like @type analyzer: bool @return: resulting query object @rtype: PyLucene.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: analyzer_obj = self.ExactAnalyzer() else: value = _escape_term_value(value) analyzer_obj = PyLucene.StandardAnalyzer() if (analyzer & self.ANALYZER_PARTIAL) > 0: # PyLucene uses explicit wildcards for partial matching value += "*" return PyLucene.QueryParser.parse(value, field, analyzer_obj)
def create_index(self, arg): """ Post download setup callback for creating a lucene index """ moreinfo("Creating lucene index") storeDir = "index" if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) self.lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) # Uncomment this line to enable a PorterStemmer analyzer # self.lucene_writer = PyLucene.IndexWriter(store, PorterStemmerAnalyzer(), True) self.lucene_writer.setMaxFieldLength(1048576) count = 0 urllist = [] for urlobj in self._urldict.values(): filename = urlobj.get_full_filename() url = urlobj.get_full_url() try: urllist.index(url) continue except ValueError: urllist.append(url) if not filename in self._downloaddict['_savedfiles']: continue data = '' moreinfo('Adding index for URL', url) if os.path.isfile(filename): try: data = unicode(open(filename).read(), 'iso-8859-1') except UnicodeDecodeError, e: data = '' doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", url, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if data and len(data) > 0: doc.add( PyLucene.Field("contents", data, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: extrainfo("warning: no content in %s" % filename) self.lucene_writer.addDocument(doc) count += 1
def _create_query_for_field(self, field, value, analyzer=None): """generate a field query this functions creates a field->value query :param field: The fieldname to be used :type field: str :param value: The wanted value of the field :type value: str :param analyzer: The analyzer to be used Possible analyzers are: - :attr:`CommonDatabase.ANALYZER_TOKENIZE` the field value is splitted to be matched word-wise - :attr:`CommonDatabase.ANALYZER_PARTIAL` the field value must start with the query string - :attr:`CommonDatabase.ANALYZER_EXACT` keep special characters and the like :type analyzer: bool :return: resulting query object :rtype: PyLucene.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: analyzer_obj = PyLucene.KeywordAnalyzer() else: value = self._escape_term_value(value) analyzer_obj = PyLucene.StandardAnalyzer() qp = PyLucene.QueryParser(field, analyzer_obj) if (analyzer & self.ANALYZER_PARTIAL > 0): # PyLucene uses explicit wildcards for partial matching value += "*" return qp.parse(value)
def index_node(self, iba_node): self.delete_node(iba_node.nid) create = len(os.listdir('index')) == 0 analyzer = PyLucene.StandardAnalyzer() writer = PyLucene.IndexWriter("index", analyzer, create) writer.addDocument(self._document_node(iba_node)) writer.close() self.count = self.count + 1
def search_node_by_name2(self, name): if self.searcher is None: self.searcher = PyLucene.IndexSearcher("index") query = PyLucene.QueryParser(COLUMN_NAME, PyLucene.StandardAnalyzer()).parse(name) hits = self.searcher.search(query) result = self.hits_to_list(hits) return result
def __init__(self, store_dir=STORE_DIR, destroy=False, analyzer=None): self.store_dir = store_dir self.analyzer = analyzer or lucene.StandardAnalyzer() self.feed_modifier = IndexModifier(store_dir=os.path.join( store_dir, 'feeds'), destroy=destroy, analyzer=analyzer) self.entry_modifier = IndexModifier(store_dir=os.path.join( store_dir, 'entries'), destroy=destroy, analyzer=analyzer)
def __init__(self, basedir, analyzer=None, create_allowed=True): """Initialize or open an indexing database. Any derived class must override __init__. :raise ValueError: The given location exists, but the database type is incompatible (e.g. created by a different indexing engine) :raise OSError: the database failed to initialize :param basedir: The parent directory of the database :type basedir: str :param analyzer: Bitwise combination of possible analyzer flags to be used as the default analyzer for this database. Leave it empty to use the system default analyzer (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... :type analyzer: int :param create_allowed: create the database, if necessary; default: True :type create_allowed: bool """ jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, create_allowed=create_allowed) self.pyl_analyzer = PyLucene.StandardAnalyzer() self.writer = None self.reader = None self.index_version = None try: # try to open an existing database tempreader = PyLucene.IndexReader.open(self.location) tempreader.close() except PyLucene.JavaError, err_msg: # Write an error out, in case this is a real problem instead of an absence of an index # TODO: turn the following two lines into debug output #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() #DEBUG_FOO("could not open index, so going to create: " + errorstr) # Create the index, so we can open cached readers on it if not create_allowed: raise OSError("Indexer: skipping database creation") try: # create the parent directory if it does not exist parent_path = os.path.dirname(self.location) if not os.path.isdir(parent_path): # recursively create all directories up to parent_path os.makedirs(parent_path) except IOError, err_msg: raise OSError("Indexer: failed to create the parent " \ + "directory (%s) of the indexing database: %s" \ % (parent_path, err_msg))
def _create_query_for_string(self, text, require_all=True, analyzer=None): """generate a query for a plain term of a string query basically this function parses the string and returns the resulting query :param text: The query string :type text: str :param require_all: boolean operator (True -> AND (default) / False -> OR) :type require_all: bool :param analyzer: The analyzer to be used Possible analyzers are: - :attr:`CommonDatabase.ANALYZER_TOKENIZE` the field value is splitted to be matched word-wise - :attr:`CommonDatabase.ANALYZER_PARTIAL` the field value must start with the query string - :attr:`CommonDatabase.ANALYZER_EXACT` keep special characters and the like :type analyzer: bool :return: resulting query object :rtype: PyLucene.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: # exact matching - no substitution ... # for PyLucene: nothing special is necessary pass # don't care about special characters ... if analyzer == self.ANALYZER_EXACT: analyzer_obj = self.ExactAnalyzer() else: text = _escape_term_value(text) analyzer_obj = PyLucene.StandardAnalyzer() qp = PyLucene.QueryParser(analyzer=analyzer_obj) if require_all: qp.setDefaultOperator(qp.Operator.AND) else: qp.setDefaultOperator(qp.Operator.OR) if (analyzer & self.ANALYZER_PARTIAL) > 0: # PyLucene uses explicit wildcards for partial matching text += "*" return qp.parse(text)
def __init__(self, store_dir=STORE_DIR, destroy=False, analyzer=None): self.store_dir = store_dir self.create = False if os.path.exists(self.store_dir) and destroy: shutil.rmtree(self.store_dir) if not os.path.exists(self.store_dir): try: os.makedirs(self.store_dir) except: {} self.create = True self.store = lucene.FSDirectory.getDirectory(self.store_dir, self.create) self.analyzer = analyzer or lucene.StandardAnalyzer() if self.create: self.get_writer(self.create).close() #this inits the segment
def search(self, query_string='', require_visible=True, allow_curated=True): hits = [] query_string = str(query_string) self.logger.info('Performing search: %s' % query_string) disassembled_query = disassemble_user_query(query_string) self.logger.debug('Disassembled query: %s' % str(disassembled_query)) reassembled_query = '+(%s)' % reassemble_user_query(disassembled_query) self.logger.debug('Reassembled query: %s', reassembled_query) if not allow_curated: reassembled_query += \ ' -record-status:%s' % canary.loader.QueuedRecord.STATUS_CURATED if require_visible: reassembled_query += ' +article-type:[%s TO %s]' % \ (Study.ARTICLE_TYPES['traditional'], Study.ARTICLE_TYPES['curated']) reassembled_query += ' +record-status:%s' % \ canary.loader.QueuedRecord.STATUS_CURATED try: searcher = PyLucene.IndexSearcher( PyLucene.FSDirectory.getDirectory( self.context.config.search_index_dir, False)) analyzer = PyLucene.StandardAnalyzer() query_parser = PyLucene.QueryParser('all', analyzer) query_parser.setOperator(PyLucene.QueryParser.DEFAULT_OPERATOR_AND) query = query_parser.parseQuery(reassembled_query) self.logger.info('Search query: %s', query) hits = searcher.search(query) return hits, searcher except Exception, e: self.logger.error('Search failed: %s', e) #self.logger.error(traceback.format_stack()) if hits \ and searcher: return hits, searcher else: return [], None
def index_files(board, time_delta): store = PyLucene.FSDirectory.getDirectory( BOARDSPATH + board + '/' + RECENT_INDEX, True) writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) writer.setMaxFieldLength(1048576) # 1MB flist = get_all_files(board, time_delta) for filename, owner, title in flist: path = BOARDSPATH + board + '/' + filename if not os.path.exists(path): continue f = open(path, 'r') contents = filter_file(f) debug(contents) try: title = title.decode('gbk') owner = owner.decode('gbk') contents = unicode(contents, 'gbk') except UnicodeDecodeError: f.close() debug(filename) continue f.close() if len(contents) > 0: doc = PyLucene.Document() doc.add( PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("owner", owner, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("title", title, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("contents", contents, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) writer.addDocument(doc) debug('adding ' + filename) writer.optimize() writer.close()
def search_node_by_attribute2(self, att_type, att_value): if self.searcher is None: self.searcher = PyLucene.IndexSearcher("index") analyzer = PyLucene.StandardAnalyzer() if att_type != "" and att_value == "": parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_TYPE_NID, analyzer) query = parser.parse(att_type) elif att_type == "" and att_value != "": parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_VALUE, analyzer) query = parser.parse(att_value) elif att_type != "" and att_value != "": parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_VALUE, analyzer) query = parser.parse(COLUMN_ATTRIBUTE_TYPE_NID + ":" + att_type + " AND " + att_value) hits = self.searcher.search(query) result = self.hits_to_list(hits) return result
def _create_query_for_string(self, text, require_all=True, analyzer=None): """generate a query for a plain term of a string query basically this function parses the string and returns the resulting query @param text: the query string @type text: str @param require_all: boolean operator (True -> AND (default) / False -> OR) @type require_all: bool @param analyzer: the analyzer to be used possible analyzers are: - L{CommonDatabase.ANALYZER_TOKENIZE} the field value is splitted to be matched word-wise - L{CommonDatabase.ANALYZER_PARTIAL} the field value must start with the query string - L{CommonDatabase.ANALYZER_EXACT} keep special characters and the like @type analyzer: bool @return: resulting query object @rtype: PyLucene.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: analyzer_obj = PyLucene.KeywordAnalyzer() else: text = _escape_term_value(text) analyzer_obj = PyLucene.StandardAnalyzer() qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj) if (analyzer & self.ANALYZER_PARTIAL > 0): # PyLucene uses explicit wildcards for partial matching text += "*" if require_all: qp.setDefaultOperator(qp.Operator.AND) else: qp.setDefaultOperator(qp.Operator.OR) return qp.parse(text)
PyLucene.Field("name", filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", path, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if len(contents) > 0: doc.add( PyLucene.Field("contents", contents, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e if __name__ == '__main__': if len(sys.argv) < 2: print IndexFiles.__doc__ sys.exit(1) print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION start = datetime.now() try: IndexFiles(sys.argv[1], "index", PyLucene.StandardAnalyzer()) end = datetime.now() print end - start except Exception, e: print "Failed: ", e
def create_index(self): """ Post download setup callback for creating a lucene index """ info("Creating lucene index") count = 0 urllist = [] urldb = objects.datamgr.get_urldb() storeDir = "index" if not os.path.exists(storeDir): os.mkdir(storeDir) store = PyLucene.FSDirectory.getDirectory(storeDir, True) lucene_writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True) lucene_writer.setMaxFieldLength(1048576) for node in urldb.preorder(): urlobj = node.get() # Only index if web-page or document if not urlobj.is_webpage() and not urlobj.is_document(): continue filename = urlobj.get_full_filename() url = urlobj.get_full_url() try: urllist.index(urlobj.index) continue except ValueError: urllist.append(urlobj.index) if not os.path.isfile(filename): continue data = '' extrainfo('Adding index for URL', url) try: data = unicode(open(filename).read(), 'iso-8859-1') except UnicodeDecodeError, e: data = '' try: doc = PyLucene.Document() doc.add( PyLucene.Field("name", 'file://' + filename, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) doc.add( PyLucene.Field("path", url, PyLucene.Field.Store.YES, PyLucene.Field.Index.UN_TOKENIZED)) if data and len(data) > 0: doc.add( PyLucene.Field("contents", data, PyLucene.Field.Store.YES, PyLucene.Field.Index.TOKENIZED)) else: warning("warning: no content in %s" % filename) lucene_writer.addDocument(doc) except PyLucene.JavaError, e: print e continue
def _analyzer(self): return PyLucene.StandardAnalyzer([])
def __init__(self, basedir, analyzer=None, create_allowed=True): """Initialize or open an indexing database. Any derived class must override __init__. :raise ValueError: The given location exists, but the database type is incompatible (e.g. created by a different indexing engine) :raise OSError: the database failed to initialize :param basedir: The parent directory of the database :type basedir: str :param analyzer: Bitwise combination of possible analyzer flags to be used as the default analyzer for this database. Leave it empty to use the system default analyzer (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... :type analyzer: int :param create_allowed: create the database, if necessary; default: True :type create_allowed: bool """ jvm = PyLucene.getVMEnv() jvm.attachCurrentThread() super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, create_allowed=create_allowed) self.pyl_analyzer = PyLucene.StandardAnalyzer() self.writer = None self.reader = None self.index_version = None try: # try to open an existing database tempreader = PyLucene.IndexReader.open(self.location) tempreader.close() except PyLucene.JavaError as err_msg: # Write an error out, in case this is a real problem instead of an absence of an index # TODO: turn the following two lines into debug output #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() #DEBUG_FOO("could not open index, so going to create: " + errorstr) # Create the index, so we can open cached readers on it if not create_allowed: raise OSError("Indexer: skipping database creation") try: # create the parent directory if it does not exist parent_path = os.path.dirname(self.location) if not os.path.isdir(parent_path): # recursively create all directories up to parent_path os.makedirs(parent_path) except IOError as err_msg: raise OSError("Indexer: failed to create the parent " "directory (%s) of the indexing database: %s" % (parent_path, err_msg)) try: tempwriter = PyLucene.IndexWriter(self.location, self.pyl_analyzer, True) tempwriter.close() except PyLucene.JavaError as err_msg: raise OSError("Indexer: failed to open or create a Lucene" " database (%s): %s" % (self.location, err_msg)) # the indexer is initialized - now we prepare the searcher # windows file locking seems inconsistent, so we try 10 times numtries = 0 #self.dir_lock.acquire(blocking=True) # read "self.reader", "self.indexVersion" and "self.searcher" try: while numtries < 10: try: self.reader = PyLucene.IndexReader.open(self.location) self.indexVersion = self.reader.getCurrentVersion( self.location) self.searcher = PyLucene.IndexSearcher(self.reader) break except PyLucene.JavaError as e: # store error message for possible later re-raise (below) lock_error_msg = e time.sleep(0.01) numtries += 1 else: # locking failed for 10 times raise OSError("Indexer: failed to lock index database" " (%s)" % lock_error_msg) finally: pass # self.dir_lock.release() # initialize the searcher and the reader self._index_refresh()
def get_search_index_writer (self, clobber=False): search_index_store = PyLucene.FSDirectory.getDirectory( self.config.search_index_dir, clobber) return PyLucene.IndexWriter(search_index_store, PyLucene.StandardAnalyzer(), clobber)