示例#1
0
    def _create_query_for_field(self, field, value, analyzer=None):
        """generate a field query

        this functions creates a field->value query

        @param field: the fieldname to be used
        @type field: str
        @param value: the wanted value of the field
        @type value: str
        @param analyzer: the analyzer to be used
            possible analyzers are:
              - L{CommonDatabase.ANALYZER_TOKENIZE}
                    the field value is splitted to be matched word-wise
              - L{CommonDatabase.ANALYZER_PARTIAL}
                    the field value must start with the query string
              - L{CommonDatabase.ANALYZER_EXACT}
                    keep special characters and the like
        @type analyzer: bool
        @return: resulting query object
        @rtype: PyLucene.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            analyzer_obj = self.ExactAnalyzer()
        else:
            value = _escape_term_value(value)
            analyzer_obj = PyLucene.StandardAnalyzer()
        if (analyzer & self.ANALYZER_PARTIAL) > 0:
            # PyLucene uses explicit wildcards for partial matching
            value += "*"
        return PyLucene.QueryParser.parse(value, field, analyzer_obj)
示例#2
0
def create_index(self, arg):
    """ Post download setup callback for creating a lucene index """

    moreinfo("Creating lucene index")
    storeDir = "index"
    if not os.path.exists(storeDir):
        os.mkdir(storeDir)

    store = PyLucene.FSDirectory.getDirectory(storeDir, True)

    self.lucene_writer = PyLucene.IndexWriter(store,
                                              PyLucene.StandardAnalyzer(),
                                              True)
    # Uncomment this line to enable a PorterStemmer analyzer
    # self.lucene_writer = PyLucene.IndexWriter(store, PorterStemmerAnalyzer(), True)
    self.lucene_writer.setMaxFieldLength(1048576)

    count = 0

    urllist = []

    for urlobj in self._urldict.values():

        filename = urlobj.get_full_filename()
        url = urlobj.get_full_url()

        try:
            urllist.index(url)
            continue
        except ValueError:
            urllist.append(url)

        if not filename in self._downloaddict['_savedfiles']: continue

        data = ''

        moreinfo('Adding index for URL', url)

        if os.path.isfile(filename):
            try:
                data = unicode(open(filename).read(), 'iso-8859-1')
            except UnicodeDecodeError, e:
                data = ''

        doc = PyLucene.Document()
        doc.add(
            PyLucene.Field("name", filename, PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.UN_TOKENIZED))
        doc.add(
            PyLucene.Field("path", url, PyLucene.Field.Store.YES,
                           PyLucene.Field.Index.UN_TOKENIZED))
        if data and len(data) > 0:
            doc.add(
                PyLucene.Field("contents", data, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.TOKENIZED))
        else:
            extrainfo("warning: no content in %s" % filename)

        self.lucene_writer.addDocument(doc)
        count += 1
示例#3
0
    def _create_query_for_field(self, field, value, analyzer=None):
        """generate a field query

        this functions creates a field->value query

        :param field: The fieldname to be used
        :type field: str
        :param value: The wanted value of the field
        :type value: str
        :param analyzer: The analyzer to be used
                         Possible analyzers are:
                         - :attr:`CommonDatabase.ANALYZER_TOKENIZE`
                           the field value is splitted to be matched word-wise
                         - :attr:`CommonDatabase.ANALYZER_PARTIAL`
                           the field value must start with the query string
                         - :attr:`CommonDatabase.ANALYZER_EXACT`
                           keep special characters and the like
        :type analyzer: bool
        :return: resulting query object
        :rtype: PyLucene.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            analyzer_obj = PyLucene.KeywordAnalyzer()
        else:
            value = self._escape_term_value(value)
            analyzer_obj = PyLucene.StandardAnalyzer()
        qp = PyLucene.QueryParser(field, analyzer_obj)
        if (analyzer & self.ANALYZER_PARTIAL > 0):
            # PyLucene uses explicit wildcards for partial matching
            value += "*"
        return qp.parse(value)
示例#4
0
    def index_node(self, iba_node):
        self.delete_node(iba_node.nid)
        create = len(os.listdir('index')) == 0
        analyzer = PyLucene.StandardAnalyzer()
        writer = PyLucene.IndexWriter("index", analyzer, create)

        writer.addDocument(self._document_node(iba_node))
        writer.close()
        self.count = self.count + 1
示例#5
0
    def search_node_by_name2(self, name):
        if self.searcher is None:
            self.searcher = PyLucene.IndexSearcher("index")

        query = PyLucene.QueryParser(COLUMN_NAME,
                                     PyLucene.StandardAnalyzer()).parse(name)
        hits = self.searcher.search(query)
        result = self.hits_to_list(hits)

        return result
示例#6
0
    def __init__(self, store_dir=STORE_DIR, destroy=False, analyzer=None):
        self.store_dir = store_dir
        self.analyzer = analyzer or lucene.StandardAnalyzer()

        self.feed_modifier = IndexModifier(store_dir=os.path.join(
            store_dir, 'feeds'),
                                           destroy=destroy,
                                           analyzer=analyzer)
        self.entry_modifier = IndexModifier(store_dir=os.path.join(
            store_dir, 'entries'),
                                            destroy=destroy,
                                            analyzer=analyzer)
示例#7
0
    def __init__(self, basedir, analyzer=None, create_allowed=True):
        """Initialize or open an indexing database.

        Any derived class must override __init__.

        :raise ValueError: The given location exists, but the database type
                           is incompatible (e.g. created by a different indexing engine)
        :raise OSError: the database failed to initialize

        :param basedir: The parent directory of the database
        :type basedir: str
        :param analyzer: Bitwise combination of possible analyzer flags
                         to be used as the default analyzer for this database.
                         Leave it empty to use the system default analyzer
                         (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE,
                         self.ANALYZER_PARTIAL, ...
        :type analyzer: int
        :param create_allowed: create the database, if necessary; default: True
        :type create_allowed: bool
        """
        jvm = PyLucene.getVMEnv()
        jvm.attachCurrentThread()
        super(PyLuceneDatabase, self).__init__(basedir,
                                               analyzer=analyzer,
                                               create_allowed=create_allowed)
        self.pyl_analyzer = PyLucene.StandardAnalyzer()
        self.writer = None
        self.reader = None
        self.index_version = None
        try:
            # try to open an existing database
            tempreader = PyLucene.IndexReader.open(self.location)
            tempreader.close()
        except PyLucene.JavaError, err_msg:
            # Write an error out, in case this is a real problem instead of an absence of an index
            # TODO: turn the following two lines into debug output
            #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str()
            #DEBUG_FOO("could not open index, so going to create: " + errorstr)
            # Create the index, so we can open cached readers on it
            if not create_allowed:
                raise OSError("Indexer: skipping database creation")
            try:
                # create the parent directory if it does not exist
                parent_path = os.path.dirname(self.location)
                if not os.path.isdir(parent_path):
                    # recursively create all directories up to parent_path
                    os.makedirs(parent_path)
            except IOError, err_msg:
                raise OSError("Indexer: failed to create the parent " \
                        + "directory (%s) of the indexing database: %s" \
                        % (parent_path, err_msg))
示例#8
0
    def _create_query_for_string(self, text, require_all=True,
                analyzer=None):
        """generate a query for a plain term of a string query

        basically this function parses the string and returns the resulting
        query

        :param text: The query string
        :type text: str
        :param require_all: boolean operator
                            (True -> AND (default) / False -> OR)
        :type require_all: bool
        :param analyzer: The analyzer to be used
                         Possible analyzers are:
                         - :attr:`CommonDatabase.ANALYZER_TOKENIZE`
                           the field value is splitted to be matched word-wise
                         - :attr:`CommonDatabase.ANALYZER_PARTIAL`
                           the field value must start with the query string
                         - :attr:`CommonDatabase.ANALYZER_EXACT`
                           keep special characters and the like
        :type analyzer: bool
        :return: resulting query object
        :rtype: PyLucene.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            # exact matching - no substitution ...
            # for PyLucene: nothing special is necessary
            pass
        # don't care about special characters ...
        if analyzer == self.ANALYZER_EXACT:
            analyzer_obj = self.ExactAnalyzer()
        else:
            text = _escape_term_value(text)
            analyzer_obj = PyLucene.StandardAnalyzer()
        qp = PyLucene.QueryParser(analyzer=analyzer_obj)
        if require_all:
            qp.setDefaultOperator(qp.Operator.AND)
        else:
            qp.setDefaultOperator(qp.Operator.OR)
        if (analyzer & self.ANALYZER_PARTIAL) > 0:
            # PyLucene uses explicit wildcards for partial matching
            text += "*"
        return qp.parse(text)
示例#9
0
    def __init__(self, store_dir=STORE_DIR, destroy=False, analyzer=None):
        self.store_dir = store_dir
        self.create = False

        if os.path.exists(self.store_dir) and destroy:
            shutil.rmtree(self.store_dir)

        if not os.path.exists(self.store_dir):
            try:
                os.makedirs(self.store_dir)
            except:
                {}
            self.create = True

        self.store = lucene.FSDirectory.getDirectory(self.store_dir,
                                                     self.create)
        self.analyzer = analyzer or lucene.StandardAnalyzer()
        if self.create:
            self.get_writer(self.create).close()  #this inits the segment
示例#10
0
    def search(self,
               query_string='',
               require_visible=True,
               allow_curated=True):

        hits = []
        query_string = str(query_string)
        self.logger.info('Performing search: %s' % query_string)
        disassembled_query = disassemble_user_query(query_string)
        self.logger.debug('Disassembled query: %s' % str(disassembled_query))
        reassembled_query = '+(%s)' % reassemble_user_query(disassembled_query)
        self.logger.debug('Reassembled query: %s', reassembled_query)

        if not allow_curated:
            reassembled_query += \
                ' -record-status:%s' % canary.loader.QueuedRecord.STATUS_CURATED

        if require_visible:
            reassembled_query += ' +article-type:[%s TO %s]' % \
                (Study.ARTICLE_TYPES['traditional'],
                Study.ARTICLE_TYPES['curated'])
            reassembled_query += ' +record-status:%s' % \
                canary.loader.QueuedRecord.STATUS_CURATED

        try:
            searcher = PyLucene.IndexSearcher(
                PyLucene.FSDirectory.getDirectory(
                    self.context.config.search_index_dir, False))
            analyzer = PyLucene.StandardAnalyzer()
            query_parser = PyLucene.QueryParser('all', analyzer)
            query_parser.setOperator(PyLucene.QueryParser.DEFAULT_OPERATOR_AND)
            query = query_parser.parseQuery(reassembled_query)
            self.logger.info('Search query: %s', query)
            hits = searcher.search(query)
            return hits, searcher
        except Exception, e:
            self.logger.error('Search failed: %s', e)
            #self.logger.error(traceback.format_stack())
            if hits \
                and searcher:
                return hits, searcher
            else:
                return [], None
示例#11
0
文件: indexer.py 项目: iecnu/bmybbs
def index_files(board, time_delta):
    store = PyLucene.FSDirectory.getDirectory(
        BOARDSPATH + board + '/' + RECENT_INDEX, True)
    writer = PyLucene.IndexWriter(store, PyLucene.StandardAnalyzer(), True)
    writer.setMaxFieldLength(1048576)  # 1MB

    flist = get_all_files(board, time_delta)
    for filename, owner, title in flist:
        path = BOARDSPATH + board + '/' + filename
        if not os.path.exists(path):
            continue

        f = open(path, 'r')
        contents = filter_file(f)
        debug(contents)
        try:
            title = title.decode('gbk')
            owner = owner.decode('gbk')
            contents = unicode(contents, 'gbk')
        except UnicodeDecodeError:
            f.close()
            debug(filename)
            continue
        f.close()

        if len(contents) > 0:
            doc = PyLucene.Document()
            doc.add(
                PyLucene.Field("name", filename, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.UN_TOKENIZED))
            doc.add(
                PyLucene.Field("owner", owner, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.UN_TOKENIZED))
            doc.add(
                PyLucene.Field("title", title, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.UN_TOKENIZED))
            doc.add(
                PyLucene.Field("contents", contents, PyLucene.Field.Store.YES,
                               PyLucene.Field.Index.TOKENIZED))
            writer.addDocument(doc)
            debug('adding ' + filename)
    writer.optimize()
    writer.close()
示例#12
0
    def search_node_by_attribute2(self, att_type, att_value):
        if self.searcher is None:
            self.searcher = PyLucene.IndexSearcher("index")

        analyzer = PyLucene.StandardAnalyzer()

        if att_type != "" and att_value == "":
            parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_TYPE_NID, analyzer)
            query = parser.parse(att_type)
        elif att_type == "" and att_value != "":
            parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_VALUE, analyzer)
            query = parser.parse(att_value)
        elif att_type != "" and att_value != "":
            parser = PyLucene.QueryParser(COLUMN_ATTRIBUTE_VALUE, analyzer)
            query = parser.parse(COLUMN_ATTRIBUTE_TYPE_NID + ":" + att_type +
                                 " AND " + att_value)

        hits = self.searcher.search(query)
        result = self.hits_to_list(hits)

        return result
示例#13
0
    def _create_query_for_string(self, text, require_all=True,
            analyzer=None):
        """generate a query for a plain term of a string query

        basically this function parses the string and returns the resulting
        query

        @param text: the query string
        @type text: str
        @param require_all: boolean operator
            (True -> AND (default) / False -> OR)
        @type require_all: bool
        @param analyzer: the analyzer to be used
            possible analyzers are:
             -  L{CommonDatabase.ANALYZER_TOKENIZE}
                    the field value is splitted to be matched word-wise
             -  L{CommonDatabase.ANALYZER_PARTIAL}
                    the field value must start with the query string
             -  L{CommonDatabase.ANALYZER_EXACT}
                    keep special characters and the like
        @type analyzer: bool
        @return: resulting query object
        @rtype: PyLucene.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            analyzer_obj = PyLucene.KeywordAnalyzer()
        else:
            text = _escape_term_value(text)
            analyzer_obj = PyLucene.StandardAnalyzer()
        qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj)
        if (analyzer & self.ANALYZER_PARTIAL > 0):
            # PyLucene uses explicit wildcards for partial matching
            text += "*"
        if require_all:
            qp.setDefaultOperator(qp.Operator.AND)
        else:
            qp.setDefaultOperator(qp.Operator.OR)
        return qp.parse(text)
示例#14
0
                        PyLucene.Field("name", filename,
                                       PyLucene.Field.Store.YES,
                                       PyLucene.Field.Index.UN_TOKENIZED))
                    doc.add(
                        PyLucene.Field("path", path, PyLucene.Field.Store.YES,
                                       PyLucene.Field.Index.UN_TOKENIZED))
                    if len(contents) > 0:
                        doc.add(
                            PyLucene.Field("contents", contents,
                                           PyLucene.Field.Store.YES,
                                           PyLucene.Field.Index.TOKENIZED))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print IndexFiles.__doc__
        sys.exit(1)
    print 'PyLucene', PyLucene.VERSION, 'Lucene', PyLucene.LUCENE_VERSION
    start = datetime.now()
    try:
        IndexFiles(sys.argv[1], "index", PyLucene.StandardAnalyzer())
        end = datetime.now()
        print end - start
    except Exception, e:
        print "Failed: ", e
    def create_index(self):
        """ Post download setup callback for creating a lucene index """

        info("Creating lucene index")

        count = 0

        urllist = []

        urldb = objects.datamgr.get_urldb()

        storeDir = "index"
        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = PyLucene.FSDirectory.getDirectory(storeDir, True)
        lucene_writer = PyLucene.IndexWriter(store,
                                             PyLucene.StandardAnalyzer(), True)
        lucene_writer.setMaxFieldLength(1048576)

        for node in urldb.preorder():
            urlobj = node.get()

            # Only index if web-page or document
            if not urlobj.is_webpage() and not urlobj.is_document(): continue

            filename = urlobj.get_full_filename()
            url = urlobj.get_full_url()

            try:
                urllist.index(urlobj.index)
                continue
            except ValueError:
                urllist.append(urlobj.index)

            if not os.path.isfile(filename): continue

            data = ''

            extrainfo('Adding index for URL', url)

            try:
                data = unicode(open(filename).read(), 'iso-8859-1')
            except UnicodeDecodeError, e:
                data = ''

            try:
                doc = PyLucene.Document()
                doc.add(
                    PyLucene.Field("name", 'file://' + filename,
                                   PyLucene.Field.Store.YES,
                                   PyLucene.Field.Index.UN_TOKENIZED))
                doc.add(
                    PyLucene.Field("path", url, PyLucene.Field.Store.YES,
                                   PyLucene.Field.Index.UN_TOKENIZED))
                if data and len(data) > 0:
                    doc.add(
                        PyLucene.Field("contents", data,
                                       PyLucene.Field.Store.YES,
                                       PyLucene.Field.Index.TOKENIZED))
                else:
                    warning("warning: no content in %s" % filename)

                lucene_writer.addDocument(doc)
            except PyLucene.JavaError, e:
                print e
                continue
示例#16
0
 def _analyzer(self):
     return PyLucene.StandardAnalyzer([])
示例#17
0
    def __init__(self, basedir, analyzer=None, create_allowed=True):
        """Initialize or open an indexing database.

        Any derived class must override __init__.

        :raise ValueError: The given location exists, but the database type
                           is incompatible (e.g. created by a different indexing engine)
        :raise OSError: the database failed to initialize

        :param basedir: The parent directory of the database
        :type basedir: str
        :param analyzer: Bitwise combination of possible analyzer flags
                         to be used as the default analyzer for this database.
                         Leave it empty to use the system default analyzer
                         (self.ANALYZER_DEFAULT). See self.ANALYZER_TOKENIZE,
                         self.ANALYZER_PARTIAL, ...
        :type analyzer: int
        :param create_allowed: create the database, if necessary; default: True
        :type create_allowed: bool
        """
        jvm = PyLucene.getVMEnv()
        jvm.attachCurrentThread()
        super(PyLuceneDatabase, self).__init__(basedir,
                                               analyzer=analyzer,
                                               create_allowed=create_allowed)
        self.pyl_analyzer = PyLucene.StandardAnalyzer()
        self.writer = None
        self.reader = None
        self.index_version = None
        try:
            # try to open an existing database
            tempreader = PyLucene.IndexReader.open(self.location)
            tempreader.close()
        except PyLucene.JavaError as err_msg:
            # Write an error out, in case this is a real problem instead of an absence of an index
            # TODO: turn the following two lines into debug output
            #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str()
            #DEBUG_FOO("could not open index, so going to create: " + errorstr)
            # Create the index, so we can open cached readers on it
            if not create_allowed:
                raise OSError("Indexer: skipping database creation")
            try:
                # create the parent directory if it does not exist
                parent_path = os.path.dirname(self.location)
                if not os.path.isdir(parent_path):
                    # recursively create all directories up to parent_path
                    os.makedirs(parent_path)
            except IOError as err_msg:
                raise OSError("Indexer: failed to create the parent "
                              "directory (%s) of the indexing database: %s" %
                              (parent_path, err_msg))
            try:
                tempwriter = PyLucene.IndexWriter(self.location,
                                                  self.pyl_analyzer, True)
                tempwriter.close()
            except PyLucene.JavaError as err_msg:
                raise OSError("Indexer: failed to open or create a Lucene"
                              " database (%s): %s" % (self.location, err_msg))
        # the indexer is initialized - now we prepare the searcher
        # windows file locking seems inconsistent, so we try 10 times
        numtries = 0
        #self.dir_lock.acquire(blocking=True)
        # read "self.reader", "self.indexVersion" and "self.searcher"
        try:
            while numtries < 10:
                try:
                    self.reader = PyLucene.IndexReader.open(self.location)
                    self.indexVersion = self.reader.getCurrentVersion(
                        self.location)
                    self.searcher = PyLucene.IndexSearcher(self.reader)
                    break
                except PyLucene.JavaError as e:
                    # store error message for possible later re-raise (below)
                    lock_error_msg = e
                    time.sleep(0.01)
                    numtries += 1
            else:
                # locking failed for 10 times
                raise OSError("Indexer: failed to lock index database"
                              " (%s)" % lock_error_msg)
        finally:
            pass
        #    self.dir_lock.release()
        # initialize the searcher and the reader
        self._index_refresh()
示例#18
0
 def get_search_index_writer (self, clobber=False):
     search_index_store = PyLucene.FSDirectory.getDirectory(
         self.config.search_index_dir, clobber)
     return PyLucene.IndexWriter(search_index_store, 
         PyLucene.StandardAnalyzer(), clobber)