Python IndexWriter.optimize示例

编程语言: Python

命名空间/包名称: lupy.index.indexwriter

类/类型: IndexWriter

方法/功能: optimize

hotexamples.com的示例: 5

Python IndexWriter.optimize - 已找到5个示例。这些是从开源项目中提取的最受好评的lupy.index.indexwriter.IndexWriter.optimize现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

IndexWriter(4)

close(3)

addDocument(2)

optimize(2)

flushRamSegments(1)

mergeFactor(1)

示例#1

显示文件

文件： testLupy.py 项目： kjk/ipedia-palm

def index(fileName,limit):
    startTiming()
    # create a new index in a directory
    indexer = IndexWriter(g_indexPath, True)

    # supposed to speed up indexing by avoiding disk i/o
    # that's how many documents to index in memory before flushing
    # to disk
    indexer.mergeFactor = 1100

    count = 0
    failedCount = 0
    for (title,ns,txt) in iterGetArticle(fileName):
        if ns != 0:
            continue
        try:
            # iso-8859-1 is the default encoding used in Wikipedia
            txt = txt.decode("iso-8859-1")
            title = title.decode("iso-8859-1")
            indexOneArticle(indexer,title,txt)
            count += 1
            if count >= limit:
                break
            if count % 500 == 0:
                print "indexed %d articles" % count
        except:
            # for now just ignore possible decoding errors
            if g_fVerbose:
                sys.stdout.write('failed to index %s' % title)
            failedCount += 1

    # Uncomment the following line to optimize the index.
    # Have a look in the index dir before you optimize.
    # You will probably see a dozens of files from
    # several segments. optimize() merges all the segments
    # into one. It can be quite an expensive operation, but
    # it can save space and speed up searches.

    indexer.optimize()        
    indexer.close()
    endTiming()

    print "indexed %d articles" % count
    print "failed to index %d articles" % failedCount
    dumpTiming()

示例#2

显示文件

def index(fileName, limit):
    startTiming()
    # create a new index in a directory
    indexer = IndexWriter(g_indexPath, True)

    # supposed to speed up indexing by avoiding disk i/o
    # that's how many documents to index in memory before flushing
    # to disk
    indexer.mergeFactor = 1100

    count = 0
    failedCount = 0
    for (title, ns, txt) in iterGetArticle(fileName):
        if ns != 0:
            continue
        try:
            # iso-8859-1 is the default encoding used in Wikipedia
            txt = txt.decode("iso-8859-1")
            title = title.decode("iso-8859-1")
            indexOneArticle(indexer, title, txt)
            count += 1
            if count >= limit:
                break
            if count % 500 == 0:
                print "indexed %d articles" % count
        except:
            # for now just ignore possible decoding errors
            if g_fVerbose:
                sys.stdout.write('failed to index %s' % title)
            failedCount += 1

    # Uncomment the following line to optimize the index.
    # Have a look in the index dir before you optimize.
    # You will probably see a dozens of files from
    # several segments. optimize() merges all the segments
    # into one. It can be quite an expensive operation, but
    # it can save space and speed up searches.

    indexer.optimize()
    indexer.close()
    endTiming()

    print "indexed %d articles" % count
    print "failed to index %d articles" % failedCount
    dumpTiming()

示例#3

显示文件

文件： indexer.py 项目： Atom66/tain335

class Index:

    def __init__(self, name, create=False, analyzer=None):
        """
        @param name: Name of the directory for this index.
        @param create: Whether to create this directory or not.
        @type create: boolean
        """
        
        self.name = name
        self.analyzer = analyzer or standardTokenizer
        # Create the index if we need to. From here on we assume
        # that the index exists
        self.indexer = IndexWriter(self.name, create, analyzer)
        # Remember the default merge factor
        self.mergeFactor = self.indexer.mergeFactor
        # Clean up
        self.indexer.close()
        self.indexer = self.searcher = None
        
    def index(self, **kw):
        """Add a document to the index.
        
        **kw contains the name and values of each Field in the
        Document that we are creating.

        If the key in **kw starts with '_' the field will be created
        as a Keyword. If it starts with '__', it is created as a
        stored Text field (e.g. tokenized and stored), otherwise it
        will be created as a Text field. The leading '_' are removed
        before field creation.

        Text fields will have their value tokenized before
        indexing. The value is not stored in the index.  This is the
        usual type of field that you need for plain text.

        Keyword fields will not have their value tokenized.  The value
        is stored in the index and is returned with search hits on the
        Document. If you wanted to store the path to a document along
        with each document, you would use a Keyword field. The path
        would not be tokenized and its value would be returned in the
        query results, so you could easily open and display the file.
        """
        self._setupIndexer()
        
        # create document
        d = document.Document()

        # TODO - Please find another way of defining fields
        # than magic field names!!!

        # add a file field containing the path to this file
        for key, value in kw.items():
            if key[:2] == '__':
                key = key[2:]
                # Tokenized and stored
                f = document.Text(key, value, True)
            elif key[0] == '_':
                # Not tokenized and stored
                key = key[1:]
                # keyword
                f = document.Keyword(key, value)
            else:
                # Tokenized and not stored
                f = document.Text(key, value, False)
            d.add(f)
        self.indexer.addDocument(d)

    def _setupIndexer(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None
        if self.indexer is None:
            self.indexer = IndexWriter(self.name, False, self.analyzer)
            self.indexer.mergeFactor = self.mergeFactor

    def _setupSearcher(self):
        if self.indexer is not None:
            self.indexer.close()
            self.indexer = None
        if self.searcher is None:
            self.searcher = indexsearcher.IndexSearcher(self.name)

    def delete(self, **kw):
        "Delete the first document containing the specified term. See also L{deleteAll}."
        # Not very efficient for bulk deletes
        # Use deleteAll for bulk deletes
        self._setupSearcher()
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        field, value = kw.items()[0]
        t = Term(field, value)
        self.searcher.reader.deleteTerm(t)
        
    def deleteAll(self, **kw):
        "Remove all documents containing this field and value."
        self.close()
        reader = indexsearcher.open(self.name)
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        field, values = kw.items()[0]
        for value in values:
            t = Term(field, value)
            reader.deleteTerm(t)
        # commit the deletes
        reader.close()

    def close(self):
        # Indexer and Searchers are different
        # and we have to open the right kind
        # for the operation we are performing.
        # The actual creation is done in the index and find
        # methods. Here we close whatever is open.
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None
        if self.indexer is not None:
            self.indexer.close()
            self.indexer = None

    def flush(self):
       """Flush outstanding indexes to disk.

       This makes sure we are searching the latest stuff.
       """
       if self.indexer is not None:
           self.indexer.flushRamSegments()

    def optimize(self):
        """Merge all on-disk segments into a single segment. Saves space and can speed up queries."""
        self._setupIndexer()
        self.indexer.optimize()

    def parse(self, field, qString):
        if qString.startswith('"'):
            qString = qString.strip('"')
            #qWords = qString.strip('"').split()
            qWords = self._tokenize(qString)
            return self.phraseSearch(field, qWords)
        else:
            qWords = self._tokenize(qString)
            if len(qWords) == 1:
                return self.termSearch(field, qWords[0])
            else:
                return self.boolSearch(field, qWords)

    def _tokenize(self, qString):
        return list(self.analyzer(qString))

    def find(self, qStr):
        """Perform a search in any field in this index.

        If the search string is enclosed in double quotes, a phrase
        search will be run; otherwise, the search will be for
        documents containing all words specified."""
        
        self._setupSearcher()
            
        fields = self.searcher.fieldNames()
        if not fields:
            return []
        all = [self.parse(field, qStr) for field in fields]
        if len(all) is 1:
            # simple case
            return self.searcher.search(all[0])
        
        q = BooleanQuery()
        for query in all:
            # OR all of the field queries
            q.add(query, False, False)
        hits = self.searcher.search(q)
        return hits

    def findInField(self, **kw):
        """Search only in a single field."""
        # eg index.findInField(text='flute')
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        self._setupSearcher()
        field, query = kw.items()[0]
        q = self.parse(field, query)
        hits = self.searcher.search(q)
        return hits
    
    def termSearch(self, field, term):
        "Search for a single C{term} in a C{field}."
        t = Term(field, term)
        q = TermQuery(t)
        return q

    def phraseSearch(self, field, words):
        "Search for a phrase (given as a list of words) in C{field}."
        q = PhraseQuery()
        for word in words:
            t = Term(field, word)
            q.add(t)  
        return q
            
    def boolSearch(self, field, ands=[], ors=[], nots=[]):
        """Build a simple boolean query.

        Each word in C{ands} is equiv to +word
        Each word in C{ors} is equiv to word
        Each word in C{nots} is equiv to -word

        E.g. C{boolSearch(['spam'], ['eggs'], ['parrot', 'cheese'])} is
        equiv to C{+spam eggs -parrot -cheese} in Google/Lucene syntax.
        """
        q = BooleanQuery()

        for a in ands:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, True, False)
            
        for a in ors:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, False, False)
            
        for a in nots:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, False, True)
        
        return q
            
    def printHits(self, hits):
        if len(hits) == 0:
            print 'Nothing found!'
        else:
            for i in range(len(hits)):
                print hits.doc(i), hits.score(i)

    def setMergeFactor(self, anInt):
        "Set how many documents will be processed before the indexes will be merged. Never less than 2."
        # Never less than 2
        if anInt >= 2:
            self.mergeFactor = anInt

示例#4

显示文件

文件： search_lupy.py 项目： mrmaple/open_qon

 def _optimize_index(self):
     index = IndexWriter(self._lupy_index_dir, False)
     index.optimize()
     index.close()

示例#5

显示文件

class Index:
    def __init__(self, name, create=False, analyzer=None):
        """
        @param name: Name of the directory for this index.
        @param create: Whether to create this directory or not.
        @type create: boolean
        """

        self.name = name
        self.analyzer = analyzer or standardTokenizer
        # Create the index if we need to. From here on we assume
        # that the index exists
        self.indexer = IndexWriter(self.name, create, analyzer)
        # Remember the default merge factor
        self.mergeFactor = self.indexer.mergeFactor
        # Clean up
        self.indexer.close()
        self.indexer = self.searcher = None

    def index(self, **kw):
        """Add a document to the index.
        
        **kw contains the name and values of each Field in the
        Document that we are creating.

        If the key in **kw starts with '_' the field will be created
        as a Keyword. If it starts with '__', it is created as a
        stored Text field (e.g. tokenized and stored), otherwise it
        will be created as a Text field. The leading '_' are removed
        before field creation.

        Text fields will have their value tokenized before
        indexing. The value is not stored in the index.  This is the
        usual type of field that you need for plain text.

        Keyword fields will not have their value tokenized.  The value
        is stored in the index and is returned with search hits on the
        Document. If you wanted to store the path to a document along
        with each document, you would use a Keyword field. The path
        would not be tokenized and its value would be returned in the
        query results, so you could easily open and display the file.
        """
        self._setupIndexer()

        # create document
        d = document.Document()

        # TODO - Please find another way of defining fields
        # than magic field names!!!

        # add a file field containing the path to this file
        for key, value in kw.items():
            if key[:2] == '__':
                key = key[2:]
                # Tokenized and stored
                f = document.Text(key, value, True)
            elif key[0] == '_':
                # Not tokenized and stored
                key = key[1:]
                # keyword
                f = document.Keyword(key, value)
            else:
                # Tokenized and not stored
                f = document.Text(key, value, False)
            d.add(f)
        self.indexer.addDocument(d)

    def _setupIndexer(self):
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None
        if self.indexer is None:
            self.indexer = IndexWriter(self.name, False, self.analyzer)
            self.indexer.mergeFactor = self.mergeFactor

    def _setupSearcher(self):
        if self.indexer is not None:
            self.indexer.close()
            self.indexer = None
        if self.searcher is None:
            self.searcher = indexsearcher.IndexSearcher(self.name)

    def delete(self, **kw):
        "Delete the first document containing the specified term. See also L{deleteAll}."
        # Not very efficient for bulk deletes
        # Use deleteAll for bulk deletes
        self._setupSearcher()
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        field, value = kw.items()[0]
        t = Term(field, value)
        self.searcher.reader.deleteTerm(t)

    def deleteAll(self, **kw):
        "Remove all documents containing this field and value."
        self.close()
        reader = indexsearcher.open(self.name)
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        field, values = kw.items()[0]
        for value in values:
            t = Term(field, value)
            reader.deleteTerm(t)
        # commit the deletes
        reader.close()

    def close(self):
        # Indexer and Searchers are different
        # and we have to open the right kind
        # for the operation we are performing.
        # The actual creation is done in the index and find
        # methods. Here we close whatever is open.
        if self.searcher is not None:
            self.searcher.close()
            self.searcher = None
        if self.indexer is not None:
            self.indexer.close()
            self.indexer = None

    def flush(self):
        """Flush outstanding indexes to disk.

       This makes sure we are searching the latest stuff.
       """
        if self.indexer is not None:
            self.indexer.flushRamSegments()

    def optimize(self):
        """Merge all on-disk segments into a single segment. Saves space and can speed up queries."""
        self._setupIndexer()
        self.indexer.optimize()

    def parse(self, field, qString):
        if qString.startswith('"'):
            qString = qString.strip('"')
            #qWords = qString.strip('"').split()
            qWords = self._tokenize(qString)
            return self.phraseSearch(field, qWords)
        else:
            qWords = self._tokenize(qString)
            if len(qWords) == 1:
                return self.termSearch(field, qWords[0])
            else:
                return self.boolSearch(field, qWords)

    def _tokenize(self, qString):
        return list(self.analyzer(qString))

    def find(self, qStr):
        """Perform a search in any field in this index.

        If the search string is enclosed in double quotes, a phrase
        search will be run; otherwise, the search will be for
        documents containing all words specified."""

        self._setupSearcher()

        fields = self.searcher.fieldNames()
        if not fields:
            return []
        all = [self.parse(field, qStr) for field in fields]
        if len(all) is 1:
            # simple case
            return self.searcher.search(all[0])

        q = BooleanQuery()
        for query in all:
            # OR all of the field queries
            q.add(query, False, False)
        hits = self.searcher.search(q)
        return hits

    def findInField(self, **kw):
        """Search only in a single field."""
        # eg index.findInField(text='flute')
        if len(kw) != 1:
            raise RuntimeError, 'one and only one field for the moment'
        self._setupSearcher()
        field, query = kw.items()[0]
        q = self.parse(field, query)
        hits = self.searcher.search(q)
        return hits

    def termSearch(self, field, term):
        "Search for a single C{term} in a C{field}."
        t = Term(field, term)
        q = TermQuery(t)
        return q

    def phraseSearch(self, field, words):
        "Search for a phrase (given as a list of words) in C{field}."
        q = PhraseQuery()
        for word in words:
            t = Term(field, word)
            q.add(t)
        return q

    def boolSearch(self, field, ands=[], ors=[], nots=[]):
        """Build a simple boolean query.

        Each word in C{ands} is equiv to +word
        Each word in C{ors} is equiv to word
        Each word in C{nots} is equiv to -word

        E.g. C{boolSearch(['spam'], ['eggs'], ['parrot', 'cheese'])} is
        equiv to C{+spam eggs -parrot -cheese} in Google/Lucene syntax.
        """
        q = BooleanQuery()

        for a in ands:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, True, False)

        for a in ors:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, False, False)

        for a in nots:
            t = Term(field, a)
            tq = TermQuery(t)
            q.add(tq, False, True)

        return q

    def printHits(self, hits):
        if len(hits) == 0:
            print 'Nothing found!'
        else:
            for i in range(len(hits)):
                print hits.doc(i), hits.score(i)

    def setMergeFactor(self, anInt):
        "Set how many documents will be processed before the indexes will be merged. Never less than 2."
        # Never less than 2
        if anInt >= 2:
            self.mergeFactor = anInt