예제 #1
0
 def _setupIndexer(self):
     if self.searcher is not None:
         self.searcher.close()
         self.searcher = None
     if self.indexer is None:
         self.indexer = IndexWriter(self.name, False, self.analyzer)
         self.indexer.mergeFactor = self.mergeFactor
예제 #2
0
    def __init__(self, name, create=False, analyzer=None):
        """
        @param name: Name of the directory for this index.
        @param create: Whether to create this directory or not.
        @type create: boolean
        """

        self.name = name
        self.analyzer = analyzer or standardTokenizer
        # Create the index if we need to. From here on we assume
        # that the index exists
        self.indexer = IndexWriter(self.name, create, analyzer)
        # Remember the default merge factor
        self.mergeFactor = self.indexer.mergeFactor
        # Clean up
        self.indexer.close()
        self.indexer = self.searcher = None
예제 #3
0
def index(fileName, limit):
    startTiming()
    # create a new index in a directory
    indexer = IndexWriter(g_indexPath, True)

    # supposed to speed up indexing by avoiding disk i/o
    # that's how many documents to index in memory before flushing
    # to disk
    indexer.mergeFactor = 1100

    count = 0
    failedCount = 0
    for (title, ns, txt) in iterGetArticle(fileName):
        if ns != 0:
            continue
        try:
            # iso-8859-1 is the default encoding used in Wikipedia
            txt = txt.decode("iso-8859-1")
            title = title.decode("iso-8859-1")
            indexOneArticle(indexer, title, txt)
            count += 1
            if count >= limit:
                break
            if count % 500 == 0:
                print "indexed %d articles" % count
        except:
            # for now just ignore possible decoding errors
            if g_fVerbose:
                sys.stdout.write('failed to index %s' % title)
            failedCount += 1

    # Uncomment the following line to optimize the index.
    # Have a look in the index dir before you optimize.
    # You will probably see a dozens of files from
    # several segments. optimize() merges all the segments
    # into one. It can be quite an expensive operation, but
    # it can save space and speed up searches.

    indexer.optimize()
    indexer.close()
    endTiming()

    print "indexed %d articles" % count
    print "failed to index %d articles" % failedCount
    dumpTiming()
예제 #4
0
    def __init__(self, path, create=False):
        """Create an indexer, writing and index to the directory B{path}.
        The boolean flag B{create} determines whether the index is created
        (overwriting an existing index) or updated"""

        self.indexer = IndexWriter(path, create)