def _setupIndexer(self): if self.searcher is not None: self.searcher.close() self.searcher = None if self.indexer is None: self.indexer = IndexWriter(self.name, False, self.analyzer) self.indexer.mergeFactor = self.mergeFactor
def __init__(self, name, create=False, analyzer=None): """ @param name: Name of the directory for this index. @param create: Whether to create this directory or not. @type create: boolean """ self.name = name self.analyzer = analyzer or standardTokenizer # Create the index if we need to. From here on we assume # that the index exists self.indexer = IndexWriter(self.name, create, analyzer) # Remember the default merge factor self.mergeFactor = self.indexer.mergeFactor # Clean up self.indexer.close() self.indexer = self.searcher = None
def index(fileName, limit): startTiming() # create a new index in a directory indexer = IndexWriter(g_indexPath, True) # supposed to speed up indexing by avoiding disk i/o # that's how many documents to index in memory before flushing # to disk indexer.mergeFactor = 1100 count = 0 failedCount = 0 for (title, ns, txt) in iterGetArticle(fileName): if ns != 0: continue try: # iso-8859-1 is the default encoding used in Wikipedia txt = txt.decode("iso-8859-1") title = title.decode("iso-8859-1") indexOneArticle(indexer, title, txt) count += 1 if count >= limit: break if count % 500 == 0: print "indexed %d articles" % count except: # for now just ignore possible decoding errors if g_fVerbose: sys.stdout.write('failed to index %s' % title) failedCount += 1 # Uncomment the following line to optimize the index. # Have a look in the index dir before you optimize. # You will probably see a dozens of files from # several segments. optimize() merges all the segments # into one. It can be quite an expensive operation, but # it can save space and speed up searches. indexer.optimize() indexer.close() endTiming() print "indexed %d articles" % count print "failed to index %d articles" % failedCount dumpTiming()
def __init__(self, path, create=False): """Create an indexer, writing and index to the directory B{path}. The boolean flag B{create} determines whether the index is created (overwriting an existing index) or updated""" self.indexer = IndexWriter(path, create)