class Writer(Indexer): def __init__(self, schema, name, index_type, commit_count = 5): super(Writer, self).__init__(schema, name, index_type) self.__commit_count = commit_count self.writer = self.get_writer() self.__count = 0 self.__isMultiSegment = False def get_writer(self): return self.index.writer() def set_multiSegmentWriter(self, limitmb = 128, procs = 4): self.__isMultiSegment = True self.writer = MultiSegmentWriter(self.index, limitmb, procs) def save(self, item): self.writer.update_document(**item) self.__count += 1 if not self.__isMultiSegment and self.__count == self.__commit_count: self.commit() self.writer = self.get_writer() self.__count = 0 def commit(self): self.writer.commit()
def indexer(self, create=True): schema = self.bench.spec.whoosh_schema() path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) if not os.path.exists(path): os.mkdir(path) if create: ix = index.create_in(path, schema) else: ix = index.open_dir(path) poolclass = None if self.options.pool: poolclass = find_object(self.options.pool) kwargs = dict(limitmb=int(self.options.limitmb), poolclass=poolclass, dir=self.options.tempdir, procs=int(self.options.procs), batchsize=int(self.options.batch)) if self.options.expw: from whoosh.filedb.multiproc import MultiSegmentWriter self.writer = MultiSegmentWriter(ix, **kwargs) else: self.writer = ix.writer(**kwargs) self._procdoc = None if hasattr(self.bench.spec, "process_document_whoosh"): self._procdoc = self.bench.spec.process_document_whoosh
def create_index(use_multiprocessing=False): schema_fields = { 'id': NUMERIC(stored=True), 'slug': TEXT, 'title': TEXT, 'description': TEXT, } schema = Schema(**schema_fields) if os.path.exists(WHOOSH_INDEX_DIR): shutil.rmtree(WHOOSH_INDEX_DIR) os.mkdir(WHOOSH_INDEX_DIR) ix = create_in(WHOOSH_INDEX_DIR, schema) if use_multiprocessing: writer = MultiSegmentWriter(ix, limitmb=128) else: writer = ix.writer(limitmb=256) documents = get_documents() for doc in documents: writer.add_document(**doc) writer.commit() ix.close()
class WhooshModule(Module): def indexer(self, create=True): schema = self.bench.spec.whoosh_schema() path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) if not os.path.exists(path): os.mkdir(path) if create: ix = index.create_in(path, schema) else: ix = index.open_dir(path) poolclass = None if self.options.pool: poolclass = find_object(self.options.pool) kwargs = dict(limitmb=int(self.options.limitmb), poolclass=poolclass, dir=self.options.tempdir, procs=int(self.options.procs), batchsize=int(self.options.batch)) if self.options.expw: from whoosh.filedb.multiproc import MultiSegmentWriter self.writer = MultiSegmentWriter(ix, **kwargs) else: self.writer = ix.writer(**kwargs) self._procdoc = None if hasattr(self.bench.spec, "process_document_whoosh"): self._procdoc = self.bench.spec.process_document_whoosh def index_document(self, d): _procdoc = self._procdoc if _procdoc: _procdoc(d) self.writer.add_document(**d) def finish(self, merge=True, optimize=False): self.writer.commit(merge=merge, optimize=optimize) def searcher(self): path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) ix = index.open_dir(path) self.srch = ix.searcher() self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema) def query(self): qstring = " ".join(self.args).decode("utf8") return self.parser.parse(qstring) def find(self, q): return self.srch.search(q, limit=int(self.options.limit)) def findterms(self, terms): limit = int(self.options.limit) s = self.srch q = query.Term(self.bench.spec.main_field, None) for term in terms: q.text = term yield s.search(q, limit=limit)
def _modify_index(self, index, schema, wikiname, revids, mode='add', procs=1, limitmb=256): """ modify index contents - add, update, delete the indexed documents for all given revids Note: mode == 'add' is faster but you need to make sure to not create duplicate documents in the index. """ if procs == 1: # MultiSegmentWriter sometimes has issues and is pointless for procs == 1, # so use the simple writer when --procs 1 is given: writer = index.writer() else: writer = MultiSegmentWriter(index, procs, limitmb) with writer as writer: for revid in revids: if mode in ['add', 'update', ]: meta, data = self.backend.retrieve(revid) content = convert_to_indexable(meta, data) doc = backend_to_index(meta, content, schema, wikiname) if mode == 'update': writer.update_document(**doc) elif mode == 'add': writer.add_document(**doc) elif mode == 'delete': writer.delete_by_term(REVID, revid) else: raise ValueError("mode must be 'update', 'add' or 'delete', not '%s'" % mode)
def set_multiSegmentWriter(self, limitmb = 128, procs = 4): self.__isMultiSegment = True self.writer = MultiSegmentWriter(self.index, limitmb, procs)
def test_multisegwriter(): from whoosh.filedb.multiproc import MultiSegmentWriter _check_writer("multisegw", lambda ix: MultiSegmentWriter(ix, procs=4))