예제 #1
0
class Writer(Indexer):
	
	def __init__(self, schema, name, index_type, commit_count = 5):
		super(Writer, self).__init__(schema, name, index_type)
		self.__commit_count = commit_count
		
		self.writer = self.get_writer()
		self.__count = 0
		self.__isMultiSegment = False
	
	def get_writer(self):
		return self.index.writer()
	
	def set_multiSegmentWriter(self, limitmb = 128, procs = 4):
		self.__isMultiSegment = True
		self.writer = MultiSegmentWriter(self.index, limitmb, procs)
	
	def save(self, item):
		self.writer.update_document(**item)
		self.__count += 1
		
		if not self.__isMultiSegment and self.__count == self.__commit_count:
			self.commit()
			self.writer = self.get_writer()
			self.__count = 0
	
	def commit(self):
		self.writer.commit()
예제 #2
0
    def indexer(self, create=True):
        schema = self.bench.spec.whoosh_schema()
        path = os.path.join(self.options.dir,
                            "%s_whoosh" % self.options.indexname)

        if not os.path.exists(path):
            os.mkdir(path)
        if create:
            ix = index.create_in(path, schema)
        else:
            ix = index.open_dir(path)

        poolclass = None
        if self.options.pool:
            poolclass = find_object(self.options.pool)

        kwargs = dict(limitmb=int(self.options.limitmb),
                      poolclass=poolclass,
                      dir=self.options.tempdir,
                      procs=int(self.options.procs),
                      batchsize=int(self.options.batch))

        if self.options.expw:
            from whoosh.filedb.multiproc import MultiSegmentWriter
            self.writer = MultiSegmentWriter(ix, **kwargs)
        else:
            self.writer = ix.writer(**kwargs)

        self._procdoc = None
        if hasattr(self.bench.spec, "process_document_whoosh"):
            self._procdoc = self.bench.spec.process_document_whoosh
예제 #3
0
def create_index(use_multiprocessing=False):
    schema_fields = {
        'id': NUMERIC(stored=True),
        'slug': TEXT,
        'title': TEXT,
        'description': TEXT,
    }

    schema = Schema(**schema_fields)

    if os.path.exists(WHOOSH_INDEX_DIR):
        shutil.rmtree(WHOOSH_INDEX_DIR)
    os.mkdir(WHOOSH_INDEX_DIR)

    ix = create_in(WHOOSH_INDEX_DIR, schema)
    if use_multiprocessing:
        writer = MultiSegmentWriter(ix, limitmb=128)
    else:
        writer = ix.writer(limitmb=256)

    documents = get_documents()

    for doc in documents:
        writer.add_document(**doc)
    writer.commit()
    ix.close()
예제 #4
0
파일: bench.py 프로젝트: MapofLife/MOL
class WhooshModule(Module):
    def indexer(self, create=True):
        schema = self.bench.spec.whoosh_schema()
        path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)

        if not os.path.exists(path):
            os.mkdir(path)
        if create:
            ix = index.create_in(path, schema)
        else:
            ix = index.open_dir(path)

        poolclass = None
        if self.options.pool:
            poolclass = find_object(self.options.pool)

        kwargs = dict(limitmb=int(self.options.limitmb), poolclass=poolclass,
                      dir=self.options.tempdir, procs=int(self.options.procs),
                      batchsize=int(self.options.batch))

        if self.options.expw:
            from whoosh.filedb.multiproc import MultiSegmentWriter
            self.writer = MultiSegmentWriter(ix, **kwargs)
        else:
            self.writer = ix.writer(**kwargs)

        self._procdoc = None
        if hasattr(self.bench.spec, "process_document_whoosh"):
            self._procdoc = self.bench.spec.process_document_whoosh

    def index_document(self, d):
        _procdoc = self._procdoc
        if _procdoc:
            _procdoc(d)
        self.writer.add_document(**d)

    def finish(self, merge=True, optimize=False):
        self.writer.commit(merge=merge, optimize=optimize)

    def searcher(self):
        path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)
        ix = index.open_dir(path)
        self.srch = ix.searcher()
        self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema)

    def query(self):
        qstring = " ".join(self.args).decode("utf8")
        return self.parser.parse(qstring)

    def find(self, q):
        return self.srch.search(q, limit=int(self.options.limit))

    def findterms(self, terms):
        limit = int(self.options.limit)
        s = self.srch
        q = query.Term(self.bench.spec.main_field, None)
        for term in terms:
            q.text = term
            yield s.search(q, limit=limit)
예제 #5
0
    def _modify_index(self, index, schema, wikiname, revids, mode='add', procs=1, limitmb=256):
        """
        modify index contents - add, update, delete the indexed documents for all given revids

        Note: mode == 'add' is faster but you need to make sure to not create duplicate
              documents in the index.
        """
        if procs == 1:
            # MultiSegmentWriter sometimes has issues and is pointless for procs == 1,
            # so use the simple writer when --procs 1 is given:
            writer = index.writer()
        else:
            writer = MultiSegmentWriter(index, procs, limitmb)
        with writer as writer:
            for revid in revids:
                if mode in ['add', 'update', ]:
                    meta, data = self.backend.retrieve(revid)
                    content = convert_to_indexable(meta, data)
                    doc = backend_to_index(meta, content, schema, wikiname)
                if mode == 'update':
                    writer.update_document(**doc)
                elif mode == 'add':
                    writer.add_document(**doc)
                elif mode == 'delete':
                    writer.delete_by_term(REVID, revid)
                else:
                    raise ValueError("mode must be 'update', 'add' or 'delete', not '%s'" % mode)
예제 #6
0
파일: bench.py 프로젝트: MapofLife/MOL
    def indexer(self, create=True):
        schema = self.bench.spec.whoosh_schema()
        path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)

        if not os.path.exists(path):
            os.mkdir(path)
        if create:
            ix = index.create_in(path, schema)
        else:
            ix = index.open_dir(path)

        poolclass = None
        if self.options.pool:
            poolclass = find_object(self.options.pool)

        kwargs = dict(limitmb=int(self.options.limitmb), poolclass=poolclass,
                      dir=self.options.tempdir, procs=int(self.options.procs),
                      batchsize=int(self.options.batch))

        if self.options.expw:
            from whoosh.filedb.multiproc import MultiSegmentWriter
            self.writer = MultiSegmentWriter(ix, **kwargs)
        else:
            self.writer = ix.writer(**kwargs)

        self._procdoc = None
        if hasattr(self.bench.spec, "process_document_whoosh"):
            self._procdoc = self.bench.spec.process_document_whoosh
예제 #7
0
	def set_multiSegmentWriter(self, limitmb = 128, procs = 4):
		self.__isMultiSegment = True
		self.writer = MultiSegmentWriter(self.index, limitmb, procs)
예제 #8
0
class WhooshModule(Module):
    def indexer(self, create=True):
        schema = self.bench.spec.whoosh_schema()
        path = os.path.join(self.options.dir,
                            "%s_whoosh" % self.options.indexname)

        if not os.path.exists(path):
            os.mkdir(path)
        if create:
            ix = index.create_in(path, schema)
        else:
            ix = index.open_dir(path)

        poolclass = None
        if self.options.pool:
            poolclass = find_object(self.options.pool)

        kwargs = dict(limitmb=int(self.options.limitmb),
                      poolclass=poolclass,
                      dir=self.options.tempdir,
                      procs=int(self.options.procs),
                      batchsize=int(self.options.batch))

        if self.options.expw:
            from whoosh.filedb.multiproc import MultiSegmentWriter
            self.writer = MultiSegmentWriter(ix, **kwargs)
        else:
            self.writer = ix.writer(**kwargs)

        self._procdoc = None
        if hasattr(self.bench.spec, "process_document_whoosh"):
            self._procdoc = self.bench.spec.process_document_whoosh

    def index_document(self, d):
        _procdoc = self._procdoc
        if _procdoc:
            _procdoc(d)
        self.writer.add_document(**d)

    def finish(self, merge=True, optimize=False):
        self.writer.commit(merge=merge, optimize=optimize)

    def searcher(self):
        path = os.path.join(self.options.dir,
                            "%s_whoosh" % self.options.indexname)
        ix = index.open_dir(path)
        self.srch = ix.searcher()
        self.parser = qparser.QueryParser(self.bench.spec.main_field,
                                          schema=ix.schema)

    def query(self):
        qstring = " ".join(self.args).decode("utf8")
        return self.parser.parse(qstring)

    def find(self, q):
        return self.srch.search(q, limit=int(self.options.limit))

    def findterms(self, terms):
        limit = int(self.options.limit)
        s = self.srch
        q = query.Term(self.bench.spec.main_field, None)
        for term in terms:
            q.text = term
            yield s.search(q, limit=limit)
예제 #9
0
def test_multisegwriter():
    from whoosh.filedb.multiproc import MultiSegmentWriter
    _check_writer("multisegw", lambda ix: MultiSegmentWriter(ix, procs=4))