def test_stored_fields2(): schema = fields.Schema( content=fields.TEXT(stored=True), title=fields.TEXT(stored=True), summary=fields.STORED, path=fields.ID(stored=True), helpid=fields.KEYWORD, parent=fields.KEYWORD, context=fields.KEYWORD(stored=True), type=fields.KEYWORD(stored=True), status=fields.KEYWORD(stored=True), superclass=fields.KEYWORD(stored=True), exampleFor=fields.KEYWORD(stored=True), chapter=fields.KEYWORD(stored=True), replaces=fields.KEYWORD, time=fields.STORED, methods=fields.STORED, exampleFile=fields.STORED, ) storedkeys = [ "chapter", "content", "context", "exampleFile", "exampleFor", "methods", "path", "status", "summary", "superclass", "time", "title", "type" ] assert_equal(storedkeys, schema.stored_names()) st = RamStorage() ix = st.create_index(schema) writer = ix.writer() writer.add_document(content=u("Content of this document."), title=u("This is the title"), summary=u("This is the summary"), path=u("/main")) writer.add_document(content=u("Second document."), title=u("Second title"), summary=u("Summary numero due"), path=u("/second")) writer.add_document(content=u("Third document."), title=u("Title 3"), summary=u("Summary treo"), path=u("/san")) writer.commit() ix.close() ix = st.open_index() with ix.searcher() as s: doc = s.document(path="/main") assert doc is not None assert ([doc[k] for k in sorted(doc.keys())] == [ "Content of this document.", "/main", "This is the summary", "This is the title" ]) ix.close()
def test_pystemmer(): ana = (analysis.RegexTokenizer() | analysis.LowercaseFilter() | analysis.PyStemmerFilter()) schema = fields.Schema(text=fields.TEXT(analyzer=ana)) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: w.add_document(text=u("rains falling strangely")) ix = st.open_index() with ix.writer() as w: w.add_document(text=u("pains stalling strongly")) ix = st.open_index() with ix.reader() as r: assert_equal(list(r.lexicon("text")), ["fall", "pain", "rain", "stall", "strang", "strong"])
def test_persistent_cache(): schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: for term in u("charlie alfa echo bravo delta").split(): w.add_document(id=term) ix = st.open_index() with ix.reader() as r: _ = r.fieldcache("id") del _ ix = st.open_index() with ix.reader() as r: assert r.fieldcache_available("id") assert not r.fieldcache_loaded("id") fc = r.fieldcache("id") assert r.fieldcache_loaded("id") assert_equal(list(fc.order), [3, 1, 5, 2, 4]) assert_equal(list(fc.texts), [u('\uffff'), u'alfa', u'bravo', u'charlie', u'delta', u'echo'])
def test_persistent_cache(): schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) with ix.writer() as w: for term in u("charlie alfa echo bravo delta").split(): w.add_document(id=term) ix = st.open_index() with ix.reader() as r: _ = r.fieldcache("id") del _ ix = st.open_index() with ix.reader() as r: assert r.fieldcache_available("id") assert not r.fieldcache_loaded("id") fc = r.fieldcache("id") assert r.fieldcache_loaded("id") assert_equal(list(fc.order), [3, 1, 5, 2, 4]) assert_equal( list(fc.texts), [u('\uffff'), u'alfa', u'bravo', u'charlie', u'delta', u'echo'])
def test_stored_fields2(): schema = fields.Schema(content=fields.TEXT(stored=True), title=fields.TEXT(stored=True), summary=fields.STORED, path=fields.ID(stored=True), helpid=fields.KEYWORD, parent=fields.KEYWORD, context=fields.KEYWORD(stored=True), type=fields.KEYWORD(stored=True), status=fields.KEYWORD(stored=True), superclass=fields.KEYWORD(stored=True), exampleFor=fields.KEYWORD(stored=True), chapter=fields.KEYWORD(stored=True), replaces=fields.KEYWORD, time=fields.STORED, methods=fields.STORED, exampleFile=fields.STORED, ) storedkeys = ["chapter", "content", "context", "exampleFile", "exampleFor", "methods", "path", "status", "summary", "superclass", "time", "title", "type"] assert_equal(storedkeys, schema.stored_names()) st = RamStorage() ix = st.create_index(schema) writer = ix.writer() writer.add_document(content=u("Content of this document."), title=u("This is the title"), summary=u("This is the summary"), path=u("/main")) writer.add_document(content=u("Second document."), title=u("Second title"), summary=u("Summary numero due"), path=u("/second")) writer.add_document(content=u("Third document."), title=u("Title 3"), summary=u("Summary treo"), path=u("/san")) writer.commit() ix.close() ix = st.open_index() with ix.searcher() as s: doc = s.document(path="/main") assert ([doc[k] for k in sorted(doc.keys())] == ["Content of this document.", "/main", "This is the summary", "This is the title"]) ix.close()
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in ATTRS.keys(): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in info.pop('entity_attributes').items(): info[a] = v for a, v in info.items(): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in info.keys(): if not a in self.schema.names(): del info[a] for a, v in info.items(): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return self._collections def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield ATTRS[n] def attributes(self): return list(self._attributes()) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return list(searcher.lexicon(n)) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return self.objects.values() else: return self.infos.values() from whoosh.qparser import QueryParser #import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in ATTRS_INV.items(): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return list(lst)
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v for a, v in list(info.items()): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in list(info.keys()): if a not in self.schema.names(): del info[a] for a, v in list(info.items()): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return b2u(self._collections) def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(list(self.objects.keys())) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return b2u(list(self.objects.values())) else: return b2u(list(self.infos.values())) from whoosh.qparser import QueryParser # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return b2u(list(lst))