示例#1
0
  def test_build_attrs(self):
    schema = Schema()
    adapter = SAAdapter(SANotIndexable, schema)
    self.assertEquals(adapter.indexable, False)
    self.assertEquals(adapter.doc_attrs, {})

    adapter = SAAdapter(Entity, schema)
    self.assertEquals(adapter.indexable, False)

    adapter = SAAdapter(SubclassEntityIndexable, schema)
    self.assertEquals(adapter.indexable, True)
    self.assertEquals(set(adapter.doc_attrs),
                      set(('object_key', 'id', 'name', 'object_type',
                           'text', 'created_at', 'updated_at', 'name_prefix',
                           'owner', 'owner_name', 'creator_name', 'creator')))
    self.assert_(all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues()))

    self.assertEquals(set(schema.names()),
                      set(('object_key', 'id', 'object_type', 'name',
                           'text', 'created_at', 'updated_at', 'name_prefix',
                           'owner', 'owner_name', 'creator_name', 'creator')))

    schema = Schema(
      id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True),
    )
    adapter = SAAdapter(Indexable, schema)
    self.assertEquals(adapter.indexable, True)
    self.assertEquals(set(adapter.doc_attrs),
                      set(('id', 'text', 'num', 'name')))
    self.assert_(all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues()))

    self.assertEquals(set(schema.names()),
                      set(('id', 'text', 'num', 'name')))
    self.assertTrue(isinstance(schema['text'], TEXT))
    self.assertTrue(isinstance(schema['num'], NUMERIC))
示例#2
0
def test_build_attrs():
    schema = Schema()
    adapter = SAAdapter(SANotIndexable, schema)
    assert not adapter.indexable
    assert adapter.doc_attrs == {}

    adapter = SAAdapter(Entity, schema)
    assert adapter.indexable == False

    adapter = SAAdapter(SubclassEntityIndexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
        "object_key",
        "id",
        "name",
        "slug",
        "object_type",
        "text",
        "created_at",
        "updated_at",
        "name_prefix",
        "owner",
        "owner_name",
        "creator_name",
        "creator",
        "allowed_roles_and_users",
        "tag_ids",
        "tag_text",
    }
    assert all(lambda f: callable(f) for f in six.itervalues(adapter.doc_attrs))

    assert set(schema.names()) == {
        "object_key",
        "id",
        "object_type",
        "name",
        "slug",
        "text",
        "created_at",
        "updated_at",
        "name_prefix",
        "owner",
        "owner_name",
        "creator_name",
        "creator",
        "allowed_roles_and_users",
        "tag_ids",
        "tag_text",
    }

    schema = Schema(id=NUMERIC(bits=64, signed=False, stored=True, unique=True))
    adapter = SAAdapter(Indexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {"id", "text", "num", "name"}
    assert all(lambda f: callable(f) for f in six.itervalues(adapter.doc_attrs))

    assert set(schema.names()) == {"id", "text", "num", "name"}
    assert isinstance(schema["text"], TEXT)
    assert isinstance(schema["num"], NUMERIC)
示例#3
0
def test_build_attrs_4() -> None:
    schema = Schema(
        id=NUMERIC(bits=64, signed=False, stored=True, unique=True))
    adapter = SAAdapter(Indexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
        "id",
        "text",
        "num",
        "name",
        "object_type",
        "object_key",
    }
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.values())

    assert set(schema.names()) == {
        "id",
        "text",
        "num",
        "name",
        "object_type",
        "object_key",
    }
    assert isinstance(schema["text"], TEXT)
    assert isinstance(schema["num"], NUMERIC)
示例#4
0
  def test_build_attrs(self):
    schema = Schema()
    adapter = SAAdapter(SANotIndexable, schema)
    assert not adapter.indexable
    assert adapter.doc_attrs == {}

    adapter = SAAdapter(Entity, schema)
    assert adapter.indexable == False

    adapter = SAAdapter(SubclassEntityIndexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
      'object_key', 'id', 'name', 'slug', 'object_type',
      'text', 'created_at', 'updated_at', 'name_prefix',
      'owner', 'owner_name', 'creator_name', 'creator',
      'allowed_roles_and_users', 'tag_ids', 'tag_text',
    }
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues())

    assert set(schema.names()) == {
      'object_key', 'id', 'object_type', 'name', 'slug',
      'text', 'created_at', 'updated_at', 'name_prefix',
      'owner', 'owner_name', 'creator_name', 'creator',
      'allowed_roles_and_users', 'tag_ids', 'tag_text',
    }

    schema = Schema(
      id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True),
    )
    adapter = SAAdapter(Indexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'}
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues())

    assert set(schema.names()) == {'id', 'text', 'num', 'name'}
    assert isinstance(schema['text'], TEXT)
    assert isinstance(schema['num'], NUMERIC)
示例#5
0
    def __init__(self, model: t.Type[Model], schema: Schema, index_name: t.Optional[str] = None):
        """
        :param model: django.db.models.Model subclass whose instances are to be searched
        :param schema: field schema for the search index
        :param index_name: name of the search index
        :param pk_name: field name of the model's primary key
        """
        self.model = model
        self.schema = schema
        self.pk_name = model._meta.pk.name
        self.schema.add(self.pk_name, PK_FIELDTYPE)
        self.index = self._init_index(index_name)
        query_fields = set(schema.names()) - {self.pk_name}
        self.query_parser = LectorQueryParser(query_fields, self.schema)

        self._search_cache: t.MutableMapping[str, Results] = cachetools.TTLCache(64, 60.0)
示例#6
0
def test_build_attrs_3() -> None:
    schema = Schema()
    adapter = SAAdapter(SubclassEntityIndexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
        "allowed_roles_and_users",
        "created_at",
        "creator",
        "creator_name",
        "id",
        "name",
        "name_prefix",
        "object_key",
        "object_type",
        "owner",
        "owner_name",
        "slug",
        "tag_ids",
        "tag_text",
        "text",
        "updated_at",
    }
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.values())

    assert set(schema.names()) == {
        "allowed_roles_and_users",
        "created_at",
        "creator",
        "creator_name",
        "id",
        "name",
        "name_prefix",
        "object_key",
        "object_type",
        "owner",
        "owner_name",
        "slug",
        "tag_ids",
        "tag_text",
        "text",
        "updated_at",
    }
示例#7
0
def test_build_attrs_3():
    schema = Schema()
    adapter = SAAdapter(SubclassEntityIndexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
        "allowed_roles_and_users",
        "created_at",
        "creator",
        "creator_name",
        "id",
        "name",
        "name_prefix",
        "object_key",
        "object_type",
        "owner",
        "owner_name",
        "slug",
        "tag_ids",
        "tag_text",
        "text",
        "updated_at",
    }
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.values())

    assert set(schema.names()) == {
        "allowed_roles_and_users",
        "created_at",
        "creator",
        "creator_name",
        "id",
        "name",
        "name_prefix",
        "object_key",
        "object_type",
        "owner",
        "owner_name",
        "slug",
        "tag_ids",
        "tag_text",
        "text",
        "updated_at",
    }
示例#8
0
def test_build_attrs_4():
    schema = Schema(id=NUMERIC(bits=64, signed=False, stored=True, unique=True))
    adapter = SAAdapter(Indexable, schema)
    assert adapter.indexable
    assert set(adapter.doc_attrs) == {
        "id",
        "text",
        "num",
        "name",
        "object_type",
        "object_key",
    }
    assert all(lambda f: callable(f) for f in adapter.doc_attrs.values())

    assert set(schema.names()) == {
        "id",
        "text",
        "num",
        "name",
        "object_type",
        "object_key",
    }
    assert isinstance(schema["text"], TEXT)
    assert isinstance(schema["num"], NUMERIC)
示例#9
0
from whoosh import fields
# 打印支持的变量类型
print([item for item in dir(fields)[:10] if item.isupper()
       ])  # ['BOOLEAN', 'COLUMN', 'DATETIME', 'ID', 'IDLIST', 'KEYWORD']
print(len(schema.items()))  # 3
print(
    schema.items()[0]
)  # ('content', TEXT(format=Positions(boost=1.0), scorable=True, stored=False, unique=None))
print(
    schema.items()[1]
)  # ('path', ID(format=Existence(boost=1.0), scorable=None, stored=True, unique=False))
print(
    schema.items()[2]
)  # ('title', TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None))
print(
    schema.names()
)  # ['content', 'path', 'title']; Returns a list of the names of the fields in this schema.
print(
    schema.scorable_names()
)  # ['content', 'title']; Returns a list of the names of fields that store field lengths.
print(schema.stored_names(
))  # ['path', 'title']; Returns a list of the names of fields that are stored.
print(schema.has_scorable_fields())  # True
##################################################################
## 2. 索引生成
## create_in(dirname, schema, indexname=None)
## Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you.
ix = create_in('./tmp',
               schema)  # 存储 schema 信息至 ./tmp/; ** 这个只能执行一遍, 否则会报 LockError **
print(type(ix))  # <class 'whoosh.index.FileIndex'>
print(ix.schema)  # <Schema: ['content', 'path', 'title']>
示例#10
0
class WhooshStore(SAMLStoreBase):
    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in ATTRS.keys():
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in info.pop('entity_attributes').items():
                info[a] = v
        for a, v in info.items():
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in info.keys():
            if not a in self.schema.names():
                del info[a]

        for a, v in info.items():
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return self._collections

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield ATTRS[n]

    def attributes(self):
        return list(self._attributes())

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return list(searcher.lexicon(n))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return self.objects.values()
            else:
                return self.infos.values()

        from whoosh.qparser import QueryParser
        #import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in ATTRS_INV.items():
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return list(lst)
示例#11
0
文件: store.py 项目: mrvanes/pyFFplus
class RedisWhooshStore(SAMLStoreBase
                       ):  # TODO: This needs a gc mechanism for keys (uuids)
    def json_dict(self, name):
        return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name),
                                     redis=self._redis,
                                     writeback=True),
                            maxsize=config.cache_size)

    def xml_dict(self, name):
        return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name),
                                    redis=self._redis,
                                    writeback=True),
                            maxsize=config.cache_size)

    def __init__(self, *args, **kwargs):
        self._dir = kwargs.pop('directory', '.whoosh')
        clear = bool(kwargs.pop('clear', config.store_clear))
        self._name = kwargs.pop('name', config.store_name)
        self._redis = kwargs.pop('redis', redis())
        if clear:
            shutil.rmtree(self._dir)
        now = datetime.now()
        self._last_index_time = now
        self._last_modified = now
        self._setup()
        if clear:
            self.reset()

    def _setup(self):
        self._redis = getattr(self, '_redis', None)
        if not self._redis:
            self._redis = redis(
            )  # XXX test cases won't get correctly unpicked because of this
        self.schema = Schema(content=NGRAMWORDS(stored=False))
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        self.schema.add('sha1', ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self.objects = self.xml_dict('objects')
        self.parts = self.json_dict('parts')
        self.storage = FileStorage(os.path.join(self._dir, self._name))
        try:
            self.index = self.storage.open_index(schema=self.schema)
        except BaseException as ex:
            log.warn(ex)
            self.storage.create()
            self.index = self.storage.create_index(self.schema)
            self._reindex()

    def __getstate__(self):
        state = dict()
        for p in ('_dir', '_name', '_last_index_time', '_last_modified'):
            state[p] = getattr(self, p)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self._setup()

    def __call__(self, *args, **kwargs):
        watched = kwargs.pop('watched', None)
        scheduler = kwargs.pop('scheduler', None)
        if watched is not None and scheduler is not None:
            super(RedisWhooshStore, self).__call__(watched=watched,
                                                   scheduler=scheduler)
            log.debug("indexing using {}".format(scheduler))
            if scheduler is not None:  # and self._last_modified > self._last_index_time and :
                scheduler.add_job(RedisWhooshStore._reindex,
                                  args=[self],
                                  max_instances=1,
                                  coalesce=True,
                                  misfire_grace_time=2 *
                                  config.update_frequency)

    def _reindex(self):
        log.debug("indexing the store...")
        self._last_index_time = datetime.now()
        seen = set()
        refs = set([b2u(s) for s in self.objects.keys()])
        parts = self.parts.values()
        for ref in refs:
            for part in parts:
                if ref in part['items']:
                    seen.add(ref)

        ix = self.storage.open_index()
        lock = ix.lock("reindex")
        try:
            log.debug("waiting for index lock")
            lock.acquire(True)
            log.debug("got index lock")
            with ix.writer() as writer:
                for ref in refs:
                    if ref not in seen:
                        log.debug("removing unseen ref {}".format(ref))
                        del self.objects[ref]
                        del self.parts[ref]

                log.debug("updating index")
                for e in self.objects.values():
                    info = self._index_prep(entity_simple_info(e))
                    ref = object_id(e)
                    writer.add_document(object_id=ref, **info)

                writer.mergetype = CLEAR
        finally:
            try:
                log.debug("releasing index lock")
                lock.release()
            except ThreadError as ex:
                pass

    def dump(self):
        ix = self.storage.open_index()
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        res = dict()
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v

        content = " ".join(
            filter(lambda x: x is not None, [
                info.get(x, '') for x in ('service_name', 'title', 'domain',
                                          'keywords', 'scopes')
            ]))
        res['content'] = content.strip()
        for a, v in info.items():
            k = a
            if a in ATTRS_INV:
                k = ATTRS_INV[a]

            if k in self.schema.names():
                if type(v) in (list, tuple):
                    res[k] = " ".join([vv.lower() for vv in v])
                elif type(v) in six.string_types:
                    res[k] = info[a].lower()
        res['sha1'] = hash_id(info['entity_id'], prefix=False)
        return res

    def update(self, t, tid=None, etag=None, lazy=True):
        relt = root(t)
        assert (relt is not None)

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            ref = object_id(relt)
            parts = None
            if ref in self.parts:
                parts = self.parts[ref]
            if etag is not None and (parts is None
                                     or parts.get('etag', None) != etag):
                self.parts[ref] = {
                    'id': relt.get('entityID'),
                    'etag': etag,
                    'count': 1,
                    'items': [ref]
                }
                self.objects[ref] = relt
                self._last_modified = datetime.now()
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            if etag is None:
                etag = hex_digest(dumptree(t, pretty_print=False), 'sha256')
            parts = None
            if tid in self.parts:
                parts = self.parts[tid]
            if parts is None or parts.get('etag', None) != etag:
                items = set()
                for e in iter_entities(t):
                    ref = object_id(e)
                    items.add(ref)
                    self.objects[ref] = e
                self.parts[tid] = {
                    'id': tid,
                    'count': len(items),
                    'etag': etag,
                    'items': list(items)
                }
                self._last_modified = datetime.now()

        if not lazy:
            self._reindex()

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def collections(self):
        return [b2u(ref) for ref in self.parts.keys()]

    def reset(self):
        for k in ('{}_{}'.format(self._name, 'parts'),
                  '{}_{}'.format(self._name, 'objects')):
            self._redis.delete('{}_{}'.format(self._name, 'parts'))
            self._redis.delete('{}_{}'.format(self._name, 'objects'))

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def _prep_key(self, key):
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        key = key.replace('-', ' AND NOT ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        return key

    def _entities(self):
        lst = set()
        for ref_data in self.parts.values():
            for ref in ref_data['items']:
                e = self.objects.get(ref, None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def lookup(self, key):
        if key == 'entities' or key is None:
            return self._entities()

        bkey = six.b(key)
        if bkey in self.objects:
            return [self.objects.get(bkey)]

        if bkey in self.parts:
            res = []
            part = self.parts.get(bkey)
            for item in part['items']:
                res.extend(self.lookup(item))
            return res

        key = self._prep_key(key)
        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                e = self.objects.get(result['object_id'], None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def search(self, query=None, path=None, entity_filter=None, related=None):
        if entity_filter:
            query = "{!s} AND {!s}".format(query, entity_filter)
        query = self._prep_key(query)
        qp = MultifieldParser(['content', 'domain'], schema=self.schema)
        q = qp.parse(query)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            log.debug(results)
            for result in results:
                lst.add(result['object_id'])

        res = list()
        for ref in lst:
            e = self.objects.get(ref, None)
            if e is not None:
                res.append(discojson(e))
        return res
示例#12
0
    def test_build_attrs(self):
        schema = Schema()
        adapter = SAAdapter(SANotIndexable, schema)
        assert not adapter.indexable
        assert adapter.doc_attrs == {}

        adapter = SAAdapter(Entity, schema)
        assert adapter.indexable == False

        adapter = SAAdapter(SubclassEntityIndexable, schema)
        assert adapter.indexable
        assert set(adapter.doc_attrs) == {
            'object_key',
            'id',
            'name',
            'slug',
            'object_type',
            'text',
            'created_at',
            'updated_at',
            'name_prefix',
            'owner',
            'owner_name',
            'creator_name',
            'creator',
            'allowed_roles_and_users',
            'tag_ids',
            'tag_text',
        }
        assert all(lambda f: callable(f)
                   for f in six.itervalues(adapter.doc_attrs))

        assert set(schema.names()) == {
            'object_key',
            'id',
            'object_type',
            'name',
            'slug',
            'text',
            'created_at',
            'updated_at',
            'name_prefix',
            'owner',
            'owner_name',
            'creator_name',
            'creator',
            'allowed_roles_and_users',
            'tag_ids',
            'tag_text',
        }

        schema = Schema(id=NUMERIC(numtype=int,
                                   bits=64,
                                   signed=False,
                                   stored=True,
                                   unique=True), )
        adapter = SAAdapter(Indexable, schema)
        assert adapter.indexable
        assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'}
        assert all(lambda f: callable(f)
                   for f in six.itervalues(adapter.doc_attrs))

        assert set(schema.names()) == {'id', 'text', 'num', 'name'}
        assert isinstance(schema['text'], TEXT)
        assert isinstance(schema['num'], NUMERIC)
示例#13
0
文件: store.py 项目: leifj/pyFF
class WhooshStore(SAMLStoreBase):

    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v
        for a, v in list(info.items()):
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in list(info.keys()):
            if a not in self.schema.names():
                del info[a]

        for a, v in list(info.items()):
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return b2u(self._collections)

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(list(self.objects.keys()))
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return b2u(list(self.objects.values()))
            else:
                return b2u(list(self.infos.values()))

        from whoosh.qparser import QueryParser
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return b2u(list(lst))
示例#14
0
##################################################################
## 1. 创建 schema
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)  # stored 为 True 表示能够被检索
# All keyword arguments to the constructor are treated as fieldname = fieldtype pairs.
# The fieldtype can be an instantiated FieldType object, or a FieldType sub-class
#     (in which case the Schema will instantiate it with the default constructor before adding it).
# For example: s = Schema(content=TEXT, title=TEXT(stored = True), tags=KEYWORD(stored = True))
# 返回索引结果的时候一般只想得到文章标题和路径, 文章内容是想要点进去看; 所以 content 没有 stored=True
from whoosh import fields
# 打印支持的变量类型
print([item for item in dir(fields)[:10] if item.isupper()])  # ['BOOLEAN', 'COLUMN', 'DATETIME', 'ID', 'IDLIST', 'KEYWORD']
print(len(schema.items()))  # 3
print(schema.items()[0])  # ('content', TEXT(format=Positions(boost=1.0), scorable=True, stored=False, unique=None))
print(schema.items()[1])  # ('path', ID(format=Existence(boost=1.0), scorable=None, stored=True, unique=False))
print(schema.items()[2])  # ('title', TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None))
print(schema.names())  # ['content', 'path', 'title']; Returns a list of the names of the fields in this schema.
print(schema.scorable_names())  # ['content', 'title']; Returns a list of the names of fields that store field lengths.
print(schema.stored_names())  # ['path', 'title']; Returns a list of the names of fields that are stored.
print(schema.has_scorable_fields())  # True
##################################################################
## 2. 索引生成
## create_in(dirname, schema, indexname=None)
## Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you.
ix = create_in('./tmp', schema)  # 存储 schema 信息至 ./tmp/; ** 这个只能执行一遍, 否则会报 LockError **
print(type(ix))  # <class 'whoosh.index.FileIndex'>
print(ix.schema)  # <Schema: ['content', 'path', 'title']>
## writer(procs=1, **kwargs): Returns an IndexWriter object for this index.
writer = ix.writer()  # 按照 schema 定义信息, 增加需要建立索引的文档
print(type(writer))  # <class 'whoosh.writing.SegmentWriter'>
print(writer.schema)  # <Schema: ['content', 'path', 'title']>
## add_document(**fields): The keyword arguments map field names to the values to index/store
示例#15
0
class Indexer:
    SOURCES = ['file', 'internet']

    def __init__(self, source='file'):
        CURRENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/search_engine"
        self.source = source if source == 'file' else 'internet'
        self.index_dir = "%s/%s_%s" % (CURRENT_DIR, 'indexdir', self.source)
        self.schema = Schema(title=TEXT(phrase=True, sortable=True, stored=True,
                                        field_boost=2.0, spelling=True, analyzer=StemmingAnalyzer()),
                             url=ID(stored=True),
                             body=TEXT(spelling=True, stored=True, analyzer=StemmingAnalyzer()))
        self.writer = None
        self.ix = None
        self.create_or_open_index()

    def add_document(self, doc, commit=True):
        writer = self.ix.writer()
        writer.add_document(title=doc['title'], url=doc['url'], body=doc['body'])
        if commit:
            writer.commit()

    def commit(self):
        writer = self.ix.writer()
        writer.commit()

    def get_doc(self, url):
        query = MultifieldParser(["url"], self.ix.schema).parse(url)
        return self.ix.searcher().search(query)

    def get_document_count(self):
        return self.ix.searcher().doc_count_all()

    def get_field_list(self):
        return self.schema.names()

    def get_word_count(self):
        return len(list(self.ix.searcher().lexicon("body")))

    def get_doc_list(self, page_number, pagelen=20):
        result = []
        for i, doc in enumerate(self.ix.searcher().documents()):
            if i in range((page_number - 1) * pagelen, (page_number) * pagelen):
                result.append({'index': i, 'title': doc['title'], 'url': doc['url'], 'body': doc['body'][:100]})
        return result

    def search(self, query_str, page_number):
        query = MultifieldParser(["body", "title"], self.ix.schema).parse(query_str)
        docs = self.ix.searcher().search_page(query, page_number, pagelen=20)
        result = []
        for doc in docs:
            result.append({'title': doc['title'], 'url': doc['url'], 'body': doc.highlights("body")})
        return result

    def clean_index(self):
        # self.writer = self.ix.writer()
        if index.exists_in(self.index_dir):
            self.ix = create_in(self.index_dir, self.schema)

    def create_or_open_index(self):
        if index.exists_in(self.index_dir):
            self.ix = index.open_dir(self.index_dir)
        else:
            if not os.path.exists(self.index_dir):
                os.mkdir(self.index_dir)
            self.ix = create_in(self.index_dir, self.schema)