class SearchMigrationTest(TestCase, TempDirMixin): """Search index migration testing""" def setUp(self): self.create_temp() self.storage = FileStorage(self.tempdir) self.storage.create() def tearDown(self): self.remove_temp() def do_test(self): fulltext = Fulltext() fulltext.storage = self.storage sindex = fulltext.get_source_index() self.assertIsNotNone(sindex) tindex = fulltext.get_target_index('cs') self.assertIsNotNone(tindex) writer = sindex.writer() writer.update_document( pk=1, source="source", context="context", location="location", ) writer.commit() writer = tindex.writer() writer.update_document( pk=1, target="target", comment="comment" ) writer.commit() for item in ('source', 'context', 'location', 'target'): self.assertEqual( fulltext.search(item, ['cs'], {item: True}), set([1]) ) def test_nonexisting(self): self.do_test() def test_nonexisting_dir(self): shutil.rmtree(self.tempdir) self.tempdir = None self.do_test()
def test_storage_creation(): import tempfile, uuid from whoosh import fields from whoosh.filedb.filestore import FileStorage schema = fields.Schema(text=fields.TEXT) uid = uuid.uuid4() dirpath = os.path.join(tempfile.gettempdir(), str(uid)) assert not os.path.exists(dirpath) st = FileStorage(dirpath) st.create() assert os.path.exists(dirpath) ix = st.create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo")) w.add_document(text=u("bracho charlie")) st.destroy() assert not os.path.exists(dirpath)
def _temp_storage(self, name=None): path = tempfile.mkdtemp() tempstore = FileStorage(path) return tempstore.create()
class SearchMigrationTest(TestCase, TempDirMixin): """Search index migration testing""" def setUp(self): self.create_temp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.tempdir) weblate.trans.search.STORAGE = self.storage self.storage.create() def tearDown(self): self.remove_temp() weblate.trans.search.STORAGE = self.backup def do_test(self, source, target): if source is not None: self.storage.create_index(source, 'source') if target is not None: self.storage.create_index(target, 'target-cs') sindex = weblate.trans.search.get_source_index() self.assertIsNotNone(sindex) tindex = weblate.trans.search.get_target_index('cs') self.assertIsNotNone(tindex) writer = sindex.writer() writer.update_document( pk=1, source="source", context="context", location="location", ) writer.commit() writer = tindex.writer() writer.update_document(pk=1, target="target", comment="comment") writer.commit() for item in ('source', 'context', 'location', 'target'): self.assertEqual(fulltext_search(item, ['cs'], {item: True}), set([1])) def test_nonexisting(self): self.do_test(None, None) def test_nonexisting_dir(self): shutil.rmtree(self.tempdir) self.tempdir = None self.do_test(None, None) def test_current(self): source = weblate.trans.search.SourceSchema target = weblate.trans.search.TargetSchema self.do_test(source, target) def test_2_4(self): source = Schema(checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), location=TEXT()) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), comment=TEXT(), ) self.do_test(source, target) def test_2_1(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), ) self.do_test(source, target)
def temp_storage(self, name=None): tdir = tempfile.gettempdir() name = name or "%s.tmp" % random_name() path = os.path.join(tdir, name) tempstore = FileStorage(path) return tempstore.create()
def setup_index(): storage = FileStorage(data_dir('memory')) storage.create() return storage.create_index(TMSchema())
class RedisWhooshStore(SAMLStoreBase ): # TODO: This needs a gc mechanism for keys (uuids) def json_dict(self, name): return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def xml_dict(self, name): return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def __init__(self, *args, **kwargs): self._dir = kwargs.pop('directory', '.whoosh') clear = bool(kwargs.pop('clear', config.store_clear)) self._name = kwargs.pop('name', config.store_name) self._redis = kwargs.pop('redis', redis()) if clear: shutil.rmtree(self._dir) now = datetime.now() self._last_index_time = now self._last_modified = now self._setup() if clear: self.reset() def _setup(self): self._redis = getattr(self, '_redis', None) if not self._redis: self._redis = redis( ) # XXX test cases won't get correctly unpicked because of this self.schema = Schema(content=NGRAMWORDS(stored=False)) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) self.schema.add('sha1', ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self.objects = self.xml_dict('objects') self.parts = self.json_dict('parts') self.storage = FileStorage(os.path.join(self._dir, self._name)) try: self.index = self.storage.open_index(schema=self.schema) except BaseException as ex: log.warn(ex) self.storage.create() self.index = self.storage.create_index(self.schema) self._reindex() def __getstate__(self): state = dict() for p in ('_dir', '_name', '_last_index_time', '_last_modified'): state[p] = getattr(self, p) return state def __setstate__(self, state): self.__dict__.update(state) self._setup() def __call__(self, *args, **kwargs): watched = kwargs.pop('watched', None) scheduler = kwargs.pop('scheduler', None) if watched is not None and scheduler is not None: super(RedisWhooshStore, self).__call__(watched=watched, scheduler=scheduler) log.debug("indexing using {}".format(scheduler)) if scheduler is not None: # and self._last_modified > self._last_index_time and : scheduler.add_job(RedisWhooshStore._reindex, args=[self], max_instances=1, coalesce=True, misfire_grace_time=2 * config.update_frequency) def _reindex(self): log.debug("indexing the store...") self._last_index_time = datetime.now() seen = set() refs = set([b2u(s) for s in self.objects.keys()]) parts = self.parts.values() for ref in refs: for part in parts: if ref in part['items']: seen.add(ref) ix = self.storage.open_index() lock = ix.lock("reindex") try: log.debug("waiting for index lock") lock.acquire(True) log.debug("got index lock") with ix.writer() as writer: for ref in refs: if ref not in seen: log.debug("removing unseen ref {}".format(ref)) del self.objects[ref] del self.parts[ref] log.debug("updating index") for e in self.objects.values(): info = self._index_prep(entity_simple_info(e)) ref = object_id(e) writer.add_document(object_id=ref, **info) writer.mergetype = CLEAR finally: try: log.debug("releasing index lock") lock.release() except ThreadError as ex: pass def dump(self): ix = self.storage.open_index() from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): res = dict() if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v content = " ".join( filter(lambda x: x is not None, [ info.get(x, '') for x in ('service_name', 'title', 'domain', 'keywords', 'scopes') ])) res['content'] = content.strip() for a, v in info.items(): k = a if a in ATTRS_INV: k = ATTRS_INV[a] if k in self.schema.names(): if type(v) in (list, tuple): res[k] = " ".join([vv.lower() for vv in v]) elif type(v) in six.string_types: res[k] = info[a].lower() res['sha1'] = hash_id(info['entity_id'], prefix=False) return res def update(self, t, tid=None, etag=None, lazy=True): relt = root(t) assert (relt is not None) if relt.tag == "{%s}EntityDescriptor" % NS['md']: ref = object_id(relt) parts = None if ref in self.parts: parts = self.parts[ref] if etag is not None and (parts is None or parts.get('etag', None) != etag): self.parts[ref] = { 'id': relt.get('entityID'), 'etag': etag, 'count': 1, 'items': [ref] } self.objects[ref] = relt self._last_modified = datetime.now() elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') if etag is None: etag = hex_digest(dumptree(t, pretty_print=False), 'sha256') parts = None if tid in self.parts: parts = self.parts[tid] if parts is None or parts.get('etag', None) != etag: items = set() for e in iter_entities(t): ref = object_id(e) items.add(ref) self.objects[ref] = e self.parts[tid] = { 'id': tid, 'count': len(items), 'etag': etag, 'items': list(items) } self._last_modified = datetime.now() if not lazy: self._reindex() @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def collections(self): return [b2u(ref) for ref in self.parts.keys()] def reset(self): for k in ('{}_{}'.format(self._name, 'parts'), '{}_{}'.format(self._name, 'objects')): self._redis.delete('{}_{}'.format(self._name, 'parts')) self._redis.delete('{}_{}'.format(self._name, 'objects')) def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def _prep_key(self, key): # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') key = key.replace('-', ' AND NOT ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() return key def _entities(self): lst = set() for ref_data in self.parts.values(): for ref in ref_data['items']: e = self.objects.get(ref, None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def lookup(self, key): if key == 'entities' or key is None: return self._entities() bkey = six.b(key) if bkey in self.objects: return [self.objects.get(bkey)] if bkey in self.parts: res = [] part = self.parts.get(bkey) for item in part['items']: res.extend(self.lookup(item)) return res key = self._prep_key(key) qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: e = self.objects.get(result['object_id'], None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def search(self, query=None, path=None, entity_filter=None, related=None): if entity_filter: query = "{!s} AND {!s}".format(query, entity_filter) query = self._prep_key(query) qp = MultifieldParser(['content', 'domain'], schema=self.schema) q = qp.parse(query) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) log.debug(results) for result in results: lst.add(result['object_id']) res = list() for ref in lst: e = self.objects.get(ref, None) if e is not None: res.append(discojson(e)) return res
class SearchMigrationTest(TestCase, TempDirMixin): """Search index migration testing""" def setUp(self): self.create_temp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.tempdir) weblate.trans.search.STORAGE = self.storage self.storage.create() def tearDown(self): self.remove_temp() weblate.trans.search.STORAGE = self.backup def do_test(self, source, target): if source is not None: self.storage.create_index(source, 'source') if target is not None: self.storage.create_index(target, 'target-cs') sindex = weblate.trans.search.get_source_index() self.assertIsNotNone(sindex) tindex = weblate.trans.search.get_target_index('cs') self.assertIsNotNone(tindex) writer = sindex.writer() writer.update_document( pk=1, source="source", context="context", location="location", ) writer.commit() writer = tindex.writer() writer.update_document( pk=1, target="target", comment="comment" ) writer.commit() for item in ('source', 'context', 'location', 'target'): self.assertEqual( fulltext_search(item, ['cs'], {item: True}), set([1]) ) def test_nonexisting(self): self.do_test(None, None) def test_nonexisting_dir(self): shutil.rmtree(self.tempdir) self.tempdir = None self.do_test(None, None) def test_current(self): source = weblate.trans.search.SourceSchema target = weblate.trans.search.TargetSchema self.do_test(source, target) def test_2_4(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), location=TEXT() ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), comment=TEXT(), ) self.do_test(source, target) def test_2_1(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), ) self.do_test(source, target)
class DB: def __init__(self, config, api): self.config = config self.api = api self.cache_dir = user_cache_dir('fac', appauthor=False) self.storage = FileStorage(os.path.join(self.cache_dir, 'index')) self.schema = Schema( name=TEXT(sortable=True, phrase=True, field_boost=3, analyzer=intraword), owner=TEXT(sortable=True, field_boost=2.5, analyzer=intraword), title=TEXT(field_boost=2.0, phrase=False), summary=TEXT(phrase=True), downloads=NUMERIC(sortable=True), sort_name=SortColumn(), name_id=ID(stored=True), ) try: self.index = self.storage.open_index() except EmptyIndexError: self.index = None self.db = JSONFile(os.path.join(self.cache_dir, 'mods.json')) def maybe_update(self): if self.needs_update(): self.update() def needs_update(self): if not self.index or not self.db.get('mods'): return True last_update = self.db.mtime period = int(self.config.get('db', 'update_period')) db_age = time.time() - last_update return db_age > period def update(self): with ProgressWidget("Downloading mod database...") as progress: mods = self.api.get_mods(progress) old_mods = self.db.get('mods', {}) self.db.mods = {mod.name: mod.data for mod in mods} if old_mods != self.db['mods']: print("Building search index...") self.index = self.storage.create().create_index(self.schema) with self.index.writer() as w: for mod in mods: w.add_document( name_id=mod.name, name=mod.name, sort_name=mod.name.lower(), title=mod.title.lower(), owner=mod.owner.lower(), summary=mod.summary.lower(), downloads=mod.downloads_count ) self.db.save() print("Updated mods database (%d mods)" % len(mods)) else: print("Index is up to date") self.db.utime() def search(self, query, sortedby=None, limit=None): parser = qparser.MultifieldParser( ['owner', 'name', 'title', 'summary'], schema=self.schema ) parser.add_plugin(qparser.FuzzyTermPlugin()) if not isinstance(query, Query): query = parser.parse(query or 'name:*') with self.index.searcher() as searcher: if sortedby: facets = [] for field in sortedby.split(','): reverse = field.startswith('-') if reverse: field = field[1:] if 'sort_' + field in self.schema: field = 'sort_' + field facets.append(FieldFacet(field, reverse=reverse)) if len(facets) == 1: sortedby = facets[0] else: sortedby = MultiFacet(facets) for result in searcher.search( query, limit=limit, sortedby=sortedby): d = JSONDict(self.db.mods[result['name_id']]) d.score = result.score yield d @property def mods(self): return self.db.mods
class SearchMigrationTest(TestCase): """Search index migration testing""" def setUp(self): self.path = tempfile.mkdtemp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.path) weblate.trans.search.STORAGE = self.storage self.storage.create() def tearDown(self): if os.path.exists(self.path): shutil.rmtree(self.path) weblate.trans.search.STORAGE = self.backup def do_test(self, source, target): if source is not None: self.storage.create_index(source, 'source') if target is not None: self.storage.create_index(target, 'target-cs') self.assertIsNotNone( weblate.trans.search.get_source_index() ) self.assertIsNotNone( weblate.trans.search.get_target_index('cs') ) def test_nonexisting(self): self.do_test(None, None) def test_nonexisting_dir(self): shutil.rmtree(self.path) self.do_test(None, None) def test_current(self): source = weblate.trans.search.SourceSchema target = weblate.trans.search.TargetSchema self.do_test(source, target) def test_2_4(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), location=TEXT() ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), comment=TEXT(), ) self.do_test(source, target) def test_2_1(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), ) self.do_test(source, target)