def test_normalize_compound(): def oq(): return Or([Term("a", u("a")), Term("a", u("b"))]) def nq(level): if level == 0: return oq() else: return Or([nq(level - 1), nq(level - 1), nq(level - 1)]) q = nq(7) q = q.normalize() assert_equal(q, Or([Term("a", u("a")), Term("a", u("b"))]))
def test_span_or(): ix = get_index() with ix.searcher() as s: nq = spans.SpanNear(Term("text", "alfa"), Term("text", "charlie"), slop=2) bq = Term("text", "bravo") q = spans.SpanOr([nq, bq]) m = q.matcher(s) while m.is_active(): orig = s.stored_fields(m.id())["text"] assert ("alfa" in orig and "charlie" in orig) or "bravo" in orig m.next()
def get_subitem_revs(self): """ Create a list of subitems of this item. Subitems are in the form of storage Revisions. """ query = And([Term(WIKINAME, app.cfg.interwikiname), Term(NAMESPACE, self.fqname.namespace)]) # trick: an item of empty name can be considered as "virtual root item" # that has all wiki items as sub items if self.names: query = And([query, Or([Prefix(NAME_EXACT, prefix) for prefix in self.subitem_prefixes])]) revs = flaskg.storage.search(query, sortedby=NAME_EXACT, limit=None) return revs
def search(): print(request.args) search = request.args.get('search') author = request.args.get('author') category = request.args.get('category') page = int(request.args.get( 'page')) if not request.args.get('page') is None else 1 print(search) if search is None and author is None and category is None: myquery = Every() else: if search is None: if not author is None: myquery = Term('author', author) if not category is None: myquery = myquery & Term('category', category) else: myquery = Term('category', category) else: myquery = MultifieldParser(["title", "post_content"], ix.schema, plugins=[FuzzyTermPlugin() ]).parse(search) if not author is None: myquery = myquery & Term('author', author) if not category is None: myquery = myquery & Term('category', category) with ix.searcher() as searcher: results = searcher.search_page(myquery, page, pagelen=25, sortedby="date", reverse=True) print(results.is_last_page()) results_json = json.dumps( { "results": [dict(i) for i in results], "page": page, "total_results": results.total }, default=str) resp = Response(response=results_json, status=200, mimetype="application/json") return resp
def get_subscribers(**meta): """ Get all users that are subscribed to the item :param meta: key/value pairs from item metadata - itemid, name, namespace, tags keys :return: a set of Subscriber objects """ itemid = meta.get(ITEMID) name = meta.get(NAME) namespace = meta.get(NAMESPACE) fqname = CompositeName(namespace, ITEMID, itemid) tags = meta.get(TAGS) terms = [] if itemid is not None: terms.extend( [Term(SUBSCRIPTION_IDS, "{0}:{1}".format(ITEMID, itemid))]) if namespace is not None: if name is not None: terms.extend( Term(SUBSCRIPTION_IDS, "{0}:{1}:{2}".format( NAME, namespace, name_)) for name_ in name) if tags is not None: terms.extend( Term(SUBSCRIPTION_IDS, "{0}:{1}:{2}".format( TAGS, namespace, tag)) for tag in tags) query = Or(terms) with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher: result_iterators = [ searcher.search(query, limit=None), ] subscription_patterns = searcher.lexicon(SUBSCRIPTION_PATTERNS) # looks like whoosh gives us bytes (not str), decode them: subscription_patterns = [ p if isinstance(p, str) else p.decode() for p in subscription_patterns ] patterns = get_matched_subscription_patterns(subscription_patterns, **meta) result_iterators.extend( searcher.documents(subscription_patterns=pattern) for pattern in patterns) subscribers = set() for user in chain.from_iterable(result_iterators): email = user.get(EMAIL) if email: from moin.user import User u = User(uid=user.get(ITEMID)) if u.may.read(fqname): locale = user.get(LOCALE, DEFAULT_LOCALE) subscribers.add( Subscriber(user[ITEMID], user[NAME][0], email, locale)) return subscribers
def add_other_versions(searcher, results, user, staff): allow_q = [] if not staff: allow_q = [Or([Term('public', 't'), Term('users', user.username.lower())] + [Term('groups', group.name.lower()) for group in user.groups.all()])] for result in results: user_q = And([Term('vendor_name', '%s/%s' % (result['vendor'], result['name']))] + allow_q) version_results = [h.fields()['version'] for h in searcher.search(user_q)] result['others'] = [v for v in version_results if v != result['version']] return results
def do_show(self, revid): """ Show a blog item and a list of its blog entries below it. If tag GET-parameter is defined, the list of blog entries consists only of those entries that contain the tag value in their lists of tags. """ # for now it is just one tag=value, later it could be tag=value1&tag=value2&... tag = request.values.get('tag') prefix = self.name + u'/' current_timestamp = int(time.time()) terms = [ Term(WIKINAME, app.cfg.interwikiname), # Only blog entry itemtypes Term(ITEMTYPE, ITEMTYPE_BLOG_ENTRY), # Only sub items of this item Prefix(NAME_EXACT, prefix), ] if tag: terms.append(Term(TAGS, tag)) query = And(terms) def ptime_sort_key(searcher, docnum): """ Compute the publication time key for blog entries sorting. If PTIME is not defined, we use MTIME. """ fields = searcher.stored_fields(docnum) ptime = fields.get(PTIME, fields[MTIME]) return ptime ptime_sort_facet = FunctionFacet(ptime_sort_key) revs = flaskg.storage.search(query, sortedby=ptime_sort_facet, reverse=True, limit=None) blog_entry_items = [ Item.create(rev.name, rev_id=rev.revid) for rev in revs ] return render_template( 'blog/main.html', item_name=self.name, fqname=split_fqname(self.name), blog_item=self, blog_entry_items=blog_entry_items, tag=tag, item=self, )
def get_item_last_revisions(app, fqname): """ Get 2 or less most recent item revisions from the index :param app: local proxy app :param fqname: the fqname of the item :return: a list of revisions """ # TODO: Implement AccessDenied or similar error in case the user does not have access to item # and to also to handle the case where the item has no revisions terms = [Term(WIKINAME, app.cfg.interwikiname), Term(fqname.field, fqname.value), ] query = And(terms) return list( flaskg.storage.search(query, idx_name=ALL_REVS, sortedby=[MTIME], reverse=True, limit=2))
def has_word(self, character_set, key): assert character_set & TRADITIONAL or character_set & SIMPLIFIED with self._index.searcher() as searcher: query = NullQuery() # Documentation for Whoosh says 'in' # operator can be used on the searcher # to look for the key but it didn't work # for me. if character_set & TRADITIONAL: query |= Term("traditional", key) if character_set & SIMPLIFIED: query |= Term('simplified', key) results = searcher.search(query) return len(results) > 0
def search_addresses(searcher, query): restrict_q = Term("tag", "drafts") | Term("tag", "trash") results = [] for field in ['to', 'cc', 'bcc', 'sender']: query_parser = QueryParser(field, searcher.schema) results.append( searcher.search(query_parser.parse("*%s* OR *%s*" % (query.title(), query)), limit=None, mask=restrict_q, groupedby=sorting.FieldFacet(field, allow_overlap=True), terms=True).matched_terms()) return [address[1] for address in flatten(results)]
def item_acl_report(): """ Return a sorted list of all items in the wiki along with the ACL Meta-data. Item names are prefixed with the namespace, if there is a non-default namespace. If there are multiple names, the first name is used for sorting. """ query = And([ Term(WIKINAME, app.cfg.interwikiname), Not(Term(NAMESPACE, NAMESPACE_USERPROFILES)), ]) all_metas = flaskg.storage.search_meta(query, idx_name=LATEST_REVS, sortedby=[NAMESPACE, NAME], limit=None) items_acls = [] for meta in all_metas: item_namespace = meta.get(NAMESPACE) item_id = meta.get(ITEMID) if item_namespace: item_name = [ item_namespace + '/' + name for name in meta.get(NAME) ] else: item_name = meta.get(NAME) item_acl = meta.get(ACL) acl_default = item_acl is None if acl_default: for namespace, acl_config in app.cfg.acl_mapping: if item_namespace == namespace: item_acl = acl_config['default'] fqnames = gen_fqnames(meta) fqname = fqnames[0] items_acls.append({ 'name': item_name, 'name_old': meta.get('name_old', []), 'itemid': item_id, 'fqnames': fqnames, 'fqname': fqnames[0], 'acl': item_acl, 'acl_default': acl_default }) # deleted items have no names; this sort places deleted items on top of the report; # the display name may be similar to: "9cf939f ~(DeletedItemName)" items_acls = sorted(items_acls, key=lambda k: (k['name'], k['name_old'])) return render_template('admin/item_acl_report.html', title_name=_('Item ACL Report'), number_items=len(items_acls), items_acls=items_acls)
def filter_queryset(self, request, queryset, view): if ('parent' in request.query_params and request.query_params['parent'] == ''): # Empty string means query for null parent queryset = queryset.filter(parent=None) if 'q' not in request.query_params: return queryset queryset_pks = list(queryset.values_list('pk', flat=True)) if not len(queryset_pks): return queryset # 'q' means do a full-text search of the document fields, where the # critera are given in the Whoosh query language: # https://pythonhosted.org/Whoosh/querylang.html search_queryset = SearchQuerySet().models(queryset.model) search_backend = search_queryset.query.backend if not isinstance(search_backend, WhooshSearchBackend): raise NotImplementedError( 'Only the Whoosh search engine is supported at this time') if not search_backend.setup_complete: search_backend.setup() searcher = search_backend.index.searcher() # Parse the user's query user_query = QueryParser('text', search_backend.index.schema).parse( request.query_params['q']) # Construct a query to restrict the search to the appropriate model filter_query = Term(DJANGO_CT, get_model_ct(queryset.model)) # Does the search index for this model have a field that allows # filtering by permissions? haystack_index = haystack.connections[ 'default'].get_unified_index().get_index(queryset.model) if hasattr(haystack_index, 'users_granted_permission'): # Also restrict the search to records that the user can access filter_query &= Term( 'users_granted_permission', request.user.username) results = searcher.search( user_query, filter=filter_query, scored=False, sortedby=None, limit=None ) pk_type = type(queryset_pks[0]) results_pks = { # Coerce each `django_id` from unicode to the appropriate type, # usually `int` pk_type((x['django_id'])) for x in results } filter_pks = results_pks.intersection(queryset_pks) return queryset.filter(pk__in=filter_pks)
def itemsize(): """display a table with item sizes""" headings = [ _('Size'), _('Item name'), ] query = And([Term(WIKINAME, app.cfg.interwikiname), Not(Term(NAMESPACE, NAMESPACE_USERPROFILES)), Not(Term(TRASH, True))]) revs = flaskg.storage.search_meta(query, idx_name=LATEST_REVS, sortedby=[NAME], limit=None) rows = [(rev[SIZE], CompositeName(rev[NAMESPACE], NAME_EXACT, rev[NAME][0])) for rev in revs] rows = sorted(rows, reverse=True) return render_template('user/itemsize.html', title_name=_("Item Sizes"), headings=headings, rows=rows)
def test_or_nots3(): schema = fields.Schema(title=fields.TEXT(stored=True), itemtype=fields.ID(stored=True)) with TempIndex(schema, "ornot") as ix: w = ix.writer() w.add_document(title=u("a1"), itemtype=u("a")) w.add_document(title=u("a2"), itemtype=u("a")) w.add_document(title=u("b1"), itemtype=u("b")) w.commit() q = Term('itemtype', 'a') | Not(Term('itemtype', 'a')) with ix.searcher() as s: r = " ".join([hit["title"] for hit in s.search(q)]) assert r == "a1 a2 b1"
def build_filter_terms(field_name, *, include=None, exclude=None): """ Build Whoosh query terms that may be used to filter a search. :param list include: List of values to allow in the search results. If `None`, no inclusion term gets produced. :param list exclude: List of values to deny from the search results. If `None`, no exclusion term gets produced. """ terms = [Or([Term(field_name, value) for value in include])] if include else [] if exclude: terms.extend([Not(Term(field_name, value)) for value in exclude]) return terms
def user_acl_report(uid): query = And([ Term(WIKINAME, app.cfg.interwikiname), Not(Term(NAMESPACE, NAMESPACE_USERPROFILES)) ]) all_items = flaskg.storage.search_meta(query, idx_name=LATEST_REVS, sortedby=[NAMESPACE, NAME], limit=None) theuser = user.User(uid=uid) itemwise_acl = [] last_item_acl_parts = (None, None, None) last_item_result = { 'read': False, 'write': False, 'create': False, 'admin': False, 'destroy': False } for item in all_items: if item.meta.get(NAME): fqname = CompositeName(item.meta.get(NAMESPACE), NAME_EXACT, item.meta.get(NAME)[0]) else: fqname = CompositeName(item.meta.get(NAMESPACE), ITEMID, item.meta.get(ITEMID)) this_rev_acl_parts = (item.meta[NAMESPACE], item.meta.get(PARENTNAMES), item.meta.get(ACL)) name_parts = { 'name': item.meta.get(NAME), 'namespace': item.meta.get(NAMESPACE), 'itemid': item.meta.get(ITEMID), 'fqname': fqname } if not last_item_acl_parts == this_rev_acl_parts: last_item_acl_parts = this_rev_acl_parts last_item_result = { 'read': theuser.may.read(fqname), 'write': theuser.may.write(fqname), 'create': theuser.may.create(fqname), 'admin': theuser.may.admin(fqname), 'destroy': theuser.may.destroy(fqname) } itemwise_acl.append({**name_parts, **last_item_result}) return render_template('admin/user_acl_report.html', title_name=_('User ACL Report'), user_names=theuser.name, itemwise_acl=itemwise_acl)
def test_span_term(): ix = get_index() with ix.searcher() as s: alllists = [d["text"] for d in s.all_stored_fields()] for word in domain: q = Term("text", word) m = q.matcher(s) ids = set() while m.is_active(): id = m.id() sps = m.spans() ids.add(id) original = list(s.stored_fields(id)["text"]) assert word in original if word != "bravo": assert len(sps) == 1 assert original.index(word) == sps[0].start assert original.index(word) == sps[0].end m.next() for i, ls in enumerate(alllists): if word in ls: assert i in ids else: assert i not in ids
def search_text(self, groupname: str, field: str, text: str, op: str, sortby: str = "path", scoring: str = "unscored") -> Iterable[SearchResult]: assert sortby in ("path", "score") assert scoring in ("unscored", "bm25f") scored = scoring != "unscored" sortedby = {"score": None}.get(sortby, sortby) limit = None qp = QueryParser(field, self.invindex.ix.schema) q = qp.parse(text) terms = list( Term(fieldname, value) for fieldname, value in q.iter_all_terms()) if op == "and": query = And(terms) elif op == "or": query = Or(terms) with self.searcher() as searcher: for hit in searcher.search(query, limit=limit, scored=scored, sortedby=sortedby): yield Path( hit["path"]), hit.score # hit.pos, hit.rank, hit.docnum
def more_like(pk, source, top=5): """Find similar units.""" index = get_source_index() with index.searcher() as searcher: # Extract key terms kts = searcher.key_terms_from_text('source', source, numterms=10, normalize=False) # Create an Or query from the key terms query = Or( [Term('source', word, boost=weight) for word, weight in kts]) # Grab fulltext results results = [(h['pk'], h.score) for h in searcher.search(query, limit=top)] if not results: return [], {} # Normalize scores to 0-100 max_score = max([h[1] for h in results]) scores = {h[0]: h[1] * 100 / max_score for h in results} # Filter results with score above 30 and not current unit return ( [h[0] for h in results if scores[h[0]] > 30 and h[0] != pk], scores, )
def more_like(self, pk, source, top=5): """Find similar units.""" index = self.get_source_index() with index.searcher() as searcher: # Extract key terms kts = searcher.key_terms_from_text('source', source, numterms=10, normalize=False) # Create an Or query from the key terms query = Or( [Term('source', word, boost=weight) for word, weight in kts]) LOGGER.debug('more like query: %r', query) # Grab fulltext results results = [(h['pk'], h.score) for h in searcher.search(query, limit=top)] LOGGER.debug('found %d matches', len(results)) if not results: return [] # Filter bad results threshold = max([h[1] for h in results]) / 2 results = [h[0] for h in results if h[1] > threshold] LOGGER.debug('filter %d matches over threshold %d', len(results), threshold) return results
def search( self, query: str, page: int, pagesize: int, include_private: bool = True, extend: bool = False, ): """Search the index. If `include_private` is true, include also private objects and search in private fields. """ query_parser = (self.query_parser_all if include_private else self.query_parser_public) query_parser.add_plugin(DateParserPlugin()) # if private objects should not be shown, add a mask mask = None if include_private else Term("private", True) parsed_query = query_parser.parse(query) with self.index().searcher() as searcher: results = searcher.search_page(parsed_query, page, pagesize, mask=mask) return results.total, [self.format_hit(hit) for hit in results]
def _trashed(namespace): q = And([Term(WIKINAME, app.cfg.interwikiname), Term(TRASH, True)]) if namespace != NAMESPACE_ALL: q = And([ q, Term(NAMESPACE, namespace), ]) trashedEntry = namedtuple('trashedEntry', 'fqname oldname revid mtime comment editor') results = [] for meta in flaskg.storage.search_meta(q, limit=None): fqname = CompositeName(meta[NAMESPACE], ITEMID, meta[ITEMID]) results.append( trashedEntry(fqname, meta[NAME_OLD], meta[REVID], meta[MTIME], meta[COMMENT], get_editor_info(meta))) return results
def search(self, w): if not self.ix.up_to_date(): self.initialize_trie( ) # if the index is not up to date, someone has added cards, so we reinitialize the trie # If we searched for an alias, make it the exact hit for alias, name in fetcher.card_aliases(): if w == card.canonicalize(alias): return SearchResult(name, None, None, None) normalized = list(WhooshConstants.normalized_analyzer(w))[0].text # If we get matches by prefix, we return that exact, prefix_whole_word, other_prefixed = self.find_matches_by_prefix( normalized) if exact or len(prefix_whole_word) > 0 or len(other_prefixed) > 0: return SearchResult(exact, prefix_whole_word, other_prefixed, None) # We try fuzzy and stemmed queries query_normalized = fuzzy_term(normalized, self.DIST, "name_normalized") query_stemmed = And([ Term('name_stemmed', q.text) for q in WhooshConstants.stem_analyzer(w) ]) query_tokenized = And([ fuzzy_term(q.text, self.DIST, "name_tokenized") for q in WhooshConstants.tokenized_analyzer(w) ]) query = Or([query_normalized, query_tokenized, query_stemmed]) with self.ix.searcher() as searcher: fuzzy = [(r['name'], r.score) for r in searcher.search(query, limit=40)] return SearchResult(exact, prefix_whole_word, other_prefixed, fuzzy)
def matcher(self, searcher, context=None): from whoosh.query import Term, SpanNear2 fieldname = self.fieldname if fieldname not in searcher.schema: return matching.NullMatcher() field = searcher.schema[fieldname] if not field.format or not field.format.supports("positions"): raise qcore.QueryError("Phrase search: %r field has no positions" % self.fieldname) terms = [] # Build a list of Term queries from the words in the phrase reader = searcher.reader() for word in self.words: word = field.to_bytes(word) if (fieldname, word) not in reader: # Shortcut the query if one of the words doesn't exist. return matching.NullMatcher() terms.append(Term(fieldname, word)) # Create the equivalent SpanNear2 query from the terms q = SpanNear2(terms, slop=self.slop, ordered=True, mindist=1) # Get the matcher m = q.matcher(searcher, context) if self.boost != 1.0: m = matching.WrappingMatcher(m, boost=self.boost) return m
def get_query(line, ix): lines = line.strip().split('\t') #context=unicode(' '.join(lines[2:-1]), 'gb18030') post = unicode(lines[0], 'gb18030') #q1=QueryParser("context", ix.schema).parse(context) q2 = QueryParser("post", ix.schema).parse(post) #context=' '.join(lines[2:-1]) #query =QueryParser("post", ix.schema).parse(post) terms = list(q2.all_terms()) query = Or([Term(*x) for x in terms]) return query context = unicode(context, 'gb18030') q1 = QueryParser("context", ix.schema).parse(context) terms = list(q1.all_terms()) + list(q2.all_terms()) query = Or([Term(*x) for x in terms]) return query
def test_simplify(): s = fields.Schema(k=fields.ID, v=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(k=u("1"), v=u("aardvark apple allan alfa bear bee")) w.add_document(k=u("2"), v=u("brie glue geewhiz goop julia")) w.commit() r = ix.reader() q1 = And([Prefix("v", "b", boost=2.0), Term("v", "juliet")]) q2 = And([Or([Term('v', u('bear'), boost=2.0), Term('v', u('bee'), boost=2.0), Term('v', u('brie'), boost=2.0)]), Term('v', 'juliet')]) assert_equal(q1.simplify(r), q2)
def test_excludematcher(): schema = fields.Schema(content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "charlie", "delta") for _ in xrange(3): w = ix.writer() for ls in permutations(domain): w.add_document(content=u(" ").join(ls)) w.commit(merge=False) w = ix.writer() w.delete_document(5) w.delete_document(10) w.delete_document(28) w.commit(merge=False) q = Term("content", "bravo") with ix.searcher() as s: m = q.matcher(s) while m.is_active(): content = s.stored_fields(m.id())["content"].split() spans = m.spans() for span in spans: assert content[span.start] == "bravo" m.next()
def test_span_near2(): ana = analysis.SimpleAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(text=u("The Lucene library is by Doug Cutting and Whoosh " + "was made by Matt Chaput")) w.commit() nq1 = spans.SpanNear(Term("text", "lucene"), Term("text", "doug"), slop=5) nq2 = spans.SpanNear(nq1, Term("text", "whoosh"), slop=4) with ix.searcher() as s: m = nq2.matcher(s) assert m.spans() == [spans.Span(1, 8)]
def get_query(line, ix): lines = line.strip().split('\t') post = lines[0].decode('utf-8') q2 = QueryParser("post", ix.schema).parse(post) terms = list(q2.all_terms()) query = Or([Term(*x) for x in terms]) return query
def test_near_unordered(): schema = fields.Schema(text=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(text=u("alfa bravo charlie delta echo")) w.add_document(text=u("alfa bravo delta echo charlie")) w.add_document(text=u("alfa charlie bravo delta echo")) w.add_document(text=u("echo delta alfa foxtrot")) w.commit() with ix.searcher() as s: q = spans.SpanNear(Term("text", "bravo"), Term("text", "charlie"), ordered=False) r = sorted(d["text"] for d in s.search(q)) assert r == [u('alfa bravo charlie delta echo'), u('alfa charlie bravo delta echo')]