def search(self, querytext, request, pagenum=1, maxresults=30): user_q = querytext and self.parser.parse(querytext) or Every() restricted_q = And([user_q, self.restrict_query(request)]) result = {} if pagenum < 1: pagenum = 1 with self.searcher() as searcher: hits = searcher.search(restricted_q, limit=(pagenum * maxresults) + 1) if querytext and hits.is_empty(): corrected = searcher.correct_query(user_q, querytext) if corrected.query != user_q: querytext = corrected.string result['corrected_q'] = querytext restricted_q = And( [corrected.query, self.restrict_query(request)]) hits = searcher.search(restricted_q, limit=(pagenum * maxresults)) self.prepare_search_response(result, hits, pagenum, maxresults) return result
def validate_name(meta, itemid): """ Check whether the names are valid. Will just return, if they are valid, will raise a NameNotValidError if not. """ names = meta.get(NAME) current_namespace = meta.get(NAMESPACE) if current_namespace is None: raise NameNotValidError(L_("No namespace field in the meta.")) namespaces = [namespace.rstrip('/') for namespace, _ in app.cfg.namespace_mapping] if len(names) != len(set(names)): raise NameNotValidError(L_("The names in the name list must be unique.")) # Item names must not start with '@' or '+', '@something' denotes a field where as '+something' denotes a view. invalid_names = [name for name in names if name.startswith(('@', '+'))] if invalid_names: raise NameNotValidError(L_("Item names (%(invalid_names)s) must not start with '@' or '+'", invalid_names=", ".join(invalid_names))) namespaces = namespaces + NAMESPACES_IDENTIFIER # Also dont allow item names to match with identifier namespaces. # Item names must not match with existing namespaces. invalid_names = [name for name in names if name.split('/', 1)[0] in namespaces] if invalid_names: raise NameNotValidError(L_("Item names (%(invalid_names)s) must not match with existing namespaces.", invalid_names=", ".join(invalid_names))) query = And([Or([Term(NAME, name) for name in names]), Term(NAMESPACE, current_namespace)]) # There should be not item existing with the same name. if itemid is not None: query = And([query, Not(Term(ITEMID, itemid))]) # search for items except the current item. with flaskg.storage.indexer.ix[LATEST_REVS].searcher() as searcher: results = searcher.search(query) duplicate_names = {name for result in results for name in result[NAME] if name in names} if duplicate_names: raise NameNotValidError(L_("Item(s) named %(duplicate_names)s already exist.", duplicate_names=", ".join(duplicate_names)))
def test_apply(): def visit(q): if isinstance(q, (Term, Variations, FuzzyTerm)): q.text = q.text.upper() return q return q.apply(visit) before = And([Not(Term("a", u("b"))), Variations("a", u("c")), Not(FuzzyTerm("a", u("d")))]) after = visit(before) assert_equal(after, And([Not(Term("a", u("B"))), Variations("a", u("C")), Not(FuzzyTerm("a", u("D")))])) def term2var(q): if isinstance(q, Term): return Variations(q.fieldname, q.text) else: return q.apply(term2var) q = And([Term("f", "alfa"), Or([Term("f", "bravo"), Not(Term("f", "charlie"))])]) q = term2var(q) assert_equal(q, And([Variations('f', 'alfa'), Or([Variations('f', 'bravo'), Not(Variations('f', 'charlie'))])]))
def search(self, w: str) -> SearchResult: if not self.ix.up_to_date(): self.initialize_trie( ) # if the index is not up to date, someone has added cards, so we reinitialize the trie normalized = list(WhooshConstants.normalized_analyzer(w))[0].text # If we get matches by prefix, we return that exact, prefix_whole_word, other_prefixed = self.find_matches_by_prefix( normalized) if exact or len(prefix_whole_word) > 0 or len(other_prefixed) > 0: return SearchResult(exact, prefix_whole_word, other_prefixed, []) # We try fuzzy and stemmed queries query_normalized = fuzzy_term(normalized, self.DIST, 'name_normalized') query_stemmed = And([ Term('name_stemmed', q.text) for q in WhooshConstants.stem_analyzer(w) ]) query_tokenized = And([ fuzzy_term(q.text, self.DIST, 'name_tokenized') for q in WhooshConstants.tokenized_analyzer(w) ]) if len( query_tokenized ) == 0: # This can be empty because some unicode chars are ignored. See #4988 query = Or([query_normalized, query_stemmed]) else: query = Or([query_normalized, query_tokenized, query_stemmed]) with self.ix.searcher() as searcher: fuzzy = [(r['canonical_name'], r.score) for r in searcher.search(query, limit=40)] return SearchResult(exact, prefix_whole_word, other_prefixed, fuzzy)
def search(self, w): if not self.ix.up_to_date(): self.initialize_trie( ) # if the index is not up to date, someone has added cards, so we reinitialize the trie # If we searched for an alias, make it the exact hit for alias, name in fetcher.card_aliases(): if w == card.canonicalize(alias): return SearchResult(name, None, None, None) normalized = list(WhooshConstants.normalized_analyzer(w))[0].text # If we get matches by prefix, we return that exact, prefix_whole_word, other_prefixed = self.find_matches_by_prefix( normalized) if exact or len(prefix_whole_word) > 0 or len(other_prefixed) > 0: return SearchResult(exact, prefix_whole_word, other_prefixed, None) # We try fuzzy and stemmed queries query_normalized = fuzzy_term(normalized, self.DIST, "name_normalized") query_stemmed = And([ Term('name_stemmed', q.text) for q in WhooshConstants.stem_analyzer(w) ]) query_tokenized = And([ fuzzy_term(q.text, self.DIST, "name_tokenized") for q in WhooshConstants.tokenized_analyzer(w) ]) query = Or([query_normalized, query_tokenized, query_stemmed]) with self.ix.searcher() as searcher: fuzzy = [(r['name'], r.score) for r in searcher.search(query, limit=40)] return SearchResult(exact, prefix_whole_word, other_prefixed, fuzzy)
def test_intersection(): schema = fields.Schema(key=fields.ID(stored=True), value=fields.TEXT(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(key=u("a"), value=u("alpha bravo charlie delta")) w.add_document(key=u("b"), value=u("echo foxtrot alpha bravo")) w.add_document(key=u("c"), value=u("charlie delta golf hotel")) w.commit() w = ix.writer() w.add_document(key=u("d"), value=u("india alpha bravo charlie")) w.add_document(key=u("e"), value=u("delta bravo india bravo")) w.commit() with ix.searcher() as s: q = And([Term("value", u("bravo")), Term("value", u("delta"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "e"] q = And([Term("value", u("bravo")), Term("value", u("alpha"))]) m = q.matcher(s) assert _keys(s, m.all_ids()) == ["a", "b", "d"]
def build_search_kwargs(user_q, request, types, staff, orderby): if not staff: user_q = And([ user_q, Or([Term('public', 't'), Term('users', request.user.username)] + [ Term('groups', group.name) for group in request.user.groups.all() ]) ]) if types and len(types) > 0: user_q = And([ user_q, Or([Term('type', resource_type) for resource_type in types]) ]) orderby_f = FieldFacet(orderby.replace('-', ''), reverse=orderby.find('-') > -1) search_kwargs = { 'sortedby': [orderby_f], 'collapse': FieldFacet('vendor_name'), 'collapse_limit': 1, 'collapse_order': FunctionFacet(order_by_version) } return (user_q, search_kwargs)
def test_accept(): def boost_phrases(q): if isinstance(q, Phrase): q.boost *= 2.0 return q before = And([ Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")])]), Phrase("a", [u("g"), u("h")], boost=0.25) ]) after = before.accept(boost_phrases) assert_equal( after, And([ Term("a", u("b")), Or([Term("c", u("d")), Phrase("a", [u("e"), u("f")], boost=2.0)]), Phrase("a", [u("g"), u("h")], boost=0.5) ])) before = Phrase("a", [u("b"), u("c")], boost=2.5) after = before.accept(boost_phrases) assert_equal(after, Phrase("a", [u("b"), u("c")], boost=5.0))
def filter_by_player_and_team(jugador, equipo): ix = open_dir("Index_news") with ix.searcher() as buscador_noticias: list_aux_jugador = [] list_aux_equipo = [] for nom_divididio in equipo.split(): list_aux_equipo.append( Or([ Term("titulo", str(nom_divididio).lower()), Term("desc", str(nom_divididio).lower()) ])) qe = And(list_aux_equipo) for nom_divididio in jugador.split(): list_aux_jugador.append( Or([ Term("titulo", str(nom_divididio).lower()), Term("desc", str(nom_divididio).lower()) ])) qj = And(list_aux_jugador) q = And([qe, qj]) results_whoosh = buscador_noticias.search(q, limit=None) results = [] for result_whoosh in results_whoosh: results.append(result_whoosh.fields()) return results
def _trashed(namespace): q = And([Term(WIKINAME, app.cfg.interwikiname), Term(TRASH, True)]) if namespace != NAMESPACE_ALL: q = And([q, Term(NAMESPACE, namespace), ]) trashedEntry = namedtuple('trashedEntry', 'fqname oldname revid mtime comment editor') results = [] for rev in flaskg.storage.search(q, limit=None): meta = rev.meta results.append(trashedEntry(rev.fqname, meta[NAME_OLD], meta[REVID], meta[MTIME], meta[COMMENT], get_editor_info(meta))) return results
def test_replace(): q = And([ Or([Term("a", "b"), Term("b", "c")], boost=1.2), Variations("a", "b", boost=2.0) ]) q = q.replace("a", "b", "BB") assert q == And([ Or([Term("a", "BB"), Term("b", "c")], boost=1.2), Variations("a", "BB", boost=2.0) ])
def _trashed(namespace): q = And([Term(WIKINAME, app.cfg.interwikiname), Term(TRASH, True)]) if namespace != NAMESPACE_ALL: q = And([q, Term(NAMESPACE, namespace), ]) trashedEntry = namedtuple('trashedEntry', 'fqname oldname revid rev_number mtime comment editor parentid') results = [] for meta in flaskg.storage.search_meta(q, limit=None): fqname = CompositeName(meta[NAMESPACE], ITEMID, meta[ITEMID]) results.append(trashedEntry(fqname, meta[NAME_OLD], meta[REVID], meta[REV_NUMBER], meta[MTIME], meta[COMMENT], get_editor_info(meta), meta[PARENTID])) return results
def get_subitem_revs(self): """ Create a list of subitems of this item. Subitems are in the form of storage Revisions. """ query = And([Term(WIKINAME, app.cfg.interwikiname), Term(NAMESPACE, self.fqname.namespace)]) # trick: an item of empty name can be considered as "virtual root item" # that has all wiki items as sub items if self.names: query = And([query, Or([Prefix(NAME_EXACT, prefix) for prefix in self.subitem_prefixes])]) revs = flaskg.storage.search(query, sortedby=NAME_EXACT, limit=None) return revs
def test_duplicates(): q = And([Term("a", u("b")), Term("a", u("b"))]) assert_equal(q.normalize(), Term("a", u("b"))) q = And([Prefix("a", u("b")), Prefix("a", u("b"))]) assert_equal(q.normalize(), Prefix("a", u("b"))) q = And([ Variations("a", u("b")), And([Variations("a", u("b")), Term("a", u("b"))]) ]) assert_equal(q.normalize(), And([Variations("a", u("b")), Term("a", u("b"))])) q = And( [Term("a", u("b")), Prefix("a", u("b")), Term("a", u("b"), boost=1.1)]) assert_equal(q.normalize(), q) # Wildcard without * or ? normalizes to Term q = And([ Wildcard("a", u("b")), And([Wildcard("a", u("b")), Term("a", u("b"))]) ]) assert_equal(q.normalize(), Term("a", u("b")))
def test_merge_ranges(): q = And([TermRange("f1", u("a"), None), TermRange("f1", None, u("z"))]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("z"))) q = And([NumericRange("f1", None, u("aaaaa")), NumericRange("f1", u("zzzzz"), None)]) assert_equal(q.normalize(), q) q = And([TermRange("f1", u("a"), u("z")), TermRange("f1", "b", "x")]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("z"))) q = And([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert_equal(q.normalize(), TermRange("f1", u("f"), u("m"))) q = Or([TermRange("f1", u("a"), u("m")), TermRange("f1", u("f"), u("q"))]) assert_equal(q.normalize(), TermRange("f1", u("a"), u("q"))) q = Or([TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert_equal(q.normalize(), Every("f1")) q = And([Every("f1"), Term("f1", "a"), Variations("f1", "b")]) assert_equal(q.normalize(), Every("f1")) q = Or([Term("f1", u("q")), TermRange("f1", u("m"), None), TermRange("f1", None, u("n"))]) assert_equal(q.normalize(), Every("f1")) q = And([Or([Term("f1", u("a")), Term("f1", u("b"))]), Every("f1")]) assert_equal(q.normalize(), Every("f1")) q = And([Term("f1", u("a")), And([Or([Every("f1")])])]) assert_equal(q.normalize(), Every("f1"))
def sitemap(): """ Google (and others) XML sitemap """ def format_timestamp(t): tm = time.gmtime(t) return time.strftime("%Y-%m-%dT%H:%M:%S+00:00", tm) sitemap = [] for rev in flaskg.storage.documents(wikiname=app.cfg.interwikiname): fqnames = rev.fqnames mtime = rev.meta[MTIME] # these are the content items: changefreq = "daily" priority = "0.5" sitemap += [(fqname, format_timestamp(mtime), changefreq, priority) for fqname in fqnames] # add entries for root urls root_mapping = [(namespace, app.cfg.root_mapping.get(namespace, app.cfg.default_root)) for namespace, _ in app.cfg.namespace_mapping] query = Or([ And([Term(NAME_EXACT, root), Term(NAMESPACE, namespace)]) for namespace, root in root_mapping ]) for rev in flaskg.storage.search(q=query): mtime = rev.meta[MTIME] sitemap.append( (rev.meta[NAMESPACE], format_timestamp(mtime), "hourly", "1.0")) sitemap.sort() content = render_template('misc/sitemap.xml', sitemap=sitemap) return Response(content, mimetype='text/xml')
def build_keywords_query(keywords): """ Build parsers for a query. :param MultiDict keywords: The search texts keyed by scope key. If empty, the query will match every documents. """ queries = [] if keywords: composer = current_app.config['KERKO_COMPOSER'] text_plugins = [ plugins.PhrasePlugin(), plugins.GroupPlugin(), plugins.OperatorsPlugin( And=r"(?<=\s)" + re.escape(gettext("AND")) + r"(?=\s)", Or=r"(?<=\s)" + re.escape(gettext("OR")) + r"(?=\s)", Not=r"(^|(?<=(\s|[()])))" + re.escape(gettext("NOT")) + r"(?=\s)", AndNot=None, AndMaybe=None, Require=None ), plugins.BoostPlugin(), ] for key, value in keywords.items(multi=True): fields = [spec.key for spec in composer.fields.values() if key in spec.scopes] if not fields: raise KeyError # No known field for that scope key. parser = MultifieldParser( fields, schema=composer.schema, plugins=text_plugins ) queries.append(parser.parse(value)) else: queries.append(Every()) return And(queries)
def __sub__(self, query): """Allows you to use - between query objects to add the right-hand query as a "NOT" query. """ from whoosh.query import And, Not return And([self, Not(query)]).normalize()
def __and__(self, query): """Allows you to use & between query objects to wrap them in an And query. """ from whoosh.query import And return And([self, query]).normalize()
def build_keywords_query(keywords): """ Build parsers for a query. :param MultiDict keywords: The search texts keyed by scope key. If empty, the query will match every documents. """ queries = [] if keywords: composer = current_app.config['KERKO_COMPOSER'] text_plugins = [PhrasePlugin(), GroupPlugin(), OperatorsPlugin()] for key, value in keywords.items(multi=True): fields = [ spec.key for spec in composer.fields.values() if key in spec.scopes ] if not fields: raise KeyError # No known field for that scope key. parser = MultifieldParser(fields, schema=composer.schema, plugins=text_plugins) queries.append(parser.parse(value)) else: queries.append(Every()) return And(queries)
def test_requires(): a = Term("f", u("a")) b = Term("f", u("b")) assert_equal(And([a, b]).requires(), set([a, b])) assert_equal(Or([a, b]).requires(), set()) assert_equal(AndMaybe(a, b).requires(), set([a])) assert_equal(a.requires(), set([a]))
def group_acl_report(group_name): """ Display a table of items and permissions, where the ACL rule specifies any WikiGroup or ConfigGroup name. """ query = And([ Term(WIKINAME, app.cfg.interwikiname), Not(Term(NAMESPACE, NAMESPACE_USERPROFILES)) ]) all_metas = flaskg.storage.search_meta(query, idx_name=LATEST_REVS, sortedby=[NAMESPACE, NAME], limit=None) group_items = [] for meta in all_metas: acl_iterator = ACLStringIterator(ACL_RIGHTS_CONTENTS, meta.get(ACL, '')) for modifier, entries, rights in acl_iterator: if group_name in entries: fqname = gen_fqnames(meta) group_items.append( dict(name=meta.get(NAME), itemid=meta.get(ITEMID), namespace=meta.get(NAMESPACE), fqname=fqname, rights=rights)) return render_template('admin/group_acl_report.html', title_name=_('Group ACL Report'), group_items=group_items, group_name=group_name)
def __call__(self, query): """search""" query = unicode(query) query_parser = QueryParser("description", schema=self.ix.schema) myquery = query_parser.parse(query) # Old code: too strict # extendedquery = Or([myquery] + # [Term(field, query) for field in self.keywords]) # New code: too permissive # extendedquery = [myquery] excluded = set(['AND', 'OR', 'NOT']) terms = [i for i in query.split() if i not in excluded] # for field in self.keywords: # extendedquery.extend([Term(field, term) for term in terms]) # extendedquery = Or(extendedquery) # Code should look something like #Or([myquery] + [Or( # extendedquery = [myquery] extendedquery = And([ Or([myquery] + [Term('description', term), Term('name', term)] + [Term(field, term) for field in self.keywords]) for term in terms ]) # perform the search searcher = self.ix.searcher() return [i['name'] for i in searcher.search(extendedquery, limit=None)]
def get_comments(self): if self.meta.get(ITEMID) and self.meta.get(NAME): refers_to = self.meta[ITEMID] else: refers_to = self.fqname.value query = And([ Term(WIKINAME, app.cfg.interwikiname), Term(REFERS_TO, refers_to), Term(ELEMENT, u'comment') ]) revs = flaskg.storage.search(query, sortedby=[MTIME], limit=None) comments = dict() lookup = dict() roots = [] for rev in revs: lookup[rev.meta[ITEMID]] = rev comments[rev] = [] for comment_id, rev in lookup.iteritems(): if not rev.meta['reply_to']: roots.append(rev) else: parent = lookup[rev.meta['reply_to']] if comments.get(parent): comments[parent].append(rev) else: comments[parent] = [rev] return comments, roots
def build_filter_query(filters=None): """ Build groupedby and filter queries based on facet specs. :param list filter: A list of (name, values) tuples, where values is itself a list. :return: A tuple with the Facets to perform grouping on, and the terms to filter on. """ composer = current_app.config['KERKO_COMPOSER'] groupedby = Facets() for spec in composer.facets.values(): groupedby.add_field(spec.key, allow_overlap=spec.allow_overlap) terms = [] if filters: for filter_key, filter_values in filters: spec = composer.get_facet_by_filter_key(filter_key) if spec: # Ensure only valid filters. for v in filter_values: if v == '': # If trying to filter with a missing value. # Exclude all results with a value in facet field. terms.append(Not(Every(spec.key))) else: v = spec.codec.transform_for_query(v) terms.append(spec.query_class(spec.key, v)) return groupedby, And(terms)
def test_simplify(): s = fields.Schema(k=fields.ID, v=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(k=u("1"), v=u("aardvark apple allan alfa bear bee")) w.add_document(k=u("2"), v=u("brie glue geewhiz goop julia")) w.commit() r = ix.reader() q1 = And([Prefix("v", "b", boost=2.0), Term("v", "juliet")]) q2 = And([Or([Term('v', u('bear'), boost=2.0), Term('v', u('bee'), boost=2.0), Term('v', u('brie'), boost=2.0)]), Term('v', 'juliet')]) assert_equal(q1.simplify(r), q2)
def check_itemid(self): # once a ticket has both name and itemid, use itemid if self.meta.get(ITEMID) and self.meta.get(NAME): query = And([ Term(WIKINAME, app.cfg.interwikiname), Term(REFERS_TO, self.meta[NAME]) ]) revs = flaskg.storage.search(query, limit=None) prefix = self.meta[NAME][0] + '/' for rev in revs: # TODO: if this is not dead code add a comment how to get here old_names = rev.meta[NAME] for old_name in old_names: file_name = old_name[len(prefix):] try: new_name = self.meta[ITEMID] + '/' + file_name item = Item.create(new_name) item.modify({}, rev.meta[CONTENT], refers_to=self.meta[ITEMID], element='file') item = Item.create(old_name) item._save(item.meta, name=old_name, action=ACTION_TRASH) # delete except AccessDenied: abort(403)
def get_comments(self): """ Return a list of roots (comments to original ticket) and a dict of comments (comments to comments). """ refers_to = self.meta[ITEMID] query = And([ Term(WIKINAME, app.cfg.interwikiname), Term(REFERS_TO, refers_to), Term(ELEMENT, 'comment') ]) revs = flaskg.storage.search(query, sortedby=[MTIME], limit=None) comments = dict() # {rev: [],...} comments to a comment lookup = dict() # {itemid: rev,...} roots = [] revs = list(revs) for rev in revs: lookup[rev.meta[ITEMID]] = rev comments[rev] = [] for rev in revs: if not rev.meta['reply_to']: roots.append(rev) else: parent = lookup[rev.meta['reply_to']] comments[parent] = comments.get(parent, []) + [rev] return comments, roots
def add_other_versions(searcher, hits, user, staff): results = [hit.fields() for hit in hits] allow_q = [] if not staff: allow_q = [ Or([Term('public', 't'), Term('users', user.username.lower())] + [ Term('groups', group.name.lower()) for group in user.groups.all() ]) ] for result in results: user_q = And([ Term('vendor_name', '%s/%s' % (result['vendor'], result['name'])) ] + allow_q) version_results = [ h.fields()['version'] for h in searcher.search(user_q) ] result['others'] = [ v for v in version_results if v != result['version'] ] return results
def search_text(self, groupname: str, field: str, text: str, op: str, sortby: str = "path", scoring: str = "unscored") -> Iterable[SearchResult]: assert sortby in ("path", "score") assert scoring in ("unscored", "bm25f") scored = scoring != "unscored" sortedby = {"score": None}.get(sortby, sortby) limit = None qp = QueryParser(field, self.invindex.ix.schema) q = qp.parse(text) terms = list( Term(fieldname, value) for fieldname, value in q.iter_all_terms()) if op == "and": query = And(terms) elif op == "or": query = Or(terms) with self.searcher() as searcher: for hit in searcher.search(query, limit=limit, scored=scored, sortedby=sortedby): yield Path( hit["path"]), hit.score # hit.pos, hit.rank, hit.docnum