def search_all(username, field_name="content", key_word="分布式", **kwargs): WHOOSH_PATH = '/home/python/Learn/django/Haystack-Whoosh/whoosh_index/%s' % username if not os.path.exists(WHOOSH_PATH): return [] # if not id and not keyword and not title and not content: # return [] index = whoosh_open_idx(WHOOSH_PATH, WHOOSH_SCHEMA) searcher = index.searcher() field_name = "content" key_word = "分布式" args = { "limit": None, } if "sortedby" in kwargs: sortedby = kwargs.pop("sortedby") if "orderby" in kwargs: orderby = kwargs.pop("orderby") else: orderby = "desc" if orderby == "desc": facet = sorting.FieldFacet(sortedby, reverse=True) else: facet = sorting.FieldFacet(sortedby) args["sortedby"] = facet args.update(kwargs) return searcher.find(field_name, key_word, **args)
def query_search(indexdir, queries, n=10, function='BM25F'): ix = index.open_dir(indexdir) search_fields = ['resname', 'categories', 'address', 'city', 'state'] # search fields og = qparser.OrGroup.factory(0.9) qp = MultifieldParser(search_fields, ix.schema, termclass=query.Variations, group=og) qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(queries) result_index = [] if function == 'BM25F': with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s: rates = sorting.FieldFacet('rating', reverse=True) scores = sorting.ScoreFacet() results = s.search(q, limit=n, sortedby=[scores, rates]) k = min(len(results), n) for i in range(k): result_index.append(int(results[i]['ID'])) if function == 'TF_IDF': with ix.searcher(weighting=scoring.TF_IDF()) as s: rates = sorting.FieldFacet('rating', reverse=True) scores = sorting.ScoreFacet() results = s.search(q, limit=n, sortedby=[scores, rates]) k = min(len(results), n) for i in range(k): result_index.append(int(results[i]['ID'])) return result_index
def _paginated_search_mails(self, query, window, page): page = int(page) if page is not None and int(page) > 1 else 1 window = int(window) if window is not None else 25 with self._index.searcher() as searcher: tags_facet = sorting.FieldFacet('tag', allow_overlap=True, maptype=sorting.Count) sorting_facet = sorting.FieldFacet('date', reverse=True) results = searcher.search_page(query, page, pagelen=window, groupedby=tags_facet, sortedby=sorting_facet) return unique([mail['ident'] for mail in results]), sum(results.results.groups().values())
class test_translate(): domain = [ ("alfa", 100, 50), ("bravo", 20, 80), ("charlie", 10, 10), ("delta", 82, 39), ("echo", 20, 73), ("foxtrot", 81, 59), ("golf", 39, 93), ("hotel", 57, 48), ("india", 84, 75), ] schema = fields.Schema(name=fields.TEXT(sortable=True), a=fields.NUMERIC(sortable=True), b=fields.NUMERIC(sortable=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: for name, a, b in domain: w.add_document(name=u(name), a=a, b=b) with ix.searcher() as s: q = query.Every() # Baseline: just sort by a field r = s.search(q, sortedby="a") assert " ".join([ hit["name"] for hit in r ]) == "charlie bravo echo golf hotel foxtrot delta india alfa" # Sort by reversed name target = [x[0] for x in sorted(domain, key=lambda x: x[0][::-1])] tf = sorting.TranslateFacet(lambda name: name[::-1], sorting.FieldFacet("name")) r = s.search(q, sortedby=tf) assert [hit["name"] for hit in r] == target # Sort by average of a and b def avg(a, b): return (a + b) / 2 target = [ x[0] for x in sorted(domain, key=lambda x: (x[1] + x[2]) / 2) ] af = sorting.FieldFacet("a") bf = sorting.FieldFacet("b") tf = sorting.TranslateFacet(avg, af, bf) r = s.search(q, sortedby=tf) assert [hit["name"] for hit in r] == target
def search2(username, id=None, keyword=None, title=None, content=None, sortedby="id", orderby="desc", page=1, page_size=10): # 可以每个用户一个目录 WHOOSH_PATH = '/home/python/Learn/django/Haystack-Whoosh/whoosh_index/%s' % username if not os.path.exists(WHOOSH_PATH): return [] # if not id and not keyword and not title and not content: # return [] index = whoosh_open_idx(WHOOSH_PATH, WHOOSH_SCHEMA) searcher = index.searcher() print("-----------2") parser = QueryParser("content", index.schema) myquery = parser.parse("分布式") facet = sorting.FieldFacet("id", reverse=True) # 按序排列搜索结果 results = searcher.search( myquery, limit=None, sortedby=facet) # limit为搜索结果的限制,默认为10,详见博客开头的官方文档 for result1 in results: print(dict(result1)) print("-----------2")
def test_nocachefield_segments(): schema = fields.Schema(a=fields.ID(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(a=u("bravo")) w.add_document(a=u("echo")) w.add_document(a=u("juliet")) w.commit() w = ix.writer() w.add_document(a=u("kilo")) w.add_document(a=u("foxtrot")) w.add_document(a=u("charlie")) w.commit(merge=False) w = ix.writer() w.delete_by_term("a", u("echo")) w.add_document(a=u("alfa")) w.add_document(a=u("india")) w.add_document(a=u("delta")) w.commit(merge=False) with ix.searcher() as s: q = query.TermRange("a", u("bravo"), u("k")) facet = sorting.FieldFacet("a", reverse=True) r = s.search(q, sortedby=facet) assert [hit["a"] for hit in r] == [ "juliet", "india", "foxtrot", "delta", "charlie", "bravo" ] mq = query.Or( [query.Term("a", u("bravo")), query.Term("a", u("delta"))]) anq = query.AndNot(q, mq) r = s.search(anq, sortedby=facet) assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "charlie"] mq = query.Or( [query.Term("a", u("bravo")), query.Term("a", u("delta"))]) r = s.search(q, mask=mq, sortedby=facet) assert [hit["a"] for hit in r] == ["juliet", "india", "foxtrot", "charlie"] fq = query.Or([ query.Term("a", u("alfa")), query.Term("a", u("charlie")), query.Term("a", u("echo")), query.Term("a", u("india")), ]) r = s.search(query.Every(), filter=fq, sortedby=facet) assert [hit["a"] for hit in r] == ["india", "charlie", "alfa"] nq = query.Not( query.Or([query.Term("a", u("alfa")), query.Term("a", u("india"))])) r = s.search(query.Every(), filter=nq, sortedby=facet) assert [hit["a"] for hit in r] == [ "kilo", "juliet", "foxtrot", "delta", "charlie", "bravo" ]
def test(ix): with ix.searcher() as s: # Sort by title r = s.search(query.Every(), sortedby="title") assert [hit["title"] for hit in r] == sorted_titles # Sort by reverse title facet = sorting.FieldFacet("title", reverse=True) r = s.search(query.Every(), sortedby=facet) assert [hit["title"] for hit in r] == list(reversed(sorted_titles)) # Sort by num (-10 to 10) first, and within that, by reverse title facet = sorting.MultiFacet() facet.add_field("num") facet.add_field("title", reverse=True) r = s.search(query.Every(), sortedby=facet) target = ["Visual and Statistical Thinking", "Cognitive Style of Powerpoint", "Beautiful Evidence", "Visual Explanations", "Visual Display of Quantitative Information, The", "Envisioning Information", ] assert [hit["title"] for hit in r] == target
def searchuser(): q = request.args.get("q", "") # repr offset = int(request.args.get("offset", 0)) count = int(request.args.get("count", 20)) with uix.searcher() as searcher: query = QueryParser("nickname", uix.schema).parse( "nickname:*%s*" % q) # QueryParser("name", ix.schema).parse("tash*") #print query user_id = sorting.FieldFacet("user_id", reverse=True) results = searcher.search_page(query, max(offset / count, 0) + 1, pagelen=count, sortedby=user_id) print results.offset, count, offset, max(offset / count, 0) + 1 if results.offset < offset: return "[]" tmp = hashlib.md5(str(mktime(datetime.datetime.now().timetuple())) ).hexdigest() + "user_search_tmp" lua = """local searched = loadstring('return ' .. KEYS[1])() for i = 1, table.getn(searched) do redis.call('sadd', KEYS[2], tostring(searched[i])) end local arr = redis.call('sort', KEYS[2], 'GET', 'users:*') return arr""" res = rs.eval( lua, 2, '{' + ','.join(str(hit['user_id']) for hit in results) + '}', tmp) rs.delete(tmp) r = ",".join(res) return "[" + r + "]"
def test_overlapping_lists(): schema = fields.Schema(id=fields.STORED, tags=fields.KEYWORD) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(id=0, tags=u("alfa bravo charlie")) w.add_document(id=1, tags=u("bravo charlie delta")) w.add_document(id=2, tags=u("charlie delta echo")) w.add_document(id=3, tags=u("delta echo alfa")) w.add_document(id=4, tags=u("echo alfa bravo")) with ix.searcher() as s: of = sorting.FieldFacet("tags", allow_overlap=True) cat = of.categorizer(s) assert not cat._use_vectors r = s.search(query.Every(), groupedby={"tags": of}) assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], 'charlie': [0, 1, 2], 'delta': [1, 2, 3], 'echo': [2, 3, 4]} fcts = sorting.Facets() fcts.add_field("tags", allow_overlap=True) r = s.search(query.Every(), groupedby=fcts) assert r.groups("tags") == {'alfa': [0, 3, 4], 'bravo': [0, 1, 4], 'charlie': [0, 1, 2], 'delta': [1, 2, 3], 'echo': [2, 3, 4]}
def find_unique_orgid(self, q, limit): facet = sorting.FieldFacet("id", reverse=True) jobs = self.ix.searcher().search(q, collapse="orgid", sortedby=facet, limit=limit) return jobs
def test_sorting(): from whoosh import sorting schema = fields.Schema(id=fields.STORED, name=fields.ID(stored=True), size=fields.NUMERIC) ix = RamIndex(schema) with ix.writer() as w: w.add_document(id=0, name=u("bravo"), size=10) w.add_document(id=1, name=u("alfa"), size=9) w.add_document(id=2, name=u("delta"), size=8) w.add_document(id=3, name=u("charlie"), size=7) with ix.searcher() as s: q = query.Every() r = s.search(q, sortedby="name") assert_equal([hit["id"] for hit in r], [1, 0, 3, 2]) r = s.search(q, sortedby="size") assert_equal([hit["id"] for hit in r], [3, 2, 1, 0]) facet = sorting.FieldFacet("size", reverse=True) r = s.search(q, sortedby=facet) assert_equal([hit["id"] for hit in r], [0, 1, 2, 3])
def search(self, parameter): # 提取查询字段,创建检索器 keys = parameter['keys'] parser = None if len(keys) == 1: parser = QueryParser(keys[0], schema=self.index.schema) elif len(keys) > 1: parser = MultifieldParser(keys, schema=self.index.schema) # 搜索参数(排序、分页) # score = sorting.ScoreFacet() # 相关度 id = sorting.FieldFacet('id', reverse=False) # 标题字段 _limit = None # 分页限制 if 'page' in parameter and 'pagesize' in parameter: page = parameter['page'] pagesize = parameter['pagesize'] if page > 0 and pagesize != 0: _limit = page * pagesize # 执行搜索 query = parser.parse(parameter['keywords']) result = self.searcher.search(query, limit=_limit, sortedby=[id]) # 返回结果 res = list() for hit in result: res.append({ 'title': hit['title'], 'url': hit['url'], 'content': re.sub(r'<[^>]+>', ' | ', hit.highlights('content'), re.S) }) return res
def test_add_sortable(): st = RamStorage() schema = fields.Schema(chapter=fields.ID(stored=True), price=fields.NUMERIC) ix = st.create_index(schema) with ix.writer() as w: w.add_document(chapter=u("alfa"), price=100) w.add_document(chapter=u("bravo"), price=200) w.add_document(chapter=u("charlie"), price=300) w.add_document(chapter=u("delta"), price=400) with ix.writer() as w: w.add_document(chapter=u("bravo"), price=500) w.add_document(chapter=u("alfa"), price=600) w.add_document(chapter=u("delta"), price=100) w.add_document(chapter=u("charlie"), price=200) w.merge = False with ix.reader() as r: assert not r.has_column("chapter") assert not r.has_column("price") with ix.writer() as w: sorting.add_sortable(w, "chapter", sorting.StoredFieldFacet("chapter")) sorting.add_sortable(w, "price", sorting.FieldFacet("price")) w.schema.test = 100 with ix.reader() as r: assert r.has_column("chapter") assert r.has_column("price") chapr = r.column_reader("chapter") pricer = r.column_reader("price") assert chapr[0] == "alfa" assert pricer[0] == 100
def bounced_addresses_filter(searcher, contacts): query = QueryParser('bounced', searcher.schema).parse('*') bounced_addresses = searcher.search( query, limit=None, groupedby=sorting.FieldFacet('bounced', allow_overlap=True)).groups() return set(contacts) - set(flatten([bounced_addresses]))
def search(self, query_string, page="1", limit=20): results = [] query_string = unicode(query_string, 'utf-8') with self.index.searcher() as searcher: query = QueryParser("content", self.index.schema).parse(query_string) scores = sorting.ScoreFacet() sortperson = sorting.FieldFacet("person") sortcollection = sorting.FieldFacet("collection", reverse=True) resultset = searcher.search_page( query, int(page), pagelen=int(limit), sortedby=[sortcollection, scores, sortperson]) # NOTE: Need to copy plain dicts out, since once the searcher # dies (end of with block), the Hit results lose their reference to # the data. for hit in resultset[0:]: # Grab a copy of the results as a plain dict. result = hit.fields() # Also grab the surrounding fragment as a highlight. # NOTE: This is pretty much the only point we know # "where" in the matched document the hit occurs. # The raw content we indexed is stored in 'content', # so we tell the Hit instance to pull the surrounding # text fragments from there. # Also: # These highlights are pretty much the only reason # we need to bother stashing the entire document. # Otherwise, the index can be even smaller. # Whoosh allows to hunt for the content in the # original files, if they're available. But as our # text content isn't large -- keeping it in the # index seems faster. result['highlights'] = hit.highlights('content') results.append(result) results = { 'matches': results, 'matches_returned': resultset.scored_length(), 'total_matches': len(resultset), 'query': query_string } return results
def _search_all_mails(self, query): with self._index.searcher() as searcher: sorting_facet = sorting.FieldFacet('date', reverse=True) results = searcher.search(query, sortedby=sorting_facet, reverse=True, limit=None) return unique([mail['ident'] for mail in results])
def test_reverse_collapse(): from whoosh import sorting schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT, path=fields.ID(stored=True), tags=fields.KEYWORD, order=fields.NUMERIC(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(title=u"First document", content=u"This is my document!", path=u"/a", tags=u"first", order=20.0) w.add_document(title=u"Second document", content=u"This is the second example.", path=u"/b", tags=u"second", order=12.0) w.add_document(title=u"Third document", content=u"Examples are many.", path=u"/c", tags=u"third", order=15.0) w.add_document(title=u"Thirdish document", content=u"Examples are too many.", path=u"/d", tags=u"third", order=25.0) with ix.searcher() as s: q = query.Every('content') r = s.search(q) assert [hit["path"] for hit in r] == ["/a", "/b", "/c", "/d"] q = query.Or([ query.Term("title", "document"), query.Term("content", "document"), query.Term("tags", "document") ]) cf = sorting.FieldFacet("tags") of = sorting.FieldFacet("order", reverse=True) r = s.search(q, collapse=cf, collapse_order=of, terms=True) assert [hit["path"] for hit in r] == ["/a", "/b", "/d"]
def autocomplete(query_str, results=10): query_str = u' '.join([ t.text for t in _analyzer(query_str) if not 'university'.startswith(t.text) ]) q = _query_parser.parse(query_str) return [ _ror_rows[row['ror']] for row in _searcher.search_page( q, 1, results, sortedby=[ sorting.FieldFacet('citation_score', reverse=True), sorting.FieldFacet('num_students', reverse=True), sorting.ScoreFacet(), ]) ]
def search(term): with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(term) date_sort_facet = sorting.FieldFacet("date", reverse=True) results = searcher.search(query, sortedby=date_sort_facet) links = [] for x in results: links.append('<li><a href="' + x['path'] + '.html' + '">' + x['title'] + '</a></li>\n') return u"".join(links).encode('utf8')
def search_index(query): # sizes = sorting.FieldFacet("size") # prices = sorting.FieldFacet("price", reverse=True) # results = searcher.search(myquery, sortedby=[sizes, prices]) lec_ids = sorting.FieldFacet("lec_id") agreeCounts = sorting.FieldFacet("agreeCount", reverse=True) marks = sorting.FieldFacet("mark", reverse=True) ix = open_dir("indexdir") with ix.searcher() as searcher: # query = MultifieldParser(["url", "title", "tags", "note", "article"], ix.schema).parse("使用") parser = QueryParser("content", ix.schema) myquery = parser.parse(query) # 一开始这里失败了,是由于txt文件的编码形式不是UTF-8,导致了乱码。 # results = searcher.search(myquery,limit=None) # results = searcher.search_page(myquery, 5) results = searcher.search(myquery, limit=None, sortedby=[lec_ids, agreeCounts, marks]) print(len(results)) print(type(results)) # print(results[:]) # for i in range(len(results)): # print(results[i]);print('\n') #IndexError: results[10]: Results only has 10 hits print(results) count = 0 client = pymongo.MongoClient("mongodb://address") db_opt = client["course_info"] b = list() for i in results: count += 1 print(i) j = dict(i) print(j) db_opt.result.insert_one({ 'lec_id': j['lec_id'], 'agreeCount': j['agreeCount'], 'mark': j['mark'], 'content': j['content'] }) b.append(j['lec_id']) seta = set(b) print(count) return seta
def _search_tag_groups(self, is_filtering_tags): seen = None query_parser = QueryParser('tag', self._index.schema) options = {'limit': None, 'groupedby': sorting.FieldFacet('tag', allow_overlap=True), 'maptype': sorting.Count} with self._index.searcher() as searcher: total = searcher.search(query_parser.parse('*'), **options).groups() if not is_filtering_tags: seen = searcher.search(query_parser.parse("* AND flags:%s" % Status.SEEN), **options).groups() return seen, total
def contacts(self, query): restrict_q = Term("tag", "drafts") | Term("tag", "trash") if query: to = QueryParser('to', self._index.schema) cc = QueryParser('cc', self._index.schema) bcc = QueryParser('bcc', self._index.schema) sender = QueryParser('sender', self._index.schema) with self._index.searcher() as searcher: to = searcher.search(to.parse("*%s*" % query), limit=None, mask=restrict_q, groupedby=sorting.FieldFacet('to', allow_overlap=True)).groups() cc = searcher.search(cc.parse("*%s*" % query), limit=None, mask=restrict_q, groupedby=sorting.FieldFacet('cc', allow_overlap=True)).groups() bcc = searcher.search(bcc.parse("*%s*" % query), limit=None, mask=restrict_q, groupedby=sorting.FieldFacet('bcc', allow_overlap=True)).groups() sender = searcher.search(sender.parse("*%s*" % query), limit=None, mask=restrict_q, groupedby=sorting.FieldFacet('sender', allow_overlap=True)).groups() return flatten([to, cc, bcc, sender]) return []
def test_compound_sort(): fspec = fields.KEYWORD(stored=True, sortable=True) schema = fields.Schema(a=fspec, b=fspec, c=fspec) ix = RamStorage().create_index(schema) alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split() blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa" ).split() clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet" ).split() assert all(len(ls) == 10 for ls in (alist, blist, clist)) with ix.writer() as w: for i in xrange(10): w.add_document(a=alist[i], b=blist[i], c=clist[i]) with ix.searcher() as s: q = query.Every() sortedby = [ sorting.FieldFacet("a"), sorting.FieldFacet("b", reverse=True), sorting.FieldFacet("c") ] r = s.search(q, sortedby=sortedby) output = [] for hit in r: output.append(" ".join((hit["a"], hit["b"], hit["c"]))) assert output == [ "alfa charlie charlie", "alfa charlie india", "alfa bravo echo", "alfa alfa alfa", "alfa alfa golf", "bravo charlie foxtrot", "bravo bravo bravo", "bravo bravo hotel", "bravo alfa delta", "bravo alfa juliet", ]
def search_addresses(searcher, query): restrict_q = Term("tag", "drafts") | Term("tag", "trash") results = [] for field in ['to', 'cc', 'bcc', 'sender']: query_parser = QueryParser(field, searcher.schema) results.append( searcher.search(query_parser.parse("*%s*" % query), limit=None, mask=restrict_q, groupedby=sorting.FieldFacet( field, allow_overlap=True)).groups()) return flatten(results)
def search(q, filters, query_string, max_facets=5): """ Search for a query term and a set o filters Returns a list of hits and the representation of the facets """ ix = get_or_create_index() hits = [] facets = [ sorting.FieldFacet("tags", allow_overlap=True, maptype=sorting.Count) ] tags = Tag.objects.values( 'slug', 'title', ) parser = qparser.QueryParser("text", schema=ix.schema) # , group=og) try: q = parser.parse("*" + q + "*") except: q = None if q or filters: searcher = ix.searcher() for filter_value in filters: filter_name = "tags" q = q & query.Term(filter_name, filter_value) hits = searcher.search(q.normalize(), groupedby=facets) active_facets = [] sorted_facets = sorted(hits.groups("tags").items(), key=operator.itemgetter(1, 0), reverse=True) facets = [] for facet_slug, facet_value in sorted_facets: if not facet_slug: continue qs = query_string.copy() qs["page"] = "1" if facet_slug in filters: qs.setlist('f', [f for f in filters if f != facet_slug]) state = "active" else: qs.appendlist('f', facet_slug) state = "available" obj = tags.get(slug=facet_slug) facet_dict = { 'slug': facet_slug, 'title': obj.get("title", ""), 'count': facet_value, 'qs': qs.urlencode(), } if state == 'active': active_facets.append(facet_dict) else: facets.append(facet_dict) return {"hits": hits, "facets": facets, "active_facets": active_facets}
def contacts(self, query): if query: to = QueryParser('to', self._index.schema) cc = QueryParser('cc', self._index.schema) bcc = QueryParser('bcc', self._index.schema) with self._index.searcher() as searcher: to = searcher.search(to.parse("*%s*" % query), limit=None, groupedby=sorting.FieldFacet( 'to', allow_overlap=True)).groups() cc = searcher.search(cc.parse("*%s*" % query), limit=None, groupedby=sorting.FieldFacet( 'cc', allow_overlap=True)).groups() bcc = searcher.search(bcc.parse("*%s*" % query), limit=None, groupedby=sorting.FieldFacet( 'bcc', allow_overlap=True)).groups() return flatten([to, cc, bcc]) return []
def search(self, text: str, sorted_by_count=False, fieldname='body'): """returns (dset, item_gen) results.docs(): a set of docnum of the results in index results.items(): a generator for (docnum, score) in from highest score/count to lowest """ isearcher = self._searcher skw = {'limit': None} skw['q'] = QueryParser("body", self.ix.schema).parse(text) if sorted_by_count: skw['sortedby'] = sorting.FieldFacet('count', reverse=True) results = isearcher.search(**skw) return results.docs(), results.items()
def search(humanReadableId): query = request.args.get('q', '').strip() pagination = None if query: index_base_dir = config().get_path("ZIM", "wikipedia_index_dir") index_dir = os.path.join(index_base_dir, humanReadableId) page = int(request.args.get('page', 1)) # Load index so we can query it for which fields exist ix = whoosh_open_dir_32_or_64(index_dir) # Set a higher value for the title field so it is weighted more weighting = scoring.BM25F(title_B=1.0) # Sort pages with "Image:" in their title after # regular articles def image_pages_last(searcher, docnum): fields = searcher.stored_fields(docnum) if fields['title'].find("Image:") == 0: return 1 else: return 0 # Support older whoosh indexes that do not have a reverse_links field if 'reverse_links' in ix.schema.names(): sortedby = sorting.MultiFacet([ sorting.FunctionFacet(image_pages_last), sorting.ScoreFacet(), sorting.FieldFacet("reverse_links", reverse=True), ]) else: sortedby = sorting.MultiFacet([ sorting.FunctionFacet(image_pages_last), sorting.ScoreFacet(), ]) (pagination, suggestion) = paginated_search(ix, ["title", "content"], query, page, weighting=weighting, sort_column=sortedby) else: flash(_('Please input keyword(s)'), 'error') return render_template('zim/search.html', humanReadableId=humanReadableId, pagination=pagination, suggestion=suggestion, keywords=query, endpoint_desc=EndPointDescription( 'zim_views.search', {'humanReadableId': humanReadableId}))
def test_group_types(): schema = fields.Schema(a=fields.STORED, b=fields.TEXT, c=fields.ID) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(a=0, b=u("blah"), c=u("apple")) w.add_document(a=1, b=u("blah blah"), c=u("bear")) w.add_document(a=2, b=u("blah blah blah"), c=u("apple")) w.add_document(a=3, b=u("blah blah blah blah"), c=u("bear")) w.add_document(a=4, b=u("blah blah blah blah blah"), c=u("apple")) w.add_document(a=5, b=u("blah blah blah blah blah blah"), c=u("bear")) w.add_document(a=6, b=u("blah blah blah blah blah blah blah"), c=u("apple")) with ix.searcher() as s: q = query.Term("b", "blah") f = sorting.FieldFacet("c", maptype=sorting.UnorderedList) r = s.search(q, groupedby=f) gs = r.groups("c") assert_equal(gs["apple"], [0, 2, 4, 6]) assert_equal(gs["bear"], [1, 3, 5]) f = sorting.FieldFacet("c", maptype=sorting.Count) r = s.search(q, groupedby=f) gs = r.groups("c") assert_equal(gs["apple"], 4) assert_equal(gs["bear"], 3) f = sorting.FieldFacet("c", maptype=sorting.Best) r = s.search(q, groupedby=f) gs = r.groups() assert_equal(gs["apple"], 6) assert_equal(gs["bear"], 5) r = s.search(q, groupedby="c", maptype=sorting.Count) gs = r.groups() assert_equal(gs["apple"], 4) assert_equal(gs["bear"], 3)
def get_context(self): if self.context: return self.context self.get_index() from whoosh.qparser.dateparse import DateParserPlugin datetimes = sorting.FieldFacet("datetime", reverse=True) parser = qparser.QueryParser("text", self.index.schema) query = parser.parse("*") searcher = self.index.searcher() restrict_replies = whoosh.query.Term("reply", True) results = searcher.search( query, mask=restrict_replies, sortedby=datetimes, limit=MAX_TWEETS_NUMBER) click.secho("Adding to context...", fg="green") context = set() for result in results: text = result["text"] click.secho(text) import re # don't want these influence context # drop usernames text = re.sub(r"@[^ ]+", " ", text) # drop links text = re.sub(r"http(s?)://[^ ]+", " ", text) # drop cut-off text from the end of manual rts text = re.sub(r"[^ ]+…$", " ", text) context.update(text.split(" ")) # split and discard empty strings context = " ".join(filter(None, context)) click.secho("Processed context:", fg="green") click.secho(context) self.context = context return self.context