def div_translation_count(lang): " Returns the count of translations per subdivision " body = { "aggregations": { "div_uids": {"terms": {"field": "division", "size": 0}}, # Unlimited "subdiv_uids": {"terms": {"field": "subdivision", "size": 0}}, # Unlimited } } try: result = es.search(index=lang, doc_type="text", search_type="count", body=body) except elasticsearch.exceptions.NotFoundError: return None mapping = {d["key"]: d["doc_count"] for d in result["aggregations"]["subdiv_uids"]["buckets"]} # If division and subdiv is shared, clobber with div value. mapping.update({d["key"]: d["doc_count"] for d in result["aggregations"]["div_uids"]["buckets"]}) return mapping
def sutta_search(**kwargs): mode = kwargs.get("mode") or "wildcard" fields = { "name": {"mode": mode}, "volpage": {"mode": mode, "fields": ["volpage", "volpage_extra"]}, "acronym": {"mode": mode, "field": "uid"}, "division": {"mode": mode, "fields": ["division", "subdivision"]}, "lang": {"mode": "term"} } if "acronym" in kwargs: kwargs["acronym"] = kwargs["acronym"].lower().replace(' ', '') queries = [] for field, params in fields.items(): value = kwargs.get(field) if not value: continue if "fields" in params: sub_query = { "bool": { "should": [ { params["mode"]: { sub_field: { "value": value.lower() } } } for sub_field in params["fields"] ] } } queries.append(sub_query) else: queries.append( { params["mode"]: { params.get("field", field): { "value": value.lower() } } } ) if not queries: return None body = { "size": int(kwargs.get("limit", 25)), "from": int(kwargs.get("offset", 0)), "query": { "bool": { "must": queries } }, "sort": [ { "_score": { "order": "desc" }}, { "ordering": {"order": "asc" }} ] } return es.search(index="suttas", body=body)
def search(uid): if not discourse_is_available: return None uid = uid.lower().split('-')[0] body = { "query": { "function_score": { "query": { "bool": { "should": [ { "has_child": { "type": "post", "score_mode": "sum", "query": { "match": { "plain": uid } }, "inner_hits": { "size": 1, # "_source": ["post_number", "id"], "highlight": { "fields": { "plain": {} } } } } }, { "match": { "tags": uid } }, { "match": { "title": uid } } ] } }, "boost_mode": "sum", "functions": [{ "gauss": { "updated_at": { "origin": "now", "scale": "21d", "offset": "7d", "decay": 0.5 } }, "weight": 2 }] } } } import json result = es.search(discourse_index, doc_type='topic', body=body) hits = result['hits']['hits'] out = {'topics': [], 'categories': {}} for hit in hits: inner_hits = hit['inner_hits']['post']['hits']['hits'] if inner_hits: inner_hit = inner_hits[0]['_source'] snippet = ' … '.join(inner_hits[0]['highlight']['plain']) else: first_post_query = { "size": 1, "query": { "term": { "topic_id": hit['_source']['id'] } }, "sort": [{ "post_number": { "order": "asc" } }] } r = es.search(discourse_index, doc_type='post', body=first_post_query) inner_hits = r['hits']['hits'] if inner_hits: inner_hit = inner_hits[0]['_source'] snippet = make_snippet(inner_hit['plain']) else: inner_hit = None snippet = '' source = hit['_source'] out['topics'].append({ 'topic_id': source['id'], 'post_number': inner_hit['post_number'] if inner_hit else None, 'title': source['title'], 'category_id': source['category_id'], 'snippet': snippet }) if source['category_id'] not in out['categories']: cat = get_category(source['category_id']) out['categories'][cat['id']] = cat parent_id = cat['parent_category_id'] if parent_id and parent_id not in out['categories']: parent_cat = get_category(parent_id) out['categories'][parent_cat['id']] = parent_cat return out
def search(uid): if not discourse_is_available: return None uid = uid.lower() body = { "query": { "function_score": { "query": { "bool": { "should": [ { "has_child": { "type": "post", "score_mode": "sum", "query": { "match": { "plain": uid } }, "inner_hits": { "size": 1, "_source": ["post_number", "id"], "highlight": { "fields": { "plain": {} } } } } }, { "match": { "tags": uid } }, { "match": { "title": uid } } ] } }, "boost_mode": "sum", "functions": [ { "gauss":{ "updated_at": { "origin": "now", "scale": "21d", "offset": "7d", "decay": 0.5 } }, "weight": 2 } ] } } } import json result = es.search(discourse_index, doc_type='topic', body=body) hits = result['hits']['hits'] out = { 'topics': [], 'categories': {} } for hit in hits: inner_hits = hit['inner_hits']['post']['hits']['hits'] if inner_hits: inner_hit = inner_hits[0]['_source'] snippet = ' … '.join(inner_hits[0]['highlight']['plain']) else: first_post_query = { "size": 1, "query": { "filtered": { "filter": { "term": { "topic_id": hit['_source']['id'] } } } }, "sort": [ { "post_number": { "order": "asc" } } ] } r = es.search(discourse_index, doc_type='post', body=first_post_query) inner_hits = r['hits']['hits'] if inner_hits: inner_hit = inner_hits[0]['_source'] snippet = make_snippet(inner_hit['plain']) else: inner_hit = None snippet = '' source = hit['_source'] out['topics'].append({ 'topic_id': source['id'], 'post_number': inner_hit['post_number'] if inner_hit else None, 'title': source['title'], 'category_id': source['category_id'], 'snippet': snippet }) if source['category_id'] not in out['categories']: cat = get_category(source['category_id']) out['categories'][cat['id']] = cat parent_id = cat['parent_category_id'] if parent_id and parent_id not in out['categories']: parent_cat = get_category(parent_id) out['categories'][parent_cat['id']] = parent_cat return out
def search(query, highlight=True, offset=0, limit=10, lang=None, define=None, details=None, **kwargs): query.strip() indexes = [] if details is not None: indexes = ["suttas"] if define is not None: indexes.append("en-dict") if lang: indexes.append(lang) if not indexes: indexes = ["en", "pi", "suttas", "en-dict"] index_string = ",".join(get_available_indexes(indexes)) fields = [ "content", "content.*^0.5", "term^1.5", "term.*^0.5", "gloss^1.5", "lang^0.5", "author^0.5", "uid", "uid.division^0.7", "name^1.25", "name.*^0.75", "heading.title^0.5", "heading.title.plain^0.5", "heading.title.shingle^0.5", ] if regex.search(r'[:"~*]', query) or regex.search(r"AND|OR|NOT", query): query = query.replace("define:", "term:") inner_query = {"query_string": {"fields": fields, "query": query, "use_dis_max": True}} else: inner_query = {"multi_match": {"type": "best_fields", "tie_breaker": 0.3, "fields": fields, "query": query}} body = { "from": offset, "size": limit, "_source": ["uid", "lang", "name", "volpage", "gloss", "term", "heading", "is_root"], "timeout": "15s", "query": { "function_score": { "query": inner_query, "functions": [ {"boost_factor": "1.2", "filter": {"term": {"lang": "en"}}}, {"field_value_factor": {"field": "boost", "factor": 1}}, {"boost_factor": "0.25", "filter": {"type": {"value": "definition"}}}, {"boost_factor": "2", "filter": {"term": {"uid": query.replace(" ", "").lower()}}}, {"boost_factor": "1.2", "filter": {"term": {"is_root": True}}}, ], "score_mode": "multiply", } }, } import json print(json.dumps(body, indent=2)) if highlight: body["highlight"] = { "pre_tags": ['<strong class="highlight">'], "post_tags": ["</strong>"], "order": "score", "require_field_match": False, "fields": { "content": { "matched_fields": ["content", "content.folded", "content.stemmed"], "type": "fvh", "fragment_size": 100, "number_of_fragments": 3, "no_match_size": 250, } }, } return es.search(index=index_string, body=body)
def search(query, highlight=True, offset=0, limit=10, lang=None, define=None, details=None, **kwargs): query.strip() indexes = [] if details is not None: indexes = ['suttas'] if define is not None: indexes.append('en-dict') if lang: indexes.append(lang) if not indexes: indexes = ['en', 'pi', 'suttas', 'en-dict'] index_string = ','.join(get_available_indexes(indexes)) fields = [ "content", "content.*^0.5", "term^1.5", "term.*^0.5", "gloss^1.5", "lang^0.5", "author^0.5", "uid", "uid.division^0.7", "name^1.25", "name.*^0.75", "heading.title^0.5", "heading.title.plain^0.5", "heading.title.shingle^0.5" ] if (regex.search(r'[:"~*]', query) or regex.search(r'AND|OR|NOT', query)): query = query.replace('define:', 'term:') inner_query = { "query_string": { "fields": fields, "query": query, "use_dis_max": True } } else: inner_query = { "multi_match": { "type": "best_fields", "tie_breaker": 0.3, "fields": fields, "query": query } } body = { "from": offset, "size": limit, "_source": [ "uid", "lang", "name", "volpage", "gloss", "term", "heading", "is_root" ], "timeout": "15s", "query": { "function_score": { "query": inner_query, "functions": [{ "weight": "1.2", "filter": { "term": { "lang": "en" } } }, { "field_value_factor": { "field": "boost", "factor": 1.0, "missing": 1.0 } }, { "weight": "0.25", "filter": { "type": { "value": "definition" } } }, { "weight": "2", "filter": { "term": { "uid": query.replace(' ', '').lower() } } }, { "weight": "1.2", "filter": { "term": { "is_root": True } } }], "score_mode": "multiply" } } } import json print('searching index: {}'.format(index_string)) print(json.dumps(body, indent=2)) if highlight: body["highlight"] = { "pre_tags": ["<strong class=\"highlight\">"], "post_tags": ["</strong>"], "order": "score", "require_field_match": False, "fields": { "content": { "matched_fields": ["content", "content.folded", "content.stemmed"], "type": "fvh", "fragment_size": 100, "number_of_fragments": 3, "no_match_size": 250 } } } return es.search(index=index_string, body=body)
def sutta_search(**kwargs): mode = kwargs.get("mode") or "wildcard" fields = { "name": { "mode": mode }, "volpage": { "mode": mode, "fields": ["volpage", "volpage_extra"] }, "acronym": { "mode": mode, "field": "uid" }, "division": { "mode": mode, "fields": ["division", "subdivision"] }, "lang": { "mode": "term" } } if "acronym" in kwargs: kwargs["acronym"] = kwargs["acronym"].lower().replace(' ', '') queries = [] for field, params in fields.items(): value = kwargs.get(field) if not value: continue if "fields" in params: sub_query = { "bool": { "should": [{ params["mode"]: { sub_field: { "value": value.lower() } } } for sub_field in params["fields"]] } } queries.append(sub_query) else: queries.append({ params["mode"]: { params.get("field", field): { "value": value.lower() } } }) if not queries: return None body = { "size": int(kwargs.get("limit", 25)), "from": int(kwargs.get("offset", 0)), "query": { "bool": { "must": queries } }, "sort": [{ "_score": { "order": "desc" } }, { "ordering": { "order": "asc" } }] } return es.search(index="suttas", body=body)