def test_random_score(): """ random_score 函数,它的输出是一个介于0到1之间的数字,当给它提供相同的seed值时,它能够产生一致性随机的结果 random_score 子句不包含任何的filter,因此它适用于所有文档。 当然,如果你索引了能匹配查询的新文档,无论你是否使用了一致性随机,结果的顺序都会有所改变。 :return: """ q = query.Q( 'function_score', functions=[ query.SF('random_score', seed=10), query.SF('field_value_factor', field='likes', modifier="log1p", factor=0.1) ], score_mode="sum", max_boost=1.5 # 通过制定max_boost参数来限制函数的最大影响 # 无论field_value_factor函数的结果是多少,它绝不会大于1.5。 # max_boost只是对函数的结果有所限制,并不是最终的_score。 ) s = House.search() s = s.query(q) print(s.to_dict()) response = s.execute() for h in response: print(h.city, h.location)
def apply_search_query(self, search_query, qs): lang = translation.get_language() analyzer = get_locale_analyzer(lang) # Our query consist of a number of should clauses. We call the ones # with the higher boost "primary" for convenience. primary_should = self.primary_should_rules(search_query, analyzer) secondary_should = self.secondary_should_rules(search_query, analyzer) # We alter scoring depending on the "boost" field which is defined in # the mapping (used to boost public addons higher than the rest) and, # if the waffle switch is on, whether or an addon is a webextension. functions = [ query.SF('field_value_factor', field='boost'), ] if waffle.switch_is_active('boost-webextensions-in-search'): webext_boost_filter = (Q( 'term', **{'current_version.files.is_webextension': True}) | Q( 'term', **{ 'current_version.files.is_mozilla_signed_extension': True })) functions.append( query.SF({ 'weight': WEBEXTENSIONS_WEIGHT, 'filter': webext_boost_filter })) # Assemble everything together and return the search "queryset". return qs.query('function_score', query=query.Bool(should=primary_should + secondary_should), functions=functions)
def test_function_score_to_dict(): q = query.Q('function_score', query=query.Q('match', title='python'), functions=[ query.SF('random_score'), query.SF('field_value_factor', field='comment_count', filter=query.Q('term', tags='python')) ]) d = { 'function_score': { 'query': { 'match': { 'title': 'python' } }, 'functions': [{ 'random_score': {} }, { 'filter': { 'term': { 'tags': 'python' } }, 'field_value_factor': { 'field': 'comment_count', } }], } } assert d == q.to_dict()
def test_function_score_to_dict(): q = query.Q( "function_score", query=query.Q("match", title="python"), functions=[ query.SF("random_score"), query.SF( "field_value_factor", field="comment_count", filter=query.Q("term", tags="python"), ), ], ) d = { "function_score": { "query": {"match": {"title": "python"}}, "functions": [ {"random_score": {}}, { "filter": {"term": {"tags": "python"}}, "field_value_factor": {"field": "comment_count"}, }, ], } } assert d == q.to_dict()
def apply_search_query(self, search_query, qs, sort=None): lang = translation.get_language() # Our query consist of a number of should clauses. We call the ones # with the higher boost "primary" for convenience. primary_should = self.primary_should_rules(search_query, lang) secondary_should = self.secondary_should_rules(search_query, lang) # We alter scoring depending on add-on popularity and whether the # add-on is reviewed & public & non-experimental, and whether or not # it's in a promoted group with a search boost. functions = [ query.SF('field_value_factor', field='average_daily_users', modifier='log2p'), query.SF({ 'weight': 4.0, 'filter': (Q('term', is_experimental=False) & Q('terms', status=amo.REVIEWED_STATUSES) & Q('exists', field='current_version') & Q('term', is_disabled=False)) }), ] ranking_bump_groups = amo.utils.sorted_groupby( PROMOTED_GROUPS, lambda g: g.search_ranking_bump, reverse=True) for bump, promo_ids in ranking_bump_groups: if not bump: continue functions.append( query.SF({ 'weight': bump, 'filter': (Q('terms', **{'promoted.group_id': [p.id for p in promo_ids]})) })) # Assemble everything together qs = qs.query('function_score', query=query.Bool(should=primary_should + secondary_should), functions=functions) if sort is None or sort == 'relevance': # If we are searching by relevancy, rescore the top 10 # (window_size below) results per shard with more expensive rules # using match_phrase + slop. rescore_query = self.rescore_rules(search_query, lang) qs = qs.extra( rescore={ 'window_size': 10, 'query': { 'rescore_query': query.Bool( should=rescore_query).to_dict() } }) return qs
def apply_search_query(self, search_query, qs, sort=None): lang = translation.get_language() analyzer = get_locale_analyzer(lang) # Our query consist of a number of should clauses. We call the ones # with the higher boost "primary" for convenience. primary_should = self.primary_should_rules(search_query, analyzer) secondary_should = self.secondary_should_rules(search_query, analyzer) # We alter scoring depending on add-on popularity and whether the # add-on is reviewed & public & non-experimental. functions = [ query.SF('field_value_factor', field='average_daily_users', modifier='log2p'), query.SF({ 'weight': 4.0, 'filter': (Q('term', is_experimental=False) & Q('terms', status=amo.REVIEWED_STATUSES) & Q('exists', field='current_version') & Q('term', is_disabled=False)) }), ] if switch_is_active('api-recommendations-priority'): functions.append( query.SF({ 'weight': 5.0, 'filter': (Q('term', is_recommended=True)) })) # Assemble everything together qs = qs.query('function_score', query=query.Bool(should=primary_should + secondary_should), functions=functions) if sort is None or sort == 'relevance': # If we are searching by relevancy, rescore the top 10 # (window_size below) results per shard with more expensive rules # using match_phrase + slop. rescore_query = self.rescore_rules(search_query, analyzer) qs = qs.extra( rescore={ 'window_size': 10, 'query': { 'rescore_query': query.Bool( should=rescore_query).to_dict() } }) return qs
def filter_queryset(self, request, qs, view): search_query = request.GET.get('q', '').lower() if not search_query: return qs lang = translation.get_language() analyzer = get_locale_analyzer(lang) # Our query consist of a number of should clauses. We call the ones # with the higher boost "primary" for convenience. primary_should = self.primary_should_rules(search_query, analyzer) secondary_should = self.secondary_should_rules(search_query, analyzer) # We alter scoring depending on the "boost" field which is defined in # the mapping (used to boost public addons higher than the rest). functions = [ query.SF('field_value_factor', field='boost'), ] # Assemble everything together and return the search "queryset". return qs.query('function_score', query=query.Bool(should=primary_should + secondary_should), functions=functions)
def test_function_score_exp(): """ origin: 中心点 或字段可能的最佳值, 落在原点 origin 上的文档评分 _score 为满分1.0 。 scale: 衰减率, 即一个文档从原点origin下落时, 评分_score 改变的速度(例如,每 £10 欧元或每 100 米)。 decay: 从原点 origin 衰减到 scale 所得的评分 _score, 默认值为 0.5。 offset: 以原点 origin 为中心点,为其设置一个非零的偏移量 offset 覆盖一个范围,而不只是单个原点。 在范围 -offset <= origin <= +offset 内的所有评分 _score 都是 1.0。 :return: """ q = query.Q('function_score', functions=[ query.SF('exp', created_at={ 'origin': '2015-03-01', 'scale': '10d', 'offset': '0d', 'decay': 0.5 }) ]) print(q.to_dict()) s = House.search() s = s.query(q) response = s.execute() for h in response: print(h.city, h.created_at)
def filter_queryset(self, qs): qs = super().filter_queryset(qs) qs = qs.query(query.Bool(filter=[Q('term', is_recommended=True)])) return ( qs.query('function_score', functions=[query.SF('random_score')]) .sort('_score') )
def test_function_score_with_no_function_is_boost_factor(): q = query.Q( "function_score", functions=[query.SF({"weight": 20, "filter": query.Q("term", f=42)})], ) assert { "function_score": {"functions": [{"filter": {"term": {"f": 42}}, "weight": 20}]} } == q.to_dict()
def filter_queryset(self, request, qs, view): search_query_param = request.GET.get('q') sort_param = request.GET.get('sort') order_by = None if sort_param is not None: split_sort_params = sort_param.split(',') # Random sort is a bit special. # First, it can't be combined with other sorts. if 'random' in split_sort_params and len(split_sort_params) > 1: raise serializers.ValidationError( 'The "random" "sort" parameter can not be combined.') # Second, for perf reasons it's only available when the 'featured' # or 'recommended' param is present (to limit the number of # documents we'll have to apply the random score to) and a search # query is absent (to prevent clashing with the score functions # coming from a search query). if sort_param == 'random': is_random_sort_available = ( (AddonFeaturedQueryParam.query_param in request.GET or AddonRecommendedQueryParam.query_param in request.GET) and not search_query_param) if is_random_sort_available: qs = qs.query('function_score', functions=[query.SF('random_score')]) else: raise serializers.ValidationError( 'The "sort" parameter "random" can only be specified ' 'when the "featured" or "recommended" parameter is ' 'also present, and the "q" parameter absent.') # Having just recommended sort doesn't make any sense, so ignore it if sort_param == 'recommended': sort_param = None if sort_param is None: # The default sort depends on the presence of a query: we sort by # relevance if we have a query, otherwise by recommended,downloads. recommended_waffle_on = switch_is_active( 'api-recommendations-priority') split_sort_params = (['relevance'] if search_query_param else ['recommended', 'downloads'] if recommended_waffle_on else ['downloads']) try: order_by = [ self.SORTING_PARAMS[name] for name in split_sort_params ] except KeyError: raise serializers.ValidationError('Invalid "sort" parameter.') return qs.sort(*order_by)
def test_function_score_with_functions(): q = query.Q( "function_score", functions=[query.SF("script_score", script="doc['comment_count'] * _score")], ) assert { "function_score": { "functions": [{"script_score": {"script": "doc['comment_count'] * _score"}}] } } == q.to_dict()
def get_subscribers(targetings, hours_whitelist, volume): logger.debug("get_subscribers: getting subscribers") start_time = time.time() timezones = [tz for tz in pytz.all_timezones if datetime.now(pytz.timezone(tz)).hour in hours_whitelist] targetings.append({ "field": "unsub", "operator": "NOT IN", "values": [1, "true"] }) if timezones: targetings.append({ "field": "timezone", "operator": "IN", "values": timezones }) es_search = Search(using=es, index="users") operator_mappings = { 'IN': 'must', 'NOT IN': 'must_not', } es_query = Q() for condition in targetings: condition_pair = {condition["field"]: condition["values"]} terms_q = Q('terms', **condition_pair) bool_operator = operator_mappings[condition['operator']] bool_q = Q('bool', **{bool_operator: terms_q}) es_query += bool_q es_search = es_search.query(es_query) es_search.query = dslq.FunctionScore( query=es_search.query, functions=[dslq.SF('random_score')], boost_mode="replace" ) es_search = es_search[:volume] try: res = es_search.execute() except ElasticsearchException as e: logger.error(f"get_subscribers: Exception {e}") else: subscribers = [] for row in res.hits: subscriber = row.to_dict() subscriber['_id'] = row.meta.id subscribers.append(subscriber) end_time = time.time() logger.debug(f"get_subscribers: finished in " f"{int((end_time - start_time) * 1000)}ms") return subscribers
def test_function_score_gauss(): q = query.Q('function_score', query=query.Q('match', city='Sarasota'), functions=[ query.SF('gauss', price={ 'origin': '0', 'scale': '20' }), query.SF('gauss', location={ 'origin': '26.494627, -81.961609', 'scale': '2km', 'offset': '0km', 'decay': 0.33 }) ], score_mode="multiply") s = House.search() s = s.query(q) print(s.to_dict()) response = s.execute() for h in response: print(h.city, h.location)
def search(self, page_id='next_prediction'): if self.awsauth is not None: connections.create_connection( hosts=self.hosts, http_auth=self.awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) else: connections.create_connection(hosts=self.hosts) # q = Q('match', dataset_id='documents') if page_id == "next_prediction": q = Q('function_score', functions=[query.SF("random_score")]) else: q = Q('match', _id=page_id) s = Search(index='page').query(q)[0] logger.info("About to execute") resp = s.execute().to_dict() hit = resp['hits']['hits'][0]['_source'] objects = zip(hit['bbox'], hit['postprocess_cls'], hit['postprocess_score']) pp_detected_objs = [] for i in objects: pp_detected_objs.append({ "bounding_box": i[0], "class": i[1], "confidence": i[2], "annotated_class": None, "obj_id": -1 }) image_dir = '/cosmos_tmp/images' with open(os.path.join(image_dir, os.path.basename(hit['img_pth'])), 'rb') as imf: imbytes = base64.b64encode(imf.read()).decode('ascii') t = { "_id": resp['hits']['hits'][0]['_id'], "page_height": hit['pdf_dims'][3], "page_width": hit['pdf_dims'][2], "pdf_id": -1, "pdf_name": hit['pdf_name'], "page_num": hit["page_num"], "pp_detected_objs": pp_detected_objs, "resize_bytes": imbytes } return [t]
def test_function_score_with_functions(): q = query.Q('function_score', functions=[ query.SF('script_score', script="doc['comment_count'] * _score") ]) assert { 'function_score': { 'functions': [{ 'script_score': { 'script': "doc['comment_count'] * _score" } }] } } == q.to_dict()
def filter_queryset(self, request, qs, view): search_query_param = request.GET.get('q') sort_param = request.GET.get('sort') order_by = None if sort_param is not None: split_sort_params = sort_param.split(',') try: order_by = [ self.SORTING_PARAMS[name] for name in split_sort_params ] except KeyError: raise serializers.ValidationError('Invalid "sort" parameter.') # Random sort is a bit special. # First, it can't be combined with other sorts. if 'random' in split_sort_params and len(split_sort_params) > 1: raise serializers.ValidationError( 'The "random" "sort" parameter can not be combined.') # Second, for perf reasons it's only available when the 'featured' # param is present (to limit the number of documents we'll have to # apply the random score to) and a search query is absent # (to prevent clashing with the score functions coming from a # search query). if sort_param == 'random': is_random_sort_available = (AddonFeaturedQueryParam.query_param in request.GET and not search_query_param) if is_random_sort_available: qs = qs.query('function_score', functions=[query.SF('random_score')]) else: raise serializers.ValidationError( 'The "sort" parameter "random" can only be specified ' 'when the "featured" parameter is also present, and ' 'the "q" parameter absent.') # The default sort depends on the presence of a query: we sort by # relevance if we have a query, otherwise by downloads. if not order_by: sort_param = 'relevance' if search_query_param else 'downloads' order_by = [self.SORTING_PARAMS[sort_param]] return qs.sort(*order_by)
def test_field_value_factor(): q = query.Q('function_score', query=query.Q("multi_match", query='python', fields=['title', 'content']), functions=[ query.SF('field_value_factor', field='votes', modifier='log1p', factor=0.1) ], score_mode="sum", max_boost=1.5) s = Post.search() s = s.query(q) print(s.to_dict()) response = s.execute() for h in response: print(h.title)
def filter_queryset(self, request, queryset, view): search_term = view.query_params.get('q') if search_term: queries = [] for query_type, field, boost in self.search_operations: queries.append( Q(query_type, **{field: {'query': search_term, 'boost': boost}})) queryset = queryset.query( 'function_score', query=query.Bool(should=queries), functions=[query.SF('field_value_factor', field='boost')], ) if request.user.is_superuser: queryset = queryset.extra(explain=True) return queryset
def test_function_score_with_no_function_is_boost_factor(): q = query.Q( 'function_score', functions=[query.SF({ 'weight': 20, 'filter': query.Q('term', f=42) })]) assert { 'function_score': { 'functions': [{ 'filter': { 'term': { 'f': 42 } }, 'weight': 20 }] } } == q.to_dict()
def filter_queryset(self, request, queryset, view): search_term = view.query_params.get("q") if search_term: queries = [] for query_type, field, boost in self.search_operations: queries.append( Q(query_type, **{field: { "query": search_term, "boost": boost }})) queryset = queryset.query( "function_score", query=query.Bool(should=queries), functions=[query.SF("field_value_factor", field="boost")], ) if request.user.is_superuser: queryset = queryset.extra(explain=True) return queryset
def filter_queryset(self, request, queryset, view): search_param = request.QUERY_PARAMS.get(self.search_param, None) if search_param: queries = [] for query_type, field, boost in self.search_operations: queries.append( Q(query_type, **{field: { 'query': search_param, 'boost': boost }})) queryset = queryset.query( 'function_score', query=query.Bool(should=queries), functions=[query.SF('field_value_factor', field='boost')], ) if flag_is_active(request, 'search_explanation'): queryset = queryset.extra(explain=True) return queryset
def apply_search_query(self, search_query, qs, sort=None): lang = translation.get_language() analyzer = get_locale_analyzer(lang) # Our query consist of a number of should clauses. We call the ones # with the higher boost "primary" for convenience. primary_should = self.primary_should_rules(search_query, analyzer) secondary_should = self.secondary_should_rules(search_query, analyzer) # We alter scoring depending on the "boost" field which is defined in # the mapping (used to boost public addons higher than the rest). functions = [ query.SF('field_value_factor', field='boost'), ] # Assemble everything together qs = qs.query('function_score', query=query.Bool(should=primary_should + secondary_should), functions=functions) if sort is None or sort == 'relevance': # If we are searching by relevancy, rescore the top 10 # (window_size below) results per shard with more expensive rules # using match_phrase + slop. rescore_query = self.rescore_rules(search_query, analyzer) qs = qs.extra( rescore={ 'window_size': 10, 'query': { 'rescore_query': query.Bool( should=rescore_query).to_dict() } }) return qs
def filter_queryset(self, request, qs, view): search_query_param = request.GET.get('q') split_sort_params = self.get_sort_params(request) if split_sort_params: # Random sort is a bit special. # First, it can't be combined with other sorts. if 'random' in split_sort_params and len(split_sort_params) > 1: raise serializers.ValidationError( 'The "random" "sort" parameter can not be combined.' ) # Second, for perf reasons it's only available when the 'featured' # or 'promoted' param is present (to limit the number of # documents we'll have to apply the random score to) and a search # query is absent (to prevent clashing with the score functions # coming from a search query). if split_sort_params == ['random']: is_random_sort_available = ( AddonFeaturedQueryParam.query_param in request.GET or AddonPromotedQueryParam.query_param in request.GET ) and not search_query_param if is_random_sort_available: # We want randomness to change only once every 24 hours, so # we use a seed that depends on the date. qs = qs.query( 'function_score', functions=[ query.SF( 'random_score', seed=date.today().toordinal(), ) ], ) else: raise serializers.ValidationError( 'The "sort" parameter "random" can only be specified ' 'when the "featured" or "promoted" parameter is ' 'also present, and the "q" parameter absent.' ) # Sorting by relevance only makes sense with a query string if not search_query_param and 'relevance' in split_sort_params: split_sort_params = [ param for param in split_sort_params if not 'relevance' ] # Having just recommended sort doesn't make any sense, so ignore it if split_sort_params == ['recommended']: split_sort_params = None # relevance already takes into account recommended so ignore it too elif ( 'recommended' in split_sort_params and 'relevance' in split_sort_params ): split_sort_params = [ param for param in split_sort_params if not 'recommended' ] if not split_sort_params: # The default sort depends on the presence of a query: we sort by # relevance if we have a query, otherwise by recommended,downloads. split_sort_params = ( ['relevance'] if search_query_param else ['recommended', 'users'] ) try: order_by = [self.SORTING_PARAMS[name] for name in split_sort_params] except KeyError: raise serializers.ValidationError('Invalid "sort" parameter.') return qs.sort(*order_by)
def get_app_filter(cls, request, additional_data=None, sq=None, app_ids=None, no_filter=False): """ THE grand, consolidated ES filter for Webapps. By default: - Excludes non-public apps. - Excludes disabled apps (whether by reviewer or by developer). - Excludes based on region exclusions. - TODO: Excludes based on device and platform support. additional_data -- an object with more data to allow more filtering. sq -- if you have an existing search object to filter off of. app_ids -- if you want to filter by a list of app IDs. no_filter -- doesn't apply the consumer-side excludes (public/region). """ from mkt.api.base import get_region_from_request from mkt.search.views import name_query sq = sq or cls.search() additional_data = additional_data or {} app_ids = app_ids or [] data = { 'app_type': [], 'author.raw': None, 'category': None, # Slug. 'device': None, # ID. 'gaia': getattr(request, 'GAIA', False), 'is_offline': None, 'manifest_url': '', 'mobile': getattr(request, 'MOBILE', False), 'premium_type': [], 'profile': get_feature_profile(request), 'q': '', 'region': getattr(get_region_from_request(request), 'id', None), 'status': None, 'supported_locales': [], 'tablet': getattr(request, 'TABLET', False), 'tags': '', } data.update(additional_data) # Fields that will be filtered with a term query. term_fields = ('author.raw', 'device', 'manifest_url', 'status', 'tags') # Fields that will be filtered with a terms query. terms_fields = ('category', 'premium_type', 'app_type', 'supported_locales') # QUERY. if data['q']: # Function score for popularity boosting (defaults to multiply). sq = sq.query( 'function_score', query=name_query(data['q'].lower()), functions=[query.SF('field_value_factor', field='boost')]) # MUST. must = [ F('term', status=amo.STATUS_PUBLIC), F('term', is_disabled=False), ] if not no_filter else [] for field in term_fields + terms_fields: # Term filters. if data[field]: filter_type = 'term' if field in term_fields else 'terms' must.append(F(filter_type, **{field: data[field]})) if not no_filter: if data['profile']: # Feature filters. profile = data['profile'] for k, v in profile.to_kwargs(prefix='features.has_').items(): must.append(F('term', **{k: v})) if data['mobile'] or data['gaia']: # Uses flash. must.append(F('term', uses_flash=False)) if data['is_offline'] is not None: must.append(F('term', is_offline=data['is_offline'])) # SHOULD. should = [] if app_ids: should = [es_filter.Terms(id=list(set(app_ids)))] sq = sq[0:len(set(app_ids))] # FILTER. if must or should: sq = sq.filter(es_filter.Bool(must=must, should=should)) if data['region'] and not no_filter: # Region exclusions. sq = sq.filter(~F('term', region_exclusions=data['region'])) return sq
def filter_queryset(self, request, queryset, view): q = request.GET.get('q', '').lower() lang = translation.get_language() analyzer = self._get_locale_analyzer(lang) if not q: return queryset should = [] rules = [ (query.Match, { 'query': q, 'boost': 3, 'analyzer': 'standard' }), (query.Match, { 'query': q, 'boost': 4, 'type': 'phrase', 'slop': 1 }), (query.Prefix, { 'value': q, 'boost': 1.5 }), ] # Only add fuzzy queries if q is a single word. It doesn't make sense # to do a fuzzy query for multi-word queries. if ' ' not in q: rules.append((query.Fuzzy, { 'value': q, 'boost': 2, 'prefix_length': 1 })) # Apply rules to search on few base fields. Some might not be present # in every document type / indexes. for k, v in rules: for field in ('app_slug', 'author', 'name', 'short_name', 'slug', 'title', 'url_tokenized'): should.append(k(**{field: v})) # Exact matches need to be queried against a non-analyzed field. Let's # do a term query on `name.raw` for an exact match against the item # name and give it a good boost since this is likely what the user # wants. # FIXME: we should also do that on translations and slug/app_slug, but # we don't store a raw version for them at the moment. should.append(query.Term(**{'name.raw': {'value': q, 'boost': 10}})) # Do the same for GUID searches. should.append(query.Term(**{'guid': {'value': q, 'boost': 10}})) # If query is numeric, check if it is an ID. if q.isnumeric(): should.append(query.Term(**{'id': {'value': q, 'boost': 10}})) if analyzer: should.append( query.Match( **{'name_l10n_%s' % analyzer: { 'query': q, 'boost': 2.5 }})) should.append( query.Match(**{ 'short_name_l10n_%s' % analyzer: { 'query': q, 'boost': 2.5 } })) # Add searches on the description field. should.append( query.Match(description={ 'query': q, 'boost': 0.8, 'type': 'phrase' })) if analyzer: desc_field = 'description_l10n_%s' % analyzer desc_analyzer = ('%s_analyzer' % analyzer if analyzer in mkt.STEMMER_MAP else analyzer) should.append( query.Match( **{ desc_field: { 'query': q, 'boost': 0.6, 'type': 'phrase', 'analyzer': desc_analyzer } })) # Add searches on tag field. should.append(query.Term(tags={'value': q})) if ' ' not in q: should.append(query.Fuzzy(tags={'value': q, 'prefix_length': 1})) # The list of functions applied to our `function_score` query. functions = [ query.SF('field_value_factor', field='boost'), ] # Add a boost for the preferred region, if it exists. region = get_region_from_request(request) if region: functions.append({ 'filter': { 'term': { 'preferred_regions': region.id } }, # TODO: When we upgrade to Elasticsearch 1.4, change this # to 'weight'. 'boost_factor': 4, }) return queryset.query('function_score', query=query.Bool(should=should), functions=functions)
def _find(params, total_only=False, make_suggestions=False, min_suggestion_score=0.8): search_query = Search(index=settings.SEARCH_INDEX_NAME, ) if make_suggestions: # XXX research if it it's better to use phrase suggesters and if # that works # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters.html#phrase-suggester search_query = search_query.suggest("title_suggestions", params["query"], term={"field": "title"}) search_query = search_query.suggest("body_suggestions", params["query"], term={"field": "body"}) sub_queries = [] sub_queries.append( Q("match", title={ "query": params["query"], "boost": 2.0 })) sub_queries.append( Q("match", body={ "query": params["query"], "boost": 1.0 })) if " " in params["query"]: sub_queries.append( Q("match_phrase", title={ "query": params["query"], "boost": 10.0 })) sub_queries.append( Q("match_phrase", body={ "query": params["query"], "boost": 5.0 })) sub_query = query.Bool(should=sub_queries) if params["locales"]: search_query = search_query.filter("terms", locale=params["locales"]) if params["archive"] == "exclude": search_query = search_query.filter("term", archived=False) elif params["archive"] == "only": search_query = search_query.filter("term", archived=True) if params["slug_prefixes"]: sub_queries = [Q("prefix", slug=x) for x in params["slug_prefixes"]] search_query = search_query.query(query.Bool(should=sub_queries)) search_query = search_query.highlight_options( pre_tags=["<mark>"], post_tags=["</mark>"], number_of_fragments=3, fragment_size=120, encoder="html", ) search_query = search_query.highlight("title", "body") if params["sort"] == "relevance": search_query = search_query.sort("_score", "-popularity") search_query = search_query.query(sub_query) elif params["sort"] == "popularity": search_query = search_query.sort("-popularity", "_score") search_query = search_query.query(sub_query) else: popularity_factor = 10.0 boost_mode = "sum" score_mode = "max" search_query = search_query.query( "function_score", query=sub_query, functions=[ query.SF( "field_value_factor", field="popularity", factor=popularity_factor, missing=0.0, ) ], boost_mode=boost_mode, score_mode=score_mode, ) search_query = search_query.source(excludes=["body"]) search_query = search_query[params["size"] * (params["page"] - 1):params["size"] * params["page"]] retry_options = { "retry_exceptions": ( # This is the standard operational exception. exceptions.ConnectionError, # This can happen if the search happened right as the index had # just been deleted due to a fresh re-indexing happening in Yari. exceptions.NotFoundError, # This can happen when the index simply isn't ready yet. exceptions.TransportError, ), # The default in redo is 60 seconds. Let's tone that down. "sleeptime": settings.ES_RETRY_SLEEPTIME, "attempts": settings.ES_RETRY_ATTEMPTS, "jitter": settings.ES_RETRY_JITTER, } with retrying(search_query.execute, **retry_options) as retrying_function: response = retrying_function() if total_only: return response.hits.total metadata = { "took_ms": response.took, "total": { # The `response.hits.total` is a `elasticsearch_dsl.utils.AttrDict` # instance. Pluck only the exact data needed. "value": response.hits.total.value, "relation": response.hits.total.relation, }, "size": params["size"], "page": params["page"], } documents = [] for hit in response: try: body_highlight = list(hit.meta.highlight.body) except AttributeError: body_highlight = [] try: title_highlight = list(hit.meta.highlight.title) except AttributeError: title_highlight = [] d = { "mdn_url": hit.meta.id, "score": hit.meta.score, "title": hit.title, "locale": hit.locale, "slug": hit.slug, "popularity": hit.popularity, "archived": hit.archived, "summary": hit.summary, "highlight": { "body": body_highlight, "title": title_highlight, }, } documents.append(d) try: suggest = getattr(response, "suggest") except AttributeError: suggest = None suggestions = [] if suggest: suggestion_strings = _unpack_suggestions( params["query"], response.suggest, ("body_suggestions", "title_suggestions"), ) for score, string in suggestion_strings: if score > min_suggestion_score or 1: # Sure, this is different way to spell, but what will it yield # if you actually search it? total = _find(dict(params, query=string), total_only=True) if total["value"] > 0: suggestions.append({ "text": string, "total": { # This 'total' is an `AttrDict` instance. "value": total.value, "relation": total.relation, }, }) # Since they're sorted by score, it's usually never useful # to suggestion more than exactly 1 good suggestion. break return { "documents": documents, "metadata": metadata, "suggestions": suggestions, }