def analyze_match_query(index, field, query, results): body = {"query": {"match": {field: {"query": query}}}} es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) for result in results: response = es.explain(index=index, body=body, doc_type='doc', id=result['uuid']) print('explain response --> ', response)
class ElasticRetrieval(BaseRetrieval): """ Interfaces with the Elasticsearch API """ def __init__(self, index_name, method, logger=None, use_default_similarity=True, max_results=None, es_instance=None, save_terms=True): self.index_name=index_name if es_instance: self.es=es_instance else: if cp.Corpus.__class__.__name__ == "ElasticCorpus": self.es=cp.Corpus.es else: self.es=Elasticsearch() if max_results: self.max_results=max_results else: self.max_results=MAX_RESULTS_RECALL self.method=method # never used? self.logger=logger self.last_query={} self.save_terms=save_terms self.default_field="text" self.tie_breaker=0 self.multi_match_type="best_fields" def rewriteQueryAsDSL(self, structured_query, parameters): """ Creates a multi_match DSL query for elasticsearch. :param structured_query: a StructuredQuery dict, optionally under the key "structured_query" :param parameters: dict of [field]=weight to replace in the query """ if "structured_query" in structured_query: structured_query=structured_query["structured_query"] if not isinstance(structured_query,StructuredQuery): structured_query=StructuredQuery(structured_query) if not structured_query or len(structured_query) == 0: return None self.last_query=structured_query lucene_query="" for token in structured_query: # TODO proper computing of the boost formula. Different methods? ## boost=token["boost"]*token["count"] boost=token.boost*token.count ## bool_val=token.get("bool", None) or "" bool_val=token.bool or "" ## lucene_query+="%s%s" % (bool_val,token["token"]) lucene_query+="%s%s" % (bool_val,token.token) ## if boost != 1: ## lucene_query+="^%s" %str(boost) if boost > 1: token_str=token.token+" " lucene_query= bool_val + (token_str * str(int(boost))).strip() lucene_query+=" " fields=[] for param in parameters: fields.append(param+"^"+str(parameters[param])) dsl_query={ "multi_match" : { "query": lucene_query, "type": self.multi_match_type, "fields": fields, "operator": "or", } } if self.tie_breaker: dsl_query["multi_match"]["tie_breaker"]=self.tie_breaker return dsl_query def rewriteQueryAsDSL_new(self, structured_query, parameters): """ Creates a multi_match DSL query for elasticsearch. :param structured_query: a StructuredQuery dict, optionally under the key "structured_query" :param parameters: dict of [field]=weight to replace in the query """ if "structured_query" in structured_query: structured_query=structured_query["structured_query"] if not isinstance(structured_query,StructuredQuery): structured_query=StructuredQuery(structured_query) if not structured_query or len(structured_query) == 0: return None self.last_query=structured_query lucene_query="" for token in structured_query: # TODO proper computing of the boost formula. Different methods? ## boost=token["boost"]*token["count"] boost=token.boost*token.count ## bool_val=token.get("bool", None) or "" bool_val=token.bool or "" ## lucene_query+="%s%s" % (bool_val,token["token"]) lucene_query+="%s%s" % (bool_val,token.token) ## if boost != 1: ## lucene_query+="^%s" %str(boost) if boost > 1: lucene_query= bool_val + (token.token * str(int(boost))) lucene_query+=" " fields=[] for param in parameters: fields.append(param+"^"+str(parameters[param])) dsl_query={ "multi_match" : { "query": lucene_query, "type": self.multi_match_type, "fields": fields, "operator": "or", } } if self.tie_breaker: dsl_query["multi_match"]["tie_breaker"]=self.tie_breaker return dsl_query def runQuery(self, structured_query, max_results=None): """ Interfaces with the elasticsearch query API """ if not structured_query or len(structured_query) == 0 : return [] if not max_results: max_results=self.max_results self.last_query=dict(structured_query) dsl_query=self.rewriteQueryAsDSL(structured_query["structured_query"], [self.default_field]) res=self.es.search( body={"query":dsl_query}, size=max_results, index=self.index_name, doc_type=ES_TYPE_DOC, request_timeout=QUERY_TIMEOUT, ) structured_query["dsl_query"]=dsl_query hits=res["hits"]["hits"] ## print("Found %d document(s) that matched query '%s':" % (res['hits']['total'], query)) ## if len(hits.scoreDocs) ==0: ## print "Original query:",original_query ## print "Query:", query result=[] for hit in hits: metadata= hit["_source"]["metadata"] result.append((hit["_score"],metadata)) return result def formulaFromExplanation(self, query, doc_id): """ Runs .explain() for one query/doc pair, generates and returns a \ StoredFormula instance from it :param query: StructuredQuery dict, with a "dsl_query" key :param doc_id: id of document to run .explain() for :returns: """ explanation=None retries=0 while retries < 2: try: explanation=self.es.explain( index=self.index_name, doc_type=ES_TYPE_DOC, body={"query":query["dsl_query"]}, id=doc_id, request_timeout=QUERY_TIMEOUT, ) break except Exception as e: logging.exception("Exception, retrying...") retries+=1 formula=StoredFormula() if explanation: formula.fromElasticExplanation(explanation, self.save_terms) return formula
def main(): # es = Elasticsearch(hosts=[{'host': 'elasticsearch.aws.blahblah.com', 'port': '9200'}]) local_es = Elasticsearch() local_client = client.IndicesClient(local_es) # ### Analyzers, Defaults, and Preventing Analysis # # Analysis is the process of chopping up your text and storing it in a form that can be searched efficiently against. # # #### Read this: # # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html # # An Analyzer, is in order, a sequence of optional # * character filters # * tokenizers # * token filters # # To prevent analysis, you can specify "not_analyzed" on the index itself. The Interwebs also suggest "keyword" as the analyzer for a field, but some folks claim it does some simple analyis. # # The default analyzer (if unspecified!) for string fields is "standard." In a custom analyzer, it would be defined: # # { # "type": "custom", # "tokenizer": "standard", # "filter": [ "lowercase", "stop" ] # } # # More on default analysis from the docs (https://www.elastic.co/guide/en/elasticsearch/guide/current/_controlling_analysis.html): # # >While we can specify an analyzer at the field level, how do we determine which analyzer is used for a field if none is specified at the field level? # > # >Analyzers can be specified at several levels. Elasticsearch works through each level until it finds an analyzer that it can use. At index time, the order is as follows: # > # >1. The analyzer defined in the field mapping, else # >2. The analyzer named default in the index settings, which defaults to # >3. The standard analyzer # > # >...At search time, the sequence is slightly different:... # > # >1. The analyzer defined in the query itself, else # >2. The search_analyzer defined in the field mapping, else # >3. The analyzer defined in the field mapping, else # >4. The analyzer named default_search in the index settings, which defaults to # >5. The analyzer named default in the index settings, which defaults to # >6. The standard analyzer # # #### We can inspect analysis with the "analyze" function (or "_analyze" in the curl style). if local_es.indices.exists('my_index'): local_es.indices.delete(index='my_index') local_es.indices.create(index='my_index') # this is the default analyzer ES will use if you don't specify one! Specify one! print( local_client.analyze(index='my_index', analyzer='standard', text='My kitty-cat is adorable.')) # A utility to make analysis results easier to read: def get_analyzer_tokens(result): ''' Utility to combine tokens in an analyzer result. ''' tokens = result[u'tokens'] print(tokens) return ' '.join([token['token'] for token in tokens]) get_analyzer_tokens( local_client.analyze(index='my_index', analyzer="standard", text='My kitty-cat\'s a pain in the neck.')) # **NB: Prevent analysis with "keyword" analyzer, or set the index itself as "not_analyzed" in settings.** # # But if you do this, you need to match on EXACT field contents to search for it. Best to keep an analyzed copy too, if it's meant to be english searchable text. get_analyzer_tokens( local_client.analyze(index='my_index', analyzer='keyword', text='My kitty-cat\'s a pain in the neck.')) # ## The Built-In ES "English" Analyzer: # ### A useful analyzer for text is the built-in English one, which does this, approximately: # # https://www.elastic.co/guide/en/elasticsearch/guide/current/language-intro.html # # See: # https://simpsora.wordpress.com/2014/05/02/customizing-elasticsearch-english-analyzer/ # # >Tokenizer: Standard tokenizer # # >TokenFilters: # >* Standard token filter # >* English possessive filter, which removes trailing 's from words # >* Lowercase token filter # >* Stop token filter # >* Keyword marker filter, which protects certain tokens from modification by stemmers # >* Porter stemmer filter, which reduces words down to a base form (“stem”) # # # These are the stop-words defined for English: # # a, an, and, are, as, at, be, but, by, for, if, in, into, is, it, # no, not, of, on, or, such, that, the, their, then, there, these, # they, this, to, was, will, with # # If you want to customize you can create a new filter yourself or use a file in your config directory for ES. # Try it on some text and see... get_analyzer_tokens( local_client.analyze(index='my_index', analyzer='english', text='My kitty-cat\'s a pain in the neck.')) # If you wanted to customize the 'english' analyzer with your own special rules (extra stopwords etc), see here: https://www.elastic.co/guide/en/elasticsearch/guide/current/configuring-language-analyzers.html # # ## Analyzers and Custom Analyzers # You want to make sure you are explicit about types in your data, so that ES doesn't just guess and maybe get it wrong. Also, this is how you set explicit analysis. # # # Create a setting for the index: # # PUT /my_index # { # "settings": { # "analysis": { # "char_filter": { ... custom character filters ... }, # "tokenizer": { ... custom tokenizers ... }, # "filter": { ... custom token filters ... }, # "analyzer": { ... custom analyzers referring to the definitions above ... } # } # } # } # # For example - this saves a bunch of analysis components into an analyzer called 'my_analyzer': # # PUT /my_index # { # "settings": { # "analysis": { # "char_filter": { # "&_to_and": { # "type": "mapping", # "mappings": [ "&=> and "] # }}, # "filter": { # "my_stopwords": { # "type": "stop", # "stopwords": [ "the", "a" ] # }}, # "analyzer": { # "my_analyzer": { # "type": "custom", # "char_filter": [ "html_strip", "&_to_and" ], # "tokenizer": "standard", # "filter": [ "lowercase", "my_stopwords" ] # }} # }}} # # Then you **use it**, by referring to it in a mapping for a document in this index: # # PUT /my_index/_mapping/my_type # { # "properties": { # "title": { # "type": "string", # "analyzer": "my_analyzer" # } # } # } # # #### Remember: If you don't assign it to a field in a mapping, you aren't using it. # # In Python: MY_SETTINGS = { "settings": { "analysis": { "char_filter": { "&_to_and": { "type": "mapping", "mappings": ["&=> and "] } }, "filter": { "my_stopwords": { "type": "stop", "stopwords": ["the", "a"] } }, "analyzer": { "my_analyzer": { "type": "custom", "char_filter": ["html_strip", "&_to_and"], "tokenizer": "standard", "filter": ["lowercase", "my_stopwords"] } } } } } MAPPING = { "my_doc_type": { "properties": { "title": { "type": "string", "analyzer": "my_analyzer" } } } } # ## Stopwords Note # # The default list of stopwords is indicated thusly: # # >"stopwords": "\_english\_" # # So you can specify both that filter and a custom stopwords list, if you want. if local_es.indices.exists('my_index'): local_es.indices.delete(index='my_index') local_es.indices.create(index='my_index', body=json.dumps(MY_SETTINGS)) local_es.indices.put_mapping(index='my_index', doc_type="my_doc_type", body=json.dumps(MAPPING)) # Check that your mapping looks right! print(local_client.get_mapping(index='my_index')) res = local_client.analyze( index='my_index', analyzer='my_analyzer', text="<p>This is the title & a Capitalized Word!</p>") get_analyzer_tokens(res) # ## Tokenizers vs. Analyzers - Be Careful. # # Some of the names in ES are confusing. There is a **"standard" analyzer** and a **"standard" tokenizer**. https://www.elastic.co/guide/en/elasticsearch/guide/current/standard-tokenizer.html#standard-tokenizer # # Check them out: get_analyzer_tokens( local_client.analyze( index='my_index', analyzer='standard', text='My kitty-cat\'s not a pain in the \'neck\'!')) # The difference is subtle but there. get_analyzer_tokens( local_client.analyze( index='my_index', tokenizer="standard", text='My kitty-cat\'s not a pain in the \'neck\'!')) # However, if you use the english analyzer it will override that uppercase and also remove the negation, # because "not" is in the stopwords list: get_analyzer_tokens( local_client.analyze( index='my_index', analyzer="english", tokenizer="standard", text='My kitty-cat\'s not a pain in the \'neck\'!')) # ## Indexing Yelp Data df = pd.read_msgpack("./data/yelp_df_forES.msg") print(df.head()) # test with a small sample if you want dfshort = df.query('stars >= 5 and net_sentiment > 35') print(len(dfshort)) print(dfshort.head()) # filter out any rows with a nan for sent_per_token, which breaks bulk load: df = df[df.sent_per_token.isnull() != True] MAPPING = { 'review': { 'properties': { 'business_id': { 'index': 'not_analyzed', 'type': 'string' }, 'date': { 'index': 'not_analyzed', 'format': 'dateOptionalTime', 'type': 'date' }, 'review_id': { 'index': 'not_analyzed', 'type': 'string' }, 'stars': { 'index': 'not_analyzed', 'type': 'integer' }, 'text': { 'index': 'analyzed', 'analyzer': 'english', 'store': 'yes', "term_vector": "with_positions_offsets_payloads", 'type': 'string' }, 'fake_name': { 'index': 'not_analyzed', 'type': 'string' }, 'text_orig': { 'index': 'not_analyzed', 'type': 'string' }, 'user_id': { 'index': 'not_analyzed', 'type': 'string' }, 'net_sentiment': { 'index': 'not_analyzed', 'type': 'integer' }, 'sent_per_token': { 'index': 'not_analyzed', 'type': 'float' } } } } if local_es.indices.exists('yelp'): local_es.indices.delete(index='yelp') local_es.indices.create(index='yelp') local_es.indices.put_mapping(index='yelp', doc_type='review', body=json.dumps(MAPPING)) # Bulk data is structured as alternating opt_dict and data dicts. bulk_data = [] for index, row in df.iterrows(): data_dict = {} data_dict['text_orig'] = row['text'] data_dict['text'] = row['text'] data_dict['net_sentiment'] = row['net_sentiment'] data_dict['sent_per_token'] = row['sent_per_token'] data_dict['stars'] = row['stars'] data_dict['fake_name'] = row['fake_name'] data_dict['user_id'] = row['user_id'] data_dict['business_id'] = row['business_id'] data_dict['date'] = row['date'] data_dict['review_id'] = row['review_id'] op_dict = { "index": { "_index": 'yelp', "_type": 'review', "_id": row['review_id'] } } bulk_data.append(op_dict) bulk_data.append(data_dict) pprint(bulk_data[0]) pprint(bulk_data[1]) print(len(bulk_data)) # May time out with a large bulk_data bump or error and fail without any reason. Mine did, so see below. # res = local_es.bulk(index = 'yelp', body = bulk_data) # In order to find the error, I did them one-by-one, with a try. for ind, obj in enumerate(bulk_data): # every other one is the data, so use those to do it one by one if ind % 2 != 0: try: local_es.index(index='yelp', doc_type='review', id=obj['review_id'], body=json.dumps(obj)) except: print(obj) local_es.search(index='yelp', doc_type='review', q='pizza-cookie') # Remember that score relevancy results are based on the indexed TF-IDF for the doc and docs: # https://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-intro.html # Want to explain why something matched? You need the id of the matched doc. local_es.explain(index='yelp', doc_type='review', q='pizza-cookie', id=u'fmn5yGrPChOYMR2vGOIrYA') # ### More Like This # # A variety of options for finding similar documents, including term counts and custom stop words: # https://www.elastic.co/guide/en/elasticsearch/reference/2.3/query-dsl-mlt-query.html # # text = df.iloc[0].text print(text) QUERY = { "query": { "more_like_this": { "fields": ["text"], "like_text": text, "analyzer": "english", "min_term_freq": 2 } } } # Result is not brilliant, though. You could limit the hits unless a score threshold is hit. pprint( local_es.search(index='yelp', doc_type='review', body=json.dumps(QUERY))) # ### Suggestions: For Mispellings # # Can be added to queries too, to help if there are no matches. Still in development, though. See: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters.html#search-suggesters SUGGESTION = { "my-suggestion": { "text": "cheese piza", "term": { "field": "text" } } } # I don't love the results, tbh. Fail on cheese. pprint(local_es.suggest(index='yelp', body=SUGGESTION)) # ## Reminders: # * check your mapping on your fields # * check your analyzer results - they can be mysterious and hidden; if you configure wrong, it will use defaults... # * check your document tokenization # * use multi-fields to be sure of matches that may need stopwords too # ## Let's Index the Businesses too biz = pd.read_msgpack("data/biz_stats_df.msg") print(len(biz)) pprint(biz[0:2]) B_MAPPING = { 'business': { 'properties': { 'business_id': { 'index': 'not_analyzed', 'type': 'string' }, 'reviews': { 'index': 'not_analyzed', 'type': 'integer' }, 'stars_median': { 'index': 'not_analyzed', 'type': 'float' }, 'stars_mean': { 'index': 'not_analyzed', 'type': 'float' }, 'text_length_median': { 'index': 'not_analyzed', 'type': 'float' }, 'fake_name': { 'index': 'not_analyzed', 'type': 'string' }, 'net_sentiment_median': { 'index': 'not_analyzed', 'type': 'float' }, 'sent_per_token_median': { 'index': 'not_analyzed', 'type': 'float' } } } } # local_es.indices.delete(index='yelp') # nb: this errors the first time you run it. comment out. # local_es.indices.create(index='yelp') # do not do this is you already made the reviews! local_es.indices.put_mapping(index='yelp', doc_type='business', body=json.dumps(B_MAPPING)) bulk_data = [] for index, row in biz.iterrows(): data_dict = {} data_dict['net_sentiment_median'] = row['net_sentiment_median'] data_dict['sent_per_token_median'] = row['sent_per_token_median'] data_dict['stars_median'] = row['stars_median'] data_dict['stars_mean'] = row['stars_mean'] data_dict['fake_name'] = row['fake_name'] data_dict['text_length_median'] = row['text_length_median'] data_dict['business_id'] = row['business_id'] data_dict['reviews'] = row['reviews'] op_dict = { "index": { "_index": 'yelp', "_type": 'business', "_id": row['business_id'] } } bulk_data.append(op_dict) bulk_data.append(data_dict) # May time out with a large bulk_data bump or error and fail without any reason. Mine did, so see below. res = local_es.bulk(index='yelp', body=bulk_data) print( local_es.search(index='yelp', doc_type='business', q='JokKtdXU7zXHcr20Lrk29A')) # ## Aggregate Queries to get Business ID's and More # # # Here we are using the operator "and" to make sure all words in the search match, and then getting counts of matching business id's. QUERY = { "query": { "match": { "text": { "query": "good pizza", "operator": "and" } } }, "aggs": { "businesses": { "terms": { "field": "business_id" } } } } pprint( local_es.search(index="yelp", doc_type="review", body=json.dumps(QUERY))) # exact match on field: https://www.elastic.co/guide/en/elasticsearch/guide/master/_finding_exact_values.html # requires not indexed field for the match QUERY = { "query": { "constant_score": { "filter": { "term": { "business_id": "VVeogjZya58oiTxK7qUjAQ" } } } } } pprint( local_es.search(index="yelp", doc_type="business", body=json.dumps(QUERY)))
def view(self, request, content_type, pk): """The view for showing the results of a single item in the Haystack index. :param request: the current request. :type request: WSGIRequest :param content_type: ``app_label`` and ``model_name`` as stored in Haystack, separated by "." :type content_type: string. :param pk: the object identifier stored in Haystack :type pk: string. :return: A template rendered into an HttpReponse """ if not self.has_change_permission(request, None): raise PermissionDenied("Not a superuser") query = {DJANGO_ID: pk, DJANGO_CT: content_type} try: raw_sqs = SearchQuerySet().filter(**query)[:1] wrapped_sqs = self.get_wrapped_search_results(raw_sqs) sqs = wrapped_sqs[0] except IndexError: raise Search404( "Search result using query {q!r} does not exist".format( q=query)) except SearchBackendError as e: raise Search404("{exc!r} while trying query {q!r}".format(q=query, exc=e)) more_like_this = () # the model may no longer be in the database, instead being only backed # by the search backend. model_instance = sqs.object.object if model_instance is not None: # Refs #GH-15 - elasticsearch-py 2.x does not implement a .mlt # method, but currently there's nothing in haystack-proper which # prevents using the 2.x series with the haystack-es1 backend. # At some point haystack will have a separate es backend ... # and I have no idea if/how I'm going to support that. try: raw_mlt = SearchQuerySet().more_like_this(model_instance)[:5] except AttributeError as e: logger.debug( "Support for 'more like this' functionality was " "not found, possibly because you're using " "the elasticsearch-py 2.x series with haystack's " "ES1.x backend", exc_info=1, extra={'request': request}) raw_mlt = () more_like_this = self.get_wrapped_search_results(raw_mlt) form = PreSelectedModelSearchForm(request.GET or None, load_all=False) form_valid = form.is_valid() es = Elasticsearch() term_vectors = es.termvectors(index="haystack", doc_type="modelresult", id=content_type + "." + str(pk))["term_vectors"] query = request.GET.get("q", None) content_field = request.GET.get('content_field', 'content') query_string = content_field + ':(' + query + ")" if request.GET.get("search_type", 0) == "1": query_string = content_field + ':"' + query + '"' query_field = content_field if content_field else 'content' field_mapping = self.get_field_mapping(query_field) query_analysis = self.get_query_analysis(query, field_mapping, content_field, query_field) analyzer = self.get_query_analzyer(field_mapping, content_field) query_explanation = None # wasn't easy to understand but may be useful at some point if query: query_explanation = es.explain( index="haystack", doc_type="modelresult", id=content_type + "." + str(pk), q=query_string, analyzer=analyzer, default_operator=getattr(settings, "HAYSTACK_DEFAULT_OPERATOR", "OR")) context = { 'original': sqs, 'title': _('View stored data for this %s') % force_text(sqs.verbose_name), 'app_label': self.model._meta.app_label, 'module_name': force_text(self.model._meta.verbose_name_plural), 'haystack_settings': self.get_settings(), 'has_change_permission': self.has_change_permission(request, sqs), 'similar_objects': more_like_this, 'haystack_version': _haystack_version, 'term_vectors': term_vectors, "query_explanation": query_explanation, 'query_analysis': query_analysis, 'query_string': query_string, 'analyzer': analyzer, 'form': form, 'form_valid': form_valid, } # Update the context with variables that should be available to every page context.update(self.each_context_compat(request)) return self.do_render(request=request, template_name='admin/haystackbrowser/view.html', context=context)
class ElasticRetrieval(BaseRetrieval): """ Interfaces with the Elasticsearch API """ def __init__(self, index_name, method, logger=None, use_default_similarity=True, max_results=None, es_instance=None, save_terms=False, multi_match_type=None): self.index_name = index_name if es_instance: self.es = es_instance else: if cp.Corpus.__class__.__name__ == "ElasticCorpus": self.es = cp.Corpus.es else: self.es = Elasticsearch(timeout=QUERY_TIMEOUT) if not cp.Corpus.isIndexOpen(self.index_name): try: self.es.indices.open(self.index_name) time.sleep(10) except TransportError as e: print(e) if max_results: self.max_results = max_results else: self.max_results = MAX_RESULTS_RECALL self.method = method # never used! self.logger = logger self.last_query = {} self.save_terms = save_terms self.default_field = "text" self.tie_breaker = 0 if not multi_match_type: self.multi_match_type = "best_fields" else: self.multi_match_type = multi_match_type def rewriteQueryAsDSL1(self, structured_query, parameters): """ Creates a multi_match DSL query for elasticsearch. :param structured_query: a StructuredQuery dict, optionally under the key "structured_query" :param parameters: dict of [field]=weight to replace in the query """ if "structured_query" in structured_query: structured_query = structured_query["structured_query"] if not isinstance(structured_query, StructuredQuery): structured_query = StructuredQuery(structured_query) if not structured_query or len(structured_query) == 0: return None self.last_query = structured_query lucene_query = "" for token in structured_query: # TODO proper computing of the boost formula. Different methods? t_boost = token.boost t_count = token.count if t_boost is None: print("NULL! ") print(token, token.boost, token.count) t_boost = 0 if t_count is None: print("NULL! ") print(token, token.boost, token.count) t_count = 0 boost = t_boost * t_count if boost == 0.0: continue bool_val = token.bool or "" token_text = token.token if " " in token_text: # if token is a phrase token_text = "\"" + token_text + "\"" lucene_query += "%s%s " % (bool_val, token_text) ## if boost != 1: ## lucene_query+="^%s" %str(boost) if boost != 1: token_str = token_text + " " lucene_query += bool_val + (token_str * int(boost - 1)) lucene_query = lucene_query.strip() lucene_query += " " lucene_query = lucene_query.replace(" ", " ") fields = [] for param in parameters: fields.append(param + "^" + str(parameters[param])) dsl_query = { "multi_match": { "query": lucene_query, "type": self.multi_match_type, "fields": fields, "operator": "or", } } ## print(dsl_query) if self.tie_breaker: dsl_query["multi_match"]["tie_breaker"] = self.tie_breaker return dsl_query def rewriteQueryAsDSL2(self, structured_query, parameters): """ Creates a multi_match DSL query for elasticsearch. Version 2 :param structured_query: a StructuredQuery dict, optionally under the key "structured_query" :param parameters: dict of [field]=weight to replace in the query """ if "structured_query" in structured_query: structured_query = structured_query["structured_query"] if not isinstance(structured_query, StructuredQuery): structured_query = StructuredQuery(structured_query) if not structured_query or len(structured_query) == 0: return None self.last_query = structured_query lucene_query = "" for token in structured_query: boost = token.boost * token.count bool_val = token.bool or "" token_text = token.token if " " in token_text: # if token is a phrase token_text = "\"" + token_text + "\"" lucene_query += "%s%s " % (bool_val, token_text) if boost != 1: token_str = token_text + " " lucene_query += bool_val + (token_str * int(boost - 1)) lucene_query = lucene_query.strip() lucene_query += " " elastic_query = {"bool": {"should": []}} fields = [] for param in parameters: fields.append(param + "^" + str(parameters[param])) dsl_query = { "multi_match": { "query": lucene_query, "type": self.multi_match_type, "fields": fields, "operator": "or", } } ## print(dsl_query) if self.tie_breaker: dsl_query["multi_match"]["tie_breaker"] = self.tie_breaker return dsl_query def rewriteQueryAsDSL(self, structured_query, parameters): """ Creates a DSL query for elasticsearch. Version 3, uses individual "term" and "match" queries :param structured_query: a StructuredQuery dict, optionally under the key "structured_query" :param parameters: dict of [field]=weight to replace in the query """ if isinstance(structured_query, dict) and "structured_query" in structured_query: structured_query = structured_query["structured_query"] if not isinstance(structured_query, StructuredQuery): structured_query = StructuredQuery(structured_query) if not structured_query or len(structured_query) == 0: return None self.last_query = structured_query field_dicts = [] for token in structured_query: # TODO proper computing of the boost formula. Different methods? boost = token.boost * token.count bool_val = token.bool or "" token_text = token.token # if " " in token_text: # if token is a phrase # token_text = "\"" + token_text + "\"" if boost == 0.0: continue for field in parameters: if " " in token_text: new_dict = { "match_phrase": { field: { "query": token_text, "boost": parameters[field] * boost }, } } else: new_dict = { "term": { field: { "value": token_text, "boost": parameters[field] * boost }, } } field_dicts.append(new_dict) fields = [] for param in parameters: fields.append(param + "^" + str(parameters[param])) dsl_query = {"bool": {"should": field_dicts}} return dsl_query def runQuery(self, structured_query, max_results=None): """ Interfaces with the elasticsearch query API """ if not structured_query or len(structured_query) == 0: return [] if not max_results: max_results = self.max_results self.last_query = dict(structured_query) dsl_query = self.rewriteQueryAsDSL( structured_query["structured_query"], [self.default_field]) res = self.es.search( body={"query": dsl_query}, size=max_results, index=self.index_name, doc_type=ES_TYPE_DOC, request_timeout=QUERY_TIMEOUT, ) structured_query["dsl_query"] = dsl_query hits = res["hits"]["hits"] ## print("Found %d document(s) that matched query '%s':" % (res['hits']['total'], query)) ## if len(hits.scoreDocs) ==0: ## print "Original query:",original_query ## print "Query:", query result = [] for hit in hits: metadata = hit["_source"]["metadata"] result.append((hit["_score"], metadata)) return result def formulaFromExplanation(self, query, doc_id): """ Runs .explain() for one query/doc pair, generates and returns a \ StoredFormula instance from it :param query: StructuredQuery dict, with a "dsl_query" key :param doc_id: id of document to run .explain() for :returns: """ explanation = None retries = 0 while retries < 1: try: explanation = self.es.explain( index=self.index_name, doc_type=ES_TYPE_DOC, body={"query": query["dsl_query"]}, id=doc_id, request_timeout=QUERY_TIMEOUT, ) break except Exception as e: ## logging.error("Exception, retrying...") retries += 1 if retries > 0: if retries == 1: logging.error( "Retried {} times, failed to retrieve.".format(retries + 1)) else: logging.warning("Retried %d times, retrieved successfuly." % (retries + 1)) formula = StoredFormula() if explanation: formula.fromElasticExplanation(explanation, self.save_terms) return formula