Exemplo n.º 1
0
def analyze_match_query(index, field, query, results):
    body = {"query": {"match": {field: {"query": query}}}}
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    for result in results:
        response = es.explain(index=index,
                              body=body,
                              doc_type='doc',
                              id=result['uuid'])
        print('explain response --> ', response)
Exemplo n.º 2
0
class ElasticRetrieval(BaseRetrieval):
    """
        Interfaces with the Elasticsearch API
    """
    def __init__(self, index_name, method, logger=None, use_default_similarity=True, max_results=None, es_instance=None, save_terms=True):
        self.index_name=index_name
        if es_instance:
            self.es=es_instance
        else:
            if cp.Corpus.__class__.__name__ == "ElasticCorpus":
                self.es=cp.Corpus.es
            else:
                self.es=Elasticsearch()

        if max_results:
            self.max_results=max_results
        else:
            self.max_results=MAX_RESULTS_RECALL

        self.method=method # never used?
        self.logger=logger
        self.last_query={}
        self.save_terms=save_terms
        self.default_field="text"
        self.tie_breaker=0
        self.multi_match_type="best_fields"

    def rewriteQueryAsDSL(self, structured_query, parameters):
        """
            Creates a multi_match DSL query for elasticsearch.

            :param structured_query: a StructuredQuery dict, optionally under the
                key "structured_query"
            :param parameters: dict of [field]=weight to replace in the query
        """
        if "structured_query" in structured_query:
            structured_query=structured_query["structured_query"]

        if not isinstance(structured_query,StructuredQuery):
            structured_query=StructuredQuery(structured_query)

        if not structured_query or len(structured_query) == 0:
            return None

        self.last_query=structured_query

        lucene_query=""

        for  token in structured_query:
            # TODO proper computing of the boost formula. Different methods?
##            boost=token["boost"]*token["count"]
            boost=token.boost*token.count
##            bool_val=token.get("bool", None) or ""
            bool_val=token.bool or ""

##            lucene_query+="%s%s" % (bool_val,token["token"])
            lucene_query+="%s%s" % (bool_val,token.token)
##            if boost != 1:
##                lucene_query+="^%s" %str(boost)
            if boost > 1:
                token_str=token.token+" "
                lucene_query= bool_val + (token_str * str(int(boost))).strip()
            lucene_query+=" "

        fields=[]
        for param in parameters:
            fields.append(param+"^"+str(parameters[param]))

        dsl_query={
          "multi_match" : {
            "query": lucene_query,
            "type":  self.multi_match_type,
            "fields": fields,
            "operator": "or",
          }
        }

        if self.tie_breaker:
            dsl_query["multi_match"]["tie_breaker"]=self.tie_breaker

        return dsl_query

    def rewriteQueryAsDSL_new(self, structured_query, parameters):
        """
            Creates a multi_match DSL query for elasticsearch.

            :param structured_query: a StructuredQuery dict, optionally under the
                key "structured_query"
            :param parameters: dict of [field]=weight to replace in the query
        """
        if "structured_query" in structured_query:
            structured_query=structured_query["structured_query"]

        if not isinstance(structured_query,StructuredQuery):
            structured_query=StructuredQuery(structured_query)

        if not structured_query or len(structured_query) == 0:
            return None

        self.last_query=structured_query

        lucene_query=""

        for  token in structured_query:
            # TODO proper computing of the boost formula. Different methods?
##            boost=token["boost"]*token["count"]
            boost=token.boost*token.count
##            bool_val=token.get("bool", None) or ""
            bool_val=token.bool or ""

##            lucene_query+="%s%s" % (bool_val,token["token"])
            lucene_query+="%s%s" % (bool_val,token.token)
##            if boost != 1:
##                lucene_query+="^%s" %str(boost)
            if boost > 1:
                lucene_query= bool_val + (token.token * str(int(boost)))
            lucene_query+=" "

        fields=[]
        for param in parameters:
            fields.append(param+"^"+str(parameters[param]))

        dsl_query={
          "multi_match" : {
            "query": lucene_query,
            "type":  self.multi_match_type,
            "fields": fields,
            "operator": "or",
          }
        }

        if self.tie_breaker:
            dsl_query["multi_match"]["tie_breaker"]=self.tie_breaker

        return dsl_query



    def runQuery(self, structured_query, max_results=None):
        """
            Interfaces with the elasticsearch query API
        """
        if not structured_query or len(structured_query) == 0 :
            return []

        if not max_results:
            max_results=self.max_results

        self.last_query=dict(structured_query)
        dsl_query=self.rewriteQueryAsDSL(structured_query["structured_query"], [self.default_field])

        res=self.es.search(
            body={"query":dsl_query},
            size=max_results,
            index=self.index_name,
            doc_type=ES_TYPE_DOC,
            request_timeout=QUERY_TIMEOUT,
            )

        structured_query["dsl_query"]=dsl_query
        hits=res["hits"]["hits"]
##        print("Found %d document(s) that matched query '%s':" % (res['hits']['total'], query))

##        if len(hits.scoreDocs) ==0:
##            print "Original query:",original_query
##            print "Query:", query
        result=[]
        for hit in hits:
            metadata= hit["_source"]["metadata"]
            result.append((hit["_score"],metadata))
        return result

    def formulaFromExplanation(self, query, doc_id):
        """
            Runs .explain() for one query/doc pair, generates and returns a \
            StoredFormula instance from it

            :param query: StructuredQuery dict, with a "dsl_query" key
            :param doc_id: id of document to run .explain() for
            :returns:
        """
        explanation=None
        retries=0
        while retries < 2:
            try:
                explanation=self.es.explain(
                    index=self.index_name,
                    doc_type=ES_TYPE_DOC,
                    body={"query":query["dsl_query"]},
                    id=doc_id,
                    request_timeout=QUERY_TIMEOUT,
                    )
                break
            except Exception as e:
                logging.exception("Exception, retrying...")
                retries+=1

        formula=StoredFormula()
        if explanation:
            formula.fromElasticExplanation(explanation, self.save_terms)
        return formula
Exemplo n.º 3
0
def main():
    # es = Elasticsearch(hosts=[{'host': 'elasticsearch.aws.blahblah.com', 'port': '9200'}])
    local_es = Elasticsearch()
    local_client = client.IndicesClient(local_es)

    # ### Analyzers, Defaults, and Preventing Analysis
    #
    # Analysis is the process of chopping up your text and storing it in a form that can be searched efficiently against.
    #
    # #### Read this:
    #
    # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html
    #
    # An Analyzer, is in order, a sequence of optional
    # * character filters
    # * tokenizers
    # * token filters
    #
    # To prevent analysis, you can specify "not_analyzed" on the index itself.  The Interwebs also suggest "keyword" as the analyzer for a field, but some folks claim it does some simple analyis.
    #
    # The default analyzer (if unspecified!) for string fields is "standard."  In a custom analyzer, it would be defined:
    #
    #     {
    #         "type":      "custom",
    #         "tokenizer": "standard",
    #         "filter":  [ "lowercase", "stop" ]
    #     }
    #
    # More on default analysis from the docs (https://www.elastic.co/guide/en/elasticsearch/guide/current/_controlling_analysis.html):
    #
    # >While we can specify an analyzer at the field level, how do we determine which analyzer is used for a field if none is specified at the field level?
    # >
    # >Analyzers can be specified at several levels. Elasticsearch works through each level until it finds an analyzer that it can use. At index time, the order is as follows:
    # >
    # >1. The analyzer defined in the field mapping, else
    # >2. The analyzer named default in the index settings, which defaults to
    # >3. The standard analyzer
    # >
    # >...At search time, the sequence is slightly different:...
    # >
    # >1. The analyzer defined in the query itself, else
    # >2. The search_analyzer defined in the field mapping, else
    # >3. The analyzer defined in the field mapping, else
    # >4. The analyzer named default_search in the index settings, which defaults to
    # >5. The analyzer named default in the index settings, which defaults to
    # >6. The standard analyzer
    #
    # #### We can inspect analysis with the "analyze" function (or "_analyze" in the curl style).
    if local_es.indices.exists('my_index'):
        local_es.indices.delete(index='my_index')
    local_es.indices.create(index='my_index')

    # this is the default analyzer ES will use if you don't specify one! Specify one!
    print(
        local_client.analyze(index='my_index',
                             analyzer='standard',
                             text='My kitty-cat is adorable.'))

    # A utility to make analysis results easier to read:
    def get_analyzer_tokens(result):
        ''' Utility to combine tokens in an analyzer result. '''
        tokens = result[u'tokens']
        print(tokens)
        return ' '.join([token['token'] for token in tokens])

    get_analyzer_tokens(
        local_client.analyze(index='my_index',
                             analyzer="standard",
                             text='My kitty-cat\'s a pain in the neck.'))

    # **NB: Prevent analysis with "keyword" analyzer, or set the index itself as "not_analyzed" in settings.**
    #
    # But if you do this, you need to match on EXACT field contents to search for it.  Best to keep an analyzed copy too, if it's meant to be english searchable text.
    get_analyzer_tokens(
        local_client.analyze(index='my_index',
                             analyzer='keyword',
                             text='My kitty-cat\'s a pain in the neck.'))

    # ## The Built-In ES "English" Analyzer:
    # ### A useful analyzer for text is the built-in English one, which does this, approximately:
    #
    # https://www.elastic.co/guide/en/elasticsearch/guide/current/language-intro.html
    #
    # See:
    # https://simpsora.wordpress.com/2014/05/02/customizing-elasticsearch-english-analyzer/
    #
    # >Tokenizer: Standard tokenizer
    #
    # >TokenFilters:
    # >* Standard token filter
    # >* English possessive filter, which removes trailing 's from words
    # >* Lowercase token filter
    # >* Stop token filter
    # >* Keyword marker filter, which protects certain tokens from modification by stemmers
    # >* Porter stemmer filter, which reduces words down to a base form (“stem”)
    #
    #
    # These are the stop-words defined for English:
    #
    #     a, an, and, are, as, at, be, but, by, for, if, in, into, is, it,
    #     no, not, of, on, or, such, that, the, their, then, there, these,
    #     they, this, to, was, will, with
    #
    # If you want to customize you can create a new filter yourself or use a file in your config directory for ES.
    # Try it on some text and see...
    get_analyzer_tokens(
        local_client.analyze(index='my_index',
                             analyzer='english',
                             text='My kitty-cat\'s a pain in the neck.'))

    # If you wanted to customize the 'english' analyzer with your own special rules (extra stopwords etc), see here: https://www.elastic.co/guide/en/elasticsearch/guide/current/configuring-language-analyzers.html
    #

    # ## Analyzers and Custom Analyzers

    # You want to make sure you are explicit about types in your data, so that ES doesn't just guess and maybe get it wrong. Also, this is how you set explicit analysis.

    #
    #
    # Create a setting for the index:
    #
    #     PUT /my_index
    #     {
    #         "settings": {
    #             "analysis": {
    #                 "char_filter": { ... custom character filters ... },
    #                 "tokenizer":   { ...    custom tokenizers     ... },
    #                 "filter":      { ...   custom token filters   ... },
    #                 "analyzer":    { ...    custom analyzers referring to the definitions above ... }
    #             }
    #         }
    #     }
    #
    # For example - this saves a bunch of analysis components into an analyzer called 'my_analyzer':
    #
    #     PUT /my_index
    #     {
    #         "settings": {
    #             "analysis": {
    #                 "char_filter": {
    #                     "&_to_and": {
    #                         "type":       "mapping",
    #                         "mappings": [ "&=> and "]
    #                 }},
    #                 "filter": {
    #                     "my_stopwords": {
    #                         "type":       "stop",
    #                         "stopwords": [ "the", "a" ]
    #                 }},
    #                 "analyzer": {
    #                     "my_analyzer": {
    #                         "type":         "custom",
    #                         "char_filter":  [ "html_strip", "&_to_and" ],
    #                         "tokenizer":    "standard",
    #                         "filter":       [ "lowercase", "my_stopwords" ]
    #                 }}
    #     }}}
    #
    #  Then you **use it**, by referring to it in a mapping for a document in this index:
    #
    #      PUT /my_index/_mapping/my_type
    #     {
    #         "properties": {
    #             "title": {
    #                 "type":      "string",
    #                 "analyzer":  "my_analyzer"
    #             }
    #         }
    #     }
    #
    # #### Remember: If you don't assign it to a field in a mapping, you aren't using it.
    #
    # In Python:

    MY_SETTINGS = {
        "settings": {
            "analysis": {
                "char_filter": {
                    "&_to_and": {
                        "type": "mapping",
                        "mappings": ["&=> and "]
                    }
                },
                "filter": {
                    "my_stopwords": {
                        "type": "stop",
                        "stopwords": ["the", "a"]
                    }
                },
                "analyzer": {
                    "my_analyzer": {
                        "type": "custom",
                        "char_filter": ["html_strip", "&_to_and"],
                        "tokenizer": "standard",
                        "filter": ["lowercase", "my_stopwords"]
                    }
                }
            }
        }
    }

    MAPPING = {
        "my_doc_type": {
            "properties": {
                "title": {
                    "type": "string",
                    "analyzer": "my_analyzer"
                }
            }
        }
    }

    # ## Stopwords Note
    #
    # The default list of stopwords is indicated thusly:
    #
    # >"stopwords": "\_english\_"
    #
    # So you can specify both that filter and a custom stopwords list, if you want.
    if local_es.indices.exists('my_index'):
        local_es.indices.delete(index='my_index')
    local_es.indices.create(index='my_index', body=json.dumps(MY_SETTINGS))
    local_es.indices.put_mapping(index='my_index',
                                 doc_type="my_doc_type",
                                 body=json.dumps(MAPPING))

    # Check that your mapping looks right!
    print(local_client.get_mapping(index='my_index'))

    res = local_client.analyze(
        index='my_index',
        analyzer='my_analyzer',
        text="<p>This is the title & a Capitalized Word!</p>")
    get_analyzer_tokens(res)

    # ## Tokenizers vs. Analyzers - Be Careful.
    #
    # Some of the names in ES are confusing.  There is a **"standard" analyzer** and a **"standard" tokenizer**. https://www.elastic.co/guide/en/elasticsearch/guide/current/standard-tokenizer.html#standard-tokenizer
    #
    # Check them out:
    get_analyzer_tokens(
        local_client.analyze(
            index='my_index',
            analyzer='standard',
            text='My kitty-cat\'s not a pain in the \'neck\'!'))

    #  The difference is subtle but there.
    get_analyzer_tokens(
        local_client.analyze(
            index='my_index',
            tokenizer="standard",
            text='My kitty-cat\'s not a pain in the \'neck\'!'))

    # However, if you use the english analyzer it will override that uppercase and also remove the negation,
    # because "not" is in the stopwords list:
    get_analyzer_tokens(
        local_client.analyze(
            index='my_index',
            analyzer="english",
            tokenizer="standard",
            text='My kitty-cat\'s not a pain in the \'neck\'!'))

    # ## Indexing Yelp Data
    df = pd.read_msgpack("./data/yelp_df_forES.msg")
    print(df.head())

    # test with a small sample if you want
    dfshort = df.query('stars >= 5 and net_sentiment > 35')
    print(len(dfshort))
    print(dfshort.head())

    # filter out any rows with a nan for sent_per_token, which breaks bulk load:
    df = df[df.sent_per_token.isnull() != True]

    MAPPING = {
        'review': {
            'properties': {
                'business_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'date': {
                    'index': 'not_analyzed',
                    'format': 'dateOptionalTime',
                    'type': 'date'
                },
                'review_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'stars': {
                    'index': 'not_analyzed',
                    'type': 'integer'
                },
                'text': {
                    'index': 'analyzed',
                    'analyzer': 'english',
                    'store': 'yes',
                    "term_vector": "with_positions_offsets_payloads",
                    'type': 'string'
                },
                'fake_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'text_orig': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'user_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'net_sentiment': {
                    'index': 'not_analyzed',
                    'type': 'integer'
                },
                'sent_per_token': {
                    'index': 'not_analyzed',
                    'type': 'float'
                }
            }
        }
    }

    if local_es.indices.exists('yelp'):
        local_es.indices.delete(index='yelp')
    local_es.indices.create(index='yelp')
    local_es.indices.put_mapping(index='yelp',
                                 doc_type='review',
                                 body=json.dumps(MAPPING))

    # Bulk data is structured as alternating opt_dict and data dicts.
    bulk_data = []

    for index, row in df.iterrows():
        data_dict = {}
        data_dict['text_orig'] = row['text']
        data_dict['text'] = row['text']
        data_dict['net_sentiment'] = row['net_sentiment']
        data_dict['sent_per_token'] = row['sent_per_token']
        data_dict['stars'] = row['stars']
        data_dict['fake_name'] = row['fake_name']
        data_dict['user_id'] = row['user_id']
        data_dict['business_id'] = row['business_id']
        data_dict['date'] = row['date']
        data_dict['review_id'] = row['review_id']
        op_dict = {
            "index": {
                "_index": 'yelp',
                "_type": 'review',
                "_id": row['review_id']
            }
        }
        bulk_data.append(op_dict)
        bulk_data.append(data_dict)

    pprint(bulk_data[0])
    pprint(bulk_data[1])
    print(len(bulk_data))

    # May time out with a large bulk_data bump or error and fail without any reason.  Mine did, so see below.
    # res = local_es.bulk(index = 'yelp', body = bulk_data)

    # In order to find the error, I did them one-by-one, with a try.
    for ind, obj in enumerate(bulk_data):
        # every other one is the data, so use those to do it one by one
        if ind % 2 != 0:
            try:
                local_es.index(index='yelp',
                               doc_type='review',
                               id=obj['review_id'],
                               body=json.dumps(obj))
            except:
                print(obj)

    local_es.search(index='yelp', doc_type='review', q='pizza-cookie')

    # Remember that score relevancy results are based on the indexed TF-IDF for the doc and docs:
    #     https://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-intro.html

    # Want to explain why something matched?  You need the id of the matched doc.
    local_es.explain(index='yelp',
                     doc_type='review',
                     q='pizza-cookie',
                     id=u'fmn5yGrPChOYMR2vGOIrYA')

    # ### More Like This
    #
    # A variety of options for finding similar documents, including term counts and custom stop words:
    # https://www.elastic.co/guide/en/elasticsearch/reference/2.3/query-dsl-mlt-query.html
    #
    #
    text = df.iloc[0].text
    print(text)

    QUERY = {
        "query": {
            "more_like_this": {
                "fields": ["text"],
                "like_text": text,
                "analyzer": "english",
                "min_term_freq": 2
            }
        }
    }

    # Result is not brilliant, though.  You could limit the hits unless a score threshold is hit.
    pprint(
        local_es.search(index='yelp',
                        doc_type='review',
                        body=json.dumps(QUERY)))

    # ### Suggestions: For Mispellings
    #
    # Can be added to queries too, to help if there are no matches.  Still in development, though. See: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters.html#search-suggesters
    SUGGESTION = {
        "my-suggestion": {
            "text": "cheese piza",
            "term": {
                "field": "text"
            }
        }
    }

    # I don't love the results, tbh.  Fail on cheese.
    pprint(local_es.suggest(index='yelp', body=SUGGESTION))

    # ## Reminders:
    # * check your mapping on your fields
    # * check your analyzer results - they can be mysterious and hidden; if you configure wrong, it will use defaults...
    # * check your document tokenization
    # * use multi-fields to be sure of matches that may need stopwords too

    # ## Let's Index the Businesses too
    biz = pd.read_msgpack("data/biz_stats_df.msg")
    print(len(biz))
    pprint(biz[0:2])

    B_MAPPING = {
        'business': {
            'properties': {
                'business_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'reviews': {
                    'index': 'not_analyzed',
                    'type': 'integer'
                },
                'stars_median': {
                    'index': 'not_analyzed',
                    'type': 'float'
                },
                'stars_mean': {
                    'index': 'not_analyzed',
                    'type': 'float'
                },
                'text_length_median': {
                    'index': 'not_analyzed',
                    'type': 'float'
                },
                'fake_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'net_sentiment_median': {
                    'index': 'not_analyzed',
                    'type': 'float'
                },
                'sent_per_token_median': {
                    'index': 'not_analyzed',
                    'type': 'float'
                }
            }
        }
    }

    # local_es.indices.delete(index='yelp')  # nb: this errors the first time you run it. comment out.
    # local_es.indices.create(index='yelp')  # do not do this is you already made the reviews!
    local_es.indices.put_mapping(index='yelp',
                                 doc_type='business',
                                 body=json.dumps(B_MAPPING))

    bulk_data = []

    for index, row in biz.iterrows():
        data_dict = {}
        data_dict['net_sentiment_median'] = row['net_sentiment_median']
        data_dict['sent_per_token_median'] = row['sent_per_token_median']
        data_dict['stars_median'] = row['stars_median']
        data_dict['stars_mean'] = row['stars_mean']
        data_dict['fake_name'] = row['fake_name']
        data_dict['text_length_median'] = row['text_length_median']
        data_dict['business_id'] = row['business_id']
        data_dict['reviews'] = row['reviews']
        op_dict = {
            "index": {
                "_index": 'yelp',
                "_type": 'business',
                "_id": row['business_id']
            }
        }
        bulk_data.append(op_dict)
        bulk_data.append(data_dict)

    # May time out with a large bulk_data bump or error and fail without any reason.  Mine did, so see below.
    res = local_es.bulk(index='yelp', body=bulk_data)

    print(
        local_es.search(index='yelp',
                        doc_type='business',
                        q='JokKtdXU7zXHcr20Lrk29A'))

    # ## Aggregate Queries to get Business ID's and More
    #
    #
    # Here we are using the operator "and" to make sure all words in the search match, and then getting counts of matching business id's.
    QUERY = {
        "query": {
            "match": {
                "text": {
                    "query": "good pizza",
                    "operator": "and"
                }
            }
        },
        "aggs": {
            "businesses": {
                "terms": {
                    "field": "business_id"
                }
            }
        }
    }

    pprint(
        local_es.search(index="yelp",
                        doc_type="review",
                        body=json.dumps(QUERY)))

    # exact match on field: https://www.elastic.co/guide/en/elasticsearch/guide/master/_finding_exact_values.html
    # requires not indexed field for the match
    QUERY = {
        "query": {
            "constant_score": {
                "filter": {
                    "term": {
                        "business_id": "VVeogjZya58oiTxK7qUjAQ"
                    }
                }
            }
        }
    }

    pprint(
        local_es.search(index="yelp",
                        doc_type="business",
                        body=json.dumps(QUERY)))
Exemplo n.º 4
0
    def view(self, request, content_type, pk):
        """The view for showing the results of a single item in the Haystack index.

        :param request: the current request.
        :type request: WSGIRequest
        :param content_type: ``app_label`` and ``model_name`` as stored in Haystack, separated by "."
        :type content_type: string.
        :param pk: the object identifier stored in Haystack
        :type pk: string.

        :return: A template rendered into an HttpReponse
        """
        if not self.has_change_permission(request, None):
            raise PermissionDenied("Not a superuser")

        query = {DJANGO_ID: pk, DJANGO_CT: content_type}
        try:
            raw_sqs = SearchQuerySet().filter(**query)[:1]
            wrapped_sqs = self.get_wrapped_search_results(raw_sqs)
            sqs = wrapped_sqs[0]
        except IndexError:
            raise Search404(
                "Search result using query {q!r} does not exist".format(
                    q=query))
        except SearchBackendError as e:
            raise Search404("{exc!r} while trying query {q!r}".format(q=query,
                                                                      exc=e))

        more_like_this = ()
        # the model may no longer be in the database, instead being only backed
        # by the search backend.
        model_instance = sqs.object.object
        if model_instance is not None:
            # Refs #GH-15 - elasticsearch-py 2.x does not implement a .mlt
            # method, but currently there's nothing in haystack-proper which
            # prevents using the 2.x series with the haystack-es1 backend.
            # At some point haystack will have a separate es backend ...
            # and I have no idea if/how I'm going to support that.
            try:
                raw_mlt = SearchQuerySet().more_like_this(model_instance)[:5]
            except AttributeError as e:
                logger.debug(
                    "Support for 'more like this' functionality was "
                    "not found, possibly because you're using "
                    "the elasticsearch-py 2.x series with haystack's "
                    "ES1.x backend",
                    exc_info=1,
                    extra={'request': request})
                raw_mlt = ()
            more_like_this = self.get_wrapped_search_results(raw_mlt)

        form = PreSelectedModelSearchForm(request.GET or None, load_all=False)
        form_valid = form.is_valid()

        es = Elasticsearch()
        term_vectors = es.termvectors(index="haystack",
                                      doc_type="modelresult",
                                      id=content_type + "." +
                                      str(pk))["term_vectors"]

        query = request.GET.get("q", None)
        content_field = request.GET.get('content_field', 'content')

        query_string = content_field + ':(' + query + ")"
        if request.GET.get("search_type", 0) == "1":
            query_string = content_field + ':"' + query + '"'

        query_field = content_field if content_field else 'content'
        field_mapping = self.get_field_mapping(query_field)
        query_analysis = self.get_query_analysis(query, field_mapping,
                                                 content_field, query_field)
        analyzer = self.get_query_analzyer(field_mapping, content_field)

        query_explanation = None

        # wasn't easy to understand but may be useful at some point
        if query:
            query_explanation = es.explain(
                index="haystack",
                doc_type="modelresult",
                id=content_type + "." + str(pk),
                q=query_string,
                analyzer=analyzer,
                default_operator=getattr(settings, "HAYSTACK_DEFAULT_OPERATOR",
                                         "OR"))

        context = {
            'original':
            sqs,
            'title':
            _('View stored data for this %s') % force_text(sqs.verbose_name),
            'app_label':
            self.model._meta.app_label,
            'module_name':
            force_text(self.model._meta.verbose_name_plural),
            'haystack_settings':
            self.get_settings(),
            'has_change_permission':
            self.has_change_permission(request, sqs),
            'similar_objects':
            more_like_this,
            'haystack_version':
            _haystack_version,
            'term_vectors':
            term_vectors,
            "query_explanation":
            query_explanation,
            'query_analysis':
            query_analysis,
            'query_string':
            query_string,
            'analyzer':
            analyzer,
            'form':
            form,
            'form_valid':
            form_valid,
        }
        # Update the context with variables that should be available to every page
        context.update(self.each_context_compat(request))
        return self.do_render(request=request,
                              template_name='admin/haystackbrowser/view.html',
                              context=context)
Exemplo n.º 5
0
class ElasticRetrieval(BaseRetrieval):
    """
        Interfaces with the Elasticsearch API
    """
    def __init__(self,
                 index_name,
                 method,
                 logger=None,
                 use_default_similarity=True,
                 max_results=None,
                 es_instance=None,
                 save_terms=False,
                 multi_match_type=None):
        self.index_name = index_name
        if es_instance:
            self.es = es_instance
        else:
            if cp.Corpus.__class__.__name__ == "ElasticCorpus":
                self.es = cp.Corpus.es
            else:
                self.es = Elasticsearch(timeout=QUERY_TIMEOUT)

        if not cp.Corpus.isIndexOpen(self.index_name):
            try:
                self.es.indices.open(self.index_name)
                time.sleep(10)
            except TransportError as e:
                print(e)

        if max_results:
            self.max_results = max_results
        else:
            self.max_results = MAX_RESULTS_RECALL

        self.method = method  # never used!
        self.logger = logger
        self.last_query = {}
        self.save_terms = save_terms
        self.default_field = "text"
        self.tie_breaker = 0
        if not multi_match_type:
            self.multi_match_type = "best_fields"
        else:
            self.multi_match_type = multi_match_type

    def rewriteQueryAsDSL1(self, structured_query, parameters):
        """
            Creates a multi_match DSL query for elasticsearch.

            :param structured_query: a StructuredQuery dict, optionally under the
                key "structured_query"
            :param parameters: dict of [field]=weight to replace in the query
        """
        if "structured_query" in structured_query:
            structured_query = structured_query["structured_query"]

        if not isinstance(structured_query, StructuredQuery):
            structured_query = StructuredQuery(structured_query)

        if not structured_query or len(structured_query) == 0:
            return None

        self.last_query = structured_query

        lucene_query = ""

        for token in structured_query:
            # TODO proper computing of the boost formula. Different methods?
            t_boost = token.boost
            t_count = token.count

            if t_boost is None:
                print("NULL! ")
                print(token, token.boost, token.count)
                t_boost = 0
            if t_count is None:
                print("NULL! ")
                print(token, token.boost, token.count)
                t_count = 0

            boost = t_boost * t_count

            if boost == 0.0:
                continue

            bool_val = token.bool or ""

            token_text = token.token
            if " " in token_text:  # if token is a phrase
                token_text = "\"" + token_text + "\""

            lucene_query += "%s%s " % (bool_val, token_text)
            ##            if boost != 1:
            ##                lucene_query+="^%s" %str(boost)

            if boost != 1:
                token_str = token_text + " "
                lucene_query += bool_val + (token_str * int(boost - 1))

            lucene_query = lucene_query.strip()
            lucene_query += " "

        lucene_query = lucene_query.replace("  ", " ")

        fields = []
        for param in parameters:
            fields.append(param + "^" + str(parameters[param]))

        dsl_query = {
            "multi_match": {
                "query": lucene_query,
                "type": self.multi_match_type,
                "fields": fields,
                "operator": "or",
            }
        }

        ##        print(dsl_query)

        if self.tie_breaker:
            dsl_query["multi_match"]["tie_breaker"] = self.tie_breaker

        return dsl_query

    def rewriteQueryAsDSL2(self, structured_query, parameters):
        """
            Creates a multi_match DSL query for elasticsearch. Version 2

            :param structured_query: a StructuredQuery dict, optionally under the
                key "structured_query"
            :param parameters: dict of [field]=weight to replace in the query
        """
        if "structured_query" in structured_query:
            structured_query = structured_query["structured_query"]

        if not isinstance(structured_query, StructuredQuery):
            structured_query = StructuredQuery(structured_query)

        if not structured_query or len(structured_query) == 0:
            return None

        self.last_query = structured_query

        lucene_query = ""

        for token in structured_query:
            boost = token.boost * token.count
            bool_val = token.bool or ""

            token_text = token.token
            if " " in token_text:  # if token is a phrase
                token_text = "\"" + token_text + "\""

            lucene_query += "%s%s " % (bool_val, token_text)

            if boost != 1:
                token_str = token_text + " "
                lucene_query += bool_val + (token_str * int(boost - 1))

            lucene_query = lucene_query.strip()
            lucene_query += " "

        elastic_query = {"bool": {"should": []}}

        fields = []
        for param in parameters:
            fields.append(param + "^" + str(parameters[param]))

        dsl_query = {
            "multi_match": {
                "query": lucene_query,
                "type": self.multi_match_type,
                "fields": fields,
                "operator": "or",
            }
        }

        ##        print(dsl_query)

        if self.tie_breaker:
            dsl_query["multi_match"]["tie_breaker"] = self.tie_breaker

        return dsl_query

    def rewriteQueryAsDSL(self, structured_query, parameters):
        """
            Creates a DSL query for elasticsearch. Version 3, uses individual "term" and "match" queries

            :param structured_query: a StructuredQuery dict, optionally under the
                key "structured_query"
            :param parameters: dict of [field]=weight to replace in the query
        """
        if isinstance(structured_query,
                      dict) and "structured_query" in structured_query:
            structured_query = structured_query["structured_query"]

        if not isinstance(structured_query, StructuredQuery):
            structured_query = StructuredQuery(structured_query)

        if not structured_query or len(structured_query) == 0:
            return None

        self.last_query = structured_query

        field_dicts = []

        for token in structured_query:
            # TODO proper computing of the boost formula. Different methods?
            boost = token.boost * token.count
            bool_val = token.bool or ""

            token_text = token.token
            # if " " in token_text:  # if token is a phrase
            #     token_text = "\"" + token_text + "\""

            if boost == 0.0:
                continue

            for field in parameters:
                if " " in token_text:
                    new_dict = {
                        "match_phrase": {
                            field: {
                                "query": token_text,
                                "boost": parameters[field] * boost
                            },
                        }
                    }

                else:
                    new_dict = {
                        "term": {
                            field: {
                                "value": token_text,
                                "boost": parameters[field] * boost
                            },
                        }
                    }

                field_dicts.append(new_dict)

        fields = []
        for param in parameters:
            fields.append(param + "^" + str(parameters[param]))

        dsl_query = {"bool": {"should": field_dicts}}

        return dsl_query

    def runQuery(self, structured_query, max_results=None):
        """
            Interfaces with the elasticsearch query API
        """
        if not structured_query or len(structured_query) == 0:
            return []

        if not max_results:
            max_results = self.max_results

        self.last_query = dict(structured_query)
        dsl_query = self.rewriteQueryAsDSL(
            structured_query["structured_query"], [self.default_field])

        res = self.es.search(
            body={"query": dsl_query},
            size=max_results,
            index=self.index_name,
            doc_type=ES_TYPE_DOC,
            request_timeout=QUERY_TIMEOUT,
        )

        structured_query["dsl_query"] = dsl_query
        hits = res["hits"]["hits"]
        ##        print("Found %d document(s) that matched query '%s':" % (res['hits']['total'], query))

        ##        if len(hits.scoreDocs) ==0:
        ##            print "Original query:",original_query
        ##            print "Query:", query
        result = []
        for hit in hits:
            metadata = hit["_source"]["metadata"]
            result.append((hit["_score"], metadata))
        return result

    def formulaFromExplanation(self, query, doc_id):
        """
            Runs .explain() for one query/doc pair, generates and returns a \
            StoredFormula instance from it

            :param query: StructuredQuery dict, with a "dsl_query" key
            :param doc_id: id of document to run .explain() for
            :returns:
        """
        explanation = None
        retries = 0
        while retries < 1:
            try:
                explanation = self.es.explain(
                    index=self.index_name,
                    doc_type=ES_TYPE_DOC,
                    body={"query": query["dsl_query"]},
                    id=doc_id,
                    request_timeout=QUERY_TIMEOUT,
                )
                break
            except Exception as e:
                ##                logging.error("Exception, retrying...")
                retries += 1

        if retries > 0:
            if retries == 1:
                logging.error(
                    "Retried {} times, failed to retrieve.".format(retries +
                                                                   1))
            else:
                logging.warning("Retried %d times, retrieved successfuly." %
                                (retries + 1))

        formula = StoredFormula()
        if explanation:
            formula.fromElasticExplanation(explanation, self.save_terms)
        return formula