def visit_exact_match_value(self, node, fieldnames=None):
        """Generates a term query (exact search in ElasticSearch)."""
        if not fieldnames:
            fieldnames = ['_all']
        else:
            fieldnames = force_list(fieldnames)

        if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'exact-author'] == fieldnames[0]:
            return self._generate_exact_author_query(node.value)

        elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'type-code'] == fieldnames[0]:
            return self._generate_type_code_query(node.value)

        elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'journal'] == fieldnames:
            return self._generate_journal_nested_queries(node.value)

        bai_fieldnames = self._generate_fieldnames_if_bai_query(
            node.value,
            bai_field_variation=FieldVariations.raw,
            query_bai_field_if_dots_in_name=False)

        if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
            term_queries = []
            for field in fieldnames:
                term_query =  \
                    {'term': {field: _truncate_date_value_according_on_date_field(field, node.value).dumps()}}

                term_queries.append(
                    generate_nested_query(
                        ElasticSearchVisitor.DATE_NESTED_QUERY_PATH, term_query
                    ) if field in
                    ElasticSearchVisitor.DATE_NESTED_FIELDS else term_query)
        elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'author'] in fieldnames:
            term_queries = [
                generate_nested_query(
                    ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH,
                    {'term': {
                        field: node.value
                    }}) for field in (bai_fieldnames or fieldnames)
            ]
        else:
            term_queries = [{
                'term': {
                    field: node.value
                }
            } for field in (bai_fieldnames or fieldnames)]

        return wrap_queries_in_bool_clauses_if_more_than_one(
            term_queries, use_must_clause=False)
    def _generate_exact_author_query(self, author_name_or_bai):
        """Generates a term query handling authors and BAIs.

        Notes:
            If given value is a BAI, search for the provided value in the raw field variation of
            `ElasticSearchVisitor.AUTHORS_BAI_FIELD`.
            Otherwise, the value will be procesed in the same way as the indexed value (i.e. lowercased and normalized
            (inspire_utils.normalize_name and then NFKC normalization).
            E.g. Searching for 'Smith, J.' is the same as searching for: 'Smith, J', 'smith, j.', 'smith j', 'j smith',
            'j. smith', 'J Smith', 'J. Smith'.
        """
        if ElasticSearchVisitor.BAI_REGEX.match(author_name_or_bai):
            bai = author_name_or_bai.lower()
            query = self._generate_term_query(
                '.'.join((ElasticSearchVisitor.AUTHORS_BAI_FIELD,
                          FieldVariations.search)), bai)
        else:
            author_name = normalize(
                'NFKC', normalize_name(author_name_or_bai)).lower()
            query = self._generate_term_query(
                ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['exact-author'],
                author_name)

        return generate_nested_query(
            ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)
    def _generate_author_query(self, author_name):
        """Generates a query handling specifically authors.

        Notes:
            The match query is generic enough to return many results. Then, using the filter clause we truncate these
            so that we imitate legacy's behaviour on returning more "exact" results. E.g. Searching for `Smith, John`
            shouldn't return papers of 'Smith, Bob'.

            Additionally, doing a ``match`` with ``"operator": "and"`` in order to be even more exact in our search, by
            requiring that ``full_name`` field contains both
        """
        name_variations = [name_variation.lower()
                           for name_variation
                           in generate_minimal_name_variations(author_name)]

        # When the query contains sufficient data, i.e. full names, e.g. ``Mele, Salvatore`` (and not ``Mele, S`` or
        # ``Mele``) we can improve our filtering in order to filter out results containing records with authors that
        # have the same non lastnames prefix, e.g. 'Mele, Samuele'.
        if author_name_contains_fullnames(author_name):
            specialized_author_filter = [
                {
                    'bool': {
                        'must': [
                            {
                                'term': {ElasticSearchVisitor.AUTHORS_NAME_VARIATIONS_FIELD: names_variation[0]}
                            },
                            generate_match_query(
                                ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author'],
                                names_variation[1],
                                with_operator_and=True
                            )
                        ]
                    }
                } for names_variation
                in product(name_variations, name_variations)
            ]

        else:
            # In the case of initials or even single lastname search, filter with only the name variations.
            specialized_author_filter = [
                {'term': {ElasticSearchVisitor.AUTHORS_NAME_VARIATIONS_FIELD: name_variation}}
                for name_variation in name_variations
            ]

        query = {
            'bool': {
                'filter': {
                    'bool': {
                        'should': specialized_author_filter
                    }
                },
                'must': {
                    'match': {
                        ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author']: author_name
                    }
                }
            }
        }

        return generate_nested_query(ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)
示例#4
0
    def visit_regex_value(self, node, fieldname):
        query = {'regexp': {fieldname: node.value}}

        if self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldname:
            return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)

        return query
def test_generate_nested_query_returns_empty_dict_on_falsy_query():
    query = {}
    path = 'journal'

    generated_query = generate_nested_query(path, query)

    expected_query = {}

    assert generated_query == expected_query
    def _generate_journal_nested_queries(self, value):
        """Generates ElasticSearch nested query(s).

        Args:
            value (string): Contains the journal_title, journal_volume and artid or start_page separated by a comma.
                            This value should be of type string.

        Notes:
            The value contains at least one of the 3 mentioned items, in this order and at most 3.
            The 3rd is either the artid or the page_start and it will query the corresponding ES field for this item.
            The values are then split on comma and stripped of spaces before being saved in a values list in order to
            be assigned to corresponding fields.
        """
        # Abstract away which is the third field, we care only for its existence.
        third_journal_field = ElasticSearchVisitor.JOURNAL_PAGE_START

        new_publication_info = ElasticSearchVisitor._preprocess_journal_query_value(
            third_journal_field, value)

        # We always expect a journal title, otherwise query would be considered malformed, and thus this method would
        # not have been called.
        queries_for_each_field = [
            generate_match_query(
                ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[
                    ElasticSearchVisitor.JOURNAL_TITLE],
                new_publication_info[ElasticSearchVisitor.JOURNAL_TITLE],
                with_operator_and=False)
        ]

        if ElasticSearchVisitor.JOURNAL_VOLUME in new_publication_info:
            queries_for_each_field.append(
                generate_match_query(
                    ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[
                        ElasticSearchVisitor.JOURNAL_VOLUME],
                    new_publication_info[ElasticSearchVisitor.JOURNAL_VOLUME],
                    with_operator_and=False))

        if third_journal_field in new_publication_info:
            artid_or_page_start = new_publication_info[third_journal_field]
            match_queries = [
                generate_match_query(
                    ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[third_field],
                    artid_or_page_start,
                    with_operator_and=False)
                for third_field in (ElasticSearchVisitor.JOURNAL_PAGE_START,
                                    ElasticSearchVisitor.JOURNAL_ART_ID)
            ]

            queries_for_each_field.append(
                wrap_queries_in_bool_clauses_if_more_than_one(
                    match_queries, use_must_clause=False))

        return generate_nested_query(
            ElasticSearchVisitor.JOURNAL_FIELDS_PREFIX,
            wrap_queries_in_bool_clauses_if_more_than_one(
                queries_for_each_field, use_must_clause=True))
    def _generate_range_queries(self, fieldnames, operator_value_pairs):
        """Generates ElasticSearch range queries.

        Args:
            fieldnames (list): The fieldnames on which the search is the range query is targeted on,
            operator_value_pairs (dict): Contains (range_operator, value) pairs.
                The range_operator should be one of those supported by ElasticSearch (e.g. 'gt', 'lt', 'ge', 'le').
                The value should be of type int or string.

        Notes:
            A bool should query with multiple range sub-queries is generated so that even if one of the multiple fields
            is missing from a document, ElasticSearch will be able to match some records.

            In the case of a 'date' keyword query, it updates date values after normalizing them by using
            :meth:`inspire_query_parser.utils.visitor_utils.update_date_value_in_operator_value_pairs_for_fieldname`.
            Additionally, in the aforementioned case, if a malformed date has been given, then the the method will
            return an empty dictionary.
        """
        if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
            range_queries = []
            for fieldname in fieldnames:
                updated_operator_value_pairs = \
                    update_date_value_in_operator_value_pairs_for_fieldname(fieldname, operator_value_pairs)

                if not updated_operator_value_pairs:
                    break  # Malformed date
                else:
                    range_query = {
                        'range': {
                            fieldname: updated_operator_value_pairs
                        }
                    }

                    range_queries.append(
                        generate_nested_query(ElasticSearchVisitor.DATE_NESTED_QUERY_PATH, range_query)
                        if fieldname in ElasticSearchVisitor.DATE_NESTED_FIELDS
                        else range_query
                    )
        else:
            range_queries = [{
                    'range': {
                        fieldname: operator_value_pairs
                    }
                }
                for fieldname in fieldnames
            ]

        return wrap_queries_in_bool_clauses_if_more_than_one(range_queries, use_must_clause=False)
    def visit_partial_match_value(self, node, fieldnames=None):
        """Generates a query which looks for a substring of the node's value in the given fieldname."""
        if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
            # Date queries with partial values are transformed into range queries, among the given and the exact
            # next date, according to the granularity of the given date.
            if node.contains_wildcard:
                return self._generate_date_with_wildcard_query(node.value)

            return self._generate_range_queries(
                force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value})

        if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'exact-author'] == fieldnames:
            return self._generate_exact_author_query(node.value)

        elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'type-code'] == fieldnames:
            return self._generate_type_code_query(node.value)

        elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'journal'] == fieldnames:
            return self._generate_journal_nested_queries(node.value)

        # Add wildcard token as prefix and suffix.
        value = \
            ('' if node.value.startswith(ast.GenericValue.WILDCARD_TOKEN) else '*') + \
            node.value + \
            ('' if node.value.endswith(ast.GenericValue.WILDCARD_TOKEN) else '*')

        bai_fieldnames = self._generate_fieldnames_if_bai_query(
            node.value,
            bai_field_variation=FieldVariations.search,
            query_bai_field_if_dots_in_name=True)

        query = self._generate_query_string_query(value,
                                                  fieldnames=bai_fieldnames
                                                  or fieldnames,
                                                  analyze_wildcard=True)
        if (bai_fieldnames and ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author'] in bai_fieldnames) \
                or (fieldnames and ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author'] in fieldnames):
            return generate_nested_query(
                ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)

        return query
def test_generate_nested_query():
    query = {
        'bool': {
            'must': [
                {
                    'match': {
                        'journal.title': 'Phys.Rev'
                    }
                },
                {
                    'match': {
                        'journal.volume': 'D42'
                    }
                },
            ]
        }
    }
    path = 'journal'

    generated_query = generate_nested_query(path, query)

    expected_query = {
        'nested': {
            'path': 'journal',
            'query': {
                'bool': {
                    'must': [
                        {
                            'match': {
                                'journal.title': 'Phys.Rev'
                            }
                        },
                        {
                            'match': {
                                'journal.volume': 'D42'
                            }
                        },
                    ]
                }
            }
        }
    }

    assert generated_query == expected_query
    def visit_value(self, node, fieldnames=None):
        if not fieldnames:
            fieldnames = '_all'

        if node.contains_wildcard:
            if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                    'date'] == fieldnames:
                return self._generate_date_with_wildcard_query(node.value)

            bai_fieldnames = self._generate_fieldnames_if_bai_query(
                node.value,
                bai_field_variation=FieldVariations.search,
                query_bai_field_if_dots_in_name=True)

            query = self._generate_query_string_query(node.value,
                                                      fieldnames=bai_fieldnames
                                                      or fieldnames,
                                                      analyze_wildcard=True)

            if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                    'author'] == fieldnames:
                return generate_nested_query(
                    ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)
            return query
        else:
            if isinstance(fieldnames, list):
                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'date'] == fieldnames:
                    # Date queries with simple values are transformed into range queries, among the given and the exact
                    # next date, according to the granularity of the given date.
                    return self._generate_range_queries(
                        force_list(fieldnames),
                        {ES_RANGE_EQ_OPERATOR: node.value})

                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'journal'] == fieldnames:
                    return self._generate_journal_nested_queries(node.value)

                return {
                    'multi_match': {
                        'fields': fieldnames,
                        'query': node.value,
                    }
                }
            else:
                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'author'] == fieldnames:
                    bai_fieldnames = self._generate_fieldnames_if_bai_query(
                        node.value,
                        bai_field_variation=FieldVariations.search,
                        query_bai_field_if_dots_in_name=True)
                    if bai_fieldnames:
                        if len(bai_fieldnames) == 1:
                            query = {"match": {bai_fieldnames[0]: node.value}}
                            return generate_nested_query(
                                ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH,
                                query)
                        else:
                            # Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots),
                            # e.g. `S.Mele`. In this case generate a partial match query.
                            return self.visit_partial_match_value(
                                node, bai_fieldnames)

                    return self._generate_author_query(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'exact-author'] == fieldnames:
                    return self._generate_exact_author_query(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'irn'] == fieldnames:
                    return {
                        'term': {
                            fieldnames: ''.join(('SPIRES-', node.value))
                        }
                    }

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'title'] == fieldnames:
                    return self._generate_title_queries(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'type-code'] == fieldnames:
                    return self._generate_type_code_query(node.value)

                elif fieldnames not in ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME.values(
                ):
                    colon_value = ':'.join([fieldnames, node.value])
                    given_field_query = generate_match_query(
                        fieldnames, node.value, with_operator_and=True)
                    texkey_query = self._generate_term_query('texkeys.raw',
                                                             colon_value,
                                                             boost=2.0)
                    _all_field_query = generate_match_query(
                        '_all', colon_value, with_operator_and=True)
                    return wrap_queries_in_bool_clauses_if_more_than_one(
                        [given_field_query, texkey_query, _all_field_query],
                        use_must_clause=False)

                return generate_match_query(fieldnames,
                                            node.value,
                                            with_operator_and=True)
示例#11
0
    def _generate_author_query(self, author_name):
        """Generates a query handling specifically authors.

        Notes:
            The match query is generic enough to return many results. Then, using the filter clause we truncate these
            so that we imitate legacy's behaviour on returning more "exact" results. E.g. Searching for `Smith, John`
            shouldn't return papers of 'Smith, Bob'.

            Additionally, doing a ``match`` with ``"operator": "and"`` in order to be even more exact in our search, by
            requiring that ``full_name`` field contains both
        """

        parsed_name = ParsedName(author_name)

        def _is_initial(name_part):
            return len(name_part) == 1 or u'.' in name_part

        # This case we treat ti just like lastname
        if len(parsed_name) == 1:
            query = {
                'bool': {
                    'must': {
                        'match': {
                            ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author_last_name_raw']:
                            author_name
                        }
                    }
                }
            }
            return generate_nested_query(
                ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)

        if ',' not in author_name:
            parts = author_name.split(' ')
            lastname = parts[-1]
            firstnames = parts[0:-1]
        else:
            parts = author_name.split(',')
            lastname = parts[0]
            firstnames = parts[1].replace('.', ' ').split(' ')

        # do something else
        query_build = []
        for name in firstnames:
            if _is_initial(name):
                query_build.append(
                    {"match": {
                        "authors.first_name": name.strip()
                    }})
            else:
                query_build.append({
                    "prefix": {
                        "authors.first_name.raw": name.strip().lower()
                    }
                })

        query = {
            'bool': {
                'must': [{
                    "match": {
                        "authors.last_name.raw": {
                            "query": lastname.strip(),
                            "operator": "AND"
                        }
                    }
                }, {
                    "bool": {
                        "must": query_build
                    }
                }]
            }
        }
        return generate_nested_query(
            ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)
示例#12
0
    def _generate_author_query(self, author_name):
        """Generates a query handling specifically authors.

        Notes:
            There are three main cases:

            1) ``a Smith``
            This will just generate a ``match`` query on ``last_name``

            2) ``a John Smith``
             This will just generate a ``match`` query on ``last_name`` and  a ``prefix`` query on ``first_name``
             and a ``match`` query on the initial ``J``. This will return results from ``Smith, John`` and ``Smith, J``
             but not from ``Smith, Jane``.

            3) ``a J Smith``
            This will just generate a ``match`` query on ``last_name`` and a match query on ``first_name.initials``.

            Please note, cases such as ``J.D.`` have been properly handled by the tokenizer.
        """
        parsed_name = ParsedName(author_name)

        def _match_query_with_names_initials_analyzer_with_and_operator(
                field, value):
            return {
                "match": {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        "query": value,
                        'operator': 'AND',
                        "analyzer": "names_initials_analyzer"
                    }
                }
            }

        def _match_query_with_and_operator(field, value):
            return {
                'match': {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        'query': value,
                        'operator': 'AND'
                    }
                }
            }

        def _match_phrase_prefix_query(field, value):
            return {
                "match_phrase_prefix": {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        "query": value,
                        "analyzer": "names_analyzer"
                    }
                }
            }

        if len(parsed_name) == 1 and '.' not in parsed_name.first:
            # ParsedName returns first name if there is only one name i.e. `Smith`
            # in our case we consider it as a lastname
            last_name = parsed_name.first
            query = _match_query_with_and_operator("author_last_name",
                                                   last_name)
            return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)

        bool_query_build = []
        bool_query_build.append(
            _match_query_with_and_operator("author_last_name",
                                           parsed_name.last))

        should_query = []
        first_names = retokenize_first_names(parsed_name.first_list)
        for name in first_names:
            name_query = []
            if is_initial_of_a_name(name):
                name_query.append(
                    _match_query_with_names_initials_analyzer_with_and_operator(
                        "author_first_name_initials", name))
            else:
                name_query.extend([
                    _match_phrase_prefix_query("author_first_name", name),
                    _match_query_with_names_initials_analyzer_with_and_operator(
                        "author_first_name", name)
                ])
            should_query.append(
                wrap_queries_in_bool_clauses_if_more_than_one(
                    name_query, use_must_clause=False))

        bool_query_build.append(
            wrap_queries_in_bool_clauses_if_more_than_one(
                should_query, use_must_clause=True))

        query = wrap_queries_in_bool_clauses_if_more_than_one(
            bool_query_build, use_must_clause=True)
        return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)