def _generate_journal_nested_queries(self, value):
        """Generates ElasticSearch nested query(s).

        Args:
            value (string): Contains the journal_title, journal_volume and artid or start_page separated by a comma.
                            This value should be of type string.

        Notes:
            The value contains at least one of the 3 mentioned items, in this order and at most 3.
            The 3rd is either the artid or the page_start and it will query the corresponding ES field for this item.
            The values are then split on comma and stripped of spaces before being saved in a values list in order to
            be assigned to corresponding fields.
        """
        # Abstract away which is the third field, we care only for its existence.
        third_journal_field = ElasticSearchVisitor.JOURNAL_PAGE_START

        new_publication_info = ElasticSearchVisitor._preprocess_journal_query_value(
            third_journal_field, value)

        # We always expect a journal title, otherwise query would be considered malformed, and thus this method would
        # not have been called.
        queries_for_each_field = [
            generate_match_query(
                ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[
                    ElasticSearchVisitor.JOURNAL_TITLE],
                new_publication_info[ElasticSearchVisitor.JOURNAL_TITLE],
                with_operator_and=False)
        ]

        if ElasticSearchVisitor.JOURNAL_VOLUME in new_publication_info:
            queries_for_each_field.append(
                generate_match_query(
                    ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[
                        ElasticSearchVisitor.JOURNAL_VOLUME],
                    new_publication_info[ElasticSearchVisitor.JOURNAL_VOLUME],
                    with_operator_and=False))

        if third_journal_field in new_publication_info:
            artid_or_page_start = new_publication_info[third_journal_field]
            match_queries = [
                generate_match_query(
                    ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[third_field],
                    artid_or_page_start,
                    with_operator_and=False)
                for third_field in (ElasticSearchVisitor.JOURNAL_PAGE_START,
                                    ElasticSearchVisitor.JOURNAL_ART_ID)
            ]

            queries_for_each_field.append(
                wrap_queries_in_bool_clauses_if_more_than_one(
                    match_queries, use_must_clause=False))

        return generate_nested_query(
            ElasticSearchVisitor.JOURNAL_FIELDS_PREFIX,
            wrap_queries_in_bool_clauses_if_more_than_one(
                queries_for_each_field, use_must_clause=True))
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_generates_should_clause(
):
    queries = [
        {
            'match': {
                'title': 'collider'
            }
        },
    ]

    generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
        queries,
        use_must_clause=False,
        preserve_bool_semantics_if_one_clause=True)

    expected_bool_clause = {
        'bool': {
            'should': [
                {
                    'match': {
                        'title': 'collider'
                    }
                },
            ]
        }
    }

    assert generated_bool_clause == expected_bool_clause
    def _generate_queries_for_title_symbols(title_field, query_value):
        """Generate queries for any symbols in the title against the whitespace tokenized field of titles.

        Returns:
            (dict): The query or queries for the whitespace tokenized field of titles. If none such tokens exist, then
                    returns an empty dict.
        Notes:
            Splits the value stream into tokens according to whitespace.
            Heuristically identifies the ones that contain symbol-indicating-characters (examples of those tokens are
            "g-2", "SU(2)").
        """
        values_tokenized_by_whitespace = query_value.split()

        symbol_queries = []
        for value in values_tokenized_by_whitespace:
            # Heuristic: If there's a symbol-indicating-character in the value, it signifies terms that should be
            # queried against the whitespace-tokenized title.
            if any(character in value for character in
                   ElasticSearchVisitor.TITLE_SYMBOL_INDICATING_CHARACTER):
                symbol_queries.append(
                    generate_match_query('.'.join(
                        [title_field, FieldVariations.search]),
                                         value,
                                         with_operator_and=False))

        return wrap_queries_in_bool_clauses_if_more_than_one(
            symbol_queries, use_must_clause=True)
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_preserves_bool_clause_with_flag_enabled(
):
    queries = [
        {
            'match': {
                'title': 'collider'
            }
        },
    ]

    generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
        queries,
        use_must_clause=True,
        preserve_bool_semantics_if_one_clause=True)

    expected_bool_clause = {
        'bool': {
            'must': [{
                'match': {
                    'title': 'collider'
                }
            }]
        }
    }

    assert generated_bool_clause == expected_bool_clause
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_two_queries():
    queries = [
        {
            'match': {
                'title': 'collider'
            }
        },
        {
            'match': {
                'subject': 'hep'
            }
        },
    ]

    generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
        queries, use_must_clause=True)

    expected_bool_clause = {
        'bool': {
            'must': [
                {
                    'match': {
                        'title': 'collider'
                    }
                },
                {
                    'match': {
                        'subject': 'hep'
                    }
                },
            ]
        }
    }

    assert generated_bool_clause == expected_bool_clause
    def _generate_title_queries(self, value):
        title_field = ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['title']
        q = generate_match_query(title_field, value, with_operator_and=True)

        symbol_queries = ElasticSearchVisitor._generate_queries_for_title_symbols(
            title_field, value)
        return wrap_queries_in_bool_clauses_if_more_than_one(
            [element for element in (q, symbol_queries) if element],
            use_must_clause=True)
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_no_query_returns_empty_dict(
):
    queries = []

    generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
        queries, use_must_clause=True)

    expected_bool_clause = {}

    assert generated_bool_clause == expected_bool_clause
    def _generate_boolean_query(self, node):
        condition_a = node.left.accept(self)
        condition_b = node.right.accept(self)

        bool_body = [condition for condition in [condition_a, condition_b] if condition]
        return wrap_queries_in_bool_clauses_if_more_than_one(
            bool_body,
            use_must_clause=isinstance(node, ast.AndOp),
            preserve_bool_semantics_if_one_clause=True
        )
    def visit_exact_match_value(self, node, fieldnames=None):
        """Generates a term query (exact search in ElasticSearch)."""
        if not fieldnames:
            fieldnames = ['_all']
        else:
            fieldnames = force_list(fieldnames)

        if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'exact-author'] == fieldnames[0]:
            return self._generate_exact_author_query(node.value)

        elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'type-code'] == fieldnames[0]:
            return self._generate_type_code_query(node.value)

        elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'journal'] == fieldnames:
            return self._generate_journal_nested_queries(node.value)

        bai_fieldnames = self._generate_fieldnames_if_bai_query(
            node.value,
            bai_field_variation=FieldVariations.raw,
            query_bai_field_if_dots_in_name=False)

        if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
            term_queries = []
            for field in fieldnames:
                term_query =  \
                    {'term': {field: _truncate_date_value_according_on_date_field(field, node.value).dumps()}}

                term_queries.append(
                    generate_nested_query(
                        ElasticSearchVisitor.DATE_NESTED_QUERY_PATH, term_query
                    ) if field in
                    ElasticSearchVisitor.DATE_NESTED_FIELDS else term_query)
        elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                'author'] in fieldnames:
            term_queries = [
                generate_nested_query(
                    ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH,
                    {'term': {
                        field: node.value
                    }}) for field in (bai_fieldnames or fieldnames)
            ]
        else:
            term_queries = [{
                'term': {
                    field: node.value
                }
            } for field in (bai_fieldnames or fieldnames)]

        return wrap_queries_in_bool_clauses_if_more_than_one(
            term_queries, use_must_clause=False)
    def _generate_range_queries(self, fieldnames, operator_value_pairs):
        """Generates ElasticSearch range queries.

        Args:
            fieldnames (list): The fieldnames on which the search is the range query is targeted on,
            operator_value_pairs (dict): Contains (range_operator, value) pairs.
                The range_operator should be one of those supported by ElasticSearch (e.g. 'gt', 'lt', 'ge', 'le').
                The value should be of type int or string.

        Notes:
            A bool should query with multiple range sub-queries is generated so that even if one of the multiple fields
            is missing from a document, ElasticSearch will be able to match some records.

            In the case of a 'date' keyword query, it updates date values after normalizing them by using
            :meth:`inspire_query_parser.utils.visitor_utils.update_date_value_in_operator_value_pairs_for_fieldname`.
            Additionally, in the aforementioned case, if a malformed date has been given, then the the method will
            return an empty dictionary.
        """
        if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
            range_queries = []
            for fieldname in fieldnames:
                updated_operator_value_pairs = \
                    update_date_value_in_operator_value_pairs_for_fieldname(fieldname, operator_value_pairs)

                if not updated_operator_value_pairs:
                    break  # Malformed date
                else:
                    range_query = {
                        'range': {
                            fieldname: updated_operator_value_pairs
                        }
                    }

                    range_queries.append(
                        generate_nested_query(ElasticSearchVisitor.DATE_NESTED_QUERY_PATH, range_query)
                        if fieldname in ElasticSearchVisitor.DATE_NESTED_FIELDS
                        else range_query
                    )
        else:
            range_queries = [{
                    'range': {
                        fieldname: operator_value_pairs
                    }
                }
                for fieldname in fieldnames
            ]

        return wrap_queries_in_bool_clauses_if_more_than_one(range_queries, use_must_clause=False)
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_drops_bool_clause_with_flag_disabled(
):
    queries = [
        {
            'match': {
                'title': 'collider'
            }
        },
    ]

    generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
        queries, use_must_clause=True)

    expected_bool_clause = {'match': {'title': 'collider'}}

    assert generated_bool_clause == expected_bool_clause
    def visit_value(self, node, fieldnames=None):
        if not fieldnames:
            fieldnames = '_all'

        if node.contains_wildcard:
            if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                    'date'] == fieldnames:
                return self._generate_date_with_wildcard_query(node.value)

            bai_fieldnames = self._generate_fieldnames_if_bai_query(
                node.value,
                bai_field_variation=FieldVariations.search,
                query_bai_field_if_dots_in_name=True)

            query = self._generate_query_string_query(node.value,
                                                      fieldnames=bai_fieldnames
                                                      or fieldnames,
                                                      analyze_wildcard=True)

            if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                    'author'] == fieldnames:
                return generate_nested_query(
                    ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)
            return query
        else:
            if isinstance(fieldnames, list):
                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'date'] == fieldnames:
                    # Date queries with simple values are transformed into range queries, among the given and the exact
                    # next date, according to the granularity of the given date.
                    return self._generate_range_queries(
                        force_list(fieldnames),
                        {ES_RANGE_EQ_OPERATOR: node.value})

                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'journal'] == fieldnames:
                    return self._generate_journal_nested_queries(node.value)

                return {
                    'multi_match': {
                        'fields': fieldnames,
                        'query': node.value,
                    }
                }
            else:
                if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'author'] == fieldnames:
                    bai_fieldnames = self._generate_fieldnames_if_bai_query(
                        node.value,
                        bai_field_variation=FieldVariations.search,
                        query_bai_field_if_dots_in_name=True)
                    if bai_fieldnames:
                        if len(bai_fieldnames) == 1:
                            query = {"match": {bai_fieldnames[0]: node.value}}
                            return generate_nested_query(
                                ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH,
                                query)
                        else:
                            # Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots),
                            # e.g. `S.Mele`. In this case generate a partial match query.
                            return self.visit_partial_match_value(
                                node, bai_fieldnames)

                    return self._generate_author_query(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'exact-author'] == fieldnames:
                    return self._generate_exact_author_query(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'irn'] == fieldnames:
                    return {
                        'term': {
                            fieldnames: ''.join(('SPIRES-', node.value))
                        }
                    }

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'title'] == fieldnames:
                    return self._generate_title_queries(node.value)

                elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[
                        'type-code'] == fieldnames:
                    return self._generate_type_code_query(node.value)

                elif fieldnames not in ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME.values(
                ):
                    colon_value = ':'.join([fieldnames, node.value])
                    given_field_query = generate_match_query(
                        fieldnames, node.value, with_operator_and=True)
                    texkey_query = self._generate_term_query('texkeys.raw',
                                                             colon_value,
                                                             boost=2.0)
                    _all_field_query = generate_match_query(
                        '_all', colon_value, with_operator_and=True)
                    return wrap_queries_in_bool_clauses_if_more_than_one(
                        [given_field_query, texkey_query, _all_field_query],
                        use_must_clause=False)

                return generate_match_query(fieldnames,
                                            node.value,
                                            with_operator_and=True)
예제 #13
0
    def _generate_author_query(self, author_name):
        """Generates a query handling specifically authors.

        Notes:
            There are three main cases:

            1) ``a Smith``
            This will just generate a ``match`` query on ``last_name``

            2) ``a John Smith``
             This will just generate a ``match`` query on ``last_name`` and  a ``prefix`` query on ``first_name``
             and a ``match`` query on the initial ``J``. This will return results from ``Smith, John`` and ``Smith, J``
             but not from ``Smith, Jane``.

            3) ``a J Smith``
            This will just generate a ``match`` query on ``last_name`` and a match query on ``first_name.initials``.

            Please note, cases such as ``J.D.`` have been properly handled by the tokenizer.
        """
        parsed_name = ParsedName(author_name)

        def _match_query_with_names_initials_analyzer_with_and_operator(
                field, value):
            return {
                "match": {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        "query": value,
                        'operator': 'AND',
                        "analyzer": "names_initials_analyzer"
                    }
                }
            }

        def _match_query_with_and_operator(field, value):
            return {
                'match': {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        'query': value,
                        'operator': 'AND'
                    }
                }
            }

        def _match_phrase_prefix_query(field, value):
            return {
                "match_phrase_prefix": {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        "query": value,
                        "analyzer": "names_analyzer"
                    }
                }
            }

        if len(parsed_name) == 1 and '.' not in parsed_name.first:
            # ParsedName returns first name if there is only one name i.e. `Smith`
            # in our case we consider it as a lastname
            last_name = parsed_name.first
            query = _match_query_with_and_operator("author_last_name",
                                                   last_name)
            return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)

        bool_query_build = []
        bool_query_build.append(
            _match_query_with_and_operator("author_last_name",
                                           parsed_name.last))

        should_query = []
        first_names = retokenize_first_names(parsed_name.first_list)
        for name in first_names:
            name_query = []
            if is_initial_of_a_name(name):
                name_query.append(
                    _match_query_with_names_initials_analyzer_with_and_operator(
                        "author_first_name_initials", name))
            else:
                name_query.extend([
                    _match_phrase_prefix_query("author_first_name", name),
                    _match_query_with_names_initials_analyzer_with_and_operator(
                        "author_first_name", name)
                ])
            should_query.append(
                wrap_queries_in_bool_clauses_if_more_than_one(
                    name_query, use_must_clause=False))

        bool_query_build.append(
            wrap_queries_in_bool_clauses_if_more_than_one(
                should_query, use_must_clause=True))

        query = wrap_queries_in_bool_clauses_if_more_than_one(
            bool_query_build, use_must_clause=True)
        return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)