def _generate_journal_nested_queries(self, value): """Generates ElasticSearch nested query(s). Args: value (string): Contains the journal_title, journal_volume and artid or start_page separated by a comma. This value should be of type string. Notes: The value contains at least one of the 3 mentioned items, in this order and at most 3. The 3rd is either the artid or the page_start and it will query the corresponding ES field for this item. The values are then split on comma and stripped of spaces before being saved in a values list in order to be assigned to corresponding fields. """ # Abstract away which is the third field, we care only for its existence. third_journal_field = ElasticSearchVisitor.JOURNAL_PAGE_START new_publication_info = ElasticSearchVisitor._preprocess_journal_query_value( third_journal_field, value) # We always expect a journal title, otherwise query would be considered malformed, and thus this method would # not have been called. queries_for_each_field = [ generate_match_query( ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[ ElasticSearchVisitor.JOURNAL_TITLE], new_publication_info[ElasticSearchVisitor.JOURNAL_TITLE], with_operator_and=False) ] if ElasticSearchVisitor.JOURNAL_VOLUME in new_publication_info: queries_for_each_field.append( generate_match_query( ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[ ElasticSearchVisitor.JOURNAL_VOLUME], new_publication_info[ElasticSearchVisitor.JOURNAL_VOLUME], with_operator_and=False)) if third_journal_field in new_publication_info: artid_or_page_start = new_publication_info[third_journal_field] match_queries = [ generate_match_query( ElasticSearchVisitor.JOURNAL_FIELDS_MAPPING[third_field], artid_or_page_start, with_operator_and=False) for third_field in (ElasticSearchVisitor.JOURNAL_PAGE_START, ElasticSearchVisitor.JOURNAL_ART_ID) ] queries_for_each_field.append( wrap_queries_in_bool_clauses_if_more_than_one( match_queries, use_must_clause=False)) return generate_nested_query( ElasticSearchVisitor.JOURNAL_FIELDS_PREFIX, wrap_queries_in_bool_clauses_if_more_than_one( queries_for_each_field, use_must_clause=True))
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_generates_should_clause( ): queries = [ { 'match': { 'title': 'collider' } }, ] generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( queries, use_must_clause=False, preserve_bool_semantics_if_one_clause=True) expected_bool_clause = { 'bool': { 'should': [ { 'match': { 'title': 'collider' } }, ] } } assert generated_bool_clause == expected_bool_clause
def _generate_queries_for_title_symbols(title_field, query_value): """Generate queries for any symbols in the title against the whitespace tokenized field of titles. Returns: (dict): The query or queries for the whitespace tokenized field of titles. If none such tokens exist, then returns an empty dict. Notes: Splits the value stream into tokens according to whitespace. Heuristically identifies the ones that contain symbol-indicating-characters (examples of those tokens are "g-2", "SU(2)"). """ values_tokenized_by_whitespace = query_value.split() symbol_queries = [] for value in values_tokenized_by_whitespace: # Heuristic: If there's a symbol-indicating-character in the value, it signifies terms that should be # queried against the whitespace-tokenized title. if any(character in value for character in ElasticSearchVisitor.TITLE_SYMBOL_INDICATING_CHARACTER): symbol_queries.append( generate_match_query('.'.join( [title_field, FieldVariations.search]), value, with_operator_and=False)) return wrap_queries_in_bool_clauses_if_more_than_one( symbol_queries, use_must_clause=True)
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_preserves_bool_clause_with_flag_enabled( ): queries = [ { 'match': { 'title': 'collider' } }, ] generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( queries, use_must_clause=True, preserve_bool_semantics_if_one_clause=True) expected_bool_clause = { 'bool': { 'must': [{ 'match': { 'title': 'collider' } }] } } assert generated_bool_clause == expected_bool_clause
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_two_queries(): queries = [ { 'match': { 'title': 'collider' } }, { 'match': { 'subject': 'hep' } }, ] generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( queries, use_must_clause=True) expected_bool_clause = { 'bool': { 'must': [ { 'match': { 'title': 'collider' } }, { 'match': { 'subject': 'hep' } }, ] } } assert generated_bool_clause == expected_bool_clause
def _generate_title_queries(self, value): title_field = ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['title'] q = generate_match_query(title_field, value, with_operator_and=True) symbol_queries = ElasticSearchVisitor._generate_queries_for_title_symbols( title_field, value) return wrap_queries_in_bool_clauses_if_more_than_one( [element for element in (q, symbol_queries) if element], use_must_clause=True)
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_no_query_returns_empty_dict( ): queries = [] generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( queries, use_must_clause=True) expected_bool_clause = {} assert generated_bool_clause == expected_bool_clause
def _generate_boolean_query(self, node): condition_a = node.left.accept(self) condition_b = node.right.accept(self) bool_body = [condition for condition in [condition_a, condition_b] if condition] return wrap_queries_in_bool_clauses_if_more_than_one( bool_body, use_must_clause=isinstance(node, ast.AndOp), preserve_bool_semantics_if_one_clause=True )
def visit_exact_match_value(self, node, fieldnames=None): """Generates a term query (exact search in ElasticSearch).""" if not fieldnames: fieldnames = ['_all'] else: fieldnames = force_list(fieldnames) if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'exact-author'] == fieldnames[0]: return self._generate_exact_author_query(node.value) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'type-code'] == fieldnames[0]: return self._generate_type_code_query(node.value) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'journal'] == fieldnames: return self._generate_journal_nested_queries(node.value) bai_fieldnames = self._generate_fieldnames_if_bai_query( node.value, bai_field_variation=FieldVariations.raw, query_bai_field_if_dots_in_name=False) if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames: term_queries = [] for field in fieldnames: term_query = \ {'term': {field: _truncate_date_value_according_on_date_field(field, node.value).dumps()}} term_queries.append( generate_nested_query( ElasticSearchVisitor.DATE_NESTED_QUERY_PATH, term_query ) if field in ElasticSearchVisitor.DATE_NESTED_FIELDS else term_query) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'author'] in fieldnames: term_queries = [ generate_nested_query( ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, {'term': { field: node.value }}) for field in (bai_fieldnames or fieldnames) ] else: term_queries = [{ 'term': { field: node.value } } for field in (bai_fieldnames or fieldnames)] return wrap_queries_in_bool_clauses_if_more_than_one( term_queries, use_must_clause=False)
def _generate_range_queries(self, fieldnames, operator_value_pairs): """Generates ElasticSearch range queries. Args: fieldnames (list): The fieldnames on which the search is the range query is targeted on, operator_value_pairs (dict): Contains (range_operator, value) pairs. The range_operator should be one of those supported by ElasticSearch (e.g. 'gt', 'lt', 'ge', 'le'). The value should be of type int or string. Notes: A bool should query with multiple range sub-queries is generated so that even if one of the multiple fields is missing from a document, ElasticSearch will be able to match some records. In the case of a 'date' keyword query, it updates date values after normalizing them by using :meth:`inspire_query_parser.utils.visitor_utils.update_date_value_in_operator_value_pairs_for_fieldname`. Additionally, in the aforementioned case, if a malformed date has been given, then the the method will return an empty dictionary. """ if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames: range_queries = [] for fieldname in fieldnames: updated_operator_value_pairs = \ update_date_value_in_operator_value_pairs_for_fieldname(fieldname, operator_value_pairs) if not updated_operator_value_pairs: break # Malformed date else: range_query = { 'range': { fieldname: updated_operator_value_pairs } } range_queries.append( generate_nested_query(ElasticSearchVisitor.DATE_NESTED_QUERY_PATH, range_query) if fieldname in ElasticSearchVisitor.DATE_NESTED_FIELDS else range_query ) else: range_queries = [{ 'range': { fieldname: operator_value_pairs } } for fieldname in fieldnames ] return wrap_queries_in_bool_clauses_if_more_than_one(range_queries, use_must_clause=False)
def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_drops_bool_clause_with_flag_disabled( ): queries = [ { 'match': { 'title': 'collider' } }, ] generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( queries, use_must_clause=True) expected_bool_clause = {'match': {'title': 'collider'}} assert generated_bool_clause == expected_bool_clause
def visit_value(self, node, fieldnames=None): if not fieldnames: fieldnames = '_all' if node.contains_wildcard: if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'date'] == fieldnames: return self._generate_date_with_wildcard_query(node.value) bai_fieldnames = self._generate_fieldnames_if_bai_query( node.value, bai_field_variation=FieldVariations.search, query_bai_field_if_dots_in_name=True) query = self._generate_query_string_query(node.value, fieldnames=bai_fieldnames or fieldnames, analyze_wildcard=True) if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'author'] == fieldnames: return generate_nested_query( ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query) return query else: if isinstance(fieldnames, list): if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'date'] == fieldnames: # Date queries with simple values are transformed into range queries, among the given and the exact # next date, according to the granularity of the given date. return self._generate_range_queries( force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value}) if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'journal'] == fieldnames: return self._generate_journal_nested_queries(node.value) return { 'multi_match': { 'fields': fieldnames, 'query': node.value, } } else: if ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'author'] == fieldnames: bai_fieldnames = self._generate_fieldnames_if_bai_query( node.value, bai_field_variation=FieldVariations.search, query_bai_field_if_dots_in_name=True) if bai_fieldnames: if len(bai_fieldnames) == 1: query = {"match": {bai_fieldnames[0]: node.value}} return generate_nested_query( ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query) else: # Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots), # e.g. `S.Mele`. In this case generate a partial match query. return self.visit_partial_match_value( node, bai_fieldnames) return self._generate_author_query(node.value) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'exact-author'] == fieldnames: return self._generate_exact_author_query(node.value) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'irn'] == fieldnames: return { 'term': { fieldnames: ''.join(('SPIRES-', node.value)) } } elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'title'] == fieldnames: return self._generate_title_queries(node.value) elif ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME[ 'type-code'] == fieldnames: return self._generate_type_code_query(node.value) elif fieldnames not in ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME.values( ): colon_value = ':'.join([fieldnames, node.value]) given_field_query = generate_match_query( fieldnames, node.value, with_operator_and=True) texkey_query = self._generate_term_query('texkeys.raw', colon_value, boost=2.0) _all_field_query = generate_match_query( '_all', colon_value, with_operator_and=True) return wrap_queries_in_bool_clauses_if_more_than_one( [given_field_query, texkey_query, _all_field_query], use_must_clause=False) return generate_match_query(fieldnames, node.value, with_operator_and=True)
def _generate_author_query(self, author_name): """Generates a query handling specifically authors. Notes: There are three main cases: 1) ``a Smith`` This will just generate a ``match`` query on ``last_name`` 2) ``a John Smith`` This will just generate a ``match`` query on ``last_name`` and a ``prefix`` query on ``first_name`` and a ``match`` query on the initial ``J``. This will return results from ``Smith, John`` and ``Smith, J`` but not from ``Smith, Jane``. 3) ``a J Smith`` This will just generate a ``match`` query on ``last_name`` and a match query on ``first_name.initials``. Please note, cases such as ``J.D.`` have been properly handled by the tokenizer. """ parsed_name = ParsedName(author_name) def _match_query_with_names_initials_analyzer_with_and_operator( field, value): return { "match": { self.KEYWORD_TO_ES_FIELDNAME[field]: { "query": value, 'operator': 'AND', "analyzer": "names_initials_analyzer" } } } def _match_query_with_and_operator(field, value): return { 'match': { self.KEYWORD_TO_ES_FIELDNAME[field]: { 'query': value, 'operator': 'AND' } } } def _match_phrase_prefix_query(field, value): return { "match_phrase_prefix": { self.KEYWORD_TO_ES_FIELDNAME[field]: { "query": value, "analyzer": "names_analyzer" } } } if len(parsed_name) == 1 and '.' not in parsed_name.first: # ParsedName returns first name if there is only one name i.e. `Smith` # in our case we consider it as a lastname last_name = parsed_name.first query = _match_query_with_and_operator("author_last_name", last_name) return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query) bool_query_build = [] bool_query_build.append( _match_query_with_and_operator("author_last_name", parsed_name.last)) should_query = [] first_names = retokenize_first_names(parsed_name.first_list) for name in first_names: name_query = [] if is_initial_of_a_name(name): name_query.append( _match_query_with_names_initials_analyzer_with_and_operator( "author_first_name_initials", name)) else: name_query.extend([ _match_phrase_prefix_query("author_first_name", name), _match_query_with_names_initials_analyzer_with_and_operator( "author_first_name", name) ]) should_query.append( wrap_queries_in_bool_clauses_if_more_than_one( name_query, use_must_clause=False)) bool_query_build.append( wrap_queries_in_bool_clauses_if_more_than_one( should_query, use_must_clause=True)) query = wrap_queries_in_bool_clauses_if_more_than_one( bool_query_build, use_must_clause=True) return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)