def fuzzy_match(obj, eng): """Return ``True`` if a similar record is found in the system. Uses a custom configuration for ``inspire-matcher`` to find records similar to the current workflow object's payload in the system. Also sets the ``matches.fuzzy`` property in ``extra_data`` to the list of the brief of first 5 record that matched. Arguments: obj: a workflow object. eng: a workflow engine. Returns: bool: ``True`` if the workflow object has a duplicate in the system ``False`` otherwise. """ if not current_app.config.get('FEATURE_FLAG_ENABLE_FUZZY_MATCHER'): return False fuzzy_match_config = current_app.config['FUZZY_MATCH'] matches = dedupe_list(match(obj.data, fuzzy_match_config)) record_ids = [_get_hep_record_brief(el['_source']) for el in matches] obj.extra_data.setdefault('matches', {})['fuzzy'] = record_ids[0:5] return bool(record_ids)
def test_match_raises_if_inner_hits_param_has_wrong_config(): config = { 'algorithm': [ { "queries": [ { "paths": ["first_name", "last_name"], "search_paths": ["authors.first_name", "authors.last_name"], "type": "nested", "inner_hits": { "not_existing_argument": ["authors.record"] } }, ], }, ], 'doc_type': 'hep', 'index': 'records-hep', } with pytest.raises(ValueError) as excinfo: list(match(None, config)) assert 'Malformed query' in str(excinfo.value)
def article_exists(obj, eng): """Return ``True`` if the record is already present in the system. Uses the default configuration of the ``inspire-matcher`` to find duplicates of the current workflow object in the system. Also sets the ``record_matches`` property in ``extra_data`` to the list of control numbers that matched. Arguments: obj: a workflow object. eng: a workflow engine. Returns: bool: ``True`` if the workflow object has a duplicate in the system ``False`` otherwise. """ matches = dedupe_list(match(obj.data)) record_ids = [el['_source']['control_number'] for el in matches] if record_ids: obj.extra_data['record_matches'] = record_ids return True obj.extra_data['record_matches'] = [] return False
def match_literature_author_with_config(author_data, matcher_config): matched_records = [ matched_record for matched_record in match(author_data, matcher_config) if get_value(matched_record, "inner_hits.authors.hits.hits[0]._source.record.$ref") ] return matched_records
def test_match_raises_if_the_configuration_does_not_have_all_the_keys(): config = { 'doc_type': 'hep', 'index': 'records-hep', } with pytest.raises(KeyError) as excinfo: list(match(None, config)) assert 'Malformed configuration' in str(excinfo.value)
def match_author(author): matched_authors = match(author, current_app.config["AUTHOR_MATCHER_EXACT_CONFIG"]) matched_refs_ids = { matched_author["_source"]["self"]["$ref"]: matched_author["_source"].get( "ids", [] ) for matched_author in matched_authors } matched_author_data = get_reference_and_bai_if_unambiguous_match(matched_refs_ids) return matched_author_data
def test_match_raises_if_one_step_of_the_algorithm_has_no_queries(): config = { 'algorithm': [ {'validator': 'inspire_matcher.validators:default_validator'}, ], 'doc_type': 'hep', 'index': 'records-hep', } with pytest.raises(KeyError) as excinfo: list(match(None, config)) assert 'Malformed algorithm' in str(excinfo.value)
def pending_in_holding_pen(obj, eng): """Return ``True`` if the record is already present in the Holding Pen. Uses a custom configuration of the ``inspire-matcher`` to find duplicates of the current workflow object in the Holding Pen. Also sets ``holdingpen_matches`` in ``extra_data`` to the list of ids that matched. Arguments: obj: a workflow object. eng: a workflow engine. Returns: bool: ``True`` if the workflow object has a duplicate in the Holding Pen, ``False`` otherwise. """ config = { 'algorithm': [ { 'queries': [ { 'path': 'arxiv_eprints.value', 'search_path': 'metadata.arxiv_eprints.value.raw', 'type': 'exact', }, { 'path': 'dois.value', 'search_path': 'metadata.dois.value.raw', 'type': 'exact', }, ], }, ], 'doc_type': 'hep', 'index': 'holdingpen-hep', } matches = dedupe_list(match(obj.data, config)) holdingpen_ids = [ int(el['_id']) for el in matches if int(el['_id']) != obj.id ] if holdingpen_ids: obj.extra_data['holdingpen_matches'] = holdingpen_ids return True return False
def duplicated_validator(property_name, property_value): def _is_not_deleted(base_record, match_result): return not get_value(match_result, '_source.deleted', default=False) config = { 'algorithm': [ { 'queries': [ { 'path': 'arxiv_id', 'search_path': 'arxiv_eprints.value.raw', 'type': 'exact', }, { 'path': 'doi', 'search_path': 'dois.value.raw', 'type': 'exact', }, ], 'validator': _is_not_deleted, }, ], 'doc_type': 'hep', 'index': 'records-hep', } if property_name == 'arXiv ID': data = { 'arxiv_id': property_value, } if property_name == 'DOI': data = { 'doi': property_value, } matches = dedupe_list(match(data, config)) matched_ids = [int(el['_source']['control_number']) for el in matches] if matched_ids: url = url_for( 'invenio_records_ui.literature', pid_value=matched_ids[0], ) raise ValidationError( 'There exists already an item with the same %s. ' '<a target="_blank" href="%s">See the record.</a>' % (property_name, url))
def test_match_raises_if_one_query_does_not_have_a_type(): config = { 'algorithm': [ { 'queries': [ {}, ], }, ], 'doc_type': 'hep', 'index': 'records-hep', } with pytest.raises(ValueError) as excinfo: list(match(None, config)) assert 'Malformed query' in str(excinfo.value)
def test_match_raises_if_one_query_type_is_not_supported(): config = { 'algorithm': [ { 'queries': [ {'type': 'not-supported'}, ], }, ], 'doc_type': 'records', 'index': 'records-hep', } with pytest.raises(ValueError) as excinfo: list(match(None, config)) assert 'Malformed query. Query 0 of step 0 does not compile: type "not-supported" is not supported.' in str(excinfo.value)
def duplicated_validator(property_name, property_value): def _is_not_deleted(base_record, match_result): return not get_value(match_result, '_source.deleted', default=False) config = { 'algorithm': [ { 'queries': [ { 'path': 'arxiv_id', 'search_path': 'arxiv_eprints.value.raw', 'type': 'exact', }, { 'path': 'doi', 'search_path': 'dois.value.raw', 'type': 'exact', }, ], 'validator': _is_not_deleted, }, ], 'doc_type': 'hep', 'index': 'records-hep', } if property_name == 'arXiv ID': data = { 'arxiv_id': property_value, } if property_name == 'DOI': data = { 'doi': property_value, } matches = dedupe_list(match(data, config)) matched_ids = [int(el['_source']['control_number']) for el in matches] if matched_ids: url = url_for( 'invenio_records_ui.literature', pid_value=matched_ids[0], ) raise ValidationError( 'There exists already an item with the same %s. ' '<a target="_blank" href="%s">See the record.</a>' % (property_name, url) )
def update_references_pointing_to_merged_record(refs_to_schema, merged_record_uri, new_record_uri): for index, path in refs_to_schema: config = get_config_for_given_path(index, path) matched_records = match({"$ref": merged_record_uri}, config) for matched_record in matched_records: matched_inspire_record = InspireRecord.get_record( matched_record["_id"], with_deleted=True) referenced_records_in_path = flatten_list( get_value(matched_inspire_record, path[:-len(".$ref")], [])) for referenced_record in referenced_records_in_path: update_reference_if_reference_uri_matches( referenced_record, merged_record_uri, new_record_uri) matched_inspire_record.update(dict(matched_inspire_record)) LOGGER.info("Updated reference for record", uuid=str(matched_inspire_record.id)) db.session.commit()
def test_validator_list(es_mock): es_mock.search.return_value = { 'hits': { 'hits': { 'dummy result', } } } dummy_validator_1 = mock.Mock() dummy_validator_1.return_value = True dummy_validator_2 = mock.Mock() dummy_validator_2.return_value = True config = { 'algorithm': [ { 'queries': [ { 'type': 'exact', 'path': 'dummy.path', 'search_path': 'dummy.search.path', }, ], 'validator': [dummy_validator_1, dummy_validator_2], }, ], 'doc_type': 'hep', 'index': 'records-hep', } record = { 'dummy': { 'path': 'Non empty value', }, } result = list(match(record, config)) assert 'dummy result' in result dummy_validator_1.assert_called_with(record, 'dummy result') dummy_validator_2.assert_called_with(record, 'dummy result')
def test_match_raises_if_an_exact_query_does_not_have_all_the_keys(): config = { 'algorithm': [ { 'queries': [ { 'search_path': 'arxiv_eprints.value.raw', 'type': 'exact', }, ], }, ], 'doc_type': 'hep', 'index': 'records-hep', } with pytest.raises(ValueError) as excinfo: list(match(None, config)) assert 'Malformed query' in str(excinfo.value)
def _pending_in_holding_pen(obj, validation_func): """Return the list of matching workflows in the holdingpen. Matches the holdingpen records by their ``arxiv_eprint``, their ``doi``, and by a custom validator function. Args: obj: a workflow object. validation_func: a function used to filter the matched records. Returns: (list): the ids matching the current ``obj`` that satisfy ``validation_func``. """ config = { 'algorithm': [ { 'queries': [ { 'path': 'arxiv_eprints.value', 'search_path': 'metadata.arxiv_eprints.value.raw', 'type': 'exact', }, { 'path': 'dois.value', 'search_path': 'metadata.dois.value.raw', 'type': 'exact', }, ], 'validator': validation_func, }, ], 'doc_type': 'hep', 'index': 'holdingpen-hep', } matches = dedupe_list(match(obj.data, config)) return [int(el['_id']) for el in matches if int(el['_id']) != obj.id]
def test_match_raises_on_invalid_collections(): config = { 'algorithm': [ { 'queries': [ { 'search_path': 'arxiv_eprints.value.raw', 'path': 'arxiv_eprints.value', 'type': 'exact', }, ], }, ], 'doc_type': 'hep', 'index': 'records-hep', 'collections': 'Literature', } with pytest.raises(ValueError) as excinfo: list(match(None, config)) assert 'Malformed collections' in str(excinfo.value)
def test_match_uses_the_given_validator_callable(es_mock): es_mock.search.return_value = { 'hits': { 'hits': { 'dummy result', } } } dummy_validator = mock.Mock() dummy_validator.return_value = False config = { 'algorithm': [ { 'queries': [ { 'type': 'exact', 'path': 'dummy.path', 'search_path': 'dummy.search.path', }, ], 'validator': dummy_validator, }, ], 'doc_type': 'hep', 'index': 'records-hep', } record = { 'dummy': { 'path': 'Non empty value', }, } result = list(match(record, config)) assert not result dummy_validator.assert_called_with(record, 'dummy result')
def exact_match(obj, eng): """Return ``True`` if the record is already present in the system. Uses the default configuration of the ``inspire-matcher`` to find duplicates of the current workflow object in the system. Also sets the ``matches.exact`` property in ``extra_data`` to the list of control numbers that matched. Arguments: obj: a workflow object. eng: a workflow engine. Returns: bool: ``True`` if the workflow object has a duplicate in the system ``False`` otherwise. """ exact_match_config = current_app.config['EXACT_MATCH'] matches = dedupe_list(match(obj.data, exact_match_config)) record_ids = [el['_source']['control_number'] for el in matches] obj.extra_data.setdefault('matches', {})['exact'] = record_ids return bool(record_ids)