def test_match_references_finds_match_when_repeated_record_with_different_scores( mocked_inspire_matcher_match, inspire_app): references = [{ "reference": { "publication_info": { "artid": "045", "journal_title": "JHEP", "journal_volume": "06", "page_start": "045", "year": 2007, } } }] schema = load_schema("hep") subschema = schema["properties"]["references"] assert validate(references, subschema) is None match_result = match_references(references) references = match_result["matched_references"] assert len(references) == 1 assert references[0]["record"][ "$ref"] == "http://localhost:5000/api/literature/1" assert validate(references, subschema) is None assert match_result["any_link_modified"] assert match_result["added_recids"] == [1] assert match_result["removed_recids"] == []
def refextract_url(): """Run refextract on a URL.""" if current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"): headers = { "Content-Type": "application/json", "Accept": "application/json" } data = { "journal_kb_data": create_journal_dict(), "url": request.json["url"] } response = requests.post( f"{current_app.config['REFEXTRACT_SERVICE_URL']}/extract_references_from_url", headers=headers, data=orjson.dumps(data), ) if response.status_code != 200: return jsonify({"message": "Can not extract references"}, 500) extracted_references = response.json()["extracted_references"] else: extracted_references = extract_references_from_url( request.json["url"], override_kbs_files={"journals": create_journal_dict()}, reference_format="{title},{volume},{page}", ) deduplicated_extracted_references = dedupe_list(extracted_references) references = map_refextract_to_schema(deduplicated_extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def refextract_url(): """Run refextract on a URL.""" extracted_references = extract_references_from_url( request.json["url"], override_kbs_files={"journals": create_journal_dict()}, reference_format="{title},{volume},{page}", ) references = map_refextract_to_schema(extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def refextract_url(): """Run refextract on a URL.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_url( request.json["url"], override_kbs_files=kbs_path, reference_format="{title},{volume},{page}", ) references = map_refextract_to_schema(extracted_references) match_result = match_references(references) return jsonify(match_result.get("matched_references"))
def refextract_text(): """Run refextract on a piece of text.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_string( request.json["text"], override_kbs_files=kbs_path, reference_format="{title},{volume},{page}", ) references = map_refextract_to_schema(extracted_references) references = match_references(references) return jsonify(references)
def test_match_references_doesnt_use_relaxed_title_matching(inspire_app): non_cited_record_with_pub_info_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 1, "document_type": ["article"], "publication_info": [{ "artid": "101", "journal_title": "Phys. Rev. B.", "journal_volume": "100", "page_start": "100", "year": 2020, }], "titles": [{ "title": "The Strongly-Interacting Light Higgs" }], } create_record("lit", non_cited_record_with_pub_info_json) cited_record_with_pub_info_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 2, "document_type": ["article"], "texkeys": ["Shaikh:2022ynt"], "titles": [{ "title": "The Strongly-Interacting Light Higgs" }], } create_record("lit", cited_record_with_pub_info_json) references = [{ "reference": { "publication_info": { "journal_title": "Phys. Rev.", "journal_volume": "100", "page_start": "100", }, "texkey": "Shaikh:2022ynt", } }] expected_ref = {"$ref": "http://localhost:5000/api/literature/2"} result = match_references(references) assert expected_ref == result["matched_references"][0]["record"]
def test_match_references_no_match_when_multiple_match_different_from_previous( inspire_app, ): """Test reference matcher for when inspire-matcher returns multiple matches where the matched record id is not the same as the previous matched record id""" original_cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 1, "document_type": ["article"], "publication_info": [ { "artid": "159", "journal_title": "JHEP", "journal_volume": "03", "page_start": "159", "year": 2016, }, { "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "material": "erratum", "page_start": "074", "year": 2017, }, ], } errata_cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 2, "document_type": ["article"], "publication_info": [{ "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "material": "erratum", "page_start": "074", "year": 2017, }], } create_record("lit", data=original_cited_record_json) create_record("lit", data=errata_cited_record_json) references = [{ "reference": { "publication_info": { "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "page_start": "074", "year": 2017, } } }] schema = load_schema("hep") subschema = schema["properties"]["references"] assert validate(references, subschema) is None references = match_references(references) assert get_value(references[0], "record") is None assert validate(references, subschema) is None
def test_match_references_matches_when_multiple_match_if_same_as_previous( inspire_app): """Test reference matcher for when inspire-matcher returns multiple matches where the matched record id is one of the previous matched record id as well""" original_cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 1, "document_type": ["article"], "publication_info": [ { "artid": "159", "journal_title": "JHEP", "journal_volume": "03", "page_start": "159", "year": 2016, }, { "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "material": "erratum", "page_start": "074", "year": 2017, }, ], } errata_cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 2, "document_type": ["article"], "publication_info": [{ "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "material": "erratum", "page_start": "074", "year": 2017, }], } create_record("lit", data=original_cited_record_json) create_record("lit", data=errata_cited_record_json) references = [ { "reference": { "publication_info": { "artid": "159", "journal_title": "JHEP", "journal_volume": "03", "page_start": "159", "year": 2016, } } }, { "reference": { "publication_info": { "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "page_start": "074", "year": 2017, } } }, ] schema = load_schema("hep") subschema = schema["properties"]["references"] assert validate(references, subschema) is None match_result = match_references(references) matched_references = match_result["matched_references"] assert (matched_references[1]["record"]["$ref"] == "http://localhost:5000/api/literature/1") assert validate(matched_references, subschema) is None assert match_result["any_link_modified"] assert match_result["added_recids"] == [1, 1] assert match_result["removed_recids"] == []
def get_linked_refs(): data = request.json match_result = match_references(data["references"]) return jsonify({"references": match_result.get("matched_references")})