def test_merging_full_name_field_keeps_longest_name(): root = { 'authors': [{ 'full_name': 'Pitts Kevin', }] } head = { 'authors': [{ 'full_name': 'Pitts, Kevin John', }] } update = { 'authors': [{ 'full_name': 'Pitts, Kevin', }] } expected_merged = head expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_comparing_publication_info(): root = {} head = { 'publication_info': [ { 'journal_title': 'J. Testing', 'journal_volume': '42', } ] } update = { 'publication_info': [ { 'journal_title': 'J. Testing', 'journal_volume': '42', 'artid': 'foo', } ] } expected_conflict = [] expected_merged = update root, head, update, expected_merged = add_arxiv_source(root, head, update, expected_merged) merged, conflict = merge(root, head, update, head_source='arxiv') merged = add_arxiv_source(merged) assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_inspire_categories_field(): root = { 'inspire_categories': [{ 'source': 'INSPIRE', 'term': 'Theory-HEP' }] } head = { 'inspire_categories': [{ 'source': 'curator', 'term': 'Theory-HEP' }, { 'source': 'curator', 'term': 'Theory-Nucl' }] } update = { 'inspire_categories': [{ 'source': 'arxiv', 'term': 'Computing' }, { 'source': 'arxiv', 'term': 'Other' }] } expected_merged = head expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_comparing_authors_unicode_name(): root = {} head = { 'authors': [ { 'full_name': 'Ortín, Tomás' }, ], } update = { 'authors': [ { 'full_name': 'Ortin, Tomas' }, ], } expected_conflict = [] expected_merged = head root, head, update, expected_merged = add_arxiv_source( root, head, update, expected_merged) merged, conflict = merge(root, head, update, head_source='arxiv') merged = add_arxiv_source(merged) assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_comparing_publication_info_with_cnum(): root = {} head = { 'publication_info': [{ "artid": "WEPAB127", "cnum": "C21-05-24.3", "conf_acronym": "IPAC2021", "year": 2021 }] } update = { 'publication_info': [{ "artid": "WEPAB127", "cnum": "C21-05-24.3", "conf_acronym": "IPAC2021", "conference_record": { "$ref": "https://inspirehep.net/api/conferences/1853162" }, "year": 2021 }] } expected_conflict = [] expected_merged = update root, head, update, expected_merged = add_arxiv_source( root, head, update, expected_merged) merged, conflict = merge(root, head, update, head_source='arxiv') merged = add_arxiv_source(merged) assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_same_documents_arxiv_on_arxiv(fake_get_config): root = { "documents": [ { "key": "pdf1.pdf", "description": "paper", "source": "arXiv", "fulltext": True, "url": "http://example.com/files/1234-1234-1234-1234/pdf1.pdf", }, { "key": "pdf.tex", "description": "latex version", "source": "arXiv", "url": "http://example.com/files/1234-1234-1234-1234/pdf.tex", }, ] } head = root update = root expected_merged = head expected_conflict = [] merged, conflict = merge(root, head, update) assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_acquisition_source_publisher_on_arxiv(fake_get_config): root = { "acquisition_source": { "datetime": "2021-05-11T02:35:43.387350", "method": "hepcrawl", "source": "arXiv", "submission_number": "c8a0e3e0b20011eb8d930a580a6402c0" } } head = { "acquisition_source": { "datetime": "2021-05-11T02:35:43.387350", "method": "hepcrawl", "source": "arXiv", "submission_number": "c8a0e3e0b20011eb8d930a580a6402c0" } } update = { "acquisition_source": { "datetime": "2021-05-12T02:35:43.387350", "method": "beard", "source": "other source", "submission_number": "c8a0e3e0b20011eb8d930a580a6402c1" } } expected_merged = update expected_conflict = [] merged, conflict = merge(root, head, update) assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_dois_field_handles_repeated_values(): root = { 'dois': [{ 'material': 'preprint', 'value': '10.1023/A:1026654312961' }] } head = { 'dois': [ { 'material': 'publication', 'value': '10.1023/A:1026654312961' }, { 'source': 'nowhere', 'value': '10.1023/B:1026654312961' }, ] } update = { 'dois': [ { 'material': 'erratum', 'value': '10.1023/A:1026654312961' }, { 'material': 'erratum', 'source': 'nowhere', 'value': '10.1023/B:1026654312961' }, ] } expected_merged = { 'dois': [ { 'material': 'publication', 'value': '10.1023/A:1026654312961' }, { 'source': 'nowhere', 'value': '10.1023/B:1026654312961' }, { 'material': 'erratum', 'value': '10.1023/A:1026654312961' }, { 'material': 'erratum', 'source': 'nowhere', 'value': '10.1023/B:1026654312961' }, ] } expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_titles_field(): root = { 'titles': [{ 'source': 'arXiv', 'title': 'ANTARES: An observatory at the seabed ' 'to the confines of the Universe' } # record: 1519935 ] } head = { 'titles': [{ 'source': 'arXiv', 'subtitle': 'this subtitle has been added by a curator', 'title': 'ANTARES: An observatory at the seabed ' 'to the confines of the Universe' }] } update = { 'titles': [ { 'source': 'arXiv', 'title': 'ANTARES: Un osservatorio foo bar' }, ] } expected_merged = { 'titles': [ { 'source': 'arXiv', 'subtitle': 'this subtitle has been added by a curator', 'title': 'ANTARES: An observatory at the seabed ' 'to the confines of the Universe' }, ] } expected_conflict = [{ 'path': '/titles/0', 'op': 'add', 'value': { 'source': 'arXiv', 'title': 'ANTARES: Un osservatorio foo bar' }, '$type': 'INSERT' }] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_comparing_references_field_different_dois(): root = {} head = { 'references': [ { 'reference': { 'dois': [ '10.1099/bar', ], } } ] } update = { 'references': [ { 'reference': { 'dois': [ '10.1099/foo', ], 'document_type': 'article', } } ] } expected_conflict = [] expected_merged = { 'references': [ { 'reference': { 'dois': [ '10.1099/bar', ], } }, { 'reference': { 'dois': [ '10.1099/foo', ], 'document_type': 'article', } } ] } root, head, update, expected_merged = add_arxiv_source(root, head, update, expected_merged) merged, conflict = merge(root, head, update, head_source='arxiv') merged = add_arxiv_source(merged) assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_figures(): root = {} head = { 'figures': [{ 'key': 'figure1.png', 'caption': 'Figure 1', 'source': 'arXiv', 'url': 'http://example.comfiles/1234-1234-1234-1234/figure1.png', }, { 'key': 'figure2.png', 'caption': 'Figure 2', 'source': 'arXiv', 'url': 'http://example.com/files/1234-1234-1234-1234/figure2.png', }] } update = { 'figures': [{ 'key': 'new_figure1.png', 'caption': 'Figure 1', 'source': 'arXiv', 'url': 'http://example.com/files/5678-5678-5678-5678/figure1.png', }, { 'key': 'new_figure2.png', 'caption': 'Figure 2', 'source': 'arXiv', 'url': 'http://example.com/files/5678-5678-5678-5678/figure2.png', }] } expected_merged = update expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_ordering_conflicts(): # This test is actually for broken input. # Where authors are duplicated. root = load_test_data("test_data/root.json") head = load_test_data("test_data/head.json") update = load_test_data("test_data/update.json") expected_conflicts = load_test_data("test_data/conflicts.json") expected_merged = load_test_data("test_data/merged.json") merged, conflicts = merge(root, head, update) assert sorted(merged['authors'], key=itemgetter('uuid')) == sorted(expected_merged['authors'], key=itemgetter('uuid')) assert_ordered_conflicts(conflicts, expected_conflicts)
def test_documents(): root = {} head = { 'documents': [ { 'key': 'pdf1.pdf', 'description': 'paper', 'source': 'arXiv', 'fulltext': True, 'url': 'http://example.com/files/1234-1234-1234-1234/pdf1.pdf', }, { 'key': 'pdf.tex', 'description': 'latex version', 'source': 'arXiv', 'url': 'http://example.com/files/1234-1234-1234-1234/pdf.tex', }, ] } update = { 'documents': [{ 'key': 'pdf.pdf', 'description': 'paper', 'source': 'arXiv', 'url': 'http://example.com/files/5678-5678-5678-5678/pdf.pdf', }, { 'key': 'foo.xml', 'description': 'some xml files', 'source': 'arXiv', 'url': 'http://example.com/files/5678-5678-5678-5678/foo.xml', }] } expected_merged = update expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_acquisition_source_field(): root = {} # record_id: 1517095 head = {'acquisition_source': {'method': 'submitter', 'source': 'arxiv'}} update = { 'acquisition_source': { 'method': 'batchuploader', 'source': 'arxiv' } } expected_merged = update expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_comparing_keywords(): root = {} head = { 'keywords': [{ 'value': 'shielding', 'schema': 'JACOW', }, { 'value': 'test', 'schema': 'JACOW', }] } update = { 'keywords': [{ 'value': 'shielding', 'schema': 'INSPIRE', }, { 'value': 'shielding', 'schema': 'JACOW', }] } expected_conflict = [] expected_merged = { 'keywords': [ { 'value': 'shielding', 'schema': 'INSPIRE', }, { 'value': 'shielding', 'schema': 'JACOW', }, { 'value': 'test', 'schema': 'JACOW', }, ] } merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_license_field(): root = { 'license': [{ 'imposing': 'Elsevier', 'url': 'http://creativecommons.org/licenses/by/4.0/', 'license': 'elsevier foo bar' }] } head = { 'license': [{ 'imposing': 'Elsevier', 'url': 'http://creativecommons.org/licenses/by/4.0/', 'license': 'elsevier foo bar' }, { 'imposing': 'arXiv', 'url': 'http://creativecommons.org/licenses/by/4.0/', 'license': 'arxiv foo bar' }] } update = { 'license': [{ 'imposing': 'Elsevier', 'url': 'http://creativecommons.org/licenses/by/4.0/', 'license': 'elsevier foo bar updated!' }] } expected_merged = { 'license': [{ 'imposing': 'Elsevier', 'url': 'http://creativecommons.org/licenses/by/4.0/', 'license': 'elsevier foo bar updated!' }, { 'imposing': 'arXiv', 'url': 'http://creativecommons.org/licenses/by/4.0/', 'license': 'arxiv foo bar' }] } expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_report_numbers_field_repeated_values(): root = { 'report_numbers': [ { 'source': 'arXiv', 'value': 'CERN-CMS-2018-001', }, ] } # record: 1598022 head = { 'report_numbers': [ { 'hidden': True, 'source': 'arXiv', 'value': 'CERN-CMS-2018-001', }, { 'value': 'CERN-CMS-2018-001', }, ] } update = { 'report_numbers': [ { 'source': 'arXiv', 'value': 'CERN-CMS-2018-001', }, ] } expected_merged = head expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_raw_affiliations_field(): root = {} head = { 'authors': [{ 'full_name': 'Pitts, Kevin T', 'raw_affiliations': [{ 'source': 'arxiv', 'value': 'Department of Physics, Indiana University, Bloomington, IN 47405, USA' }] }] } update = { 'authors': [{ 'full_name': 'Pitts, Kevin T', 'raw_affiliations': [{ 'source': 'arxiv', 'value': 'Department of Physics, Indiana University, Bloomington, IN 47405, US' }, { 'source': 'arxiv', 'value': 'Padua U', }] }] } expected_merged = update expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)
def test_merging_publication_info_field(): root = { 'publication_info': [{ "hidden": True, "journal_title": "Adv.Theor.Math.Phys.", "journal_volume": "12", "page_end": "979", "page_start": "948", "year": 2008 }] } # record 697133 head = { 'publication_info': [{ "hidden": True, "journal_title": "Adv.Theor.Math.Phys.", "journal_record": { "$ref": "http://labs.inspirehep.net/api/journals/1212914" }, "journal_volume": "12", "page_end": "979", "page_start": "948", "year": 2008 }] } update = { 'publication_info': [ { 'artid': '948-979', 'curated_relation': True, 'journal_issue': '1', 'journal_title': 'Adv.Theor.Math.Phys.', 'journal_volume': '12', 'year': 2008, 'cnum': 'C12-03-10', 'material': 'erratum', 'page_end': '042', 'page_start': '032', 'parent_isbn': '9780521467025', 'parent_report_number': 'CERN-PH-TH-2012-115', }, ] } expected_merged = { 'publication_info': [{ 'artid': '948-979', 'cnum': 'C12-03-10', 'curated_relation': True, 'journal_title': 'Adv.Theor.Math.Phys.', "journal_volume": "12", 'journal_issue': '1', "journal_record": { "$ref": "http://labs.inspirehep.net/api/journals/1212914" }, 'material': 'erratum', 'page_end': '042', 'page_start': '032', 'parent_isbn': '9780521467025', 'parent_report_number': 'CERN-PH-TH-2012-115', "year": 2008, }] } expected_conflict = [] merged, conflict = merge(root, head, update, head_source='arxiv') assert merged == expected_merged assert_ordered_conflicts(conflict, expected_conflict) validate_subschema(merged)