def test_populate_inspire_document_type_from_refereed(): schema = load_schema('hep') document_type_schema = schema['properties']['document_type'] refereed_schema = schema['properties']['refereed'] record = { '$schema': 'http://localhost:5000/records/schemas/hep.json', 'document_type': [ 'article', ], 'refereed': True, } record = InspireRecord(record, model=RecordMetadata) assert validate(record['document_type'], document_type_schema) is None assert validate(record['refereed'], refereed_schema) is None populate_inspire_document_type(record) expected = [ 'article', 'peer reviewed', ] result = record['facet_inspire_doc_type'] assert expected == result
def test_populate_bookautocomplete_from_authors(): schema = load_schema('hep') authors_schema = schema['properties']['authors'] document_type_schema = schema['properties']['document_type'] self_schema = schema['properties']['self'] record = { '$schema': 'http://localhost:5000/records/schemas/hep.json', 'authors': [ {'full_name': 'Rafelski, Johann'}, ], 'document_type': [ 'book', ], 'self': { '$ref': 'http://localhost:5000/api/literature/1519486', }, } record = InspireRecord(record, model=RecordMetadata) assert validate(record['authors'], authors_schema) is None assert validate(record['document_type'], document_type_schema) is None assert validate(record['self'], self_schema) is None populate_bookautocomplete(record) expected = { 'input': [ 'Rafelski, Johann', ], } result = record['bookautocomplete'] assert expected == result
def test_arxiv_derive_inspire_categories(): schema = load_schema('hep') arxiv_eprints_schema = schema['properties']['arxiv_eprints'] inspire_categories_schema = schema['properties']['inspire_categories'] data = { 'arxiv_eprints': [ { 'categories': [ 'nucl-th', ], 'value': '1605.03898', }, ], } # literature/1458300 extra_data = {} assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None obj = MockObj(data, extra_data) eng = MockEng() assert arxiv_derive_inspire_categories(obj, eng) is None expected = [ { 'source': 'arxiv', 'term': 'Theory-Nucl', }, ] result = obj.data['inspire_categories'] assert validate(result, inspire_categories_schema) is None assert expected == result
def test_extract_journal_info_handles_year_an_empty_string(): schema = load_schema('hep') subschema = schema['properties']['publication_info'] data = { 'publication_info': [ {'pubinfo_freetext': 'The Astrophysical Journal, 838:134 (16pp), 2017 April 1'}, ], } extra_data = {} assert validate(data['publication_info'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert extract_journal_info(obj, eng) is None expected = [ { 'artid': '134', 'journal_title': 'Astrophys. J.', 'journal_volume': '838', 'page_start': '134', 'pubinfo_freetext': 'The Astrophysical Journal, 838:134 (16pp), 2017 April 1', }, ] result = obj.data['publication_info'] assert validate(result, subschema) is None assert expected == result
def test_extract_journal_info_handles_the_journal_split(): schema = load_schema('hep') subschema = schema['properties']['publication_info'] data = { 'publication_info': [ {'pubinfo_freetext': 'Phys. Rev. D 96, 076008. 2017'}, ], } extra_data = {} assert validate(data['publication_info'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert extract_journal_info(obj, eng) is None expected = [ { 'artid': '076008', 'journal_title': 'Phys. Rev. D', 'journal_volume': '96', 'pubinfo_freetext': 'Phys. Rev. D 96, 076008. 2017', }, ] result = obj.data['publication_info'] assert validate(result, subschema) is None assert expected == result
def test_populate_inspire_document_type_from_publication_type(): schema = load_schema('hep') document_type_schema = schema['properties']['document_type'] publication_type_schema = schema['properties']['publication_type'] record = { '$schema': 'http://localhost:5000/records/schemas/hep.json', 'document_type': [ 'article', ], 'publication_type': [ 'introductory', ], } assert validate(record['document_type'], document_type_schema) is None assert validate(record['publication_type'], publication_type_schema) is None populate_inspire_document_type(None, record) expected = [ 'article', 'introductory', ] result = record['facet_inspire_doc_type'] assert expected == result
def test_populate_bookautocomplete_does_nothing_if_record_is_not_a_book(): schema = load_schema('hep') authors_schema = schema['properties']['authors'] document_type_schema = schema['properties']['document_type'] self_schema = schema['properties']['self'] record = { '$schema': 'http://localhost:5000/records/schemas/hep.json', 'authors': [ {'full_name': 'Mohayai, Tanaz Angelina'}, ], 'document_type': [ 'article', ], 'self': { '$ref': 'http://localhost:5000/api/literature/1520027', } } assert validate(record['authors'], authors_schema) is None assert validate(record['document_type'], document_type_schema) is None assert validate(record['self'], self_schema) is None populate_bookautocomplete(None, record) assert 'bookautocomplete' not in record
def test_set_refereed_and_fix_document_type_sets_refereed_to_false_if_all_journals_are_not_refereed(mock_replace_refs): schema = load_schema('journals') subschema = schema['properties']['refereed'] journals = [{'refereed': False}] assert validate(journals[0]['refereed'], subschema) is None mock_replace_refs.return_value = journals schema = load_schema('hep') subschema = schema['properties']['refereed'] data = {'document_type': ['article']} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert set_refereed_and_fix_document_type(obj, eng) is None expected = False result = obj.data['refereed'] assert validate(result, subschema) is None assert expected == result
def test_set_refereed_and_fix_document_type_replaces_article_with_conference_paper_if_needed(mock_replace_refs): schema = load_schema('journals') subschema = schema['properties']['proceedings'] journals = [{'proceedings': True}] assert validate(journals[0]['proceedings'], subschema) is None mock_replace_refs.return_value = journals schema = load_schema('hep') subschema = schema['properties']['document_type'] data = {'document_type': ['article']} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert set_refereed_and_fix_document_type(obj, eng) is None expected = ['conference paper'] result = obj.data['document_type'] assert validate(result, subschema) is None assert expected == result
def test_match_references_finds_match_when_repeated_record_with_different_scores( mocked_inspire_matcher_match, isolated_app ): references = [ { 'reference': { 'publication_info': { 'artid': '045', 'journal_title': 'JHEP', 'journal_volume': '06', 'page_start': '045', 'year': 2007 } } } ] schema = load_schema('hep') subschema = schema['properties']['references'] assert validate(references, subschema) is None references = match_references(references) assert len(references) == 1 assert references[0]['record']['$ref'] == 'http://localhost:5000/api/literature/1' assert validate(references, subschema) is None
def test_match_reference_on_texkey(isolated_app): cited_record_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': ['Literature'], 'control_number': 1, 'document_type': ['article'], 'texkeys': [ 'Giudice:2007fh', ], 'titles': [ { 'title': 'The Strongly-Interacting Light Higgs' } ], } TestRecordMetadata.create_from_kwargs( json=cited_record_json, index_name='records-hep') reference = { 'reference': { 'texkey': 'Giudice:2007fh', } } schema = load_schema('hep') subschema = schema['properties']['references'] assert validate([reference], subschema) is None reference = match_reference(reference) assert reference['record']['$ref'] == 'http://localhost:5000/api/literature/1' assert validate([reference], subschema) is None
def test_is_arxiv_paper_for_submission(): schema = load_schema('hep') acquisition_source_schema = schema['properties']['acquisition_source'] arxiv_eprints_schema = schema['properties']['arxiv_eprints'] data = { 'acquisition_source': { 'method': 'submitter', }, 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '0801.4782', }, ], } extra_data = {} assert validate(data['acquisition_source'], acquisition_source_schema) is None assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None obj = MockObj(data, extra_data) eng = MockEng() assert is_arxiv_paper(obj, eng)
def test_is_arxiv_paper_returns_false_if_method_is_not_hepcrawl_or_arxiv(): schema = load_schema('hep') acquisition_source_schema = schema['properties']['acquisition_source'] arxiv_eprints_schema = schema['properties']['arxiv_eprints'] data = { 'acquisition_source': { 'method': 'batchuploader', 'source': 'arxiv', }, 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '0801.4782', }, ], } extra_data = {} assert validate(data['acquisition_source'], acquisition_source_schema) is None assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None obj = MockObj(data, extra_data) eng = MockEng() assert not is_arxiv_paper(obj, eng)
def test_get_conference_record(replace_refs): schema = load_schema('hep') control_number_schema = schema['properties']['control_number'] publication_info_schema = schema['properties']['publication_info'] conference_record = {'control_number': 972464} assert validate(conference_record['control_number'], control_number_schema) is None record = { 'publication_info': [ { 'conference_record': { '$ref': 'http://localhost:5000/api/conferences/972464', }, }, ], } assert validate(record['publication_info'], publication_info_schema) is None replace_refs.return_value = conference_record expected = 972464 result = get_conference_record(record) assert expected == result['control_number']
def test_formdata_to_model_only_chapter(mock_validate_record): schema = load_schema('hep') book_series_subschema = schema['properties']['book_series'] publication_info_subschema = schema['properties']['publication_info'] data = {} extra_data = {} obj = MockObj(data, extra_data) formdata = { 'end_page': '1200', 'parent_book': 'http://localhost:5000/api/literature/1373790', 'series_title': 'Astrophysics', 'start_page': '150', 'type_of_doc': 'chapter', } expected_book_series = [ {'title': 'Astrophysics'}, ] expected_publication_info = [ { 'page_end': '1200', 'page_start': '150', 'parent_record': { '$ref': 'http://localhost:5000/api/literature/1373790', }, }, ] result = formdata_to_model(obj, formdata) assert validate(result['book_series'], book_series_subschema) is None assert expected_book_series == result['book_series'] assert validate(result['publication_info'], publication_info_subschema) is None assert expected_publication_info == result['publication_info']
def test_fix_submission_number(): schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'method': 'hepcrawl', 'submission_number': '751e374a017311e896d6fa163ec92c6a', }, } extra_data = {} assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() fix_submission_number(obj, eng) expected = { 'method': 'hepcrawl', 'submission_number': '1', } result = obj.data['acquisition_source'] assert validate(result, subschema) is None assert expected == result
def test_populate_title_suggest_with_all_inputs(): schema = load_schema('journals') journal_title_schema = schema['properties']['journal_title'] short_title_schema = schema['properties']['short_title'] title_variants_schema = schema['properties']['title_variants'] record = { '$schema': 'http://localhost:5000/schemas/records/journals.json', 'journal_title': {'title': 'The Journal of High Energy Physics (JHEP)'}, 'short_title': 'JHEP', 'title_variants': ['JOURNAL OF HIGH ENERGY PHYSICS'], } assert validate(record['journal_title'], journal_title_schema) is None assert validate(record['short_title'], short_title_schema) is None assert validate(record['title_variants'], title_variants_schema) is None populate_title_suggest(None, record) expected = { 'input': [ 'The Journal of High Energy Physics (JHEP)', 'JHEP', 'JOURNAL OF HIGH ENERGY PHYSICS' ], 'output': 'JHEP', 'payload': { 'full_title': 'The Journal of High Energy Physics (JHEP)' } } result = record['title_suggest'] assert expected == result
def fix_submission_number_does_nothing_if_method_is_not_hepcrawl(): schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'method': 'submitter', 'submission_number': '869215', }, } extra_data = {} assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() fix_submission_number(obj, eng) expected = { 'method': 'submitter', 'submission_number': '869215', } result = obj.data['acquisition_source'] assert validate(result, subschema) is None assert expected == result
def test_assign_uuid_does_not_touch_existing_uuids(mock_uuid4): mock_uuid4.return_value = UUID('727238f3-8ed6-40b6-97d2-dc3cd1429131') schema = load_schema('hep') subschema = schema['properties']['authors'] record = { '$schema': 'http://localhost:5000/records/schemas/hep.json', 'authors': [ { 'full_name': 'Ellis, John Richard', 'uuid': 'e14955b0-7e57-41a0-90a8-f4c64eb8f4e9', }, ], } assert validate(record['authors'], subschema) is None assign_uuid(None, record) expected = [ { 'full_name': 'Ellis, John Richard', 'uuid': 'e14955b0-7e57-41a0-90a8-f4c64eb8f4e9', }, ] result = record['authors'] assert validate(result, subschema) is None assert expected == result
def test_populate_journal_coverage_writes_partial_if_all_coverages_are_partial(mock_replace_refs): schema = load_schema('journals') subschema = schema['properties']['_harvesting_info'] journals = [{'_harvesting_info': {'coverage': 'partial'}}] assert validate(journals[0]['_harvesting_info'], subschema) is None mock_replace_refs.return_value = journals schema = load_schema('hep') subschema = schema['properties']['publication_info'] data = { 'publication_info': [ {'journal_record': {'$ref': 'http://localhost:/api/journals/1212337'}}, ], } extra_data = {} assert validate(data['publication_info'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert populate_journal_coverage(obj, eng) is None expected = 'partial' result = obj.extra_data['journal_coverage'] assert expected == result
def test_record_with_non_valid_content_is_cleaned_and_created_properly( isolated_app): record_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'control_number': 1, 'document_type': [ 'article', ], 'titles': [ {'title': 'foo'}, ], '_collections': [ 'Literature' ], # these two fields make the record not valid 'documents': [], 'urls': [ {'url': ''}, ], # record/1628455/export/xme -- with some modification } non_valid = False try: validate(record_json) except ValidationError: non_valid = True assert non_valid record = InspireRecord.create(record_json) validate(record)
def test_set_refereed_and_fix_document_type_handles_journals_that_publish_mixed_content(mock_replace_refs): schema = load_schema('journals') proceedings_schema = schema['properties']['proceedings'] refereed_schema = schema['properties']['refereed'] journals = [{'proceedings': True, 'refereed': True}] assert validate(journals[0]['proceedings'], proceedings_schema) is None assert validate(journals[0]['refereed'], refereed_schema) is None mock_replace_refs.return_value = journals schema = load_schema('hep') subschema = schema['properties']['refereed'] data = {'document_type': ['article']} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert set_refereed_and_fix_document_type(obj, eng) is None expected = True result = obj.data['refereed'] assert validate(result, subschema) is None assert expected == result
def test_extract_journal_info(): schema = load_schema('hep') subschema = schema['properties']['publication_info'] data = { 'publication_info': [ {'pubinfo_freetext': 'J. Math. Phys. 55, 082102 (2014)'}, ], } extra_data = {} assert validate(data['publication_info'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert extract_journal_info(obj, eng) is None expected = [ { 'artid': '082102', 'journal_title': 'J. Math. Phys.', 'journal_volume': '55', 'pubinfo_freetext': 'J. Math. Phys. 55, 082102 (2014)', 'year': 2014, } ] result = obj.data['publication_info'] assert validate(result, subschema) is None assert expected == result
def test_populate_bookautocomplete_from_isbns_values(): schema = load_schema('hep') document_type_schema = schema['properties']['document_type'] self_schema = schema['properties']['self'] isbns_schema = schema['properties']['isbns'] record = { '$schema': 'http://localhost:5000/records/schemas/hep.json', 'document_type': [ 'book', ], 'isbns': [ {'value': '0201021153'}, ], 'self': { '$ref': 'http://localhost:5000/api/literature/1519486', }, } record = InspireRecord(record, model=RecordMetadata) assert validate(record['document_type'], document_type_schema) is None assert validate(record['isbns'], isbns_schema) is None assert validate(record['self'], self_schema) is None populate_bookautocomplete(record) expected = { 'input': [ '0201021153', ], } result = record['bookautocomplete'] assert expected == result
def test_prepare_keywords_does_nothing_if_no_keywords_were_predicted(): schema = load_schema('hep') subschema = schema['properties']['keywords'] data = { 'keywords': [ { 'schema': 'INSPIRE', 'value': 'field theory: conformal', }, ], } extra_data = {} assert validate(data['keywords'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert prepare_keywords(obj, eng) is None expected = [ { 'schema': 'INSPIRE', 'value': 'field theory: conformal', }, ] result = obj.data assert validate(result['keywords'], subschema) is None assert expected == result['keywords']
def test_arxiv_author_list_handles_multiple_author_xml_files(): schema = load_schema('hep') eprints_subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) authors_subschema = schema['properties']['authors'] expected_authors = [ { 'affiliations': [{'value': 'Yerevan Phys. Inst.'}], 'ids': [ {'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID'}, {'value': 'CERN-432142', 'schema': 'CERN'}, ], 'full_name': 'Sirunyan, Albert M', }, { 'affiliations': [{'value': 'Yerevan Phys. Inst.'}], 'ids': [ {'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID'}, {'value': 'CERN-432143', 'schema': 'CERN'}, ], 'full_name': 'Weary, Jake', } ] validate(expected_authors, authors_subschema) assert obj.data.get('authors') == expected_authors
def test_fuzzy_match_returns_true_if_something_matched_with_publication_info(mock_match, enable_fuzzy_matcher): schema = load_schema('hep') publication_info_schema = schema['properties']['publication_info'] titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 1472986, 'titles': [ { 'title': 'title', }, ], 'publication_info': [ { 'artid': '054021', 'journal_issue': '5', 'journal_title': 'Phys.Rev.D', 'journal_volume': '94', 'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021', 'year': 2016 }, ], } assert validate(matched_record['titles'], titles_schema) is None assert validate(matched_record['publication_info'], publication_info_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 1472986, 'title': 'title', 'publication_info': [ { 'artid': '054021', 'journal_issue': '5', 'journal_title': 'Phys.Rev.D', 'journal_volume': '94', 'pubinfo_freetext': 'Phys. Rev. D94 (2016) 054021', 'year': 2016 }, ], }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_fuzzy_match_returns_true_if_something_matched_with_more_than_1_public_notes(mock_match, enable_fuzzy_matcher): schema = load_schema('hep') public_notes_schema = schema['properties']['public_notes'] titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 1472986, 'titles': [ { 'title': 'title', }, ], 'public_notes': [ { 'source': 'arXiv', 'value': '4 pages, 4 figures', }, { 'source': 'arXiv', 'value': 'Some other public note', }, ], } assert validate(matched_record['titles'], titles_schema) is None assert validate(matched_record['public_notes'], public_notes_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 1472986, 'title': 'title', 'public_notes': [ {'value': '4 pages, 4 figures'}, {'value': 'Some other public note'}, ], }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_arxiv_plot_extract_logs_when_images_are_invalid(mock_process_tarball): mock_process_tarball.side_effect = DelegateError schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1612.00624', }, ], } # synthetic data extra_data = {} files = MockFiles({ '1612.00624.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1612.00624', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_plot_extract(obj, eng) is None expected = 'Error extracting plots for 1612.00624. Report and skip.' result = obj.log._error.getvalue() assert expected == result
def test_arxiv_plot_extract_logs_when_tarball_is_invalid(mock_process_tarball): mock_process_tarball.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1612.00626', }, ], } # synthetic data extra_data = {} files = MockFiles({ '1612.00626.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1612.00626', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_plot_extract(obj, eng) is None expected = 'Invalid tarball http://export.arxiv.org/e-print/1612.00626 for arxiv_id 1612.00626' result = obj.log._info.getvalue() assert expected == result
def test_references_from_999C50_h_m_o_r_y(): schema = load_schema('hep') subschema = schema['properties']['references'] snippet = ( '<datafield tag="999" ind1="C" ind2="5">' ' <subfield code="0">701721</subfield>' ' <subfield code="h">A. Ferrari, P.R. Sala, A. Fasso, and J. Ranft</subfield>' ' <subfield code="m">FLUKA: a multi-particle transport code, CERN-10 , INFN/TC_05/11</subfield>' ' <subfield code="o">13</subfield>' ' <subfield code="r">SLAC-R-773</subfield>' ' <subfield code="y">2005</subfield>' '</datafield>' ) # record/1478478 expected = [ { 'curated_relation': False, 'record': { '$ref': 'http://localhost:5000/api/literature/701721', }, 'reference': { 'authors': [ {'full_name': 'Ferrari, A.'}, {'full_name': 'Sala, P.R.'}, {'full_name': 'Fasso, A.'}, {'full_name': 'Ranft, J.'}, ], 'label': '13', 'misc': [ 'FLUKA: a multi-particle transport code, CERN-10 , INFN/TC_05/11', ], 'publication_info': {'year': 2005}, 'report_numbers': [ 'SLAC-R-773', ], }, }, ] result = hep.do(create_record(snippet)) assert validate(result['references'], subschema) is None assert expected == result['references'] expected = [ { '0': 701721, 'h': [ 'Ferrari, A.', 'Sala, P.R.', 'Fasso, A.', 'Ranft, J.', ], 'm': 'FLUKA: a multi-particle transport code, CERN-10 , INFN/TC_05/11', 'r': [ 'SLAC-R-773', ], 'o': '13', 'y': 2005, 'z': 0, } ] result = hep2marc.do(result) assert expected == result['999C5']
def test_schemas_validate(schema_name): example_data = load_example(schema_name) api.validate(data=example_data, schema_name=schema_name)
def _get_processed_item(item, spider): record = pipeline.process_item(item, spider) validate(record, 'hep') assert record return record
def xtest_populate_facet_author_name(mocked_get_linked_records_in_field): authors_json = [{ '$schema': 'http://localhost:5000/records/schemas/authors.json', 'name': { 'value': 'Silk, James Brian' }, '_collections': ['Authors'], 'ids': [{ 'schema': 'INSPIRE BAI', 'value': 'James.Brian.1' }], 'control_number': 111, }, { '$schema': 'http://localhost:5000/records/schemas/authors.json', 'name': { 'value': 'Doe, John', 'preferred_name': 'J Doe' }, '_collections': ['Authors'], 'ids': [{ 'schema': 'INSPIRE BAI', 'value': 'John.Doe.1' }], 'control_number': 222, }] mocked_get_linked_records_in_field.return_value = iter(authors_json) schema = load_schema('hep') subschema = schema['properties']['authors'] record = { '$schema': 'http://localhost:5000/records/schemas/hep.json', 'authors': [ { 'full_name': 'Silk, James Brian', 'record': { '$ref': 'https://labs.inspirehep.net/api/literature/111' } }, { 'full_name': 'Doe, John', 'record': { '$ref': 'https://labs.inspirehep.net/api/literature/222' } }, { 'full_name': 'Rohan, George', }, ], } record = InspireRecord(record, model=RecordMetadata) expected_result = [ u'James.Brian.1_James Brian Silk', u'John.Doe.1_J Doe', u'BAI_George Rohan' ] assert validate(record['authors'], subschema) is None populate_facet_author_name(record) assert record['facet_author_name'] == expected_result
def test_fuzzy_match_returns_true_if_something_matched_with_4_authors(mock_match, enable_fuzzy_matcher): schema = load_schema('hep') authors_schema = schema['properties']['authors'] titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 4328, 'titles': [ { 'title': 'title', }, ], 'authors': [ { 'full_name': 'Author 1' }, { 'full_name': 'Author, 2' }, { 'full_name': 'Author, 3' }, { 'full_name': 'Author, 4' } ], 'authors_count': 4 } assert validate(matched_record['titles'], titles_schema) is None assert validate(matched_record['authors'], authors_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 4328, 'title': 'title', 'authors': [ { 'full_name': 'Author 1' }, { 'full_name': 'Author, 2' }, { 'full_name': 'Author, 3' }, ], 'authors_count': 4 }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_load_author_advisors(): data = { 'advisors': [ { 'degree_type': 'bachelor', 'ids': [ { 'schema': 'DESY', 'value': 'DESY-55924820881' }, { 'schema': 'SCOPUS', 'value': '7039712595' }, { 'schema': 'SCOPUS', 'value': '8752067273' }, ], 'name': 'occaecat qui sint in id', 'record': { '$ref': 'http://1js40iZ' } }, ] } schema = load_schema('authors') subschema = schema['properties']['advisors'] result = Author().load(data).data expected = { '_collections': ['Authors'], 'advisors': [ { 'curated_relation': False, 'degree_type': 'bachelor', 'name': 'Id, Occaecat Qui Sint In', 'ids': [{ 'schema': 'DESY', 'value': 'DESY-55924820881' }, { 'schema': 'SCOPUS', 'value': '7039712595' }, { 'schema': 'SCOPUS', 'value': '8752067273' }], 'record': { '$ref': 'http://1js40iZ' } }, ] } assert validate(result['advisors'], subschema) is None assert expected == result
def test_report_numbers_and_document_type_from_multiple_088__a(): schema = load_schema('hep') subschema_report_numbers = schema['properties']['report_numbers'] subschema_document_type = schema['properties']['document_type'] snippet = ('<record>' ' <datafield tag="088" ind1=" " ind2=" ">' ' <subfield code="a">ATL-PHYS-CONF-2008-015</subfield>' ' </datafield>' ' <datafield tag="088" ind1=" " ind2=" ">' ' <subfield code="a">ATL-COM-PHYS-2008-052</subfield>' ' </datafield>' '<record>') # cds.cern.ch/record/2275456 expected = { '037__': [ { '9': 'CDS', 'a': 'ATL-PHYS-CONF-2008-015', }, { '9': 'CDS', 'a': 'ATL-COM-PHYS-2008-052', }, ], '980__': [ { 'a': 'NOTE', }, { 'a': 'HEP', }, { 'a': 'CORE', }, ], } result = cds2hep_marc.do(create_record(snippet)) assert expected['037__'] == result['037__'] assert expected['980__'] == result['980__'] expected = { 'document_type': [ 'note', ], 'public_notes': [ { 'source': 'CDS', 'value': 'Preliminary results', }, ], 'report_numbers': [ { 'source': 'CDS', 'value': 'ATL-PHYS-CONF-2008-015', }, { 'source': 'CDS', 'value': 'ATL-COM-PHYS-2008-052', }, ], } result = hep.do(create_record_from_dict(result)) assert validate(result['report_numbers'], subschema_report_numbers) is None assert validate(result['document_type'], subschema_document_type) is None assert expected['report_numbers'] == result['report_numbers'] assert expected['document_type'] == result['document_type']
def validate_subschema(obj): schema = load_schema('hep') key = list(obj.keys())[0] # python 3 compatibility sub_schema = schema['properties'].get(key) assert validate(obj.get(key), sub_schema) is None
def test_arxiv_author_list_handles_multiple_author_xml_files(): schema = load_schema('hep') eprints_subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({'file': AttrDict({ 'uri': filename, })}) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) authors_subschema = schema['properties']['authors'] expected_authors = [{ 'affiliations': [{ 'value': 'Yerevan Phys. Inst.' }], 'ids': [ { 'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID' }, { 'value': 'CERN-432142', 'schema': 'CERN' }, ], 'full_name': 'Sirunyan, Albert M', }, { 'affiliations': [{ 'value': 'Yerevan Phys. Inst.' }], 'ids': [ { 'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID' }, { 'value': 'CERN-432143', 'schema': 'CERN' }, ], 'full_name': 'Weary, Jake', }] validate(expected_authors, authors_subschema) assert obj.data.get('authors') == expected_authors
def test_match_reference_on_texkey_has_lower_priority_than_pub_info(isolated_app): cited_record_with_texkey_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': ['Literature'], 'control_number': 1, 'document_type': ['article'], 'texkeys': [ 'MyTexKey:2008fh', ], 'titles': [ { 'title': 'The Strongly-Interacting Light Higgs' } ], } TestRecordMetadata.create_from_kwargs( json=cited_record_with_texkey_json, index_name='records-hep') cited_record_with_pub_info_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': ['Literature'], 'control_number': 2, 'document_type': ['article'], 'publication_info': [ { 'artid': '100', 'journal_title': 'JHEP', 'journal_volume': '100', 'page_start': '100', 'year': 2020 } ], 'titles': [ { 'title': 'The Strongly-Interacting Light Higgs' } ], } TestRecordMetadata.create_from_kwargs( json=cited_record_with_pub_info_json, index_name='records-hep') reference = { 'reference': { 'texkey': 'MyTexKey:2008fh', 'publication_info': { 'artid': '100', 'journal_title': 'JHEP', 'journal_volume': '100', 'page_start': '100', 'year': 2020 } } } schema = load_schema('hep') subschema = schema['properties']['references'] assert validate([reference], subschema) is None reference = match_reference(reference) assert reference['record']['$ref'] == 'http://localhost:5000/api/literature/2' assert validate([reference], subschema) is None
def test_match_references_no_match_when_multiple_match_different_from_previous(isolated_app): """Test reference matcher for when inspire-matcher returns multiple matches where the matched record id is not the same as the previous matched record id""" original_cited_record_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': ['Literature'], 'control_number': 1, 'document_type': ['article'], 'publication_info': [ { 'artid': '159', 'journal_title': 'JHEP', 'journal_volume': '03', 'page_start': '159', 'year': 2016 }, { 'artid': '074', 'journal_title': 'JHEP', 'journal_volume': '05', 'material': 'erratum', 'page_start': '074', 'year': 2017 } ] } errata_cited_record_json = { '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': ['Literature'], 'control_number': 2, 'document_type': ['article'], 'publication_info': [ { 'artid': '074', 'journal_title': 'JHEP', 'journal_volume': '05', 'material': 'erratum', 'page_start': '074', 'year': 2017 } ] } TestRecordMetadata.create_from_kwargs( json=original_cited_record_json, index_name='records-hep') TestRecordMetadata.create_from_kwargs( json=errata_cited_record_json, index_name='records-hep') references = [ { 'reference': { 'publication_info': { 'artid': '074', 'journal_title': 'JHEP', 'journal_volume': '05', 'page_start': '074', 'year': 2017 } } } ] schema = load_schema('hep') subschema = schema['properties']['references'] assert validate(references, subschema) is None references = match_references(references) assert get_value(references[0], 'record') is None assert validate(references, subschema) is None
def test_add_institution_sorts_by_rank(): schema = load_schema('authors') subschema = schema['properties']['positions'] author = AuthorBuilder() author.add_institution(institution='Colgate University', rank='MASTER') author.add_institution(institution='Colgate University', rank='PHD') author.add_institution(institution='Colgate University', rank='VISITOR') author.add_institution(institution='Colgate University', rank='STAFF') author.add_institution(institution='Colgate University', rank='SENIOR') author.add_institution(institution='Colgate University', rank='OTHER') author.add_institution(institution='Colgate University', rank='UNDERGRADUATE') author.add_institution(institution='Colgate University') author.add_institution(institution='Colgate University', rank='POSTDOC') author.add_institution(institution='Colgate University', rank='JUNIOR') expected = [ { "institution": 'Colgate University', "rank": 'STAFF', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "rank": 'SENIOR', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "rank": 'JUNIOR', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "rank": 'VISITOR', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "rank": 'POSTDOC', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "rank": 'PHD', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "rank": 'MASTER', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "rank": 'UNDERGRADUATE', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "rank": 'OTHER', "curated_relation": False, "current": False }, { "institution": 'Colgate University', "curated_relation": False, "current": False }, ] result = author.obj['positions'] assert validate(result, subschema) is None assert expected == result
def test_authors_from_100__a_0_u_m_and_700__a_0_u_m(): schema = load_schema('hep') subschema = schema['properties']['authors'] snippet = ( '<record>' ' <datafield tag="100" ind1=" " ind2=" ">' ' <subfield code="a">Joram, Christian</subfield>' ' <subfield code="0">AUTHOR|(INSPIRE)INSPIRE-00093928</subfield>' ' <subfield code="0">AUTHOR|(SzGeCERN)403463</subfield>' ' <subfield code="0">AUTHOR|(CDS)2068232</subfield>' ' <subfield code="u">CERN</subfield>' ' <subfield code="m">[email protected]</subfield>' ' </datafield>' ' <datafield tag="700" ind1=" " ind2=" ">' ' <subfield code="a">Pons, Xavier</subfield>' ' <subfield code="0">AUTHOR|(CDS)2067681</subfield>' ' <subfield code="0">AUTHOR|(SzGeCERN)531402</subfield>' ' <subfield code="u">CERN</subfield>' ' <subfield code="m">[email protected]</subfield>' ' </datafield>' '</record>') # record/2295263 expected = { '100__': [ { 'a': 'Joram, Christian', 'i': ['INSPIRE-00093928'], 'j': ['CCID-403463'], 'u': 'CERN', 'm': '*****@*****.**', }, ], '700__': [ { 'a': 'Pons, Xavier', 'j': ['CCID-531402'], 'u': 'CERN', 'm': '*****@*****.**', }, ], } result = cds2hep_marc.do(create_record(snippet)) assert expected['100__'] == result['100__'] assert expected['700__'] == result['700__'] expected = [ { 'full_name': 'Joram, Christian', 'ids': [ { 'schema': 'INSPIRE ID', 'value': 'INSPIRE-00093928', }, { 'schema': 'CERN', 'value': 'CERN-403463', }, ], 'affiliations': [{ 'value': 'CERN' }], 'emails': ['*****@*****.**'], }, { 'full_name': 'Pons, Xavier', 'ids': [ { 'schema': 'CERN', 'value': 'CERN-531402', }, ], 'affiliations': [{ 'value': 'CERN' }], 'emails': ['*****@*****.**'], }, ] result = hep.do(create_record_from_dict(result)) assert validate(result['authors'], subschema) is None assert expected == result['authors']
def validate(self): """Validate the record, also ensuring format compliance.""" validate(self)
def test_report_numbers_and_document_type_and_publicate_notes_from_037__a(): schema = load_schema('hep') subschema_report_numbers = schema['properties']['report_numbers'] subschema_document_type = schema['properties']['document_type'] subschema_public_notes = schema['properties']['public_notes'] snippet = ('<datafield tag="088" ind1=" " ind2=" ">' ' <subfield code="a">CMS-PAS-SMP-15-001</subfield>' '</datafield>') # cds.cern.ch/record/2202807 expected = { '037__': [ { '9': 'CDS', 'a': 'CMS-PAS-SMP-15-001', }, ], '500__': [ { '9': 'CDS', 'a': 'Preliminary results', }, ], '980__': [ { 'a': 'NOTE', }, { 'a': 'HEP', }, { 'a': 'CORE', }, ], } result = cds2hep_marc.do(create_record(snippet)) assert expected['037__'] == result['037__'] assert expected['500__'] == result['500__'] assert expected['980__'] == result['980__'] expected = { 'document_type': [ 'note', ], 'public_notes': [ { 'source': 'CDS', 'value': 'Preliminary results', }, ], 'report_numbers': [ { 'source': 'CDS', 'value': 'CMS-PAS-SMP-15-001', }, ], } result = hep.do(create_record_from_dict(result)) assert validate(result['report_numbers'], subschema_report_numbers) is None assert validate(result['public_notes'], subschema_public_notes) is None assert validate(result['document_type'], subschema_document_type) is None assert expected['report_numbers'] == result['report_numbers'] assert expected['public_notes'] == result['public_notes'] assert expected['document_type'] == result['document_type']
def test_match_references_matches_when_multiple_match_if_same_as_previous( inspire_app): """Test reference matcher for when inspire-matcher returns multiple matches where the matched record id is one of the previous matched record id as well""" original_cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 1, "document_type": ["article"], "publication_info": [ { "artid": "159", "journal_title": "JHEP", "journal_volume": "03", "page_start": "159", "year": 2016, }, { "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "material": "erratum", "page_start": "074", "year": 2017, }, ], } errata_cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 2, "document_type": ["article"], "publication_info": [{ "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "material": "erratum", "page_start": "074", "year": 2017, }], } create_record("lit", data=original_cited_record_json) create_record("lit", data=errata_cited_record_json) references = [ { "reference": { "publication_info": { "artid": "159", "journal_title": "JHEP", "journal_volume": "03", "page_start": "159", "year": 2016, } } }, { "reference": { "publication_info": { "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "page_start": "074", "year": 2017, } } }, ] schema = load_schema("hep") subschema = schema["properties"]["references"] assert validate(references, subschema) is None match_result = match_references(references) matched_references = match_result["matched_references"] assert (matched_references[1]["record"]["$ref"] == "http://localhost:5000/api/literature/1") assert validate(matched_references, subschema) is None assert match_result["any_link_modified"] assert match_result["added_recids"] == [1, 1] assert match_result["removed_recids"] == []
def test_populate_experiment_suggest(): schema = load_schema('experiments') legacy_name_schema = schema['properties']['legacy_name'] long_name_schema = schema['properties']['long_name'] name_variants_schema = schema['properties']['name_variants'] collaboration_schema = schema['properties']['collaboration'] accelerator_schema = schema['properties']['accelerator'] experiment_schema = schema['properties']['experiment'] institutions_schema = schema['properties']['institutions'] record = { '$schema': 'http://foo/experiments.json', 'self': { '$ref': 'https://localhost:5000/api/experiments/bar' }, 'legacy_name': 'foo', 'long_name': 'foobarbaz', 'name_variants': [ 'bar', 'baz', ], 'collaboration': { 'value': 'D0', }, 'accelerator': { 'value': 'LHC', }, 'experiment': { 'short_name': 'SHINE', 'value': 'NA61', }, 'institutions': [ { 'value': 'ICN', }, ], } record = InspireRecord(record, model=RecordMetadata) assert validate(record['legacy_name'], legacy_name_schema) is None assert validate(record['long_name'], long_name_schema) is None assert validate(record['name_variants'], name_variants_schema) is None assert validate(record['collaboration'], collaboration_schema) is None assert validate(record['accelerator'], accelerator_schema) is None assert validate(record['institutions'], institutions_schema) is None assert validate(record['experiment'], experiment_schema) is None populate_experiment_suggest(record) expected = { 'input': [ 'LHC', 'D0', 'SHINE', 'NA61', 'ICN', 'foo', 'foobarbaz', 'bar', 'baz', ] } result = record['experiment_suggest'] assert expected == result
def test_match_reference_on_texkey_has_lower_priority_than_pub_info( inspire_app): cited_record_with_texkey_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 1, "document_type": ["article"], "texkeys": ["MyTexKey:2008fh"], "titles": [{ "title": "The Strongly-Interacting Light Higgs" }], } create_record("lit", cited_record_with_texkey_json) cited_record_with_pub_info_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 2, "document_type": ["article"], "publication_info": [{ "artid": "100", "journal_title": "JHEP", "journal_volume": "100", "page_start": "100", "year": 2020, }], "titles": [{ "title": "The Strongly-Interacting Light Higgs" }], } create_record("lit", cited_record_with_pub_info_json) reference = { "reference": { "texkey": "MyTexKey:2008fh", "publication_info": { "artid": "100", "journal_title": "JHEP", "journal_volume": "100", "page_start": "100", "year": 2020, }, } } schema = load_schema("hep") subschema = schema["properties"]["references"] assert validate([reference], subschema) is None reference = match_reference(reference) assert reference["record"][ "$ref"] == "http://localhost:5000/api/literature/2" assert validate([reference], subschema) is None expected_control_number = [2, 1] result_coontrol_number = match_reference_control_numbers(reference) assert set(expected_control_number) == set(result_coontrol_number) assert len(expected_control_number) == len(result_coontrol_number)
def test_ids_from_035__a_9_with_cern_malformed(): schema = load_schema('authors') subschema = schema['properties']['ids'] snippet = ( '<record>' ' <datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">CERN</subfield>' ' <subfield code="a">CERN-CERN-645257</subfield>' ' </datafield>' # record/1030771 ' <datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">CERN</subfield>' ' <subfield code="a">cern-783683</subfield>' ' </datafield>' # record/1408145 ' <datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">CERN</subfield>' ' <subfield code="a">CERM-724319</subfield>' ' </datafield>' # record/1244430 ' <datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">CERN</subfield>' ' <subfield code="a">CNER-727986</subfield>' ' </datafield>' # record/1068077 ' <datafield tag="035" ind1=" " ind2=" ">' ' <subfield code="9">CERN</subfield>' ' <subfield code="a">CVERN-765559</subfield>' ' </datafield>' # record/1340631 '</record>') expected = [ { 'schema': 'CERN', 'value': 'CERN-645257', }, { 'schema': 'CERN', 'value': 'CERN-783683', }, { 'schema': 'CERN', 'value': 'CERN-724319', }, { 'schema': 'CERN', 'value': 'CERN-727986', }, { 'schema': 'CERN', 'value': 'CERN-765559', }, ] result = hepnames.do(create_record(snippet)) assert validate(result['ids'], subschema) is None assert expected == result['ids'] expected = [ { '9': 'CERN', 'a': 'CERN-645257', }, { '9': 'CERN', 'a': 'CERN-783683', }, { '9': 'CERN', 'a': 'CERN-724319', }, { '9': 'CERN', 'a': 'CERN-727986', }, { '9': 'CERN', 'a': 'CERN-765559', }, ] result = hepnames2marc.do(result) assert expected == result['035']
def test_match_references_no_match_when_multiple_match_different_from_previous( inspire_app, ): """Test reference matcher for when inspire-matcher returns multiple matches where the matched record id is not the same as the previous matched record id""" original_cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 1, "document_type": ["article"], "publication_info": [ { "artid": "159", "journal_title": "JHEP", "journal_volume": "03", "page_start": "159", "year": 2016, }, { "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "material": "erratum", "page_start": "074", "year": 2017, }, ], } errata_cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", "_collections": ["Literature"], "control_number": 2, "document_type": ["article"], "publication_info": [{ "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "material": "erratum", "page_start": "074", "year": 2017, }], } create_record("lit", data=original_cited_record_json) create_record("lit", data=errata_cited_record_json) references = [{ "reference": { "publication_info": { "artid": "074", "journal_title": "JHEP", "journal_volume": "05", "page_start": "074", "year": 2017, } } }] schema = load_schema("hep") subschema = schema["properties"]["references"] assert validate(references, subschema) is None references = match_references(references) assert get_value(references[0], "record") is None assert validate(references, subschema) is None
def test_schemas_validate_negative(schema_name): example_data = load_example(schema_name) example_data = change_something(example_data) with pytest.raises(jsonschema.ValidationError): api.validate(data=example_data, schema_name=schema_name)
def test_collaborations_from_multiple_710__g_0_and_710__g(): schema = load_schema('hep') subschema = schema['properties']['collaborations'] snippet = ('<record>' ' <datafield tag="710" ind1=" " ind2=" ">' ' <subfield code="g">ANTARES</subfield>' ' <subfield code="0">1110619</subfield>' ' </datafield>' ' <datafield tag="710" ind1=" " ind2=" ">' ' <subfield code="g">IceCube</subfield>' ' <subfield code="0">1108514</subfield>' ' </datafield>' ' <datafield tag="710" ind1=" " ind2=" ">' ' <subfield code="g">LIGO Scientific</subfield>' ' </datafield>' ' <datafield tag="710" ind1=" " ind2=" ">' ' <subfield code="g">Virgo</subfield>' ' <subfield code="0">1110601</subfield>' ' </datafield>' '</record>') # record/1422032 expected = [ { 'record': { '$ref': 'http://localhost:5000/api/experiments/1110619', }, 'value': 'ANTARES', }, { 'record': { '$ref': 'http://localhost:5000/api/experiments/1108514', }, 'value': 'IceCube', }, { 'value': 'LIGO Scientific', }, { 'record': { '$ref': 'http://localhost:5000/api/experiments/1110601', }, 'value': 'Virgo', }, ] result = hep.do(create_record(snippet)) assert validate(result['collaborations'], subschema) is None assert expected == result['collaborations'] expected = [ { 'g': 'ANTARES' }, { 'g': 'IceCube' }, { 'g': 'LIGO Scientific' }, { 'g': 'Virgo' }, ] result = hep2marc.do(result) assert expected == result['710']
def test_addresses_from_371__triple_a_b_d_e_g_and_371__triple_a_b_d_e_g_x(): schema = load_schema('institutions') subschema = schema['properties']['addresses'] snippet = ( '<record>' ' <datafield tag="371" ind1=" " ind2=" ">' ' <subfield code="a">Université Libre de Bruxelles (ULB)</subfield>' ' <subfield code="a">Boulevard du Triomphe, 2</subfield>' ' <subfield code="a">B-1050 Bruxelles</subfield>' ' <subfield code="b">Brussels</subfield>' ' <subfield code="d">Belgium</subfield>' ' <subfield code="e">1050</subfield>' ' <subfield code="g">BE</subfield>' ' </datafield>' ' <datafield tag="371" ind1=" " ind2=" ">' ' <subfield code="a">Vrije Universiteit VUB</subfield>' ' <subfield code="a">Pleinlaan 2</subfield>' ' <subfield code="a">B-1050 Brussel</subfield>' ' <subfield code="b">Brussels</subfield>' ' <subfield code="d">Belgium</subfield>' ' <subfield code="e">1050</subfield>' ' <subfield code="g">BE</subfield>' ' <subfield code="x">secondary</subfield>' ' </datafield>' '</record>') # record/902696 expected = [ { 'cities': [ 'Brussels', ], 'country_code': 'BE', 'postal_address': [ u'Université Libre de Bruxelles (ULB)', 'Boulevard du Triomphe, 2', 'B-1050 Bruxelles', ], 'postal_code': '1050', }, { 'cities': [ 'Brussels', ], 'country_code': 'BE', 'postal_address': [ 'Vrije Universiteit VUB', 'Pleinlaan 2', 'B-1050 Brussel', ], 'postal_code': '1050', }, ] result = institutions.do(create_record(snippet)) assert validate(result['addresses'], subschema) is None assert expected == result['addresses']
def test_figures_order_from_FFT(): schema = load_schema('hep') subschema = schema['properties']['figures'] snippet = ( '<record>' ' <datafield tag="FFT" ind1=" " ind2=" ">' ' <subfield code="a">/opt/cds-invenio/var/data/files/g151/3037400/content.png;1</subfield>' ' <subfield code="d">00010 Co-simulation results, at $50~\mathrm{ms}$...</subfield>' ' <subfield code="f">.png</subfield>' ' <subfield code="n">FIG11</subfield>' ' <subfield code="r"/>' ' <subfield code="s">2017-10-04 07:54:54</subfield>' ' <subfield code="t">Main</subfield>' ' <subfield code="v">1</subfield>' ' <subfield code="z"/>' ' </datafield>' ' <datafield tag="FFT" ind1=" " ind2=" ">' ' <subfield code="a">/opt/cds-invenio/var/data/files/g151/3037399/content.png;1</subfield>' ' <subfield code="d">00009 Co-simulation results, at $50~\mathrm{ms}$...</subfield>' ' <subfield code="f">.png</subfield>' ' <subfield code="n">FIG10</subfield>' ' <subfield code="r"/>' ' <subfield code="s">2017-10-04 07:54:54</subfield>' ' <subfield code="t">Main</subfield>' ' <subfield code="v">1</subfield>' ' <subfield code="z"/>' ' </datafield>' ' <datafield tag="FFT" ind1=" " ind2=" ">' ' <subfield code="a">/opt/cds-invenio/var/data/files/g151/3037401/content.png;1</subfield>' ' <subfield code="d">00011 Co-simulation results, at $50~\mathrm{ms}$...</subfield>' ' <subfield code="f">.png</subfield>' ' <subfield code="n">FIG12</subfield>' ' <subfield code="r"/>' ' <subfield code="s">2017-10-04 07:54:54</subfield>' ' <subfield code="t">Main</subfield>' ' <subfield code="v">1</subfield>' ' <subfield code="z"/>' ' </datafield>' '</record>' ) # record/1628455 expected = [ { 'key': 'FIG10.png', 'caption': 'Co-simulation results, at $50~\mathrm{ms}$...', 'url': 'file:///afs/cern.ch/project/inspire/PROD/var/data/files/g151/3037399/content.png%3B1', }, { 'key': 'FIG11.png', 'caption': 'Co-simulation results, at $50~\mathrm{ms}$...', 'url': 'file:///afs/cern.ch/project/inspire/PROD/var/data/files/g151/3037400/content.png%3B1', }, { 'key': 'FIG12.png', 'caption': 'Co-simulation results, at $50~\mathrm{ms}$...', 'url': 'file:///afs/cern.ch/project/inspire/PROD/var/data/files/g151/3037401/content.png%3B1', } ] result = hep.do(create_record(snippet)) assert validate(result['figures'], subschema) is None assert expected == result['figures'] assert 'documents' not in result
def test_references_from_999C50_9_r_u_h_m_o(): schema = load_schema('hep') subschema = schema['properties']['references'] snippet = ( '<datafield tag="999" ind1="C" ind2="5">' ' <subfield code="0">1511470</subfield>' ' <subfield code="9">CURATOR</subfield>' ' <subfield code="r">urn:nbn:de:hebis:77-diss-1000009520</subfield>' ' <subfield code="u">http://www.diss.fu-berlin.de/diss/receive/FUDISS_thesis_000000094316</subfield>' ' <subfield code="h">K. Wiebe</subfield>' ' <subfield code="m">Ph.D. thesis, University of Mainz, in preparation</subfield>' ' <subfield code="o">51</subfield>' '</datafield>' ) # record/1504897 expected = [ { 'curated_relation': False, 'legacy_curated': True, 'record': { '$ref': 'http://localhost:5000/api/literature/1511470', }, 'reference': { 'authors': [ {'full_name': 'Wiebe, K.'}, ], 'label': '51', 'misc': [ 'Ph.D. thesis, University of Mainz, in preparation', ], 'report_numbers': [ 'urn:nbn:de:hebis:77-diss-1000009520', ], 'urls': [ {'value': 'http://www.diss.fu-berlin.de/diss/receive/FUDISS_thesis_000000094316'}, ], }, }, ] result = hep.do(create_record(snippet)) assert validate(result['references'], subschema) is None assert expected == result['references'] expected = [ { '0': 1511470, '9': 'CURATOR', 'h': [ 'Wiebe, K.', ], 'r': [ 'urn:nbn:de:hebis:77-diss-1000009520', ], 'm': 'Ph.D. thesis, University of Mainz, in preparation', 'o': '51', 'u': [ 'http://www.diss.fu-berlin.de/diss/receive/FUDISS_thesis_000000094316', ], 'z': 0, }, ] result = hep2marc.do(result) assert expected == result['999C5']
def test_authors_from_100__a_u_and_multiple_700__a_u_e(): schema = load_schema('hep') subschema = schema['properties']['authors'] snippet = ('<record>' ' <datafield tag="100" ind1=" " ind2=" ">' ' <subfield code="a">Aichinger, Ida</subfield>' ' <subfield code="u">Linz U.</subfield>' ' </datafield>' ' <datafield tag="700" ind1=" " ind2=" ">' ' <subfield code="a">Larcher, Gerhard</subfield>' ' <subfield code="u">Linz U.</subfield>' ' <subfield code="e">dir.</subfield>' ' </datafield>' ' <datafield tag="700" ind1=" " ind2=" ">' ' <subfield code="a">Kersevan, Roberto</subfield>' ' <subfield code="u">Linz U.</subfield>' ' <subfield code="e">dir.</subfield>' ' </datafield>' '</record>') # record/2295265 expected = { '100__': [ { 'a': 'Aichinger, Ida', 'u': 'Linz U.', }, ], '701__': [{ 'a': 'Larcher, Gerhard', 'e': 'dir.', 'u': 'Linz U.', }, { 'a': 'Kersevan, Roberto', 'e': 'dir.', 'u': 'Linz U.', }] } result = cds2hep_marc.do(create_record(snippet)) assert expected['100__'] == result['100__'] assert expected['701__'] == result['701__'] expected = [ { 'full_name': 'Aichinger, Ida', 'affiliations': [{ 'value': 'Linz U.' }], }, { 'full_name': 'Larcher, Gerhard', 'inspire_roles': ['supervisor'], 'affiliations': [{ 'value': 'Linz U.' }], }, { 'full_name': 'Kersevan, Roberto', 'inspire_roles': ['supervisor'], 'affiliations': [{ 'value': 'Linz U.' }], }, ] result = hep.do(create_record_from_dict(result)) assert validate(result['authors'], subschema) is None assert expected == result['authors']
def test_validate_raises_if_no_schema_key(): with pytest.raises(errors.SchemaKeyNotFound): api.validate(data={})
def reporterrors(output): """Reports in a friendly way all failed records and corresponding motivation.""" def get_collection(marc_record): collections = set() for field in force_list(marc_record.get('980__')): for v in field.values(): for e in force_list(v): collections.add(e.upper().strip()) if 'DELETED' in collections: return 'DELETED' for collection in collections: if collection in REAL_COLLECTIONS: return collection return 'HEP' click.echo("Reporting broken records into {0}".format(output)) errors = {} results = InspireProdRecords.query.filter(InspireProdRecords.valid == False) # noqa: ignore=F712 results_length = results.count() with click.progressbar(results.yield_per(100), length=results_length) as bar: for obj in bar: marc_record = create_record(obj.marcxml, keep_singletons=False) collection = get_collection(marc_record) if 'DELETED' in collection: continue recid = int(marc_record['001']) try: json_record = marcxml2record(obj.marcxml) except Exception as err: tb = u''.join(traceback.format_tb(sys.exc_info()[2])) errors.setdefault((collection, 'dojson', tb), []).append(recid) continue try: validate(json_record) except jsonschema.exceptions.ValidationError as err: exc = [ row for row in str(err).splitlines() if row.startswith('Failed validating') ][0] details = u'\n'.join( dropwhile( lambda x: not x.startswith('On instance'), str(err).splitlines() ) ) errors.setdefault( (collection, 'validation', exc), [] ).append((recid, details)) continue with open(output, "w") as out: csv_writer = csv.writer(out) for (collection, stage, error), elements in errors.iteritems(): if stage == 'dojson': csv_writer.writerow(( collection, stage, error, '\n'.join( 'http://inspirehep.net/record/{}'.format(recid) for recid in elements ) )) else: for recid, details in elements: csv_writer.writerow(( collection, stage, error, 'http://inspirehep.net/record/{}'.format(recid), details )) click.echo("Dumped errors into {}".format(output))
def _get_record_from_processed_item(item, spider): crawl_result = pipeline.process_item(item, spider) validate(crawl_result['record'], 'hep') assert crawl_result return crawl_result['record']
def test_references_from_999C5a_h_o_s_x_y_0(): schema = load_schema('hep') subschema = schema['properties']['references'] snippet = ( '<datafield tag="999" ind1="C" ind2="5">' ' <subfield code="a">doi:10.1142/S0217751X0804055X</subfield>' ' <subfield code="h">G.K. Leontaris</subfield>' ' <subfield code="o">15</subfield>' ' <subfield code="s">Int.J.Mod.Phys.,A23,2055</subfield>' ' <subfield code="x">Int. J. Mod. Phys. A 23 (doi:10.1142/S0217751X0804055X)</subfield>' ' <subfield code="y">2008</subfield>' ' <subfield code="0">780399</subfield>' '</datafield>' ) # record/1478478 expected = [ { 'curated_relation': False, 'record': { '$ref': 'http://localhost:5000/api/literature/780399', }, 'raw_refs': [ { 'value': 'Int. J. Mod. Phys. A 23 (doi:10.1142/S0217751X0804055X)', 'schema': 'text', }, ], 'reference': { 'dois': ['10.1142/S0217751X0804055X'], 'authors': [ {'full_name': u'Leontaris, G.K.'}, ], 'label': '15', 'publication_info': { "artid": '2055', 'journal_title': 'Int.J.Mod.Phys.A', 'journal_volume': '23', 'page_start': '2055', 'year': 2008, }, }, }, ] result = hep.do(create_record(snippet)) assert validate(result['references'], subschema) is None assert expected == result['references'] expected = [ { 'a': [ 'doi:10.1142/S0217751X0804055X', ], 'h': [ 'Leontaris, G.K.', ], 'o': '15', 's': 'Int.J.Mod.Phys.,A23,2055', 'x': [ 'Int. J. Mod. Phys. A 23 (doi:10.1142/S0217751X0804055X)', ], 'y': 2008, 'z': 0, '0': 780399, } ] result = hep2marc.do(result) assert expected == result['999C5']