def test_jlab_ticket_needed_returns_false(): config = {'JLAB_ARXIV_CATEGORIES': ['nucl-th']} with patch.dict(current_app.config, config): data = { 'arxiv_eprints': [{ 'categories': ['math.DG'], 'value': '1806.03979' }] } extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert jlab_ticket_needed(obj, eng) is False
def test_is_arxiv_paper_returns_false_if_source_is_not_present_for_hepcrawl(): schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'method': 'hepcrawl', }, } extra_data = {} assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert not is_arxiv_paper(obj, eng)
def test_article_exists_returns_true_if_something_matched(mock_match): mock_match.return_value = iter([{'_source': {'control_number': 4328}}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert article_exists(obj, eng) assert 'record_matches' in obj.extra_data expected = [4328] result = obj.extra_data['record_matches'] assert expected == result
def test_article_exists_returns_false_if_nothing_matched(mock_match): mock_match.return_value = iter([]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert not article_exists(obj, eng) assert 'record_matches' in obj.extra_data expected = [] result = obj.extra_data['record_matches'] assert expected == result
def test_exact_match_returns_true_if_something_matched(mock_match): mock_match.return_value = iter([{'_source': {'control_number': 4328}}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert exact_match(obj, eng) assert 'matches' in obj.extra_data expected = [4328] result = get_value(obj.extra_data, 'matches.exact') assert expected == result
def test_fuzzy_match_returns_false_if_nothing_matched(mock_match, enable_fuzzy_matcher): mock_match.return_value = iter([]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert not fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_wait_webcoll_halts_the_workflow_engine_when_in_production_mode(): config = {'PRODUCTION_MODE': True} with patch.dict(current_app.config, config): data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert wait_webcoll(obj, eng) is None expected = 'Waiting for webcoll.' result = eng.msg assert expected == result
def test_refextract_from_pdf(mock_get_pdf_in_workflow): mock_get_pdf_in_workflow.return_value = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1704.00452.pdf')) schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = {'acquisition_source': {'source': 'arXiv'}} extra_data = {} assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert refextract(obj, eng) is None assert obj.data['references'][0]['raw_refs'][0]['source'] == 'arXiv'
def test_populate_submission_document_does_not_duplicate_documents(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'datetime': '2017-11-30T16:38:43.352370', 'email': '*****@*****.**', 'internal_uid': 54252, 'method': 'submitter', 'orcid': '0000-0002-2174-4493', 'source': 'submitter', 'submission_number': '1', }, } extra_data = { 'submission_pdf': 'http://export.arxiv.org/pdf/1605.03844', } files = MockFiles({}) assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert populate_submission_document(obj, eng) is None assert populate_submission_document(obj, eng) is None expected = [ { 'fulltext': True, 'key': 'fulltext.pdf', 'original_url': 'http://export.arxiv.org/pdf/1605.03844', 'source': 'submitter', 'url': 'http://export.arxiv.org/pdf/1605.03844', }, ] result = obj.data['documents'] assert expected == result
def test_fuzzy_match_returns_true_if_something_matched_with_1_author(mock_match, enable_fuzzy_matcher): schema = load_schema('hep') authors_schema = schema['properties']['authors'] titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 4328, 'titles': [ { 'title': 'title', }, ], 'authors': [ { 'full_name': 'Author 1' }, ], 'authors_count': 1 } assert validate(matched_record['titles'], titles_schema) is None assert validate(matched_record['authors'], authors_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 4328, 'title': 'title', 'authors': [ { 'full_name': 'Author 1' }, ], 'authors_count': 1 }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_populate_arxiv_document_does_not_duplicate_files_if_called_multiple_times( ): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1605.03844', }, ], } # literature/1458302 extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert populate_arxiv_document(obj, eng) is None assert populate_arxiv_document(obj, eng) is None expected = [ { 'key': '1605.03844.pdf', 'fulltext': True, 'hidden': True, 'material': 'preprint', 'original_url': 'http://export.arxiv.org/pdf/1605.03844', 'url': 'http://export.arxiv.org/pdf/1605.03844', 'source': 'arxiv', }, ] result = obj.data['documents'] assert expected == result
def test_arxiv_author_list_does_not_produce_latex(): schema = load_schema('hep') filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1802.03388.tar.gz')) eprints_subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1802.03388', }, ], } validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1802.03388.tar.gz': AttrDict({'file': AttrDict({'uri': filename})}) }) authors_subschema = schema['properties']['authors'] expected_authors = [ { 'affiliations': [{'value': 'Lund U.'}], 'ids': [ { 'value': 'INSPIRE-00061248', 'schema': 'INSPIRE ID' } ], 'full_name': u'Åkesson, Torsten Paul Ake' }, ] validate(expected_authors, authors_subschema) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None assert obj.data.get('authors') == expected_authors
def test_is_arxiv_paper_ignores_case_for_hepcrawl(): schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'method': 'hepcrawl', 'source': 'arXiv', }, } extra_data = {} assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert is_arxiv_paper(obj, eng)
def test_reply_ticket_calls_tickets_reply_when_template_is_not_set( mock_reply_ticket, mock_user): mock_user.query.get.return_value = MockUser('*****@*****.**') data = { 'titles': [ { 'title': 'Partial Symmetries of Weak Interactions' }, ], } extra_data = {'ticket_id': 1, 'reason': 'reply reason'} obj = MockObj(data, extra_data) eng = MockEng() _reply_ticket = reply_ticket() _reply_ticket(obj, eng) mock_reply_ticket.assert_called_with(extra_data['ticket_id'], extra_data['reason'], False)
def test_arxiv_fulltext_download_retries_on_error(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03814', [ { 'content': '', 'status_code': 500, }, { 'content': pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03814.pdf')), 'status_code': 200, }, ], ) schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1605.03814', }, ], } # literature/1458270 extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_fulltext_download(obj, eng) is None expected = 'PDF retrieved from arXiv for 1605.03814' result = obj.log._info.getvalue() assert expected == result
def test_send_robotupload_removes_references_if_feature_flag_disabled(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'POST', 'http://inspirehep.net/batchuploader/robotupload/insert', text='[INFO] foo bar baz') schema = load_schema('hep') subschema = schema['properties']['references'] config = { 'LEGACY_ROBOTUPLOAD_URL': 'http://inspirehep.net', 'PRODUCTION_MODE': True, } with patch.dict(current_app.config, config), \ patch('inspirehep.modules.workflows.tasks.submission.record2marcxml') as mock_record2marcxml: data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'references': [ { 'raw_refs': [ { 'schema': 'text', 'value': '[1] J. Maldacena and A. Strominger, hep-th/9710014.', }, ], }, ] } data_without_references = { '$schema': 'http://localhost:5000/schemas/records/hep.json', } extra_data = {} assert validate(data['references'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() _send_robotupload = send_robotupload(mode='insert', ) assert _send_robotupload(obj, eng) is None assert mock_record2marcxml.called_with(data_without_references)
def test_download_documents_with_multiple_documents(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03845', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) schema = load_schema('hep') subschema = schema['properties']['documents'] data = { 'documents': [ { 'key': '1605.03844.pdf', 'url': 'http://export.arxiv.org/pdf/1605.03844' }, { 'key': '1605.03845.pdf', 'url': 'http://export.arxiv.org/pdf/1605.03845' }, ], } # literature/1458302 extra_data = {} files = MockFiles({}) assert validate(data['documents'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert download_documents(obj, eng) is None documents = obj.data['documents'] expected_document_url_1 = '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf' expected_document_url_2 = '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03845.pdf' assert 2 == len(documents) assert expected_document_url_1 == documents[0]['url'] assert expected_document_url_2 == documents[1]['url']
def test_arxiv_derive_inspire_categories_appends_categories_with_different_source( ): schema = load_schema('hep') arxiv_eprints_schema = schema['properties']['arxiv_eprints'] inspire_categories_schema = schema['properties']['inspire_categories'] data = { 'arxiv_eprints': [ { 'categories': [ 'nucl-th', ], 'value': '1605.03898', }, ], 'inspire_categories': [ { 'source': 'undefined', 'term': 'Theory-Nucl', }, ], } # literature/1458300 extra_data = {} assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None assert validate(data['inspire_categories'], inspire_categories_schema) is None obj = MockObj(data, extra_data) eng = MockEng() assert arxiv_derive_inspire_categories(obj, eng) is None expected = [ { 'source': 'undefined', 'term': 'Theory-Nucl', }, { 'source': 'arxiv', 'term': 'Theory-Nucl', }, ] result = obj.data['inspire_categories'] assert validate(result, inspire_categories_schema) is None assert expected == result
def test_set_schema_adds_a_schema_from_the_eng_data_type(): schema = load_schema('hep') subschema = schema['properties']['$schema'] data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng(data_type='hep') assert set_schema(obj, eng) is None expected = 'http://localhost:5000/schemas/records/hep.json' result = obj.data assert validate(result['$schema'], subschema) is None assert expected == result['$schema']
def test_send_robotupload_works_with_hepnames2marc_and_mode_insert(): httpretty.HTTPretty.allow_net_connect = False httpretty.register_uri( httpretty.POST, 'http://inspirehep.net/batchuploader/robotupload/insert', body='[INFO] foo bar baz') schema = load_schema('authors') subschema = schema['properties']['arxiv_categories'] config = { 'LEGACY_ROBOTUPLOAD_URL': 'http://inspirehep.net', 'PRODUCTION_MODE': True, } with patch.dict(current_app.config, config): data = { 'arxiv_categories': [ 'hep-th', ], } extra_data = {} assert validate(data['arxiv_categories'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() _send_robotupload = send_robotupload( marcxml_processor=hepnames2marc, mode='insert', ) assert _send_robotupload(obj, eng) is None expected = ('Robotupload sent!' '[INFO] foo bar baz' 'end of upload') result = obj.log._info.getvalue() assert expected == result expected = 'Waiting for robotupload: [INFO] foo bar baz' result = eng.msg assert expected == result httpretty.HTTPretty.allow_net_connect = True
def test_arxiv_author_list_logs_on_error(mock_os, mock_untar): mock_untar.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '1605.07707', }, ], } # synthethic data extra_data = {} files = MockFiles({ '1605.07707.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1605.07707', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert default_arxiv_author_list(obj, eng) is None expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707' result = obj.log._error.getvalue() assert expected == result finally: rmtree(temporary_dir)
def test_set_schema_does_nothing_when_the_schema_url_is_already_full(): schema = load_schema('hep') subschema = schema['properties']['$schema'] data = {'$schema': 'http://localhost:5000/schemas/records/hep.json'} extra_data = {} assert validate(data['$schema'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert set_schema(obj, eng) is None expected = 'http://localhost:5000/schemas/records/hep.json' result = obj.data assert validate(result['$schema'], subschema) is None assert expected == result['$schema']
def test_reply_ticket_calls_tickets_reply_with_template_when_template_is_set( mock_reply_ticket_with_template, mock_user): mock_user.query.get.return_value = MockUser('*****@*****.**') data = { 'titles': [ { 'title': 'Partial Symmetries of Weak Interactions' }, ], } extra_data = {'ticket_id': 1} template = 'template_path' obj = MockObj(data, extra_data) eng = MockEng() _reply_ticket = reply_ticket(template=template) _reply_ticket(obj, eng) mock_reply_ticket_with_template.assert_called_with(extra_data['ticket_id'], template, {}, False)
def test_create_ticket_calls_tickets_create_with_template( mock_create_ticket_with_template, mock_user): mock_user.query.get.return_value = MockUser('*****@*****.**') data = { 'titles': [ { 'title': 'Partial Symmetries of Weak Interactions' }, ], } template = 'template_path' extra_data = {'recid': '1'} obj = MockObj(data, extra_data) eng = MockEng() _create_ticket = create_ticket(template=template) _create_ticket(obj, eng) mock_create_ticket_with_template.assert_called_with( 'Test', '*****@*****.**', template, {}, None, extra_data['recid'])
def test_classify_paper_does_not_raise_on_unprintable_keywords( get_document_in_workflow, higgs_ontology): paper_with_unprintable_keywords = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1802.08709.pdf')) get_document_in_workflow.return_value.__enter__.return_value = paper_with_unprintable_keywords get_document_in_workflow.return_value.__exit__.return_value = None obj = MockObj({}, {}) eng = MockEng() classify_paper( taxonomy=higgs_ontology, only_core_tags=False, spires=True, with_author_keywords=True, no_cache=True, )(obj, eng) # Does not raise.
def test_is_experimental_paper_does_not_raise_if_obj_has_no_arxiv_category(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'value': '1712.02280' }, ], } extra_data = {} assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() assert not is_experimental_paper(obj, eng)
def test_populate_arxiv_document_logs_on_pdf_not_existing(): response500 = {'content': '', 'status_code': 500} response200 = { 'content': pkg_resources.resource_string( __name__, os.path.join('fixtures', '1707.02785.html')), 'status_code': 200, } with requests_mock.Mocker() as requests_mocker: requests_mocker.get( 'http://export.arxiv.org/pdf/1707.02785', (response200, ), ) requests_mocker.get( 'http://arxiv.org/pdf/1707.02785', (response500, ), ) schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'cs.CV', ], 'value': '1707.02785', }, ], } # literature/1458302 extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert populate_arxiv_document(obj, eng) is None expected = 'No PDF is available for 1707.02785' result = obj.log._info.getvalue() assert expected == result
def test_fuzzy_match_returns_true_if_something_matched_with_arxiv_eprints(mock_match, enable_fuzzy_matcher): schema = load_schema('hep') arxiv_eprints_schema = schema['properties']['arxiv_eprints'] titles_schema = schema['properties']['titles'] matched_record = { 'control_number': 1472986, 'titles': [ { 'title': 'title', }, ], 'arxiv_eprints': [ { 'categories': [ 'hep-ph' ], 'value': '1606.09129' } ], } assert validate(matched_record['titles'], titles_schema) is None assert validate(matched_record['arxiv_eprints'], arxiv_eprints_schema) is None mock_match.return_value = iter([{'_source': matched_record}]) data = {} extra_data = {} obj = MockObj(data, extra_data) eng = MockEng() assert fuzzy_match(obj, eng) assert 'matches' in obj.extra_data expected = [{ 'control_number': 1472986, 'title': 'title', 'arxiv_eprint': '1606.09129', }] result = get_value(obj.extra_data, 'matches.fuzzy') assert expected == result
def test_send_robotupload_works_with_mode_insert_on_authors(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'POST', 'http://inspirehep.net/batchuploader/robotupload/insert', text='[INFO] foo bar baz') schema = load_schema('authors') subschema = schema['properties']['arxiv_categories'] config = { 'LEGACY_ROBOTUPLOAD_URL': 'http://inspirehep.net', 'PRODUCTION_MODE': True, } with patch.dict(current_app.config, config): data = { '$schema': 'http://localhost:5000/schemas/records/authors.json', 'arxiv_categories': [ 'hep-th', ], } extra_data = {} assert validate(data['arxiv_categories'], subschema) is None obj = MockObj(data, extra_data) eng = MockEng() _send_robotupload = send_robotupload(mode='insert', ) assert _send_robotupload(obj, eng) is None expected = ('Robotupload sent!' '[INFO] foo bar baz' 'end of upload') result = obj.log._info.getvalue() assert expected == result expected = 'Waiting for robotupload: [INFO] foo bar baz' result = eng.msg assert expected == result
def test_arxiv_fulltext_download_polulates_documents(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1605.03844', }, ], } # literature/1458302 extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_fulltext_download(obj, eng) is None expected = [{ 'fulltext': True, 'original_url': 'http://export.arxiv.org/pdf/1605.03844', 'url': '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf', 'material': 'preprint', 'source': 'arxiv', 'key': '1605.03844.pdf', 'hidden': True }] result = obj.data['documents'] assert expected == result