def test_populate_arxiv_document(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1605.03844', }, ], } # literature/1458302 extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert populate_arxiv_document(obj, eng) is None expected = [ { 'key': '1605.03844.pdf', 'fulltext': True, 'hidden': True, 'material': 'preprint', 'original_url': 'http://export.arxiv.org/pdf/1605.03844', 'url': 'http://export.arxiv.org/pdf/1605.03844', 'source': 'arxiv', }, ] result = obj.data['documents'] assert expected == result
def test_populate_submission_document(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) schema = load_schema('hep') subschema = schema['properties']['acquisition_source'] data = { 'acquisition_source': { 'datetime': '2017-11-30T16:38:43.352370', 'email': '*****@*****.**', 'internal_uid': 54252, 'method': 'submitter', 'orcid': '0000-0002-2174-4493', 'source': 'submitter', 'submission_number': '1', }, } extra_data = { 'submission_pdf': 'http://export.arxiv.org/pdf/1605.03844', } files = MockFiles({}) assert validate(data['acquisition_source'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert populate_submission_document(obj, eng) is None expected = [ { 'fulltext': True, 'key': 'fulltext.pdf', 'original_url': 'http://export.arxiv.org/pdf/1605.03844', 'source': 'submitter', 'url': 'http://export.arxiv.org/pdf/1605.03844', }, ] result = obj.data['documents'] assert expected == result
def test_arxiv_author_list_logs_on_error(mock_os, mock_untar): mock_untar.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '1605.07707', }, ], } # synthethic data extra_data = {} files = MockFiles({ '1605.07707.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1605.07707', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert default_arxiv_author_list(obj, eng) is None expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707' result = obj.log._error.getvalue() assert expected == result finally: rmtree(temporary_dir)
def test_arxiv_author_list_does_not_produce_latex(): schema = load_schema('hep') filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1802.03388.tar.gz')) eprints_subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1802.03388', }, ], } validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles( {'1802.03388.tar.gz': AttrDict({'file': AttrDict({'uri': filename})})}) authors_subschema = schema['properties']['authors'] expected_authors = [ { 'affiliations': [{ 'value': 'Lund U.' }], 'ids': [{ 'value': 'INSPIRE-00061248', 'schema': 'INSPIRE ID' }], 'full_name': u'Åkesson, Torsten Paul Ake' }, ] validate(expected_authors, authors_subschema) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None assert obj.data.get('authors') == expected_authors
def test_download_documents_with_multiple_documents(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03845', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) schema = load_schema('hep') subschema = schema['properties']['documents'] data = { 'documents': [ { 'key': '1605.03844.pdf', 'url': 'http://export.arxiv.org/pdf/1605.03844' }, { 'key': '1605.03845.pdf', 'url': 'http://export.arxiv.org/pdf/1605.03845' }, ], } # literature/1458302 extra_data = {} files = MockFiles({}) assert validate(data['documents'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert download_documents(obj, eng) is None documents = obj.data['documents'] expected_document_url_1 = '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf' expected_document_url_2 = '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03845.pdf' assert 2 == len(documents) assert expected_document_url_1 == documents[0]['url'] assert expected_document_url_2 == documents[1]['url']
def test_populate_arxiv_document_logs_on_pdf_not_existing(): response500 = {'content': '', 'status_code': 500} response200 = { 'content': pkg_resources.resource_string( __name__, os.path.join('fixtures', '1707.02785.html')), 'status_code': 200, } with requests_mock.Mocker() as requests_mocker: requests_mocker.get( 'http://export.arxiv.org/pdf/1707.02785', (response200, ), ) requests_mocker.get( 'http://arxiv.org/pdf/1707.02785', (response500, ), ) schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'cs.CV', ], 'value': '1707.02785', }, ], } # literature/1458302 extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert populate_arxiv_document(obj, eng) is None expected = 'No PDF is available for 1707.02785' result = obj.log._info.getvalue() assert expected == result
def test_prepare_files_does_nothing_when_obj_has_no_files(): data = {} extra_data = {} files = MockFiles({}) obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected = {} result = obj.data assert expected == result expected = '' result = obj.log._info.getvalue() assert expected == result
def test_get_document_in_workflow_prefers_fulltext(): data = { 'documents': [ { 'key': 'table_of_contents.pdf', }, { 'key': 'fulltext.xml', 'fulltext': True, }, ], } files = MockFiles({}) files['fulltext.xml'] = None files['table_of_contents.pdf'] = None obj = MockObj(data, {}, files=files) with get_document_in_workflow(obj) as local_file: assert local_file == files['fulltext.xml'].file.uri
def test_arxiv_plot_extract_handles_duplicate_plot_names(mock_os): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1711.10662.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'cs.CV', ], 'value': '1711.10662', }, ], } # holdingpen/807096 extra_data = {} files = MockFiles({ '1711.10662.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }), }), }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert arxiv_plot_extract(obj, eng) is None assert len(obj.data['figures']) == 66 assert len(obj.files.keys) == 67 finally: rmtree(temporary_dir)
def test_prepare_files_skips_empty_files(): data = {} extra_data = {} files = MockFiles({ 'foo.pdf': AttrDict({}), }) obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected = {} result = obj.data assert expected == result expected = '' result = obj.log._info.getvalue() assert expected == result
def test_get_document_in_workflow_takes_first_among_equals(): data = { 'documents': [ { 'key': 'table_of_contents.pdf', }, { 'key': 'document.pdf', }, ], } files = MockFiles({}) files['document.pdf'] = None files['table_of_contents.pdf'] = None obj = MockObj(data, {}, files=files) with get_document_in_workflow(obj) as local_file: assert local_file == files['table_of_contents.pdf'].file.uri assert 'More than one document in workflow, first one used' in obj.log._error.getvalue( )
def test_arxiv_plot_extract_retries_on_io_error(mock_os, tmpdir): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1711.10662.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'cs.CV', ], 'value': '1711.10662', }, ], } # holdingpen/807096 extra_data = {} files = MockFiles({ '1711.10662.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }), }), }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() temporary_dir = tmpdir.mkdir('plots') mock_os.path.abspath.return_value = str(temporary_dir) with pytest.raises(IOError): with patch( 'inspirehep.modules.workflows.tasks.arxiv.open') as mock_open: mock_open.side_effect = side_effect_open arxiv_plot_extract(obj, eng) assert mock_open.call_count == 5
def test_prepare_files(): schema = load_schema('hep') subschema = schema['properties']['_fft'] data = {} extra_data = {} files = MockFiles({ 'foo.pdf': AttrDict({ 'obj': AttrDict({ 'file': AttrDict({ 'uri': '/data/foo.pdf', }), }), }), }) obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected = [ { 'path': '/data/foo.pdf', 'type': 'INSPIRE-PUBLIC', 'filename': 'foo', 'format': '.pdf', }, ] result = obj.data assert validate(result['_fft'], subschema) is None assert expected == result['_fft'] expected = 'Non-user PDF files added to FFT.' result = obj.log._info.getvalue() assert expected == result
def test_arxiv_author_list_only_overrides_authors(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) assert 'arxiv_eprints' in obj.data assert obj.data['arxiv_eprints'] == data['arxiv_eprints'] assert '$schema' in obj.data assert obj.data['$schema'] == data['$schema']
def test_arxiv_author_list_handles_auto_ignore_comment(mock_os): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({'file': AttrDict({ 'uri': filename, })}) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert default_arxiv_author_list(obj, eng) is None finally: rmtree(temporary_dir)
def test_arxiv_package_download_logs_on_success(): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/e-print/1605.03959', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03959.tar.gz')), ) schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', 'cond-mat.stat-mech', 'cond-mat.str-el', ], 'value': '1605.03959', }, ], } # literature/1458968 extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_package_download(obj, eng) is None expected = 'Tarball retrieved from arXiv for 1605.03959' result = obj.log._info.getvalue() assert expected == result
def test_arxiv_plot_extract_logs_when_images_are_invalid(mock_process_tarball): mock_process_tarball.side_effect = DelegateError schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1612.00624', }, ], } # synthetic data extra_data = {} files = MockFiles({ '1612.00624.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1612.00624', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_plot_extract(obj, eng) is None expected = 'Error extracting plots for 1612.00624. Report and skip.' result = obj.log._error.getvalue() assert expected == result
def test_arxiv_plot_extract_logs_when_tarball_is_invalid(mock_process_tarball): mock_process_tarball.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1612.00626', }, ], } # synthetic data extra_data = {} files = MockFiles({ '1612.00626.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1612.00626', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_plot_extract(obj, eng) is None expected = 'Invalid tarball http://export.arxiv.org/e-print/1612.00626 for arxiv_id 1612.00626' result = obj.log._info.getvalue() assert expected == result
def test_download_documents_with_local_file(mock_fsopen): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'GET', 'http://export.arxiv.org/pdf/1605.03844', content=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03844.pdf')), ) mock_fsopen.return_value = 'jessica jones' schema = load_schema('hep') subschema = schema['properties']['documents'] data = { 'documents': [ { 'key': '1605.03844.pdf', 'url': 'http://export.arxiv.org/pdf/1605.03844' }, { 'key': 'jessicajones.pdf;1', 'url': 'file://jessicajones.pdf%3B1' }, ], } extra_data = {} files = MockFiles({}) assert validate(data['documents'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert download_documents(obj, eng) is None documents = obj.data['documents'] assert 2 == len(documents) mock_fsopen.assert_called_once_with('file://jessicajones.pdf;1', mode='rb')
def test_arxiv_fulltext_download_retries_on_error(): httpretty.register_uri( httpretty.GET, 'http://export.arxiv.org/pdf/1605.03814', responses=[ httpretty.Response(body='', status=500), httpretty.Response(body=pkg_resources.resource_string( __name__, os.path.join('fixtures', '1605.03814.pdf')), status='200'), ]) schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1605.03814', }, ], } # literature/1458270 extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_fulltext_download(obj, eng) is None expected = 'PDF retrieved from arXiv for 1605.03814' result = obj.log._info.getvalue() assert expected == result
def test_arxiv_author_list_logs_on_error(mock_untar): mock_untar.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1605.07707')) data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '1605.07707', }, ], } # synthethic data extra_data = {} files = MockFiles({ '1605.07707.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None assert '1605.07707' in obj.log._info.getvalue()
def test_arxiv_plot_extract_no_file(mock_process_tarball): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1612.00626', }, ], } # synthetic data extra_data = {} files = MockFiles({}) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_plot_extract(obj, eng) is None assert 'No file named=' in obj.log._info.getvalue() mock_process_tarball.assert_not_called()
def test_get_document_in_workflow_returns_None_when_no_documents(): files = MockFiles({}) obj = MockObj({}, {}, files=files) with get_document_in_workflow(obj) as local_file: assert local_file is None
def test_arxiv_author_list_handles_multiple_author_xml_files(): schema = load_schema('hep') eprints_subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({'file': AttrDict({ 'uri': filename, })}) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) authors_subschema = schema['properties']['authors'] expected_authors = [{ 'affiliations': [{ 'value': 'Yerevan Phys. Inst.' }], 'ids': [ { 'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID' }, { 'value': 'CERN-432142', 'schema': 'CERN' }, ], 'full_name': 'Sirunyan, Albert M', }, { 'affiliations': [{ 'value': 'Yerevan Phys. Inst.' }], 'ids': [ { 'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID' }, { 'value': 'CERN-432143', 'schema': 'CERN' }, ], 'full_name': 'Weary, Jake', }] validate(expected_authors, authors_subschema) assert obj.data.get('authors') == expected_authors