def test_arxiv_plot_extract_logs_when_images_are_invalid(mock_process_tarball): mock_process_tarball.side_effect = DelegateError schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1612.00624')) data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1612.00624', }, ], } # synthetic data extra_data = {} files = MockFiles({ '1612.00624.tar.gz': AttrDict({'file': AttrDict({ 'uri': filename, })}) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_plot_extract(obj, eng) is None assert '1612.00624' in obj.log._error.getvalue()
def test_arxiv_author_list_handles_auto_ignore_comment(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({'file': AttrDict({ 'uri': filename, })}) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None
def test_arxiv_author_list_with_missing_tarball(): schema = load_schema('hep') eprints_subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ 'jessica.jones.tar.gz': AttrDict({'file': AttrDict({ 'uri': 'alias.investigations', })}) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() expected_message = \ 'Skipping author list extraction, no tarball with name "1703.09986.tar.gz" found' assert default_arxiv_author_list(obj, eng) is None assert expected_message in obj.log._info.getvalue()
def test_arxiv_plot_extract_is_safe_to_rerun(mock_os): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '0804.1873.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'nucl-ex', ], 'value': '0804.1873', }, ], } # literature/783246 extra_data = {} files = MockFiles({ '0804.1873.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }), }), }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir for _ in range(2): assert arxiv_plot_extract(obj, eng) is None expected_figures = [{ 'url': '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/figure1.png', 'source': 'arxiv', 'material': 'preprint', 'key': 'figure1.png', 'caption': 'Difference (in MeV) between the theoretical and experimental masses for the 2027 selected nuclei as a function of the mass number.' }] result = obj.data['figures'] assert expected_figures == result expected_files = ['0804.1873.tar.gz', 'figure1.png'] assert expected_files == obj.files.keys finally: rmtree(temporary_dir)
def test_prepare_files_ignores_keys_not_ending_with_pdf(): data = {} extra_data = {} files = MockFiles({ 'foo.bar': AttrDict({ 'obj': AttrDict({ 'file': AttrDict({ 'uri': '/data/foo.pdf', }), }), }), }) obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected = {} result = obj.data assert expected == result expected = '' result = obj.log._info.getvalue() assert expected == result
def test_prepare_files_annotates_files_from_arxiv(): schema = load_schema('hep') _fft_schema = schema['properties']['_fft'] arxiv_eprints_schema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': ['hep-th'], 'value': 'hep-th/9711200', }, ], } extra_data = {} files = MockFiles({ 'foo.pdf': AttrDict({ 'obj': AttrDict({ 'file': AttrDict({ 'uri': '/data/foo.pdf', }), }), }), }) assert validate(data['arxiv_eprints'], arxiv_eprints_schema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected_fft = [ { 'path': '/data/foo.pdf', 'type': 'arXiv', 'filename': 'arxiv:foo', 'format': '.pdf', }, ] expected_arxiv_eprints = [ { 'categories': [ 'hep-th', ], 'value': 'hep-th/9711200', }, ] result = obj.data assert validate(result['_fft'], _fft_schema) is None assert expected_fft == result['_fft'] assert validate(result['arxiv_eprints'], arxiv_eprints_schema) is None assert expected_arxiv_eprints == result['arxiv_eprints'] expected = 'Non-user PDF files added to FFT.' result = obj.log._info.getvalue() assert expected == result
def test_arxiv_author_list_logs_on_error(mock_untar): mock_untar.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1605.07707')) data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '1605.07707', }, ], } # synthethic data extra_data = {} files = MockFiles({ '1605.07707.tar.gz': AttrDict({'file': AttrDict({ 'uri': filename, })}) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None assert '1605.07707' in obj.log._info.getvalue()
def test_arxiv_author_list_handles_multiple_author_xml_files(): schema = load_schema('hep') eprints_subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) authors_subschema = schema['properties']['authors'] expected_authors = [ { 'affiliations': [{'value': 'Yerevan Phys. Inst.'}], 'ids': [ {'value': 'INSPIRE-00312131', 'schema': 'INSPIRE ID'}, {'value': 'CERN-432142', 'schema': 'CERN'}, ], 'full_name': 'Sirunyan, Albert M', }, { 'affiliations': [{'value': 'Yerevan Phys. Inst.'}], 'ids': [ {'value': 'INSPIRE-00312132', 'schema': 'INSPIRE ID'}, {'value': 'CERN-432143', 'schema': 'CERN'}, ], 'full_name': 'Weary, Jake', } ] validate(expected_authors, authors_subschema) assert obj.data.get('authors') == expected_authors
def test_arxiv_plot_extract_populates_files_with_plots(mock_os, tmpdir): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '0804.1873.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'nucl-ex', ], 'value': '0804.1873', }, ], } # literature/783246 extra_data = {} files = MockFiles({ '0804.1873.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }), }), }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() temporary_dir = tmpdir.mkdir('plots') mock_os.path.abspath.return_value = str(temporary_dir) assert arxiv_plot_extract(obj, eng) is None expected = [{ 'url': '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/figure1.png', 'source': 'arxiv', 'material': 'preprint', 'key': 'figure1.png', 'caption': 'Difference (in MeV) between the theoretical and experimental masses for the 2027 selected nuclei as a function of the mass number.' }] result = obj.data['figures'] assert expected == result expected = 'Added 1 plots.' result = obj.log._info.getvalue() assert expected == result
def test_arxiv_plot_extract_populates_files_with_plots(mock_os): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '0804.1873.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'nucl-ex', ], 'value': '0804.1873', }, ], } # literature/783246 extra_data = {} files = MockFiles({ '0804.1873.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }), }), }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert arxiv_plot_extract(obj, eng) is None expected = obj.files['figure1']['description'] result = ('00000 Difference (in MeV) between the theoretical and ' 'experimental masses for the 2027 selected nuclei as a ' 'function of the mass number.') assert expected == result expected = 'Added 1 plots.' result = obj.log._info.getvalue() assert expected == result finally: rmtree(temporary_dir)
def test_arxiv_author_list_does_not_produce_latex(): schema = load_schema('hep') filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1802.03388.tar.gz')) eprints_subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1802.03388', }, ], } validate(data['arxiv_eprints'], eprints_subschema) extra_data = {} files = MockFiles({ '1802.03388.tar.gz': AttrDict({'file': AttrDict({'uri': filename})}) }) authors_subschema = schema['properties']['authors'] expected_authors = [ { 'affiliations': [{'value': 'Lund U.'}], 'ids': [ { 'value': 'INSPIRE-00061248', 'schema': 'INSPIRE ID' } ], 'full_name': u'Åkesson, Torsten Paul Ake' }, ] validate(expected_authors, authors_subschema) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() assert default_arxiv_author_list(obj, eng) is None assert obj.data.get('authors') == expected_authors
def test_arxiv_author_list_logs_on_error(mock_os, mock_untar): mock_untar.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', ], 'value': '1605.07707', }, ], } # synthethic data extra_data = {} files = MockFiles({ '1605.07707.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1605.07707', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert default_arxiv_author_list(obj, eng) is None expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707' result = obj.log._error.getvalue() assert expected == result finally: rmtree(temporary_dir)
def test_arxiv_plot_extract_handles_duplicate_plot_names(mock_os): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1711.10662.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'cs.CV', ], 'value': '1711.10662', }, ], } # holdingpen/807096 extra_data = {} files = MockFiles({ '1711.10662.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }), }), }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() try: temporary_dir = mkdtemp() mock_os.path.abspath.return_value = temporary_dir assert arxiv_plot_extract(obj, eng) is None assert len(obj.data['figures']) == 66 assert len(obj.files.keys) == 67 finally: rmtree(temporary_dir)
def test_prepare_files(): schema = load_schema('hep') subschema = schema['properties']['_fft'] data = {} extra_data = {} files = MockFiles({ 'foo.pdf': AttrDict({ 'obj': AttrDict({ 'file': AttrDict({ 'uri': '/data/foo.pdf', }), }), }), }) obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected = [ { 'path': '/data/foo.pdf', 'type': 'INSPIRE-PUBLIC', 'filename': 'foo', 'format': '.pdf', }, ] result = obj.data assert validate(result['_fft'], subschema) is None assert expected == result['_fft'] expected = 'Non-user PDF files added to FFT.' result = obj.log._info.getvalue() assert expected == result
def test_arxiv_plot_extract_retries_on_io_error(mock_os, tmpdir): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1711.10662.tar.gz')) data = { 'arxiv_eprints': [ { 'categories': [ 'cs.CV', ], 'value': '1711.10662', }, ], } # holdingpen/807096 extra_data = {} files = MockFiles({ '1711.10662.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }), }), }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() temporary_dir = tmpdir.mkdir('plots') mock_os.path.abspath.return_value = str(temporary_dir) with pytest.raises(IOError): with patch( 'inspirehep.modules.workflows.tasks.arxiv.open') as mock_open: mock_open.side_effect = side_effect_open arxiv_plot_extract(obj, eng) assert mock_open.call_count == 5
def test_arxiv_author_list_only_overrides_authors(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] filename = pkg_resources.resource_filename( __name__, os.path.join('fixtures', '1703.09986.tar.gz')) data = { '$schema': 'http://localhost:5000/hep.json', 'arxiv_eprints': [ { 'categories': [ 'hep-ex', ], 'value': '1703.09986', }, ], } # record/1519995 validate(data['arxiv_eprints'], subschema) extra_data = {} files = MockFiles({ '1703.09986.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': filename, }) }) }) obj = MockObj(data, extra_data, files=files) eng = MockEng() default_arxiv_author_list = arxiv_author_list() default_arxiv_author_list(obj, eng) assert 'arxiv_eprints' in obj.data assert obj.data['arxiv_eprints'] == data['arxiv_eprints'] assert '$schema' in obj.data assert obj.data['$schema'] == data['$schema']
def test_arxiv_plot_extract_logs_when_images_are_invalid(mock_process_tarball): mock_process_tarball.side_effect = DelegateError schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1612.00624', }, ], } # synthetic data extra_data = {} files = MockFiles({ '1612.00624.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1612.00624', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_plot_extract(obj, eng) is None expected = 'Error extracting plots for 1612.00624. Report and skip.' result = obj.log._error.getvalue() assert expected == result
def test_arxiv_plot_extract_logs_when_tarball_is_invalid(mock_process_tarball): mock_process_tarball.side_effect = InvalidTarball schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] data = { 'arxiv_eprints': [ { 'categories': [ 'physics.ins-det', ], 'value': '1612.00626', }, ], } # synthetic data extra_data = {} files = MockFiles({ '1612.00626.tar.gz': AttrDict({ 'file': AttrDict({ 'uri': 'http://export.arxiv.org/e-print/1612.00626', }) }) }) assert validate(data['arxiv_eprints'], subschema) is None obj = MockObj(data, extra_data, files=files) eng = MockEng() assert arxiv_plot_extract(obj, eng) is None expected = 'Invalid tarball http://export.arxiv.org/e-print/1612.00626 for arxiv_id 1612.00626' result = obj.log._info.getvalue() assert expected == result
def test_prepare_files_skips_empty_files(): data = {} extra_data = {} files = MockFiles({ 'foo.pdf': AttrDict({}), }) obj = MockObj(data, extra_data, files=files) eng = MockEng() assert prepare_files(obj, eng) is None expected = {} result = obj.data assert expected == result expected = '' result = obj.log._info.getvalue() assert expected == result