示例#1
0
def test_populate_arxiv_document():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'physics.ins-det',
                    ],
                    'value': '1605.03844',
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert populate_arxiv_document(obj, eng) is None

        expected = [
            {
                'key': '1605.03844.pdf',
                'fulltext': True,
                'hidden': True,
                'material': 'preprint',
                'original_url': 'http://export.arxiv.org/pdf/1605.03844',
                'url': 'http://export.arxiv.org/pdf/1605.03844',
                'source': 'arxiv',
            },
        ]
        result = obj.data['documents']

        assert expected == result
def test_populate_submission_document():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET', 'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['acquisition_source']

        data = {
            'acquisition_source': {
                'datetime': '2017-11-30T16:38:43.352370',
                'email': '*****@*****.**',
                'internal_uid': 54252,
                'method': 'submitter',
                'orcid': '0000-0002-2174-4493',
                'source': 'submitter',
                'submission_number': '1',
            },
        }
        extra_data = {
            'submission_pdf': 'http://export.arxiv.org/pdf/1605.03844',
        }
        files = MockFiles({})
        assert validate(data['acquisition_source'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert populate_submission_document(obj, eng) is None

        expected = [
            {
                'fulltext': True,
                'key': 'fulltext.pdf',
                'original_url': 'http://export.arxiv.org/pdf/1605.03844',
                'source': 'submitter',
                'url': 'http://export.arxiv.org/pdf/1605.03844',
            },
        ]
        result = obj.data['documents']

        assert expected == result
示例#3
0
def test_arxiv_author_list_logs_on_error(mock_os, mock_untar):
    mock_untar.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '1605.07707',
            },
        ],
    }  # synthethic data
    extra_data = {}
    files = MockFiles({
        '1605.07707.tar.gz':
        AttrDict({
            'file':
            AttrDict({
                'uri': 'http://export.arxiv.org/e-print/1605.07707',
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        assert default_arxiv_author_list(obj, eng) is None

        expected = 'Invalid tarball http://export.arxiv.org/e-print/1605.07707 for arxiv_id 1605.07707'
        result = obj.log._error.getvalue()

        assert expected == result
    finally:
        rmtree(temporary_dir)
示例#4
0
def test_arxiv_author_list_does_not_produce_latex():
    schema = load_schema('hep')

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1802.03388.tar.gz'))

    eprints_subschema = schema['properties']['arxiv_eprints']
    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1802.03388',
            },
        ],
    }
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles(
        {'1802.03388.tar.gz': AttrDict({'file': AttrDict({'uri': filename})})})

    authors_subschema = schema['properties']['authors']
    expected_authors = [
        {
            'affiliations': [{
                'value': 'Lund U.'
            }],
            'ids': [{
                'value': 'INSPIRE-00061248',
                'schema': 'INSPIRE ID'
            }],
            'full_name': u'Åkesson, Torsten Paul Ake'
        },
    ]
    validate(expected_authors, authors_subschema)

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None
    assert obj.data.get('authors') == expected_authors
def test_download_documents_with_multiple_documents():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET', 'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )
        requests_mocker.register_uri(
            'GET', 'http://export.arxiv.org/pdf/1605.03845',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['documents']

        data = {
            'documents': [
                {
                    'key': '1605.03844.pdf',
                    'url': 'http://export.arxiv.org/pdf/1605.03844'
                },
                {
                    'key': '1605.03845.pdf',
                    'url': 'http://export.arxiv.org/pdf/1605.03845'
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['documents'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert download_documents(obj, eng) is None

        documents = obj.data['documents']
        expected_document_url_1 = '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf'
        expected_document_url_2 = '/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03845.pdf'

        assert 2 == len(documents)
        assert expected_document_url_1 == documents[0]['url']
        assert expected_document_url_2 == documents[1]['url']
示例#6
0
def test_populate_arxiv_document_logs_on_pdf_not_existing():
    response500 = {'content': '', 'status_code': 500}
    response200 = {
        'content':
        pkg_resources.resource_string(
            __name__, os.path.join('fixtures', '1707.02785.html')),
        'status_code':
        200,
    }
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.get(
            'http://export.arxiv.org/pdf/1707.02785',
            (response200, ),
        )
        requests_mocker.get(
            'http://arxiv.org/pdf/1707.02785',
            (response500, ),
        )
        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'cs.CV',
                    ],
                    'value': '1707.02785',
                },
            ],
        }  # literature/1458302
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert populate_arxiv_document(obj, eng) is None

        expected = 'No PDF is available for 1707.02785'
        result = obj.log._info.getvalue()

        assert expected == result
def test_prepare_files_does_nothing_when_obj_has_no_files():
    data = {}
    extra_data = {}
    files = MockFiles({})

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert prepare_files(obj, eng) is None

    expected = {}
    result = obj.data

    assert expected == result

    expected = ''
    result = obj.log._info.getvalue()

    assert expected == result
示例#8
0
def test_get_document_in_workflow_prefers_fulltext():
    data = {
        'documents': [
            {
                'key': 'table_of_contents.pdf',
            },
            {
                'key': 'fulltext.xml',
                'fulltext': True,
            },
        ],
    }
    files = MockFiles({})
    files['fulltext.xml'] = None
    files['table_of_contents.pdf'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['fulltext.xml'].file.uri
def test_arxiv_plot_extract_handles_duplicate_plot_names(mock_os):
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1711.10662.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'cs.CV',
                ],
                'value': '1711.10662',
            },
        ],
    }  # holdingpen/807096
    extra_data = {}
    files = MockFiles({
        '1711.10662.tar.gz':
        AttrDict({
            'file': AttrDict({
                'uri': filename,
            }),
        }),
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        assert arxiv_plot_extract(obj, eng) is None

        assert len(obj.data['figures']) == 66
        assert len(obj.files.keys) == 67

    finally:
        rmtree(temporary_dir)
def test_prepare_files_skips_empty_files():
    data = {}
    extra_data = {}
    files = MockFiles({
        'foo.pdf': AttrDict({}),
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert prepare_files(obj, eng) is None

    expected = {}
    result = obj.data

    assert expected == result

    expected = ''
    result = obj.log._info.getvalue()

    assert expected == result
示例#11
0
def test_get_document_in_workflow_takes_first_among_equals():
    data = {
        'documents': [
            {
                'key': 'table_of_contents.pdf',
            },
            {
                'key': 'document.pdf',
            },
        ],
    }
    files = MockFiles({})
    files['document.pdf'] = None
    files['table_of_contents.pdf'] = None
    obj = MockObj(data, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file == files['table_of_contents.pdf'].file.uri

    assert 'More than one document in workflow, first one used' in obj.log._error.getvalue(
    )
示例#12
0
def test_arxiv_plot_extract_retries_on_io_error(mock_os, tmpdir):
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1711.10662.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'cs.CV',
                ],
                'value': '1711.10662',
            },
        ],
    }  # holdingpen/807096
    extra_data = {}
    files = MockFiles({
        '1711.10662.tar.gz':
        AttrDict({
            'file': AttrDict({
                'uri': filename,
            }),
        }),
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    temporary_dir = tmpdir.mkdir('plots')
    mock_os.path.abspath.return_value = str(temporary_dir)

    with pytest.raises(IOError):
        with patch(
                'inspirehep.modules.workflows.tasks.arxiv.open') as mock_open:
            mock_open.side_effect = side_effect_open
            arxiv_plot_extract(obj, eng)
            assert mock_open.call_count == 5
def test_prepare_files():
    schema = load_schema('hep')
    subschema = schema['properties']['_fft']

    data = {}
    extra_data = {}
    files = MockFiles({
        'foo.pdf':
        AttrDict({
            'obj':
            AttrDict({
                'file': AttrDict({
                    'uri': '/data/foo.pdf',
                }),
            }),
        }),
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert prepare_files(obj, eng) is None

    expected = [
        {
            'path': '/data/foo.pdf',
            'type': 'INSPIRE-PUBLIC',
            'filename': 'foo',
            'format': '.pdf',
        },
    ]
    result = obj.data

    assert validate(result['_fft'], subschema) is None
    assert expected == result['_fft']

    expected = 'Non-user PDF files added to FFT.'
    result = obj.log._info.getvalue()

    assert expected == result
示例#14
0
def test_arxiv_author_list_only_overrides_authors():
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    assert 'arxiv_eprints' in obj.data
    assert obj.data['arxiv_eprints'] == data['arxiv_eprints']
    assert '$schema' in obj.data
    assert obj.data['$schema'] == data['$schema']
示例#15
0
def test_arxiv_author_list_handles_auto_ignore_comment(mock_os):
    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1703.09986.tar.gz'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': filename,
        })})
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    try:
        temporary_dir = mkdtemp()
        mock_os.path.abspath.return_value = temporary_dir

        assert default_arxiv_author_list(obj, eng) is None
    finally:
        rmtree(temporary_dir)
示例#16
0
def test_arxiv_package_download_logs_on_success():
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/e-print/1605.03959',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03959.tar.gz')),
        )

        schema = load_schema('hep')
        subschema = schema['properties']['arxiv_eprints']

        data = {
            'arxiv_eprints': [
                {
                    'categories': [
                        'hep-th',
                        'cond-mat.stat-mech',
                        'cond-mat.str-el',
                    ],
                    'value':
                    '1605.03959',
                },
            ],
        }  # literature/1458968
        extra_data = {}
        files = MockFiles({})
        assert validate(data['arxiv_eprints'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert arxiv_package_download(obj, eng) is None

        expected = 'Tarball retrieved from arXiv for 1605.03959'
        result = obj.log._info.getvalue()

        assert expected == result
示例#17
0
def test_arxiv_plot_extract_logs_when_images_are_invalid(mock_process_tarball):
    mock_process_tarball.side_effect = DelegateError

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'physics.ins-det',
                ],
                'value': '1612.00624',
            },
        ],
    }  # synthetic data
    extra_data = {}
    files = MockFiles({
        '1612.00624.tar.gz':
        AttrDict({
            'file':
            AttrDict({
                'uri': 'http://export.arxiv.org/e-print/1612.00624',
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert arxiv_plot_extract(obj, eng) is None

    expected = 'Error extracting plots for 1612.00624. Report and skip.'
    result = obj.log._error.getvalue()

    assert expected == result
示例#18
0
def test_arxiv_plot_extract_logs_when_tarball_is_invalid(mock_process_tarball):
    mock_process_tarball.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'physics.ins-det',
                ],
                'value': '1612.00626',
            },
        ],
    }  # synthetic data
    extra_data = {}
    files = MockFiles({
        '1612.00626.tar.gz':
        AttrDict({
            'file':
            AttrDict({
                'uri': 'http://export.arxiv.org/e-print/1612.00626',
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert arxiv_plot_extract(obj, eng) is None

    expected = 'Invalid tarball http://export.arxiv.org/e-print/1612.00626 for arxiv_id 1612.00626'
    result = obj.log._info.getvalue()

    assert expected == result
示例#19
0
def test_download_documents_with_local_file(mock_fsopen):
    with requests_mock.Mocker() as requests_mocker:
        requests_mocker.register_uri(
            'GET',
            'http://export.arxiv.org/pdf/1605.03844',
            content=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03844.pdf')),
        )
        mock_fsopen.return_value = 'jessica jones'
        schema = load_schema('hep')
        subschema = schema['properties']['documents']
        data = {
            'documents': [
                {
                    'key': '1605.03844.pdf',
                    'url': 'http://export.arxiv.org/pdf/1605.03844'
                },
                {
                    'key': 'jessicajones.pdf;1',
                    'url': 'file://jessicajones.pdf%3B1'
                },
            ],
        }
        extra_data = {}
        files = MockFiles({})
        assert validate(data['documents'], subschema) is None

        obj = MockObj(data, extra_data, files=files)
        eng = MockEng()

        assert download_documents(obj, eng) is None

        documents = obj.data['documents']

        assert 2 == len(documents)
        mock_fsopen.assert_called_once_with('file://jessicajones.pdf;1',
                                            mode='rb')
示例#20
0
def test_arxiv_fulltext_download_retries_on_error():
    httpretty.register_uri(
        httpretty.GET,
        'http://export.arxiv.org/pdf/1605.03814',
        responses=[
            httpretty.Response(body='', status=500),
            httpretty.Response(body=pkg_resources.resource_string(
                __name__, os.path.join('fixtures', '1605.03814.pdf')),
                               status='200'),
        ])

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1605.03814',
            },
        ],
    }  # literature/1458270
    extra_data = {}
    files = MockFiles({})
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert arxiv_fulltext_download(obj, eng) is None

    expected = 'PDF retrieved from arXiv for 1605.03814'
    result = obj.log._info.getvalue()

    assert expected == result
示例#21
0
def test_arxiv_author_list_logs_on_error(mock_untar):
    mock_untar.side_effect = InvalidTarball

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__, os.path.join('fixtures', '1605.07707'))

    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-th',
                ],
                'value': '1605.07707',
            },
        ],
    }  # synthethic data
    extra_data = {}
    files = MockFiles({
        '1605.07707.tar.gz': AttrDict({
            'file': AttrDict({
                'uri': filename,
            })
        })
    })
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()

    assert default_arxiv_author_list(obj, eng) is None
    assert '1605.07707' in obj.log._info.getvalue()
示例#22
0
def test_arxiv_plot_extract_no_file(mock_process_tarball):

    schema = load_schema('hep')
    subschema = schema['properties']['arxiv_eprints']
    data = {
        'arxiv_eprints': [
            {
                'categories': [
                    'physics.ins-det',
                ],
                'value': '1612.00626',
            },
        ],
    }  # synthetic data
    extra_data = {}
    files = MockFiles({})
    assert validate(data['arxiv_eprints'], subschema) is None

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    assert arxiv_plot_extract(obj, eng) is None
    assert 'No file named=' in obj.log._info.getvalue()
    mock_process_tarball.assert_not_called()
示例#23
0
def test_get_document_in_workflow_returns_None_when_no_documents():
    files = MockFiles({})
    obj = MockObj({}, {}, files=files)

    with get_document_in_workflow(obj) as local_file:
        assert local_file is None
示例#24
0
def test_arxiv_author_list_handles_multiple_author_xml_files():
    schema = load_schema('hep')
    eprints_subschema = schema['properties']['arxiv_eprints']

    filename = pkg_resources.resource_filename(
        __name__,
        os.path.join('fixtures', '1703.09986.multiple_author_lists.tar.gz'))

    data = {
        '$schema': 'http://localhost:5000/hep.json',
        'arxiv_eprints': [
            {
                'categories': [
                    'hep-ex',
                ],
                'value': '1703.09986',
            },
        ],
    }  # record/1519995
    validate(data['arxiv_eprints'], eprints_subschema)

    extra_data = {}
    files = MockFiles({
        '1703.09986.tar.gz':
        AttrDict({'file': AttrDict({
            'uri': filename,
        })})
    })

    obj = MockObj(data, extra_data, files=files)
    eng = MockEng()

    default_arxiv_author_list = arxiv_author_list()
    default_arxiv_author_list(obj, eng)

    authors_subschema = schema['properties']['authors']
    expected_authors = [{
        'affiliations': [{
            'value': 'Yerevan Phys. Inst.'
        }],
        'ids': [
            {
                'value': 'INSPIRE-00312131',
                'schema': 'INSPIRE ID'
            },
            {
                'value': 'CERN-432142',
                'schema': 'CERN'
            },
        ],
        'full_name':
        'Sirunyan, Albert M',
    }, {
        'affiliations': [{
            'value': 'Yerevan Phys. Inst.'
        }],
        'ids': [
            {
                'value': 'INSPIRE-00312132',
                'schema': 'INSPIRE ID'
            },
            {
                'value': 'CERN-432143',
                'schema': 'CERN'
            },
        ],
        'full_name':
        'Weary, Jake',
    }]
    validate(expected_authors, authors_subschema)

    assert obj.data.get('authors') == expected_authors