Exemplo n.º 1
0
def test_process_raw(app, mock_grobid_response, pdf_file):
    """Test process with raw content."""
    with open(pdf_file, 'rb') as file:
        content = file.read()

    pdf_extractor = PDFExtractor()
    output = pdf_extractor.process_raw(content)
    assert 'teiHeader' in output
Exemplo n.º 2
0
def test_extract_metadata_api_error_response(app, mock_grobid_error_response,
                                             pdf_file):
    """Test metadata extraction with error on API."""
    pdf_extractor = PDFExtractor()

    # Test error on api during extraction
    with pytest.raises(Exception) as exception:
        pdf_extractor.extract_metadata(pdf_file)
    assert str(exception.value) == 'Metadata extraction failed'
Exemplo n.º 3
0
def test_api_is_alive(app, monkeypatch):
    """Test if api is alive."""
    pdf_extractor = PDFExtractor()

    # Test API is up
    assert pdf_extractor.api_is_alive()

    # Test API is down
    monkeypatch.setattr(PDFExtractor, 'do_request', lambda *args: ('', 503))
    assert not pdf_extractor.api_is_alive()

    # Test API raise exception
    monkeypatch.setattr(PDFExtractor, 'do_request', lambda *args: Exception)
    assert not pdf_extractor.api_is_alive()
Exemplo n.º 4
0
def metadata():
    """Extract PDF metadata and return as a json object."""
    try:
        if 'file' not in request.files:
            raise Exception('File not found')

        # Get the file posted
        pdf_file = request.files['file']

        pdf_extractor = PDFExtractor()

        # Extract metadata from PDF
        return jsonify(pdf_extractor.process_raw(pdf_file.read()))
    except Exception as exception:
        return jsonify(dict(error=str(exception))), 400
Exemplo n.º 5
0
def extract_metadata(pid=None):
    """Publish a deposit or send a message for review."""
    deposit = DepositRecord.get_record_by_pid(pid)

    if not deposit:
        abort(400)

    main_file = [
        file for file in deposit.files
        if file['category'] == 'main' and file.mimetype == 'application/pdf'
    ]

    if not main_file:
        abort(500)

    # Get file content
    with main_file[0].file.storage().open() as pdf_file:
        content = pdf_file.read()

    # Extract data from pdf
    pdf_extractor = PDFExtractor()
    pdf_metadata = format_extracted_data(pdf_extractor.process_raw(content))

    return make_response(jsonify(pdf_metadata))
Exemplo n.º 6
0
def test_extract_metadata(app, mock_grobid_response, pdf_file):
    """Test metadata extraction."""
    pdf_extractor = PDFExtractor()

    # Test valid extraction
    output = pdf_extractor.extract_metadata(pdf_file)
    assert output.startswith('<?xml version="1.0" encoding="UTF-8"?>')

    # Test non existing file
    with pytest.raises(ValueError) as exception:
        pdf_extractor.extract_metadata('not_existing_file.pdf')
    assert str(exception.value) == 'Input file does not exist'

    # Test non valid pdf
    input_file = os.path.dirname(os.path.abspath(__file__)) + '/data/test.doc'
    with pytest.raises(ValueError) as exception:
        pdf_extractor.extract_metadata(input_file)
    assert str(exception.value) == 'Input file is not a valid PDF file'
Exemplo n.º 7
0
def test_do_request(app, mock_grobid_response, pdf_file):
    """Test request to API."""
    pdf_extractor = PDFExtractor()
    # Test valid call
    assert pdf_extractor.do_request('isalive', 'get') == (b'true', 200)

    # Test unexisting endpoint
    assert pdf_extractor.do_request('unexisting', 'get')[1] == 404

    # Test invalid request type
    with pytest.raises(ValueError):
        pdf_extractor.do_request('isalive', 'invalid')

    # Test post request
    assert pdf_extractor.do_request(
        'processFulltextDocument',
        'post',
        files={'input': (pdf_file, open(pdf_file,
                                        'rb'), 'application/pdf')})[1] == 200
Exemplo n.º 8
0
def test_process(app, mock_grobid_response, pdf_file):
    """Test process method."""
    pdf_extractor = PDFExtractor()

    # Test output as XML
    output = pdf_extractor.process(pdf_file, dict_output=False)
    assert output.startswith('<?xml version="1.0" encoding="UTF-8"?>')

    # Test output as dictionary
    output = pdf_extractor.process(pdf_file, dict_output=True)
    assert 'teiHeader' in output

    # Test output as XML in a file
    with tempfile.NamedTemporaryFile(mode='w+b', suffix=".pdf") as temp_file:
        pdf_extractor.process(pdf_file, output_file=temp_file.name)

        with open(temp_file.name, 'r') as output_file:
            output = output_file.read()
            assert output.startswith('<?xml version="1.0" encoding="UTF-8"?>')
Exemplo n.º 9
0
def test_load_config(app, monkeypatch):
    """Test configuration loading."""
    monkeypatch.setattr(PDFExtractor, 'api_is_alive', lambda *args: False)
    with pytest.raises(ConnectionRefusedError):
        PDFExtractor()