Exemplo n.º 1
0
def test_document_find_simple(simple_document_path):
    """Test search capabilities."""
    pdf = PDF(simple_document_path, 1)
    pdf.load()
    resp = pdf.find_text_page('Multivio')
    assert resp == [{
        'BBox': {
            'x1': 196.189,
            'x2': 254.84358934,
            'y1': 165.65651239999994,
            'y2': 180.96100299999992
        },
        'page': 1,
        'text': "Multivio: Project description"
    }, {
        'BBox': {
            'x1': 124.80199999999999,
            'y1': 287.6239556,
            'x2': 161.88678224000006,
            'y2': 296.4707444
        },
        'page':
        1,
        'text':
        'Multivio is an Internet-based application ' +
        'for browsing and accessing digital doc-'
    }]
Exemplo n.º 2
0
def extract_text(file, outfilename=None):
    """Extract fulltext from a given pdf file."""
    from invenio_multivio.pdf.api import PDF
    text = []
    try:
        pdf = PDF(path=file)
        pdf.load()
        text = pdf.get_text_page()
        # doc = slate.PDF(file)
        # doc = PyPDF2.PdfFileReader(file)
        # if doc.isEncrypted:
        #     warning('file is encrypted')
        #     return []
        # text = []
        # for np in range(doc.getNumPages()):
        #     page = doc.getPage(np)
        #     text.append(page.extractText())
    except Exception:
        error('text generation failed')
        pass
    if not text:
        warning('%s: do not contains text' % file)
        return text
    if outfilename:
        with open(outfilename, 'wb') as of:
            return of.write(bytes(" ".join(text), 'utf-8'))
    return text
Exemplo n.º 3
0
def test_document_no_toc(document_no_toc_path, json_toc_res):
    """Test search capabilities."""
    pdf = PDF(document_no_toc_path)
    pdf.load()
    resp = pdf.get_toc()
    if resp is None:
        assert True
    else:
        assert False
Exemplo n.º 4
0
def test_document_metadata(simple_document_path):
    """Test search capabilities."""
    pdf = PDF(simple_document_path)
    pdf.load()
    resp = pdf.get_metadata()
    assert resp == {
        'creator': 'Miguel Moreira',
        'nativeSize': ((595.2760000000001, 841.89), {}),
        'fileSize': 70909,
        'mime': 'application/pdf',
        'nPages': 3,
        'title': 'Multivio: Project description'
    }
Exemplo n.º 5
0
def test_document_render(simple_document_path):
    """Test search capabilities."""
    pdf = PDF(simple_document_path, 2)
    pdf.load()
    pdf.render_page(pdf.get_width(), pdf.get_height())
    assert pdf
    assert round(pdf.get_scale(), 2) == 1
    assert pdf.get_width() == 595
    assert pdf.get_height() == 841
    n_bytes = len(pdf.jpeg.read())
    # cannot check extact value as it depends on the poppler version
    assert n_bytes > 100
    pdf.rotate(90)
    assert pdf.pil_image.size[0] == 841
    assert pdf.pil_image.size[1] == 595
Exemplo n.º 6
0
def test_document_find(simple_document_path):
    """Test search capabilities."""
    pdf = PDF(simple_document_path, 1)
    pdf.load()
    resp = pdf.find_text_page(str('Multivio is'))
    assert resp == [{
        'BBox': {
            'x1': 124.80199999999999,
            'x2': 171.2336935600001,
            'y1': 287.6239556,
            'y2': 296.4707444
        },
        'page':
        1,
        'text':
        'Multivio is an Internet-based ' +
        'application for browsing and accessing digital doc-'
    }]
Exemplo n.º 7
0
def generate_thumbnail(filename, outfilename=None):
    """Generate a thumnail for a given pdf filename."""
    # img = Image(filename=filename+'[0]', resolution=20)
    # try:
    #     img.alpha_channel = 'off'
    #     img.transform(resize='150x150>')

    from invenio_multivio.pdf.api import PDF
    try:
        pdf = PDF(path=filename, page_nr=0)
        pdf.load()
        img = pdf.render_page(max_width=80, max_height=80)

    except Exception:
        error('image generation failed')
        return None
    if outfilename:
        return img.save(filename=outfilename)
    return img
Exemplo n.º 8
0
def test_document_indexing(simple_document_path):
    """Test search capabilities."""
    pdf = PDF(simple_document_path, 1)
    pdf.load()
    resp = pdf.get_indexing()
    assert resp == "NotImplemented"
Exemplo n.º 9
0
def test_document_sizes(simple_document_path):
    """Test search capabilities."""
    pdf = PDF(simple_document_path, 1)
    pdf.load()
    resp = pdf.get_sizes()
    assert resp == {'height': 841, 'width': 595}
Exemplo n.º 10
0
def test_document_toc(simple_document_path, json_toc_res):
    """Test search capabilities."""
    pdf = PDF(simple_document_path)
    pdf.load()
    resp = pdf.get_toc()
    assert resp == json_toc_res
Exemplo n.º 11
0
def test_document_find_max_result(simple_document_path):
    """Test search capabilities."""
    pdf = PDF(simple_document_path, 1)
    pdf.load()
    resp = pdf.find_text_page('a')
    assert len(resp) == 127
Exemplo n.º 12
0
def test_document_text(simple_document_path, text_page):
    """Test search capabilities."""
    pdf = PDF(simple_document_path, 1)
    pdf.load()
    resp = pdf.get_text_page()
    assert resp == text_page