Пример #1
0
def test_get_pdf_file_local():
    '''using local file test if type pdf is io.BufferedReader'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    pdf_loc = '{}/Id=MMP15%2F57_5694_5696.pdf'.format(loc)

    pdf = _get_pdf_file(pdf_loc, verbose=False)
    assert isinstance(pdf, io.BufferedReader)
Пример #2
0
def test_get_pdf_file_url():
    '''using requests test if type pdf is io.BytesIO stream'''

    base = 'https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument?'
    url = base + 'Id=MMP16%2F11|542|544'
    print('url', url)
    pdf = _get_pdf_file(url, verbose=False)

    assert isinstance(pdf, io.BytesIO)
Пример #3
0
def test_convert_text_only():
    '''assert that this file has one column and no header'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    pdf_loc = '{}/01a_only_text.pdf'.format(loc)

    pdf = _get_pdf_file(pdf_loc, verbose=False)
    boxes = _pdf_to_text_all(pdf, verbose=False)

    assert boxes[1]['column']
    assert boxes[2]['column']
Пример #4
0
def test_convert_two_cols():
    '''assert that this file has two columns'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    pdf_loc = '{}/02a_two_cols.pdf'.format(loc)

    pdf = _get_pdf_file(pdf_loc, verbose=False)
    boxes = _pdf_to_text_all(pdf, verbose=False)

    assert boxes[1]['left_column']
    assert boxes[1]['right_column']
    assert boxes[2]['column']
Пример #5
0
def test_pdf_to_text_slice(cmd_option_slice):
    '''assert that sliced PDF file will be returned in a dictionary'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    pdf_loc = '{}/Id=MMP16%2F139_14622_14624.pdf'.format(loc)

    pdf = _get_pdf_file(pdf_loc, verbose=False)

    page_from, page_to = _get_pages(pdf_loc, verbose=False)
    boxes = _pdf_to_text_slice(pdf, page_from, page_to, verbose=False)

    assert isinstance(boxes, dict)
Пример #6
0
def test_convert_three_cols_and_header():
    '''assert that this file has three columns and a header'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    pdf_loc = '{}/06a_three_cols_and_header.pdf'.format(loc)

    pdf = _get_pdf_file(pdf_loc, verbose=False)
    boxes = _pdf_to_text_all(pdf, verbose=False)

    assert boxes[1]['header']
    assert boxes[1]['left_column']
    assert boxes[1]['center_column']
    assert boxes[1]['right_column']
    assert boxes[2]['header']
    assert boxes[2]['left_column']
    assert boxes[2]['right_column']