def test_get_pdf_file_local(): '''using local file test if type pdf is io.BufferedReader''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) pdf_loc = '{}/Id=MMP15%2F57_5694_5696.pdf'.format(loc) pdf = _get_pdf_file(pdf_loc, verbose=False) assert isinstance(pdf, io.BufferedReader)
def test_get_pdf_file_url(): '''using requests test if type pdf is io.BytesIO stream''' base = 'https://www.landtag.nrw.de/portal/WWW/dokumentenarchiv/Dokument?' url = base + 'Id=MMP16%2F11|542|544' print('url', url) pdf = _get_pdf_file(url, verbose=False) assert isinstance(pdf, io.BytesIO)
def test_convert_text_only(): '''assert that this file has one column and no header''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) pdf_loc = '{}/01a_only_text.pdf'.format(loc) pdf = _get_pdf_file(pdf_loc, verbose=False) boxes = _pdf_to_text_all(pdf, verbose=False) assert boxes[1]['column'] assert boxes[2]['column']
def test_convert_two_cols(): '''assert that this file has two columns''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) pdf_loc = '{}/02a_two_cols.pdf'.format(loc) pdf = _get_pdf_file(pdf_loc, verbose=False) boxes = _pdf_to_text_all(pdf, verbose=False) assert boxes[1]['left_column'] assert boxes[1]['right_column'] assert boxes[2]['column']
def test_pdf_to_text_slice(cmd_option_slice): '''assert that sliced PDF file will be returned in a dictionary''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) pdf_loc = '{}/Id=MMP16%2F139_14622_14624.pdf'.format(loc) pdf = _get_pdf_file(pdf_loc, verbose=False) page_from, page_to = _get_pages(pdf_loc, verbose=False) boxes = _pdf_to_text_slice(pdf, page_from, page_to, verbose=False) assert isinstance(boxes, dict)
def test_convert_three_cols_and_header(): '''assert that this file has three columns and a header''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) pdf_loc = '{}/06a_three_cols_and_header.pdf'.format(loc) pdf = _get_pdf_file(pdf_loc, verbose=False) boxes = _pdf_to_text_all(pdf, verbose=False) assert boxes[1]['header'] assert boxes[1]['left_column'] assert boxes[1]['center_column'] assert boxes[1]['right_column'] assert boxes[2]['header'] assert boxes[2]['left_column'] assert boxes[2]['right_column']