def test_get_plaintext_document_body(tmpdir): input = [u"Some text\n", u"on multiple lines\n"] f = tmpdir.join("plain.txt") f.write("".join(input)) assert input == get_plaintext_document_body(str(f)) with pytest.raises(UnknownDocumentTypeError) as excinfo: html = "<html><body>Some page</body></html>" f = tmpdir.join("page.html") f.write(html) get_plaintext_document_body(str(f)) assert 'text/html' in excinfo.value.args
def test_clean_pdf_before_run(tmp_path, pdf_files): tmp_file_path = tmp_path / "packed.pdf" pdf = pdf_files[7] with open(pdf, 'rb') as input, open(tmp_file_path, 'wb') as tmp_out: tmp_out.write(input.read()) text = get_plaintext_document_body(tmp_file_path.as_posix()) assert text == ['Test\n', '\x0c']