def _get_pdf_content(self, document_str): """ Iterate through all PDF pages and extract text :return: A list containing the words in the PDF """ pdf_text = pdf_to_text(document_str) return pdf_text.split()
def test_pdf_to_text_no_pdf(self): text = pdf_to_text('hello world') self.assertEqual('', text)
def test_pdf_to_text(self): text = pdf_to_text(file(self.SIMPLE_SAMPLE).read()) self.assertIn('Hello', text) self.assertIn('World', text)