def extractTextFromStructuredDoc(self): """ creates "text" dir to dump the extracted pages """ self.logger.info('Calling Pdftotext: Dumping text pages at %s', os.path.join(self.outputDir,'text')) pdfToText = PdfToText(self.filePath, self.totalPages, os.path.join(self.outputDir,'text')) pdfToText.extractPages()
def testStructuredPdfAllPagewise(self): pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir) pdfToText.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"3.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"4.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"5.txt")))
def testStructuredPdfAllPagewise(self): pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir) pdfToText.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "3.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "4.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "5.txt")))