def extractTextFromStructuredDoc(self): """ creates "text" dir to dump the extracted pages """ self.logger.info('Calling Pdftotext: Dumping text pages at %s', os.path.join(self.outputDir,'text')) pdfToText = PdfToText(self.filePath, self.totalPages, os.path.join(self.outputDir,'text')) pdfToText.extractPages()
def testStructuredPdfAllPagewise(self): pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir) pdfToText.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "3.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "4.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "5.txt")))
def testStructuredPdfAllPagewise(self): pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir) pdfToText.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"3.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"4.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"5.txt")))
def processToCheckStructured(self): """ dumps the entire pdf to text to get the size of the content """ pdfToText = PdfToText(self.filePath, self.totalPages, self.outputDir) pdfToText.dumpPages() self.textContentSize += os.path.getsize(pdfToText.dumpedTextFilepath) self.logger.info('Text content size: %d bytes', self.textContentSize) self.logger.info('Structured? %s', self.isStructured())
def testStructuredPdfAllPagesDump(self): pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir) pdfToText.dumpPages() self.assertTrue(os.path.isfile(os.path.join(self.outdir,"sample.txt")))
def testScannedPdfPage(self): pdfToText = PdfToText('tests/sample-scanned.pdf', 5, self.outdir) pdfToText.extractPage(2) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
def testStructuredPdfPage(self): pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir) pdfToText.extractPage(1) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
def testStructuredPdfAllPagesDump(self): pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir) pdfToText.dumpPages() self.assertTrue(os.path.isfile(os.path.join(self.outdir, "sample.txt")))
def testScannedPdfPage(self): pdfToText = PdfToText('tests/sample-scanned.pdf', 5, self.outdir) pdfToText.extractPage(2) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt")))
def testStructuredPdfPage(self): pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir) pdfToText.extractPage(1) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))