Пример #1
0
 def extractTextFromStructuredDoc(self):
     """
     creates "text" dir to dump the extracted pages
     """
     self.logger.info('Calling Pdftotext: Dumping text pages at %s', os.path.join(self.outputDir,'text'))
     pdfToText = PdfToText(self.filePath, self.totalPages, os.path.join(self.outputDir,'text'))
     pdfToText.extractPages()
Пример #2
0
 def testStructuredPdfAllPagewise(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "3.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "4.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "5.txt")))
Пример #3
0
 def testStructuredPdfAllPagewise(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"3.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"4.txt")))
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"5.txt")))
Пример #4
0
 def processToCheckStructured(self):
     """
     dumps the entire pdf to text to get the size of the content
     """
     pdfToText = PdfToText(self.filePath, self.totalPages, self.outputDir)
     pdfToText.dumpPages()
     self.textContentSize += os.path.getsize(pdfToText.dumpedTextFilepath)
     self.logger.info('Text content size: %d bytes', self.textContentSize)
     self.logger.info('Structured? %s', self.isStructured())
Пример #5
0
 def testStructuredPdfAllPagesDump(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.dumpPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"sample.txt")))
Пример #6
0
 def testScannedPdfPage(self):
     pdfToText = PdfToText('tests/sample-scanned.pdf', 5, self.outdir)
     pdfToText.extractPage(2)
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"2.txt")))
Пример #7
0
 def testStructuredPdfPage(self):        
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPage(1)
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
Пример #8
0
 def testStructuredPdfAllPagesDump(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.dumpPages()
     self.assertTrue(os.path.isfile(os.path.join(self.outdir,
                                                 "sample.txt")))
Пример #9
0
 def testScannedPdfPage(self):
     pdfToText = PdfToText('tests/sample-scanned.pdf', 5, self.outdir)
     pdfToText.extractPage(2)
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt")))
Пример #10
0
 def testStructuredPdfPage(self):
     pdfToText = PdfToText('tests/sample.pdf', 5, self.outdir)
     pdfToText.extractPage(1)
     self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))