def testScannedPdfPageForUnauthorisec(self): pdfSeparate = PdfSeparate("tests/sample-scanned-1.pdf", self.indir) pdfSeparate.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf"))) try: abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english") abbyyPdf.setApplicationCredentials("nouser", "nopassword") abbyyPdf.processPdfPage(1) except HTTPError as e: self.assertEqual(e.code, 401) self.assertEqual(e.reason, "Unauthorized")
def testScanned44PdfPageForNetwork(self): pdfSeparate = PdfSeparate('tests/sample-scanned-44pages.pdf', self.indir) pdfSeparate.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf"))) try: abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 44, "english") abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password')) abbyyPdf.extractPages(); self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt"))) self.assertTrue(os.path.isfile(os.path.join(self.outdir,"44.txt"))) except Exception: pass
def testScannedPdfPage(self): pdfSeparate = PdfSeparate("tests/sample-scanned-1.pdf", self.indir) pdfSeparate.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf"))) try: abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english") abbyyPdf.setApplicationCredentials( self.configParser.get("abbyy", "appid"), self.configParser.get("abbyy", "password") ) abbyyPdf.processPdfPage(1) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt"))) except Exception: pass
def extractTextFromScannedDoc(self): """ makes api calls """ self.logger.info('Calling Abbyy: OCR-ing %d pages at %s', self.totalPages, os.path.join(self.outputDir,'text')) abbyyPdf = AbbyyPdfTextExtractor(os.path.join(self.outputDir,'pages'), os.path.join(self.outputDir,'text'), self.totalPages, self.language) abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password')) abbyyPdf.extractPages();
def testScannedPdfPageForUnauthorisec(self): pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir) pdfSeparate.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf"))) try: abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english") abbyyPdf.setApplicationCredentials('nouser', 'nopassword') abbyyPdf.processPdfPage(1) except HTTPError as e: self.assertEqual(e.code, 401) self.assertEqual(e.reason, "Unauthorized")
def testScannedPdfPage(self): pdfSeparate = PdfSeparate('tests/sample-scanned-1.pdf', self.indir) pdfSeparate.extractPages() self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf"))) try: abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 1, "english") abbyyPdf.setApplicationCredentials( self.configParser.get('abbyy', 'appid'), self.configParser.get('abbyy', 'password')) abbyyPdf.processPdfPage(1) self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt"))) except Exception: pass