Пример #1
0
 def extractTextFromScannedDoc(self):
     """
     makes api calls 
     """
     self.logger.info('Calling Abbyy: OCR-ing %d pages at %s', self.totalPages, os.path.join(self.outputDir,'text'))
     abbyyPdf = AbbyyPdfTextExtractor(os.path.join(self.outputDir,'pages'), os.path.join(self.outputDir,'text'), self.totalPages, self.language)
     abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
     abbyyPdf.extractPages();
    def testScanned44PdfPageForNetwork(self):
        pdfSeparate = PdfSeparate('tests/sample-scanned-44pages.pdf', self.indir)
        pdfSeparate.extractPages()
        self.assertTrue(os.path.isfile(os.path.join(self.indir,"1.pdf")))

        try:
            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 44, "english")
            abbyyPdf.setApplicationCredentials(self.configParser.get('abbyy','appid'), self.configParser.get('abbyy','password'))
            abbyyPdf.extractPages();
            self.assertTrue(os.path.isfile(os.path.join(self.outdir,"1.txt")))
            self.assertTrue(os.path.isfile(os.path.join(self.outdir,"44.txt")))
        except Exception:
            pass
    def testScannedPdfPages(self):
        pdfSeparate = PdfSeparate("tests/sample-scanned.pdf", self.indir)
        pdfSeparate.extractPages()
        self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf")))
        self.assertTrue(os.path.isfile(os.path.join(self.indir, "2.pdf")))

        try:
            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 2, "english")
            abbyyPdf.setApplicationCredentials(
                self.configParser.get("abbyy", "appid"), self.configParser.get("abbyy", "password")
            )
            abbyyPdf.extractPages()
            self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))
            self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt")))
        except Exception:
            pass
    def testScannedPdfPages(self):
        pdfSeparate = PdfSeparate('tests/sample-scanned.pdf', self.indir)
        pdfSeparate.extractPages()
        self.assertTrue(os.path.isfile(os.path.join(self.indir, "1.pdf")))
        self.assertTrue(os.path.isfile(os.path.join(self.indir, "2.pdf")))

        try:
            abbyyPdf = AbbyyPdfTextExtractor(self.indir, self.outdir, 2,
                                             "english")
            abbyyPdf.setApplicationCredentials(
                self.configParser.get('abbyy', 'appid'),
                self.configParser.get('abbyy', 'password'))
            abbyyPdf.extractPages()
            self.assertTrue(os.path.isfile(os.path.join(self.outdir, "1.txt")))
            self.assertTrue(os.path.isfile(os.path.join(self.outdir, "2.txt")))
        except Exception:
            pass