예제 #1
0
    def __init__(self, TikaCallTimeoutSeconds):
        self.ocrProxy = OCRProxy()
        self.ByteArrayInputStream = autoclass('java.io.ByteArrayInputStream')
        self.Metadata = autoclass('org.apache.tika.metadata.Metadata')
        self.AutoDetectParser = autoclass('org.apache.tika.parser.AutoDetectParser')
        self.BodyContentHandler = autoclass('org.apache.tika.sax.BodyContentHandler')
        self.TikaConfig = autoclass('org.apache.tika.config.TikaConfig')

        self.config = self.TikaConfig('/tika-config.xml')
        self.parser = self.AutoDetectParser(self.config)
예제 #2
0
 def __init__(self, OcrSymbolsPerPageThreshold, OcrMaxPageCount, ParserCallTimeoutSeconds): 
     self.ocrProxy = OCRProxy()
     self.parserCallTimeoutSeconds = ParserCallTimeoutSeconds
     self.ocrSymbolsPerPageThreshold = OcrSymbolsPerPageThreshold
     self.ocrMaxPageCount = OcrMaxPageCount
     self.ByteArrayInputStream = autoclass('java.io.ByteArrayInputStream')
     self.ByteArrayOutputStream = autoclass('java.io.ByteArrayOutputStream')
     self.PDDocument = autoclass('org.apache.pdfbox.pdmodel.PDDocument')
     self.PDPage = autoclass('org.apache.pdfbox.pdmodel.PDPage')
     self.PDAnnotation = autoclass('org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation')
     self.PDDocumentInformation = autoclass('org.apache.pdfbox.pdmodel.PDDocumentInformation')
     self.PDFRenderer = autoclass('org.apache.pdfbox.rendering.PDFRenderer')
     self.PDFTextStripper = autoclass('org.apache.pdfbox.text.PDFTextStripper')
     self.ImageType = autoclass('org.apache.pdfbox.rendering.ImageType')
     self.BufferedImage = autoclass('java.awt.image.BufferedImage')
     self.ImageIO = autoclass('javax.imageio.ImageIO')
     self.MemoryCacheImageOutputStream = autoclass('javax.imageio.stream.MemoryCacheImageOutputStream')
     self.System = autoclass('java.lang.System')   
     self.System.setProperty('org.apache.pdfbox.rendering.UsePureJavaCMYKConversion', 'true')
예제 #3
0
파일: pdfparser.py 프로젝트: zmijata/ambar
class PDFParser:
    def __init__(self, Logger, OcrSymbolsPerPageThreshold, OcrMaxPageCount, ParserCallTimeoutSeconds): 
        self.logger = Logger
        self.ocrProxy = OCRProxy()
        self.parserCallTimeoutSeconds = ParserCallTimeoutSeconds
        self.ocrSymbolsPerPageThreshold = OcrSymbolsPerPageThreshold
        self.ocrMaxPageCount = OcrMaxPageCount
        self.ByteArrayInputStream = autoclass('java.io.ByteArrayInputStream')
        self.ByteArrayOutputStream = autoclass('java.io.ByteArrayOutputStream')
        self.PDDocument = autoclass('org.apache.pdfbox.pdmodel.PDDocument')
        self.PDPage = autoclass('org.apache.pdfbox.pdmodel.PDPage')
        self.PDAnnotation = autoclass('org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation')
        self.PDDocumentInformation = autoclass('org.apache.pdfbox.pdmodel.PDDocumentInformation')
        self.PDFRenderer = autoclass('org.apache.pdfbox.rendering.PDFRenderer')
        self.PDFTextStripper = autoclass('org.apache.pdfbox.text.PDFTextStripper')
        self.ImageType = autoclass('org.apache.pdfbox.rendering.ImageType')
        self.BufferedImage = autoclass('java.awt.image.BufferedImage')
        self.ImageIO = autoclass('javax.imageio.ImageIO')
        self.MemoryCacheImageOutputStream = autoclass('javax.imageio.stream.MemoryCacheImageOutputStream')
        self.System = autoclass('java.lang.System')   
        self.System.setProperty('org.apache.pdfbox.rendering.UsePureJavaCMYKConversion', 'true')

    def Parse(self, FileName, FileData):
        resp = FileParserResponse()

        try: 
            inputStream = self.ByteArrayInputStream(FileData)
            document = self.PDDocument.load(inputStream)
            metadata = document.getDocumentInformation()
            resp.meta['Author'] = metadata.getAuthor()
            resp.meta['title'] = metadata.getTitle()
            resp.meta['Content-Type'] = 'application/pdf'
            resp.meta['Content-Length'] = sys.getsizeof(FileData)

            if document.getNumberOfPages() == 0:
                resp.success = True
                return resp

            ## generating thumbnail
            resp.thumbnail = self.GenerateThumbnail(document)

            ## parsing text
            pdfStripper = self.PDFTextStripper()

            for pageNumber in range(0, document.getNumberOfPages()):      
                pdfStripper.setStartPage(pageNumber + 1)
                pdfStripper.setEndPage(pageNumber + 1)
                
                try:
                    parsedText = pdfStripper.getText(document)
                except Exception as convEx:
                    parsedText = BinaryStringParser.Parse(convEx.object)

                if ((pageNumber < self.ocrMaxPageCount) or (self.ocrMaxPageCount == -1)) and ((self.GetSymbolsCount(parsedText) < self.ocrSymbolsPerPageThreshold) or (self.ocrSymbolsPerPageThreshold == -1)):
                    self.logger.LogMessage('info','performing ocr on page {0} of pdf {1}'.format(pageNumber + 1, FileName))
                    ocrResp = self.PerformOCROnPage(document, pageNumber)

                    if not ocrResp.success:
                        self.logger.LogMessage('info','could not perform ocr on page {0} of pdf {1} {2}'.format(pageNumber + 1, FileName, ocrResp.message))

                    if ocrResp.success:
                        parsedText = '{0}\r\n{1}'.format(parsedText, ocrResp.text)
                        resp.ocrPerformed = True

                ##parsing annotations
                try:
                    pdfPage = document.getPage(pageNumber)  
                    pdfAnnotations = pdfPage.getAnnotations()

                    annotationsText = ''

                    if pdfAnnotations.size() > 0:
                        for pdfAnnotationNumber in range(0, pdfAnnotations.size()):
                            pdfAnnotationContents = pdfAnnotations.get(pdfAnnotationNumber).getContents()
                            if pdfAnnotationContents and pdfAnnotationContents != '':
                                annotationsText = '{0}{1}\r\n----\r\n'.format(annotationsText, pdfAnnotationContents)
                        
                        if annotationsText != '':
                            parsedText = '{0}\r\n----Annotations start----\r\n{1}----Annotations end----'.format(parsedText, annotationsText[:-6])
                    
                    pdfPage = None
                    pdfAnnotations = None
                except Exception as ex:
                    self.logger.LogMessage('info','could not extract annotations from page {0} of pdf {1}'.format(pageNumber + 1, FileName))

                parsedText = self.NormalizeText(parsedText)
                resp.text = '{0}\r\n{1}'.format(resp.text, parsedText)

            inputStream = None
            document = None
            self.System.gc()

            resp.success = True
        except Exception as ex:
            resp.message = str(ex)
            resp.success = False

        return resp

    def GenerateThumbnail(self, document):
        try:           
            pdfRenderer = self.PDFRenderer(document)
            bufferedImage = pdfRenderer.renderImageWithDPI(0, 75, self.ImageType.RGB)
            byteStream = self.ByteArrayOutputStream()
            imageStream = self.MemoryCacheImageOutputStream(byteStream)
            self.ImageIO.write(bufferedImage, "jpg", imageStream)   
            imageData = bytearray(byteStream.toByteArray())

            pdfRenderer = None
            bufferedImage = None
            byteStream = None
            imageStream = None
            self.System.gc()

            return (imageData, 'image/jpeg')
        except Exception as ex:
            self.logger.LogMessage('info','unable to generate thumbnail for pdf {0}'.format(str(ex)))
            return None 
    
    def PerformOCROnPage(self, document, pageNumber):
        ocrResp = OCRProxyResponse()

        try:
            pdfRenderer = self.PDFRenderer(document)          
            bufferedImage = pdfRenderer.renderImageWithDPI(pageNumber, 200, self.ImageType.RGB)
            byteStream = self.ByteArrayOutputStream()
            imageStream = self.MemoryCacheImageOutputStream(byteStream)
            self.ImageIO.write(bufferedImage, "jpg", imageStream)          
            imageData = bytearray(byteStream.toByteArray())      

            ocrResp = self.ocrProxy.PerformOCR(imageData)

            pdfRenderer = None
            bufferedImage = None
            byteStream = None
            imageStream = None
            self.System.gc()
        except Exception as ex:
            ocrResp.success = False
            ocrResp.message = str(ex)

        return ocrResp

    def NormalizeText(self, Text):
        regex = re.compile(r'([\s]*[\r]*\n){2,}')
        return re.sub(regex, '\r\n', Text)
    
    def GetSymbolsCount(self, Text):
        regex = re.compile(r'[^a-zа-яёй]+', re.I)
        strippedText = re.sub(regex, '', Text)
        return len(strippedText)
예제 #4
0
class TikaParser:
    def __init__(self, TikaCallTimeoutSeconds):
        self.ocrProxy = OCRProxy()
        self.ByteArrayInputStream = autoclass('java.io.ByteArrayInputStream')
        self.Metadata = autoclass('org.apache.tika.metadata.Metadata')
        self.AutoDetectParser = autoclass('org.apache.tika.parser.AutoDetectParser')
        self.BodyContentHandler = autoclass('org.apache.tika.sax.BodyContentHandler')
        self.TikaConfig = autoclass('org.apache.tika.config.TikaConfig')

        self.config = self.TikaConfig('/tika-config.xml')
        self.parser = self.AutoDetectParser(self.config)

    def Parse(self, FileName, FileData):
        resp = FileParserResponse()

        try:
            meta = self.Metadata()
            if FileName and FileName != '':
                meta.set(self.Metadata.RESOURCE_NAME_KEY, FileName)
            contentHandler = self.BodyContentHandler(-1)
            inputStream = self.ByteArrayInputStream(FileData)
            self.parser.parse(inputStream, contentHandler, meta)

            try:
                resp.text = contentHandler.toString()
            except Exception as convEx:
                resp.text = BinaryStringParser.Parse(convEx.object)
            
            for name in meta.names():
                try:
                    resp.meta[name] = meta.get(name)
                except:
                    resp.meta[name] = ''

            inputStream = None
            contentHandler = None

            if 'Content-Type' in resp.meta and ContentTypeAnalyzer.IsImageByContentType(resp.meta['Content-Type']):
                print('info','performing ocr on {0}'.format(FileName))
                ocrResp = self.ocrProxy.PerformOCR(FileData)

                if ocrResp.success:
                    resp.text = self.NormalizeText('{0}{1}'.format(resp.text, ocrResp.text))
                    resp.ocrPerformed = True

                if not ocrResp.success:
                    sprint('info','could not perform ocr on {0} {1}'.format(FileName, ocrResp.message))
                
                resp.thumbnail = self.GenerateThumbnail(FileData)

            resp.success = True
        except Exception as ex:
            resp.success = False
            resp.message = str(ex)

        return resp
    
    def GenerateThumbnail(self, ImageData, MaxWidth = 1000, MaxHeigh = 5000, Quality = 70, Dpi = 50):
        try:              
            image = Image.open(io.BytesIO(ImageData))

            if 'compression' in image.info and image.info['compression']=='tiff_jpeg':
                return None
                
            image.thumbnail((MaxWidth,MaxHeigh))
            bytesIO = io.BytesIO()
            image.convert('RGB').save(bytesIO, format='JPEG', quality=Quality)
            return (bytesIO.getvalue(), 'image/jpeg')
        except:
            pass        
        return None

    def NormalizeText(self, Text):
        regex = re.compile(r'([\s]*[\r]*\n){2,}')
        return re.sub(regex, '\r\n', Text)