def __init__(self, TikaCallTimeoutSeconds): self.ocrProxy = OCRProxy() self.ByteArrayInputStream = autoclass('java.io.ByteArrayInputStream') self.Metadata = autoclass('org.apache.tika.metadata.Metadata') self.AutoDetectParser = autoclass('org.apache.tika.parser.AutoDetectParser') self.BodyContentHandler = autoclass('org.apache.tika.sax.BodyContentHandler') self.TikaConfig = autoclass('org.apache.tika.config.TikaConfig') self.config = self.TikaConfig('/tika-config.xml') self.parser = self.AutoDetectParser(self.config)
def __init__(self, OcrSymbolsPerPageThreshold, OcrMaxPageCount, ParserCallTimeoutSeconds): self.ocrProxy = OCRProxy() self.parserCallTimeoutSeconds = ParserCallTimeoutSeconds self.ocrSymbolsPerPageThreshold = OcrSymbolsPerPageThreshold self.ocrMaxPageCount = OcrMaxPageCount self.ByteArrayInputStream = autoclass('java.io.ByteArrayInputStream') self.ByteArrayOutputStream = autoclass('java.io.ByteArrayOutputStream') self.PDDocument = autoclass('org.apache.pdfbox.pdmodel.PDDocument') self.PDPage = autoclass('org.apache.pdfbox.pdmodel.PDPage') self.PDAnnotation = autoclass('org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation') self.PDDocumentInformation = autoclass('org.apache.pdfbox.pdmodel.PDDocumentInformation') self.PDFRenderer = autoclass('org.apache.pdfbox.rendering.PDFRenderer') self.PDFTextStripper = autoclass('org.apache.pdfbox.text.PDFTextStripper') self.ImageType = autoclass('org.apache.pdfbox.rendering.ImageType') self.BufferedImage = autoclass('java.awt.image.BufferedImage') self.ImageIO = autoclass('javax.imageio.ImageIO') self.MemoryCacheImageOutputStream = autoclass('javax.imageio.stream.MemoryCacheImageOutputStream') self.System = autoclass('java.lang.System') self.System.setProperty('org.apache.pdfbox.rendering.UsePureJavaCMYKConversion', 'true')
class PDFParser: def __init__(self, Logger, OcrSymbolsPerPageThreshold, OcrMaxPageCount, ParserCallTimeoutSeconds): self.logger = Logger self.ocrProxy = OCRProxy() self.parserCallTimeoutSeconds = ParserCallTimeoutSeconds self.ocrSymbolsPerPageThreshold = OcrSymbolsPerPageThreshold self.ocrMaxPageCount = OcrMaxPageCount self.ByteArrayInputStream = autoclass('java.io.ByteArrayInputStream') self.ByteArrayOutputStream = autoclass('java.io.ByteArrayOutputStream') self.PDDocument = autoclass('org.apache.pdfbox.pdmodel.PDDocument') self.PDPage = autoclass('org.apache.pdfbox.pdmodel.PDPage') self.PDAnnotation = autoclass('org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation') self.PDDocumentInformation = autoclass('org.apache.pdfbox.pdmodel.PDDocumentInformation') self.PDFRenderer = autoclass('org.apache.pdfbox.rendering.PDFRenderer') self.PDFTextStripper = autoclass('org.apache.pdfbox.text.PDFTextStripper') self.ImageType = autoclass('org.apache.pdfbox.rendering.ImageType') self.BufferedImage = autoclass('java.awt.image.BufferedImage') self.ImageIO = autoclass('javax.imageio.ImageIO') self.MemoryCacheImageOutputStream = autoclass('javax.imageio.stream.MemoryCacheImageOutputStream') self.System = autoclass('java.lang.System') self.System.setProperty('org.apache.pdfbox.rendering.UsePureJavaCMYKConversion', 'true') def Parse(self, FileName, FileData): resp = FileParserResponse() try: inputStream = self.ByteArrayInputStream(FileData) document = self.PDDocument.load(inputStream) metadata = document.getDocumentInformation() resp.meta['Author'] = metadata.getAuthor() resp.meta['title'] = metadata.getTitle() resp.meta['Content-Type'] = 'application/pdf' resp.meta['Content-Length'] = sys.getsizeof(FileData) if document.getNumberOfPages() == 0: resp.success = True return resp ## generating thumbnail resp.thumbnail = self.GenerateThumbnail(document) ## parsing text pdfStripper = self.PDFTextStripper() for pageNumber in range(0, document.getNumberOfPages()): pdfStripper.setStartPage(pageNumber + 1) pdfStripper.setEndPage(pageNumber + 1) try: parsedText = pdfStripper.getText(document) except Exception as convEx: parsedText = BinaryStringParser.Parse(convEx.object) if ((pageNumber < self.ocrMaxPageCount) or (self.ocrMaxPageCount == -1)) and ((self.GetSymbolsCount(parsedText) < self.ocrSymbolsPerPageThreshold) or (self.ocrSymbolsPerPageThreshold == -1)): self.logger.LogMessage('info','performing ocr on page {0} of pdf {1}'.format(pageNumber + 1, FileName)) ocrResp = self.PerformOCROnPage(document, pageNumber) if not ocrResp.success: self.logger.LogMessage('info','could not perform ocr on page {0} of pdf {1} {2}'.format(pageNumber + 1, FileName, ocrResp.message)) if ocrResp.success: parsedText = '{0}\r\n{1}'.format(parsedText, ocrResp.text) resp.ocrPerformed = True ##parsing annotations try: pdfPage = document.getPage(pageNumber) pdfAnnotations = pdfPage.getAnnotations() annotationsText = '' if pdfAnnotations.size() > 0: for pdfAnnotationNumber in range(0, pdfAnnotations.size()): pdfAnnotationContents = pdfAnnotations.get(pdfAnnotationNumber).getContents() if pdfAnnotationContents and pdfAnnotationContents != '': annotationsText = '{0}{1}\r\n----\r\n'.format(annotationsText, pdfAnnotationContents) if annotationsText != '': parsedText = '{0}\r\n----Annotations start----\r\n{1}----Annotations end----'.format(parsedText, annotationsText[:-6]) pdfPage = None pdfAnnotations = None except Exception as ex: self.logger.LogMessage('info','could not extract annotations from page {0} of pdf {1}'.format(pageNumber + 1, FileName)) parsedText = self.NormalizeText(parsedText) resp.text = '{0}\r\n{1}'.format(resp.text, parsedText) inputStream = None document = None self.System.gc() resp.success = True except Exception as ex: resp.message = str(ex) resp.success = False return resp def GenerateThumbnail(self, document): try: pdfRenderer = self.PDFRenderer(document) bufferedImage = pdfRenderer.renderImageWithDPI(0, 75, self.ImageType.RGB) byteStream = self.ByteArrayOutputStream() imageStream = self.MemoryCacheImageOutputStream(byteStream) self.ImageIO.write(bufferedImage, "jpg", imageStream) imageData = bytearray(byteStream.toByteArray()) pdfRenderer = None bufferedImage = None byteStream = None imageStream = None self.System.gc() return (imageData, 'image/jpeg') except Exception as ex: self.logger.LogMessage('info','unable to generate thumbnail for pdf {0}'.format(str(ex))) return None def PerformOCROnPage(self, document, pageNumber): ocrResp = OCRProxyResponse() try: pdfRenderer = self.PDFRenderer(document) bufferedImage = pdfRenderer.renderImageWithDPI(pageNumber, 200, self.ImageType.RGB) byteStream = self.ByteArrayOutputStream() imageStream = self.MemoryCacheImageOutputStream(byteStream) self.ImageIO.write(bufferedImage, "jpg", imageStream) imageData = bytearray(byteStream.toByteArray()) ocrResp = self.ocrProxy.PerformOCR(imageData) pdfRenderer = None bufferedImage = None byteStream = None imageStream = None self.System.gc() except Exception as ex: ocrResp.success = False ocrResp.message = str(ex) return ocrResp def NormalizeText(self, Text): regex = re.compile(r'([\s]*[\r]*\n){2,}') return re.sub(regex, '\r\n', Text) def GetSymbolsCount(self, Text): regex = re.compile(r'[^a-zа-яёй]+', re.I) strippedText = re.sub(regex, '', Text) return len(strippedText)
class TikaParser: def __init__(self, TikaCallTimeoutSeconds): self.ocrProxy = OCRProxy() self.ByteArrayInputStream = autoclass('java.io.ByteArrayInputStream') self.Metadata = autoclass('org.apache.tika.metadata.Metadata') self.AutoDetectParser = autoclass('org.apache.tika.parser.AutoDetectParser') self.BodyContentHandler = autoclass('org.apache.tika.sax.BodyContentHandler') self.TikaConfig = autoclass('org.apache.tika.config.TikaConfig') self.config = self.TikaConfig('/tika-config.xml') self.parser = self.AutoDetectParser(self.config) def Parse(self, FileName, FileData): resp = FileParserResponse() try: meta = self.Metadata() if FileName and FileName != '': meta.set(self.Metadata.RESOURCE_NAME_KEY, FileName) contentHandler = self.BodyContentHandler(-1) inputStream = self.ByteArrayInputStream(FileData) self.parser.parse(inputStream, contentHandler, meta) try: resp.text = contentHandler.toString() except Exception as convEx: resp.text = BinaryStringParser.Parse(convEx.object) for name in meta.names(): try: resp.meta[name] = meta.get(name) except: resp.meta[name] = '' inputStream = None contentHandler = None if 'Content-Type' in resp.meta and ContentTypeAnalyzer.IsImageByContentType(resp.meta['Content-Type']): print('info','performing ocr on {0}'.format(FileName)) ocrResp = self.ocrProxy.PerformOCR(FileData) if ocrResp.success: resp.text = self.NormalizeText('{0}{1}'.format(resp.text, ocrResp.text)) resp.ocrPerformed = True if not ocrResp.success: sprint('info','could not perform ocr on {0} {1}'.format(FileName, ocrResp.message)) resp.thumbnail = self.GenerateThumbnail(FileData) resp.success = True except Exception as ex: resp.success = False resp.message = str(ex) return resp def GenerateThumbnail(self, ImageData, MaxWidth = 1000, MaxHeigh = 5000, Quality = 70, Dpi = 50): try: image = Image.open(io.BytesIO(ImageData)) if 'compression' in image.info and image.info['compression']=='tiff_jpeg': return None image.thumbnail((MaxWidth,MaxHeigh)) bytesIO = io.BytesIO() image.convert('RGB').save(bytesIO, format='JPEG', quality=Quality) return (bytesIO.getvalue(), 'image/jpeg') except: pass return None def NormalizeText(self, Text): regex = re.compile(r'([\s]*[\r]*\n){2,}') return re.sub(regex, '\r\n', Text)