示例#1
0
    def extract_text(self, data, languages=None):
        """Extract text from a binary string of data."""
        try:
            image = Image.open(BytesIO(data))
            image.load()
        except Exception:
            log.exception("Cannot open image data using Pillow")
            return ''

        with temp_locale(TESSERACT_LOCALE):
            languages = self.language_list(languages)
            api = self.configure_engine(languages)
            try:
                # TODO: play with contrast and sharpening the images.
                start_time = time.time()
                api.SetImage(image)
                text = api.GetUTF8Text()
                confidence = api.MeanTextConf()
                end_time = time.time()
                duration = end_time - start_time
                log.info("w: %s, h: %s, l: %s, c: %s, took: %.5f",
                         image.width, image.height, languages,
                         confidence, duration)
                return text
            except Exception as exc:
                log.warning("OCR error: %s", exc)
                return ''
            finally:
                api.Clear()
示例#2
0
 def language_list(self, languages):
     if not hasattr(settings, 'ocr_supported'):
         with temp_locale(TESSERACT_LOCALE):
             # Tesseract language types:
             from tesserocr import get_languages
             _, settings.ocr_supported = get_languages()
             # log.info("OCR languages: %r", settings.ocr_supported)
     models = [c for c in alpha3(languages) if c in settings.ocr_supported]
     if len(models) > self.MAX_MODELS:
         log.warning("Too many models, limit: %s", self.MAX_MODELS)
         models = models[:self.MAX_MODELS]
     models.append('eng')
     return '+'.join(sorted(set(models)))