def extract_text(self, data, languages=None): """Extract text from a binary string of data.""" try: image = Image.open(BytesIO(data)) image.load() except Exception: log.exception("Cannot open image data using Pillow") return '' with temp_locale(TESSERACT_LOCALE): languages = self.language_list(languages) api = self.configure_engine(languages) try: # TODO: play with contrast and sharpening the images. start_time = time.time() api.SetImage(image) text = api.GetUTF8Text() confidence = api.MeanTextConf() end_time = time.time() duration = end_time - start_time log.info("w: %s, h: %s, l: %s, c: %s, took: %.5f", image.width, image.height, languages, confidence, duration) return text except Exception as exc: log.warning("OCR error: %s", exc) return '' finally: api.Clear()
def language_list(self, languages): if not hasattr(settings, 'ocr_supported'): with temp_locale(TESSERACT_LOCALE): # Tesseract language types: from tesserocr import get_languages _, settings.ocr_supported = get_languages() # log.info("OCR languages: %r", settings.ocr_supported) models = [c for c in alpha3(languages) if c in settings.ocr_supported] if len(models) > self.MAX_MODELS: log.warning("Too many models, limit: %s", self.MAX_MODELS) models = models[:self.MAX_MODELS] models.append('eng') return '+'.join(sorted(set(models)))