def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" tessdata_prefix = get_config('TESSDATA_PREFIX') if tessdata_prefix is None: raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.") languages = get_languages_iso3(languages) text = Cache.get_ocr(data, languages) if text is not None: return text try: img = Image.open(StringIO(data)) except DecompressionBombWarning as dce: log.debug("Image too large: %", dce) return None except IOError as ioe: log.info("Unknown image format: %r", ioe) return None # TODO: play with contrast and sharpening the images. extractor = Tesseract(tessdata_prefix, lang=languages) extractor.set_image(img) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.get_text() or '' text = text.decode(encoding="UTF-8") # extractor.clear() log.debug('OCR done: %s, %s characters extracted', languages, len(text)) Cache.set_ocr(data, languages, text) return text
def extract_text(self, data, languages=None): key = sha1(data).hexdigest() text = Cache.get_cache(key) if text is not None: log.info('OCR: %s chars cached', len(text)) return text # log.info("Size: %s", len(data)) data = self.ensure_size(data) if data is None: return for attempt in range(10): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = RPCImage(data=data, languages=languages) response = service.Recognize(image) log.info('OCR: %s chars recognized', len(response.text)) if response.text is not None: Cache.set_cache(key, response.text) return response.text except self.Error as exc: log.exception("gRPC Error: %s", self.SERVICE) self.reset_channel() backoff(failures=attempt)
def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" tessdata_prefix = get_config('TESSDATA_PREFIX') if tessdata_prefix is None: raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.") languages = get_languages_iso3(languages) text = Cache.get_ocr(data, languages) if text is not None: return text try: img = Image.open(StringIO(data)) except DecompressionBombWarning as dce: log.debug("Image too large: %", dce) return None except IOError as ioe: log.info("Unknown image format: %r", ioe) return None # TODO: play with contrast and sharpening the images. extractor = Tesseract(tessdata_prefix, lang=languages) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.ocr_image(img) extractor.clear() log.debug('OCR done: %s, %s characters extracted', languages, len(text)) Cache.set_ocr(data, languages, text) return text
def extract_text(self, data, languages=None): key = sha1(data).hexdigest() text = Cache.get_cache(key) if text is not None: # log.info('%s chars cached', len(text)) return text data = self.ensure_size(data) if data is None: return for attempt in range(1000): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = Image(data=data, languages=languages) response = service.Recognize(image) log.info('OCR: %s chars', len(response.text)) if response.text is not None: Cache.set_cache(key, response.text) return response.text except self.Error as e: if e.code() == self.Status.RESOURCE_EXHAUSTED: continue log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt) self.reset_channel()
def test_cache_basic(self): assert None is Cache.get_cache('foo'), Cache.get_cache('foo') assert db.session.query(Cache).count() == 0 Cache.set_cache('foo', 'bar') assert 'bar' == Cache.get_cache('foo'), Cache.get_cache('foo') assert db.session.query(Cache).count() == 1 Cache.set_cache('foo', 'quuux') assert 'quuux' == Cache.get_cache('foo'), Cache.get_cache('foo') assert db.session.query(Cache).count() == 1
def extract_text(self, data, languages=None): key = sha1(data).hexdigest() text = Cache.get_cache(key) if text is not None: log.info('Vision API: %s chars cached', len(text)) return text data = self.ensure_size(data) if data is not None: image = types.Image(content=data) res = self.client.document_text_detection(image) ann = res.full_text_annotation log.info('Vision API: %s chars recognized', len(ann.text)) Cache.set_cache(key, ann.text) return ann.text
def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" tessdata_prefix = get_config('TESSDATA_PREFIX') if tessdata_prefix is None: raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.") languages = get_languages_iso3(languages) text = Cache.get_ocr(data, languages) if text is not None: return text img = Image.open(StringIO(data)) # TODO: play with contrast and sharpening the images. extractor = Tesseract(tessdata_prefix, lang=languages) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.ocr_image(img) log.debug('OCR done: %s, %s characters extracted', languages, len(text)) Cache.set_ocr(data, languages, text) return text
def set_cache(self, key, value): Cache.set_cache(key, value)
def get_cache(self, key): return Cache.get_cache(key)