def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" if TESSDATA_PREFIX is None: raise ValueError("Env TESSDATA_PREFIX is not set, OCR will not work.") key, text = get_cache(data) if text is not None: return text try: img = Image.open(StringIO(data)) except Exception as ex: log.debug("Failed to parse image internally: %r", ex) return "" # TODO: play with contrast and sharpening the images. try: languages = _get_languages(languages) extractor = Tesseract(TESSDATA_PREFIX, lang=languages) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.ocr_image(img) log.debug("OCR done: %s, %s characters extracted", languages, len(text)) set_cache(key, text) return text except Exception as ex: log.exception(ex) return ""
def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" if TESSDATA_PREFIX is None: raise ValueError('Env TESSDATA_PREFIX is not set, OCR will not work.') key, text = get_cache(data) if text is not None: return text try: img = Image.open(StringIO(data)) except Exception as ex: log.debug('Failed to parse image internally: %r', ex) return '' # TODO: play with contrast and sharpening the images. try: languages = _get_languages(languages) extractor = Tesseract(TESSDATA_PREFIX, lang=languages) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.ocr_image(img) log.debug('OCR done: %s, %s characters extracted', languages, len(text)) set_cache(key, text) return text except Exception as ex: log.exception(ex) return ''
def extract_image_data(data, languages=None): """ Extract text from a binary string of data containing an image in a commonly-used format. """ if TESSDATA_PREFIX is None: raise ValueError('Env TESSDATA_PREFIX is not set, OCR will not work.') key, text = get_cache(data) if text is not None: return text img = Image.open(StringIO(data)) # TODO: play with contrast and sharpening the images. try: extractor = _get_tesseract() extractor.set_image(img) text = extractor.get_utf8_text() extractor.clear() set_cache(key, text) return text except Exception as ex: log.exception(ex) set_cache(key, '') return ''
def crawl_file(file_path): global processed _, ext = os.path.splitext(file_path) ext = ext.strip().lower() if ext.lower() not in FILE_EXTENSIONS: return with open(file_path, 'rb') as fh: data = fh.read() key, text = get_cache(data) if text is not None: return text = extract_image_data(data) counter_lock.acquire() try: processed += 1 time_taken = time.time() - START_TIME img_per_sec = time_taken / processed finally: counter_lock.release() log.info('Extracted: %s (%d characters of text), %.3fs/img', file_path, len(text), img_per_sec)