Пример #1
0
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    if TESSDATA_PREFIX is None:
        raise ValueError("Env TESSDATA_PREFIX is not set, OCR will not work.")
    key, text = get_cache(data)
    if text is not None:
        return text
    try:
        img = Image.open(StringIO(data))
    except Exception as ex:
        log.debug("Failed to parse image internally: %r", ex)
        return ""

    # TODO: play with contrast and sharpening the images.
    try:
        languages = _get_languages(languages)
        extractor = Tesseract(TESSDATA_PREFIX, lang=languages)
        extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
        text = extractor.ocr_image(img)
        log.debug("OCR done: %s, %s characters extracted", languages, len(text))
        set_cache(key, text)
        return text
    except Exception as ex:
        log.exception(ex)
        return ""
Пример #2
0
def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    if TESSDATA_PREFIX is None:
        raise ValueError('Env TESSDATA_PREFIX is not set, OCR will not work.')
    key, text = get_cache(data)
    if text is not None:
        return text
    try:
        img = Image.open(StringIO(data))
    except Exception as ex:
        log.debug('Failed to parse image internally: %r', ex)
        return ''

    # TODO: play with contrast and sharpening the images.
    try:
        languages = _get_languages(languages)
        extractor = Tesseract(TESSDATA_PREFIX, lang=languages)
        extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
        text = extractor.ocr_image(img)
        log.debug('OCR done: %s, %s characters extracted', languages,
                  len(text))
        set_cache(key, text)
        return text
    except Exception as ex:
        log.exception(ex)
        return ''
Пример #3
0
def extract_image_data(data, languages=None):
    """ Extract text from a binary string of data containing an image in
    a commonly-used format. """
    if TESSDATA_PREFIX is None:
        raise ValueError('Env TESSDATA_PREFIX is not set, OCR will not work.')
    key, text = get_cache(data)
    if text is not None:
        return text
    img = Image.open(StringIO(data))
    # TODO: play with contrast and sharpening the images.
    try:
        extractor = _get_tesseract()
        extractor.set_image(img)
        text = extractor.get_utf8_text()
        extractor.clear()
        set_cache(key, text)
        return text
    except Exception as ex:
        log.exception(ex)
        set_cache(key, '')
        return ''
Пример #4
0
def crawl_file(file_path):
    global processed
    _, ext = os.path.splitext(file_path)
    ext = ext.strip().lower()
    if ext.lower() not in FILE_EXTENSIONS:
        return
    with open(file_path, 'rb') as fh:
        data = fh.read()
    key, text = get_cache(data)
    if text is not None:
        return
    text = extract_image_data(data)
    counter_lock.acquire()
    try:
        processed += 1
        time_taken = time.time() - START_TIME
        img_per_sec = time_taken / processed
    finally:
        counter_lock.release()
    log.info('Extracted: %s (%d characters of text), %.3fs/img', file_path,
             len(text), img_per_sec)