Пример #1
0
def _convert_page(layout, path):
    text_content = []
    for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)):
        text_content.append(text_obj.get_text())

    text = text_fragments(text_content)
    # if len(text) < 2:
    #     if len(list(_find_objects(layout._objs, LTImage))):
    #         log.debug("Defaulting to OCR: %r, pg. %s", path, page_no)
    #         text = _extract_image_page(path, page_no, languages)
    return text
Пример #2
0
def _convert_page(layout, path):
    text_content = []
    for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)):
        text_content.append(text_obj.get_text())

    text = text_fragments(text_content)
    # if len(text) < 2:
    #     if len(list(_find_objects(layout._objs, LTImage))):
    #         log.debug("Defaulting to OCR: %r, pg. %s", path, page_no)
    #         text = _extract_image_page(path, page_no, languages)
    return text
Пример #3
0
def _convert_page(layout, languages):
    text_content = []
    for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)):
        text_content.append(text_obj.get_text())

    text = text_fragments(text_content)
    if len(text) > 3:
        # TODO: invent a smarter way to decide whether to do OCR.
        return text

    for img_obj in _find_objects(layout._objs, LTImage):
        try:
            if img_obj.width < OCR_MIN_WIDTH or \
                    img_obj.height < OCR_MIN_HEIGHT:
                continue
            data = img_obj.stream.get_rawdata()
            img_text = extract_image_data(data, languages=languages)
            text_content.append(img_text)
        except Exception as ex:
            log.debug(ex)

    return text_fragments(text_content)