def add_kv_ocr_confidence(t_document: t2.TDocument) -> t2.TDocument: """ adds custom attribute to each KEY_VALUE_SET in the form of "Custom":{"OCRConfidence": {'mean': 98.2, 'min': 95.1}} If no CHILD relationships exist for a KEY or VALUE, no confidence score will be added. """ for idx, page_block in enumerate(t_document.pages): logger.debug(f"page: {idx}") key_value_blocks = t_document.forms(page=page_block) logger.debug(f"len(key_value_blocks): {len(key_value_blocks)}") for key_value_block in key_value_blocks: logger.debug(f"key_value_block.id: {key_value_block.id}") ocr_blocks = t_document.get_child_relations(key_value_block) if ocr_blocks: logger.debug(f"len(child-relations: {len(ocr_blocks)}") confidence_list: List[float] = [ float(x.confidence) for x in ocr_blocks if x.confidence ] if confidence_list: kv_block_ocr_confidence_mean = statistics.mean( confidence_list) kv_block_ocr_confidence_min = min(confidence_list) if key_value_block.custom: key_value_block.custom['OCRConfidence'] = { 'mean': kv_block_ocr_confidence_mean, 'min': kv_block_ocr_confidence_min } else: key_value_block.custom = { 'OCRConfidence': { 'mean': kv_block_ocr_confidence_mean, 'min': kv_block_ocr_confidence_min } } return t_document
def order_blocks_by_geo(t_document: t2.TDocument) -> t2.TDocument: """ takes in a Textract JSON response and outputs a Textract JSON response schema which has the elements sorted by geometry (top coordinate of bounding box) """ new_order: List[t2.TBlock] = list() for page in t_document.pages: new_order.append(page) r = t_document.relationships_recursive(page) page_relationships = list(r) if r else list() page_blocks = sorted(page_relationships, key=lambda b: b.geometry.bounding_box.top if not b.text_type == "PAGE" and b.geometry and b. geometry.bounding_box else 1) new_order.extend(page_blocks) t_document.blocks = new_order return t_document
def rotate_points_to_page_orientation( t_document: t2.TDocument) -> t2.TDocument: # TODO add rotation information to document (degree and center) logger.debug("rotate_points_to_page_orientation") for page in t_document.pages: logger.debug(page) if page.custom: logger.debug("page.custom") page_rotation = -page.custom['Orientation'] logger.debug(f"page_rotation: {page_rotation}") t_document.rotate(page=page, origin=t2.TPoint(0.5, 0.5), degrees=float(page_rotation)) page.custom['Rotation'] = { 'Degrees': page_rotation, 'RotationPointX': 0.5, 'RotationPointY': 0.5 } return t_document
def pipeline_merge_tables( t_document: t2.TDocument, merge_options: MergeOptions = MergeOptions.MERGE, customer_function: Callable = None, header_footer_type: HeaderFooterType = HeaderFooterType.NONE, accuracy_percentage: float = 99) -> t2.TDocument: """ Checks if tables require to be merged using a customer function or built function and merges tables """ if customer_function: tables_merge_ids: List[List[str]] = customer_function(t_document) else: tables_merge_ids: List[List[str]] = ExecuteTableValidations( t_document, header_footer_type, accuracy_percentage) if merge_options == MergeOptions.MERGE: t_document.merge_tables(tables_merge_ids) if merge_options == MergeOptions.LINK: t_document.link_tables(tables_merge_ids) return t_document
def add_page_orientation(t_document: t2.TDocument) -> t2.TDocument: """adds orientation as Custom attribute to Textract Schema is available in trp as """ for page in t_document.pages: words = t2.TDocument.filter_blocks_by_type( block_list=t_document.get_child_relations(page=page), textract_block_type=[ t2.TextractBlockTypes.WORD, t2.TextractBlockTypes.LINE ]) orientation = statistics.mode([ round(__get_degree_from_polygon(w.geometry.polygon)) for w in words ]) if page.custom: page.custom['Orientation'] = orientation else: page.custom = {'Orientation': orientation} return t_document
def order_blocks_by_geo(t_document: t2.TDocument) -> t2.TDocument: """ takes in a Textract JSON response and outputs a Textract JSON response schema which has the elements sorted by geometry (top coordinate of bounding box)""" t_document.blocks = sorted(t_document.blocks, key=lambda b: b.geometry.bounding_box.top) return t_document