def merge_text_fields(paddle_t_b: List[TextField], poppler_t_b: List[TextField]) -> List[TextField]: not_matched = [] merged_t_b = [] for pop_t_b in poppler_t_b: merged = False for pad_t_b in paddle_t_b: if pop_t_b.bbox.box_is_inside_another(pad_t_b.bbox, threshold=0.00): merged_t_b.append( TextField(bbox=pad_t_b.bbox.merge(pop_t_b.bbox), text=pop_t_b.text)) merged = True if not merged: not_matched.append(pop_t_b) for pad_t_b in paddle_t_b: exists = False for mer_t_b in merged_t_b: if mer_t_b.bbox.box_is_inside_another(pad_t_b.bbox, threshold=0.0): exists = True if not exists: not_matched.append(pad_t_b) merged_t_b.extend(not_matched) return merged_t_b
def convert_cells(cells: dict) -> list: converted_cells = [] for coords, params in cells.items(): coords_in_px = params[0] text_boxes = TextField( bbox=BorderBox( coords_in_px["top_left"][0], coords_in_px["top_left"][1], coords_in_px["bottom_right"][0], coords_in_px["bottom_right"][1], ), text=params[-2], ) new_cell = CellLinked( coords_in_px["top_left"][0], coords_in_px["top_left"][1], coords_in_px["bottom_right"][0], coords_in_px["bottom_right"][1], text_boxes=[text_boxes], col=coords[0], row=coords[1], col_span=params[1], row_span=params[2], ) converted_cells.append(new_cell) return converted_cells
def merge_closest_text_fields(text_fields: List[TextField]): merged_fields: List[TextField] = [] curr_field: TextField = None for text_field in sorted(text_fields, key=lambda x: (x.bbox.top_left_y, x.bbox.top_left_x)): if not curr_field: curr_field = text_field continue if curr_field: if (20 > text_field.bbox.top_left_x - curr_field.bbox.bottom_right_x > -20 and curr_field.bbox.top_left_y - 10 < text_field.bbox.top_left_y < curr_field.bbox.top_left_y + 10): curr_field = TextField( bbox=curr_field.bbox.merge(text_field.bbox), text=curr_field.text + " " + text_field.text, ) else: merged_fields.append(curr_field) curr_field = text_field if curr_field: merged_fields.append(curr_field) return merged_fields
def actualize_text(table: StructuredTable, image_path: Path): with TextExtractor(str(image_path.absolute())) as te: for cell in table.cells: if not cell.text_boxes or any( [not text_box.text for text_box in cell.text_boxes]): text, _ = te.extract(cell.top_left_x, cell.top_left_y, cell.width, cell.height) cell.text_boxes.append(TextField(bbox=cell, text=text))
def extract_table_text(self, img: numpy.ndarray, border_box: BorderBox) -> List[TextField]: x1, y1, x2, y2 = border_box.box dt_boxes, elapse = self.text_detector(img[y1:y2, x1:x2]) bboxes = paddle_result_to_bboxes(dt_boxes) return [ TextField(bbox=cell, text="") for cell in (BorderBox(b[0] + x1, b[1] + y1, b[2] + x1, b[3] + y1) for b in bboxes) ]
def poppler_text_field_to_text_field(pt_field: PopplerTextField, scale: float): return TextField( bbox=bounding_box_to_bbox(pt_field.bbox, scale), text=pt_field.text )
def match_inf_res(xlsx_path: Path, images_dir: Path): LOGGER.info( "Initializing CascadeMaskRCNN with config: %s and model: %s", CASCADE_CONFIG_PATH, CASCADE_MODEL_PATH, ) cascade_rcnn_detector = CascadeRCNNInferenceService( CASCADE_CONFIG_PATH, CASCADE_MODEL_PATH, True) pages = [] workbook = load_workbook(str(xlsx_path.absolute()), data_only=True) for page_num, worksheet in enumerate(workbook.worksheets): row_fill = {} for row_id in range(1, worksheet.max_row + 1): row_fill[row_id] = False for col_id in range(1, worksheet.max_column + 1): if worksheet.cell(row_id, col_id).value: row_fill[row_id] = True break last_row = worksheet.max_row for row_id, not_empty in sorted( [(row_id, not_empty) for row_id, not_empty in row_fill.items()], reverse=True, key=lambda x: x[0], ): if not_empty: if last_row == worksheet.max_row: last_row += 1 break last_row = row_id col_fill = {} for col_id in range(1, worksheet.max_column + 1): col_fill[col_id] = False for row_id in range(1, worksheet.max_row + 1): if worksheet.cell(row_id, col_id).value: col_fill[col_id] = True break last_col = worksheet.max_column for col_id, not_empty in sorted( [(col_id, not_empty) for col_id, not_empty in col_fill.items()], reverse=True, key=lambda x: x[0], ): if not_empty: if last_col == worksheet.max_column: last_col += 1 break last_col = col_id height = 0 for row_id in range(1, last_row): if worksheet.row_dimensions[row_id].height: height += worksheet.row_dimensions[row_id].height else: height += DEFAULT_HEIGHT width = 0 for col_id in range(1, last_col): if worksheet.column_dimensions[get_column_letter(col_id)].width: width += worksheet.column_dimensions[get_column_letter( col_id)].width else: width += DEFAULT_WIDTH if height == 0 or width == 0: continue img = cv2.imread(str((images_dir / f"{page_num}.png").absolute())) if img is None: LOGGER.warning( "Image is empty or none, skipping processing on page %s", page_num) continue img_shape = img.shape[:2] tables_proposals = clust_tables(worksheet, last_row, last_col) row_dim, col_dim = get_grid(worksheet, last_row, last_col) y_scale = img_shape[0] / height x_scale = img_shape[1] / width row_dim = [dim * y_scale for dim in row_dim] col_dim = [dim * x_scale for dim in col_dim] headers = [] if not any([s > 10000 for s in img_shape]) and last_row < 1000: _, headers = cascade_rcnn_detector.inference_image( images_dir / f"{page_num}.png", padding=200) tables = [ comp_table(worksheet, row_dim, col_dim, (prop[0], prop[1]), (prop[2], prop[3]), headers) for prop in tables_proposals ] tables = [ table for table in tables if len(table.cells) + sum([len(h) for h in table.header]) > 3 ] blocks = [] blocks.extend(tables) prev_row_coord = 0 for row_id in range(1, last_row): row_coord = prev_row_coord + ( worksheet.row_dimensions[row_id].height if worksheet.row_dimensions[row_id].height else DEFAULT_HEIGHT) prev_col_coord = 0 for col_id in range(1, last_col): col_coord = prev_col_coord + ( worksheet.column_dimensions[get_column_letter(col_id)]. width if worksheet.column_dimensions[get_column_letter( col_id)].width else DEFAULT_WIDTH) if worksheet.cell(row_id, col_id).value and not any([ y1 <= row_id <= y2 and x1 <= col_id <= x2 for y1, x1, y2, x2 in tables_proposals ]): text_field = TextField( bbox=BorderBox( top_left_x=prev_col_coord * x_scale, top_left_y=prev_row_coord * y_scale, bottom_right_x=col_coord * x_scale, bottom_right_y=row_coord * y_scale, ), text=extract_cell_value(worksheet.cell(row_id, col_id)), ) blocks.append(text_field) prev_col_coord = col_coord prev_row_coord = row_coord pages.append( page_to_dict( Page( page_num=page_num, bbox=BorderBox( top_left_x=0, top_left_y=0, bottom_right_x=img_shape[1], bottom_right_y=img_shape[0], ), tables=blocks, ))) workbook.save(str(xlsx_path.absolute())) workbook.close() return pages
def comp_table(worksheet: Worksheet, row_dim: List[float], col_dim: List[float], s_cell: Tuple[int, int], e_cell: Tuple[int, int], headers: List[Cell]): m_ranges = [] for m_range in worksheet.merged_cells.ranges: m_ranges.append(m_range) s_row, s_col = s_cell e_row, e_col = e_cell e_row = min(e_row, len(row_dim) - 1) e_col = min(e_col, len(col_dim) - 1) cells = [] m_range_included = [] for row in range(s_row, e_row + 1): for col in range(s_col, e_col + 1): is_in_merged = False cur_m_range = None for m_range in m_ranges: if (row, col) in list(m_range.cells): is_in_merged = True cur_m_range = m_range break skip = False if is_in_merged: for m_range in m_range_included: if (row, col) in list(m_range.cells): skip = True break if skip: continue if is_in_merged and cur_m_range: m_range_included.append(cur_m_range) cells.append( CellLinked( top_left_y=int(row_dim[cur_m_range.min_row - 1]), top_left_x=int(col_dim[cur_m_range.min_col - 1]), bottom_right_y=int(row_dim[min(cur_m_range.max_row, len(row_dim) - 1)]), bottom_right_x=int(col_dim[min(cur_m_range.max_col, len(col_dim) - 1)]), col=col - 1, row=row - 1, col_span=cur_m_range.max_col - cur_m_range.min_col + 1, row_span=cur_m_range.max_row - cur_m_range.min_row + 1, text_boxes=[ TextField( bbox=BorderBox( top_left_y=int( row_dim[cur_m_range.min_row - 1]), top_left_x=int( col_dim[cur_m_range.min_col - 1]), bottom_right_y=int(row_dim[min( cur_m_range.max_row, len(row_dim) - 1)]), bottom_right_x=int(col_dim[min( cur_m_range.max_col, len(col_dim) - 1)]), ), text=extract_cell_value( cur_m_range.start_cell), ) ], )) else: cells.append( CellLinked( top_left_y=int(row_dim[row - 1]), top_left_x=int(col_dim[col - 1]), bottom_right_y=int(row_dim[row]), bottom_right_x=int(col_dim[col]), col=col - 1, row=row - 1, col_span=1, row_span=1, text_boxes=[ TextField( bbox=BorderBox( top_left_y=int(row_dim[row - 1]), top_left_x=int(col_dim[col - 1]), bottom_right_y=int(row_dim[row]), bottom_right_x=int(col_dim[col]), ), text=extract_cell_value( worksheet.cell(row, col)), ) ], )) struct_table = StructuredTable( bbox=BorderBox( top_left_y=int(row_dim[s_row - 1]), top_left_x=int(col_dim[s_col - 1]), bottom_right_y=int(row_dim[e_row]), bottom_right_x=int(col_dim[e_col]), ), cells=cells, ) struct_table_headered = get_headers_using_structured(struct_table, headers) if len(struct_table_headered.cells) + sum( [len(h) for h in struct_table_headered.header]) > 3: head_cells = [] for pack in struct_table_headered.header: head_cells.extend(pack) for cell in head_cells: col = cell.col + 1 row = cell.row + 1 col_span = cell.col_span row_span = cell.row_span for r in range(row, row + row_span): for c in range(col, col + col_span): worksheet.cell(r, c).fill = HEADER_FILL for cell in struct_table_headered.cells: col = cell.col + 1 row = cell.row + 1 col_span = cell.col_span row_span = cell.row_span for r in range(row, row + row_span): for c in range(col, col + col_span): worksheet.cell(r, c).fill = PatternFill(start_color="CC55BB", end_color="CC55BB", fill_type="solid") return struct_table_headered
def process_page(self, image_path: Path, output_path: Path, poppler_page) -> Dict[str, Any]: img = cv2.imread(str(image_path.absolute())) page = Page(page_num=int(image_path.name.split(".")[0]), bbox=BorderBox(top_left_x=0, top_left_y=0, bottom_right_x=img.shape[1], bottom_right_y=img.shape[0])) text_fields = self._scale_poppler_result(img, output_path, poppler_page, image_path) inference_tables, headers = self.inference_service.inference_image( image_path) if not inference_tables: return page_to_dict(page) has_bordered = any( [i_tab.label == 'Bordered' for i_tab in inference_tables]) self.visualizer.draw_object_and_save( img, inference_tables, Path(f"{output_path}/inference_result/{image_path.name}")) text_fields_to_match = text_fields semi_bordered_tables = [] detected_tables = [] for inf_table in inference_tables: in_inf_table, text_fields_to_match = match_table_text( inf_table, text_fields_to_match) paddle_fields = self.text_detector.extract_table_text( img, inf_table.bbox) if paddle_fields: in_inf_table = merge_text_fields(paddle_fields, in_inf_table) mask_rcnn_count_matches, not_matched = match_cells_text_fields( inf_table.tags, in_inf_table) if inf_table.label == 'Borderless': semi_border = semi_bordered(img, inf_table) if semi_border: semi_bordered_tables.append(semi_border) semi_border_score = match_cells_table( in_inf_table, semi_border) if semi_border_score >= mask_rcnn_count_matches and semi_border.count_cells( ) > len(inf_table.tags): struct_table = semi_border_to_struct( semi_border, img.shape) if struct_table: detected_tables.append( (semi_border_score, struct_table)) continue struct = self.extract_table_from_inference(img, inf_table, not_matched, img.shape, image_path) if struct: detected_tables.append((mask_rcnn_count_matches, struct)) if has_bordered or any(score < 0.2 * len(table.cells) for score, table in detected_tables): image = detect_tables_on_page( image_path, draw=self.visualizer.should_visualize) if image.tables: text_fields_to_match = text_fields for bordered_table in image.tables: matched = False for score, inf_table in detected_tables: if inf_table.bbox.box_is_inside_another( bordered_table.bbox): in_table, text_fields_to_match = match_table_text( inf_table, text_fields_to_match) paddle_fields = self.text_detector.extract_table_text( img, inf_table.bbox) if paddle_fields: in_table = merge_text_fields( paddle_fields, in_table) bordered_score = match_cells_table( in_table, bordered_table) if bordered_score >= score * 0.5 \ and bordered_table.count_cells() >= len(inf_table.cells) * 0.5: struct_table = semi_border_to_struct( bordered_table, img.shape) if struct_table: page.tables.append(struct_table) else: page.tables.append(inf_table) detected_tables.remove((score, inf_table)) matched = True break if not matched: in_table, text_fields_to_match = match_table_text( bordered_table, text_fields_to_match) _ = match_cells_table(in_table, bordered_table) struct_table = semi_border_to_struct( bordered_table, img.shape) if struct_table: page.tables.append(struct_table) if detected_tables: page.tables.extend( [inf_table for _, inf_table in detected_tables]) else: page.tables.extend([tab for _, tab in detected_tables]) else: page.tables.extend([tab for _, tab in detected_tables]) for table in page.tables: actualize_text(table, image_path) # TODO: Headers should be created only once cell_header_scores = [] for table in page.tables: cell_header_scores.extend( self.header_checker.get_cell_scores(table.cells)) self.visualizer.draw_object_and_save( img, cell_header_scores, output_path / 'cells_header' / f"{page.page_num}.png") tables_with_header = [] for table in page.tables: header_rows = self.create_header(table.rows, headers, 6) table_with_header = StructuredTableHeadered.from_structured_and_rows( table, header_rows) header_cols = self.create_header(table.cols, headers, 5) # TODO: Cells should be actualized only once table_with_header.actualize_header_with_cols(header_cols) tables_with_header.append(table_with_header) page.tables = tables_with_header with TextExtractor(str(image_path.absolute()), seg_mode=PSM.SPARSE_TEXT) as extractor: text_borders = [1] for table in page.tables: _, y, _, y2 = table.bbox.box text_borders.extend([y, y2]) text_borders.append(img.shape[0]) text_candidate_boxes: List[BorderBox] = [] for i in range(len(text_borders) // 2): if text_borders[i * 2 + 1] - text_borders[i * 2] > 3: text_candidate_boxes.append( BorderBox( top_left_x=1, top_left_y=text_borders[i * 2], bottom_right_x=img.shape[1], bottom_right_y=text_borders[i * 2 + 1], )) for box in text_candidate_boxes: text, _ = extractor.extract(box.top_left_x, box.top_left_y, box.width, box.height) if text: page.text.append(TextField(box, text)) self.visualizer.draw_object_and_save( img, semi_bordered_tables, output_path.joinpath('semi_bordered_tables').joinpath( image_path.name)) self.visualizer.draw_object_and_save( img, page.tables, output_path.joinpath('tables').joinpath(image_path.name)) page_dict = page_to_dict(page) if self.visualizer.should_visualize: save_page(page_dict, output_path / 'pages' / f"{page.page_num}.json") return page_dict
def process_page(self, image_path: Path, output_path: Path, poppler_page) -> Dict[str, Any]: img = cv2.imread(str(image_path.absolute())) page = Page( page_num=int(image_path.name.split(".")[0]), bbox=BorderBox( top_left_x=0, top_left_y=0, bottom_right_x=img.shape[1], bottom_right_y=img.shape[0], ), ) text_fields = self._scale_poppler_result(img, output_path, poppler_page, image_path) logger.info("Start inference") inference_tables, headers = self.inference_service.inference_image( image_path) logger.info("End inference") self.visualizer.draw_object_and_save( img, inference_tables, Path(f"{output_path}/inference_result/{image_path.name}"), headers=headers, ) if inference_tables: logger.info("Start bordered") image = detect_tables_on_page( image_path, draw=self.visualizer.should_visualize) logger.info("End bordered") text_fields_to_match = text_fields bordered_tables = [] if image.tables: for bordered_table in image.tables: in_table, text_fields_to_match = match_table_text( bordered_table, text_fields_to_match) _ = match_cells_table(in_table, bordered_table) bordered_tables.append( semi_border_to_struct(bordered_table, img.shape)) inf_tables_to_detect = [] for inf_table in inference_tables: matched = False if image.tables: for bordered_table in bordered_tables: if (inf_table.bbox.box_is_inside_another( bordered_table.bbox, 0.8) and inf_table.label == "Bordered" and len(bordered_table.cells) > len(inf_table.tags) * 0.5): matched = True page.tables.append(bordered_table) if not matched: inf_tables_to_detect.append(inf_table) semi_bordered_tables = [] for inf_table in inf_tables_to_detect: in_inf_table, text_fields_to_match = match_table_text( inf_table, text_fields_to_match) logger.info("Start paddle") paddle_fields = self.text_detector.extract_table_text( img, inf_table.bbox) logger.info("End paddle") if paddle_fields: in_inf_table = merge_text_fields(paddle_fields, in_inf_table) mask_rcnn_count_matches, not_matched = match_cells_text_fields( inf_table.tags, in_inf_table) if inf_table.label == "Borderless" and False: semi_border = semi_bordered(img, inf_table) if semi_border: semi_bordered_tables.append(semi_border) semi_border_score = match_cells_table( in_inf_table, semi_border) if (semi_border_score >= mask_rcnn_count_matches and semi_border.count_cells() > len( inf_table.tags)): struct_table = semi_border_to_struct( semi_border, img.shape) if struct_table: page.tables.append(struct_table) continue struct = self.extract_table_from_inference( img, inf_table, not_matched, img.shape, image_path) if struct: page.tables.append(struct) for table in page.tables: actualize_text(table, image_path, img.shape[:2]) # TODO: Headers should be created only once cell_header_scores = [] for table in page.tables: cell_header_scores.extend( self.header_checker.get_cell_scores(table.cells)) self.visualizer.draw_object_and_save( img, cell_header_scores, output_path / "cells_header" / f"{page.page_num}.png", ) tables_with_header = [] for table in page.tables: header_rows = self.create_header(table.rows, headers, 5) table_with_header = ( StructuredTableHeadered.from_structured_and_rows( table, header_rows)) header_cols = self.create_header(table.cols, headers, 1) # TODO: Cells should be actualized only once table_with_header.actualize_header_with_cols(header_cols) tables_with_header.append(table_with_header) page.tables = tables_with_header self.visualizer.draw_object_and_save( img, semi_bordered_tables, output_path.joinpath("semi_bordered_tables").joinpath( image_path.name), ) self.visualizer.draw_object_and_save( img, page.tables, output_path.joinpath("tables").joinpath(image_path.name), ) logger.info("Start text extraction") with TextExtractor(str(image_path.absolute()), seg_mode=PSM.SPARSE_TEXT) as extractor: text_borders = [1] for table in page.tables: _, y, _, y2 = table.bbox.box text_borders.extend([y, y2]) text_borders.append(img.shape[0]) text_candidate_boxes: List[BorderBox] = [] for i in range(len(text_borders) // 2): if text_borders[i * 2 + 1] - text_borders[i * 2] > 3: text_candidate_boxes.append( BorderBox( top_left_x=1, top_left_y=text_borders[i * 2], bottom_right_x=img.shape[1], bottom_right_y=text_borders[i * 2 + 1], )) for box in text_candidate_boxes: text, _ = extractor.extract(box.top_left_x, box.top_left_y, box.width, box.height) if text: page.text.append(TextField(box, text)) logger.info("End text extraction") page_dict = page_to_dict(page) if self.visualizer.should_visualize: save_page(page_dict, output_path / "pages" / f"{page.page_num}.json") return page_dict