def inference_result_to_boxes( inference_page_result: List[Dict[str, Any]] ) -> Tuple[List[InferenceTable], List[Cell], List[BorderBox]]: raw_tables = [ tag for tag in inference_page_result if tag["label"] in TABLE_TAGS ] raw_headers = [ _raw_to_cell(tag) for tag in inference_page_result if tag["label"] == "Header" ] inference_tables: List[InferenceTable] = [ _raw_to_table(raw_table) for raw_table in raw_tables ] filtered = _filter_double_detection(inference_tables) raw_cells = [ _raw_to_cell(cell) for cell in inference_page_result if cell["label"] == CELL_TAG ] not_matched = match_cells_and_tables(raw_cells, filtered) if len(raw_cells) > 20 and not inference_tables: filtered.append( InferenceTable( bbox=BorderBox( top_left_y=min([cell.top_left_y for cell in raw_cells]) - 50, top_left_x=min([cell.top_left_x for cell in raw_cells]) - 50, bottom_right_y=max( [cell.bottom_right_y for cell in raw_cells]) + 50, bottom_right_x=max( [cell.bottom_right_x for cell in raw_cells]) + 50, ), confidence=0.5, label="Borderless", tags=raw_cells, )) if len(not_matched) > 20: filtered.append( InferenceTable( bbox=BorderBox( top_left_y=min([cell.top_left_y for cell in not_matched]) - 50, top_left_x=min([cell.top_left_x for cell in not_matched]) - 50, bottom_right_y=max( [cell.bottom_right_y for cell in not_matched]) + 50, bottom_right_x=max( [cell.bottom_right_x for cell in not_matched]) + 50, ), confidence=0.5, label="Borderless", tags=not_matched, )) return filtered, raw_headers, not_matched
def convert_cells(cells: dict) -> list: converted_cells = [] for coords, params in cells.items(): coords_in_px = params[0] text_boxes = TextField( bbox=BorderBox( coords_in_px["top_left"][0], coords_in_px["top_left"][1], coords_in_px["bottom_right"][0], coords_in_px["bottom_right"][1], ), text=params[-2], ) new_cell = CellLinked( coords_in_px["top_left"][0], coords_in_px["top_left"][1], coords_in_px["bottom_right"][0], coords_in_px["bottom_right"][1], text_boxes=[text_boxes], col=coords[0], row=coords[1], col_span=params[1], row_span=params[2], ) converted_cells.append(new_cell) return converted_cells
def bounding_box_to_bbox(bounding_box: PopplerBoundingBox, scale: float): return BorderBox( top_left_x=int(bounding_box.x * scale), top_left_y=int(bounding_box.y * scale), bottom_right_x=int((bounding_box.x + bounding_box.width) * scale), bottom_right_y=int((bounding_box.y + bounding_box.height) * scale) )
def find_tables_in_boxes(self, min_rows=2) -> Optional[List[Table]]: tables = [] h_lines = {} v_lines = {} for box in sorted(self.objs, key=lambda x: (x.top_left_x, x.top_left_y)): for table in tables: if table.is_box_from_table(box): target_table = table break else: tables.append(Table(bbox=box, table_id=len(tables))) continue h_line_key = box[1] v_line_key = box[0] if (h_line_key not in h_lines or h_lines[h_line_key].table_id != target_table.table_id): row = Row( bbox=BorderBox(box[0], box[1], target_table.bbox[2], box[3]), table_id=target_table.table_id, ) row.add(box) target_table.rows.append(row) h_lines[h_line_key] = row else: h_lines[h_line_key].add(box) if (v_line_key not in v_lines or v_lines[v_line_key].table_id != target_table.table_id): col = Column( bbox=BorderBox(box[0], box[1], box[2], target_table.bbox[3]), table_id=target_table.table_id, ) col.add(box) target_table.cols.append(col) v_lines[v_line_key] = col else: v_lines[v_line_key].add(box) res = [i for i in tables if len(i.rows) >= min_rows] return res if res else None
def extract_table_text(self, img: numpy.ndarray, border_box: BorderBox) -> List[TextField]: x1, y1, x2, y2 = border_box.box dt_boxes, elapse = self.text_detector(img[y1:y2, x1:x2]) bboxes = paddle_result_to_bboxes(dt_boxes) return [ TextField(bbox=cell, text="") for cell in (BorderBox(b[0] + x1, b[1] + y1, b[2] + x1, b[3] + y1) for b in bboxes) ]
def _raw_to_table(raw_table: Dict[str, Any]) -> InferenceTable: top_left_x, top_left_y, bottom_right_x, bottom_right_y = raw_table['bbox'] return InferenceTable( bbox=BorderBox(top_left_y=top_left_y, top_left_x=top_left_x, bottom_right_y=bottom_right_y, bottom_right_x=bottom_right_x), confidence=raw_table['score'], label=raw_table['label'], )
def _raw_to_table(raw_table: Dict[str, Any]) -> InferenceTable: top_left_x, top_left_y, bottom_right_x, bottom_right_y = raw_table["bbox"] return InferenceTable( bbox=BorderBox( top_left_y=top_left_y, top_left_x=top_left_x, bottom_right_y=bottom_right_y, bottom_right_x=bottom_right_x, ), confidence=raw_table["score"], label=raw_table["label"], )
def excel_to_structured(excel_table: dict) -> StructuredTable: """ Converts data from excel to structured table """ table = StructuredTable( cells=convert_cells(excel_table['cells']), bbox=BorderBox( excel_table['dimensions'][0]['top_left'][0], excel_table['dimensions'][0]['top_left'][1], excel_table['dimensions'][1]['bottom_right'][0], excel_table['dimensions'][1]['bottom_right'][1], )) return table
def excel_to_structured(excel_table: dict) -> StructuredTable: """ Converts data from excel to structured table """ table = StructuredTable( cells=convert_cells(excel_table["cells"]), bbox=BorderBox( excel_table["dimensions"][0]["top_left"][0], excel_table["dimensions"][0]["top_left"][1], excel_table["dimensions"][1]["bottom_right"][0], excel_table["dimensions"][1]["bottom_right"][1], ), ) return table
def construct_rows_from_boxes(cells: List[Cell], x_max) -> List[Row]: h_lines = {} for box in sorted(cells, key=lambda x: (x.top_left_x, x.top_left_y)): h_line_key = box[1] if h_line_key not in h_lines: row = Row( bbox=BorderBox(box[0], box[1], x_max, box[3]), table_id=1, ) row.add(box) h_lines[h_line_key] = row else: h_lines[h_line_key].add(box) return list(h_lines.values())
def write(self): pages = [] for i, (sheet, tables) in enumerate(self.tables_with_headers.items()): if not i: ws = self.wb.active else: ws = self.wb.create_sheet(sheet) for table in tables: for header_cells in table.header: for cell in header_cells: added_cell = ws.cell( row=cell.row, column=cell.col, value=cell.text_boxes[0].text, ) added_cell.fill = HEADER_FILL added_cell.font = HEADER_FONT for cell in table.cells: ws.cell( row=cell.row, column=cell.col, value=cell.text_boxes[0].text, ) pages.append( page_to_dict( Page( page_num=i, bbox=BorderBox( top_left_x=0, top_left_y=0, bottom_right_x=max([ table.bbox.bottom_right_x for table in tables ]), bottom_right_y=max([ table.bbox.bottom_right_y for table in tables ]), ), tables=tables, ))) self.wb.save(self.outpath) return pages
def process_page(self, image_path: Path, output_path: Path, poppler_page) -> Dict[str, Any]: img = cv2.imread(str(image_path.absolute())) page = Page( page_num=int(image_path.name.split(".")[0]), bbox=BorderBox( top_left_x=0, top_left_y=0, bottom_right_x=img.shape[1], bottom_right_y=img.shape[0], ), ) text_fields = self._scale_poppler_result(img, output_path, poppler_page, image_path) logger.info("Start inference") inference_tables, headers = self.inference_service.inference_image( image_path) logger.info("End inference") self.visualizer.draw_object_and_save( img, inference_tables, Path(f"{output_path}/inference_result/{image_path.name}"), headers=headers, ) if inference_tables: logger.info("Start bordered") image = detect_tables_on_page( image_path, draw=self.visualizer.should_visualize) logger.info("End bordered") text_fields_to_match = text_fields bordered_tables = [] if image.tables: for bordered_table in image.tables: in_table, text_fields_to_match = match_table_text( bordered_table, text_fields_to_match) _ = match_cells_table(in_table, bordered_table) bordered_tables.append( semi_border_to_struct(bordered_table, img.shape)) inf_tables_to_detect = [] for inf_table in inference_tables: matched = False if image.tables: for bordered_table in bordered_tables: if (inf_table.bbox.box_is_inside_another( bordered_table.bbox, 0.8) and inf_table.label == "Bordered" and len(bordered_table.cells) > len(inf_table.tags) * 0.5): matched = True page.tables.append(bordered_table) if not matched: inf_tables_to_detect.append(inf_table) semi_bordered_tables = [] for inf_table in inf_tables_to_detect: in_inf_table, text_fields_to_match = match_table_text( inf_table, text_fields_to_match) logger.info("Start paddle") paddle_fields = self.text_detector.extract_table_text( img, inf_table.bbox) logger.info("End paddle") if paddle_fields: in_inf_table = merge_text_fields(paddle_fields, in_inf_table) mask_rcnn_count_matches, not_matched = match_cells_text_fields( inf_table.tags, in_inf_table) if inf_table.label == "Borderless" and False: semi_border = semi_bordered(img, inf_table) if semi_border: semi_bordered_tables.append(semi_border) semi_border_score = match_cells_table( in_inf_table, semi_border) if (semi_border_score >= mask_rcnn_count_matches and semi_border.count_cells() > len( inf_table.tags)): struct_table = semi_border_to_struct( semi_border, img.shape) if struct_table: page.tables.append(struct_table) continue struct = self.extract_table_from_inference( img, inf_table, not_matched, img.shape, image_path) if struct: page.tables.append(struct) for table in page.tables: actualize_text(table, image_path, img.shape[:2]) # TODO: Headers should be created only once cell_header_scores = [] for table in page.tables: cell_header_scores.extend( self.header_checker.get_cell_scores(table.cells)) self.visualizer.draw_object_and_save( img, cell_header_scores, output_path / "cells_header" / f"{page.page_num}.png", ) tables_with_header = [] for table in page.tables: header_rows = self.create_header(table.rows, headers, 5) table_with_header = ( StructuredTableHeadered.from_structured_and_rows( table, header_rows)) header_cols = self.create_header(table.cols, headers, 1) # TODO: Cells should be actualized only once table_with_header.actualize_header_with_cols(header_cols) tables_with_header.append(table_with_header) page.tables = tables_with_header self.visualizer.draw_object_and_save( img, semi_bordered_tables, output_path.joinpath("semi_bordered_tables").joinpath( image_path.name), ) self.visualizer.draw_object_and_save( img, page.tables, output_path.joinpath("tables").joinpath(image_path.name), ) logger.info("Start text extraction") with TextExtractor(str(image_path.absolute()), seg_mode=PSM.SPARSE_TEXT) as extractor: text_borders = [1] for table in page.tables: _, y, _, y2 = table.bbox.box text_borders.extend([y, y2]) text_borders.append(img.shape[0]) text_candidate_boxes: List[BorderBox] = [] for i in range(len(text_borders) // 2): if text_borders[i * 2 + 1] - text_borders[i * 2] > 3: text_candidate_boxes.append( BorderBox( top_left_x=1, top_left_y=text_borders[i * 2], bottom_right_x=img.shape[1], bottom_right_y=text_borders[i * 2 + 1], )) for box in text_candidate_boxes: text, _ = extractor.extract(box.top_left_x, box.top_left_y, box.width, box.height) if text: page.text.append(TextField(box, text)) logger.info("End text extraction") page_dict = page_to_dict(page) if self.visualizer.should_visualize: save_page(page_dict, output_path / "pages" / f"{page.page_num}.json") return page_dict
def reconstruct_table_from_grid( grid_table: GridTable, cells: List[Cell]) -> Tuple[Optional[StructuredTable], List[Cell]]: not_matched = [] linked_cells = [] grid_cells_dict = {} for g_cell in grid_table.cells: grid_cells_dict[g_cell.row * len(grid_table.cols) + g_cell.col] = g_cell for cell in cells: rows = [] for r_idx, row in enumerate(grid_table.rows): if row.box_is_inside_another(cell, 0.0): rows.append((r_idx, row)) cols = [] for c_idx, col in enumerate(grid_table.cols): if col.box_is_inside_another(cell, 0.0): cols.append((c_idx, col)) if rows and cols: linked_cells.append( CellLinked( top_left_y=rows[0][1].top_left_y, top_left_x=cols[0][1].top_left_x, bottom_right_y=rows[-1][1].bottom_right_y, bottom_right_x=cols[-1][1].bottom_right_x, row=rows[0][0], col=cols[0][0], row_span=len(rows), col_span=len(cols), text_boxes=cell.text_boxes, )) for row in rows: for col in cols: if grid_cells_dict.get(row[0] * len(grid_table.cols) + col[0]): _ = grid_cells_dict.pop(row[0] * len(grid_table.cols) + col[0]) else: not_matched.append(cell) for _, g_cell in grid_cells_dict.items(): linked_cells.append( CellLinked( top_left_y=g_cell.top_left_y, top_left_x=g_cell.top_left_x, bottom_right_y=g_cell.bottom_right_y, bottom_right_x=g_cell.bottom_right_x, row=g_cell.row, col=g_cell.col, row_span=1, col_span=1, text_boxes=[], )) if not grid_table.cols or not grid_table.rows or not grid_table.cells: return None, cells table = StructuredTable( bbox=BorderBox( top_left_y=grid_table.rows[0].top_left_y, top_left_x=grid_table.cols[0].top_left_x, bottom_right_y=grid_table.rows[-1].bottom_right_y, bottom_right_x=grid_table.cols[-1].bottom_right_x, ), cells=linked_cells, ) return table, not_matched
def process_page(self, image_path: Path, output_path: Path, poppler_page) -> Dict[str, Any]: img = cv2.imread(str(image_path.absolute())) page = Page(page_num=int(image_path.name.split(".")[0]), bbox=BorderBox(top_left_x=0, top_left_y=0, bottom_right_x=img.shape[1], bottom_right_y=img.shape[0])) text_fields = self._scale_poppler_result(img, output_path, poppler_page, image_path) inference_tables, headers = self.inference_service.inference_image( image_path) if not inference_tables: return page_to_dict(page) has_bordered = any( [i_tab.label == 'Bordered' for i_tab in inference_tables]) self.visualizer.draw_object_and_save( img, inference_tables, Path(f"{output_path}/inference_result/{image_path.name}")) text_fields_to_match = text_fields semi_bordered_tables = [] detected_tables = [] for inf_table in inference_tables: in_inf_table, text_fields_to_match = match_table_text( inf_table, text_fields_to_match) paddle_fields = self.text_detector.extract_table_text( img, inf_table.bbox) if paddle_fields: in_inf_table = merge_text_fields(paddle_fields, in_inf_table) mask_rcnn_count_matches, not_matched = match_cells_text_fields( inf_table.tags, in_inf_table) if inf_table.label == 'Borderless': semi_border = semi_bordered(img, inf_table) if semi_border: semi_bordered_tables.append(semi_border) semi_border_score = match_cells_table( in_inf_table, semi_border) if semi_border_score >= mask_rcnn_count_matches and semi_border.count_cells( ) > len(inf_table.tags): struct_table = semi_border_to_struct( semi_border, img.shape) if struct_table: detected_tables.append( (semi_border_score, struct_table)) continue struct = self.extract_table_from_inference(img, inf_table, not_matched, img.shape, image_path) if struct: detected_tables.append((mask_rcnn_count_matches, struct)) if has_bordered or any(score < 0.2 * len(table.cells) for score, table in detected_tables): image = detect_tables_on_page( image_path, draw=self.visualizer.should_visualize) if image.tables: text_fields_to_match = text_fields for bordered_table in image.tables: matched = False for score, inf_table in detected_tables: if inf_table.bbox.box_is_inside_another( bordered_table.bbox): in_table, text_fields_to_match = match_table_text( inf_table, text_fields_to_match) paddle_fields = self.text_detector.extract_table_text( img, inf_table.bbox) if paddle_fields: in_table = merge_text_fields( paddle_fields, in_table) bordered_score = match_cells_table( in_table, bordered_table) if bordered_score >= score * 0.5 \ and bordered_table.count_cells() >= len(inf_table.cells) * 0.5: struct_table = semi_border_to_struct( bordered_table, img.shape) if struct_table: page.tables.append(struct_table) else: page.tables.append(inf_table) detected_tables.remove((score, inf_table)) matched = True break if not matched: in_table, text_fields_to_match = match_table_text( bordered_table, text_fields_to_match) _ = match_cells_table(in_table, bordered_table) struct_table = semi_border_to_struct( bordered_table, img.shape) if struct_table: page.tables.append(struct_table) if detected_tables: page.tables.extend( [inf_table for _, inf_table in detected_tables]) else: page.tables.extend([tab for _, tab in detected_tables]) else: page.tables.extend([tab for _, tab in detected_tables]) for table in page.tables: actualize_text(table, image_path) # TODO: Headers should be created only once cell_header_scores = [] for table in page.tables: cell_header_scores.extend( self.header_checker.get_cell_scores(table.cells)) self.visualizer.draw_object_and_save( img, cell_header_scores, output_path / 'cells_header' / f"{page.page_num}.png") tables_with_header = [] for table in page.tables: header_rows = self.create_header(table.rows, headers, 6) table_with_header = StructuredTableHeadered.from_structured_and_rows( table, header_rows) header_cols = self.create_header(table.cols, headers, 5) # TODO: Cells should be actualized only once table_with_header.actualize_header_with_cols(header_cols) tables_with_header.append(table_with_header) page.tables = tables_with_header with TextExtractor(str(image_path.absolute()), seg_mode=PSM.SPARSE_TEXT) as extractor: text_borders = [1] for table in page.tables: _, y, _, y2 = table.bbox.box text_borders.extend([y, y2]) text_borders.append(img.shape[0]) text_candidate_boxes: List[BorderBox] = [] for i in range(len(text_borders) // 2): if text_borders[i * 2 + 1] - text_borders[i * 2] > 3: text_candidate_boxes.append( BorderBox( top_left_x=1, top_left_y=text_borders[i * 2], bottom_right_x=img.shape[1], bottom_right_y=text_borders[i * 2 + 1], )) for box in text_candidate_boxes: text, _ = extractor.extract(box.top_left_x, box.top_left_y, box.width, box.height) if text: page.text.append(TextField(box, text)) self.visualizer.draw_object_and_save( img, semi_bordered_tables, output_path.joinpath('semi_bordered_tables').joinpath( image_path.name)) self.visualizer.draw_object_and_save( img, page.tables, output_path.joinpath('tables').joinpath(image_path.name)) page_dict = page_to_dict(page) if self.visualizer.should_visualize: save_page(page_dict, output_path / 'pages' / f"{page.page_num}.json") return page_dict
def _actualize_line_separators( table: GridTable, image_shape: Tuple[int, int]) -> Tuple[List[int], List[int]]: span_candidates: Dict[int, GridCell] = {} for g_cell in table.cells: if len(g_cell.cells) > 1: span_candidates[len(table.cols) * g_cell.row + g_cell.col] = g_cell if not span_candidates: return [], [] col_candidates = {} for g_cell in span_candidates.values(): col_candidates[g_cell.col] = table.cols[g_cell.col] row_candidates = {} for g_cell in span_candidates.values(): row_candidates[g_cell.row] = table.rows[g_cell.row] v_lines_to_add = [] h_lines_to_add = [] for cand_col in col_candidates.values(): v_lines = [] for g_cell in cand_col.g_cells: _, v_cell_lines = _find_lines(g_cell, g_cell.cells, image_shape) if v_cell_lines: min_v_cells = min([cell.top_left_x for cell in g_cell.cells]) max_v_cells = max( [cell.bottom_right_x for cell in g_cell.cells]) v_cell_lines = list( filter( lambda line: min_v_cells < line < max_v_cells, v_cell_lines, )) v_lines.append(v_cell_lines) g_cell_v_line = list(zip(cand_col.g_cells, v_lines)) cand_v_sort = list( filter( lambda x: x[3], sorted( [(idx, len(v_cell_lines), g_cell, v_cell_lines) for idx, (g_cell, v_cell_lines) in enumerate(g_cell_v_line)], key=lambda x: (x[1], x[2].top_left_y), ), )) i = 0 while i < len(cand_v_sort): idx, l, g_cell, v_cell_lines = cand_v_sort[i] if not l: i += 1 continue cand_g_cells = g_cell.cells.copy() new_v_lines = v_cell_lines count_not_broken = 0 for j in range(i + 1, len(cand_v_sort)): jdx, _, cand_j, v_lines_j = cand_v_sort[j] # Try compute common v_lines cells_to_check = cand_g_cells.copy() cells_to_check.extend(cand_j.cells) zone = BorderBox( top_left_x=g_cell.top_left_x, top_left_y=g_cell.top_left_y, bottom_right_x=g_cell.bottom_right_x, bottom_right_y=cand_j.bottom_right_y, ) _, v = _find_lines(zone, cells_to_check, image_shape) if v: min_v_cells = min( [cell.top_left_x for cell in cells_to_check]) max_v_cells = max( [cell.bottom_right_x for cell in cells_to_check]) v = list( filter(lambda line: min_v_cells < line < max_v_cells, v)) if len(v) >= len(v_cell_lines): cand_g_cells = cells_to_check new_v_lines = v count_not_broken += 1 else: break i += count_not_broken + 1 v_lines_to_add.extend(new_v_lines) for cand_row in row_candidates.values(): h_lines = [] for g_cell in cand_row.g_cells: h_cell_lines, _ = _find_lines(g_cell, g_cell.cells, image_shape) if h_cell_lines: min_h_cells = min([cell.top_left_y for cell in g_cell.cells]) max_h_cells = max( [cell.bottom_right_y for cell in g_cell.cells]) h_cell_lines = list( filter( lambda line: min_h_cells < line < max_h_cells, h_cell_lines, )) h_lines.append(h_cell_lines) g_cell_h_line = list(zip(cand_row.g_cells, h_lines)) cand_h_sort = sorted( [(idx, len(h_cell_lines), g_cell, h_cell_lines) for idx, (g_cell, h_cell_lines) in enumerate(g_cell_h_line)], key=lambda x: (x[1], x[2].top_left_y), ) i = 0 while i < len(cand_h_sort): idx, l, g_cell, h_cell_lines = cand_h_sort[i] if not l: i += 1 continue cand_g_cells = g_cell.cells.copy() new_h_lines = h_cell_lines count_not_broken = 0 for j in range(i + 1, len(cand_h_sort)): jdx, _, cand_j, h_lines_j = cand_h_sort[j] # Try compute common v_lines cells_to_check = cand_g_cells.copy() cells_to_check.extend(cand_j.cells) zone = BorderBox( top_left_x=g_cell.top_left_x, top_left_y=g_cell.top_left_y, bottom_right_x=g_cell.bottom_right_x, bottom_right_y=cand_j.bottom_right_y, ) h, _ = _find_lines(zone, cells_to_check, image_shape) if h: min_h_cells = min( [cell.top_left_y for cell in cells_to_check]) max_h_cells = max( [cell.bottom_right_y for cell in cells_to_check]) h = list( filter(lambda line: min_h_cells < line < max_h_cells, h)) if len(h) >= len(h_cell_lines): cand_g_cells = cells_to_check new_h_lines = h count_not_broken += 1 else: break i += count_not_broken + 1 h_lines_to_add.extend(new_h_lines) return list(set(v_lines_to_add)), list(set(h_lines_to_add))
def comp_table(worksheet: Worksheet, row_dim: List[float], col_dim: List[float], s_cell: Tuple[int, int], e_cell: Tuple[int, int], headers: List[Cell]): m_ranges = [] for m_range in worksheet.merged_cells.ranges: m_ranges.append(m_range) s_row, s_col = s_cell e_row, e_col = e_cell e_row = min(e_row, len(row_dim) - 1) e_col = min(e_col, len(col_dim) - 1) cells = [] m_range_included = [] for row in range(s_row, e_row + 1): for col in range(s_col, e_col + 1): is_in_merged = False cur_m_range = None for m_range in m_ranges: if (row, col) in list(m_range.cells): is_in_merged = True cur_m_range = m_range break skip = False if is_in_merged: for m_range in m_range_included: if (row, col) in list(m_range.cells): skip = True break if skip: continue if is_in_merged and cur_m_range: m_range_included.append(cur_m_range) cells.append( CellLinked( top_left_y=int(row_dim[cur_m_range.min_row - 1]), top_left_x=int(col_dim[cur_m_range.min_col - 1]), bottom_right_y=int(row_dim[min(cur_m_range.max_row, len(row_dim) - 1)]), bottom_right_x=int(col_dim[min(cur_m_range.max_col, len(col_dim) - 1)]), col=col - 1, row=row - 1, col_span=cur_m_range.max_col - cur_m_range.min_col + 1, row_span=cur_m_range.max_row - cur_m_range.min_row + 1, text_boxes=[ TextField( bbox=BorderBox( top_left_y=int( row_dim[cur_m_range.min_row - 1]), top_left_x=int( col_dim[cur_m_range.min_col - 1]), bottom_right_y=int(row_dim[min( cur_m_range.max_row, len(row_dim) - 1)]), bottom_right_x=int(col_dim[min( cur_m_range.max_col, len(col_dim) - 1)]), ), text=extract_cell_value( cur_m_range.start_cell), ) ], )) else: cells.append( CellLinked( top_left_y=int(row_dim[row - 1]), top_left_x=int(col_dim[col - 1]), bottom_right_y=int(row_dim[row]), bottom_right_x=int(col_dim[col]), col=col - 1, row=row - 1, col_span=1, row_span=1, text_boxes=[ TextField( bbox=BorderBox( top_left_y=int(row_dim[row - 1]), top_left_x=int(col_dim[col - 1]), bottom_right_y=int(row_dim[row]), bottom_right_x=int(col_dim[col]), ), text=extract_cell_value( worksheet.cell(row, col)), ) ], )) struct_table = StructuredTable( bbox=BorderBox( top_left_y=int(row_dim[s_row - 1]), top_left_x=int(col_dim[s_col - 1]), bottom_right_y=int(row_dim[e_row]), bottom_right_x=int(col_dim[e_col]), ), cells=cells, ) struct_table_headered = get_headers_using_structured(struct_table, headers) if len(struct_table_headered.cells) + sum( [len(h) for h in struct_table_headered.header]) > 3: head_cells = [] for pack in struct_table_headered.header: head_cells.extend(pack) for cell in head_cells: col = cell.col + 1 row = cell.row + 1 col_span = cell.col_span row_span = cell.row_span for r in range(row, row + row_span): for c in range(col, col + col_span): worksheet.cell(r, c).fill = HEADER_FILL for cell in struct_table_headered.cells: col = cell.col + 1 row = cell.row + 1 col_span = cell.col_span row_span = cell.row_span for r in range(row, row + row_span): for c in range(col, col + col_span): worksheet.cell(r, c).fill = PatternFill(start_color="CC55BB", end_color="CC55BB", fill_type="solid") return struct_table_headered
def check_inside_and_put(inf_table: InferenceTable, inf_header: BorderBox): if inf_header.box_is_inside_another(inf_table.bbox): inf_table.header_boxes.append(inf_header) return True return False
def check_inside_and_put(inf_table: InferenceTable, inf_cell: BorderBox): if inf_cell.box_is_inside_another(inf_table.bbox): inf_table.tags.append(inf_cell) return True return False
def match_inf_res(xlsx_path: Path, images_dir: Path): LOGGER.info( "Initializing CascadeMaskRCNN with config: %s and model: %s", CASCADE_CONFIG_PATH, CASCADE_MODEL_PATH, ) cascade_rcnn_detector = CascadeRCNNInferenceService( CASCADE_CONFIG_PATH, CASCADE_MODEL_PATH, True) pages = [] workbook = load_workbook(str(xlsx_path.absolute()), data_only=True) for page_num, worksheet in enumerate(workbook.worksheets): row_fill = {} for row_id in range(1, worksheet.max_row + 1): row_fill[row_id] = False for col_id in range(1, worksheet.max_column + 1): if worksheet.cell(row_id, col_id).value: row_fill[row_id] = True break last_row = worksheet.max_row for row_id, not_empty in sorted( [(row_id, not_empty) for row_id, not_empty in row_fill.items()], reverse=True, key=lambda x: x[0], ): if not_empty: if last_row == worksheet.max_row: last_row += 1 break last_row = row_id col_fill = {} for col_id in range(1, worksheet.max_column + 1): col_fill[col_id] = False for row_id in range(1, worksheet.max_row + 1): if worksheet.cell(row_id, col_id).value: col_fill[col_id] = True break last_col = worksheet.max_column for col_id, not_empty in sorted( [(col_id, not_empty) for col_id, not_empty in col_fill.items()], reverse=True, key=lambda x: x[0], ): if not_empty: if last_col == worksheet.max_column: last_col += 1 break last_col = col_id height = 0 for row_id in range(1, last_row): if worksheet.row_dimensions[row_id].height: height += worksheet.row_dimensions[row_id].height else: height += DEFAULT_HEIGHT width = 0 for col_id in range(1, last_col): if worksheet.column_dimensions[get_column_letter(col_id)].width: width += worksheet.column_dimensions[get_column_letter( col_id)].width else: width += DEFAULT_WIDTH if height == 0 or width == 0: continue img = cv2.imread(str((images_dir / f"{page_num}.png").absolute())) if img is None: LOGGER.warning( "Image is empty or none, skipping processing on page %s", page_num) continue img_shape = img.shape[:2] tables_proposals = clust_tables(worksheet, last_row, last_col) row_dim, col_dim = get_grid(worksheet, last_row, last_col) y_scale = img_shape[0] / height x_scale = img_shape[1] / width row_dim = [dim * y_scale for dim in row_dim] col_dim = [dim * x_scale for dim in col_dim] headers = [] if not any([s > 10000 for s in img_shape]) and last_row < 1000: _, headers = cascade_rcnn_detector.inference_image( images_dir / f"{page_num}.png", padding=200) tables = [ comp_table(worksheet, row_dim, col_dim, (prop[0], prop[1]), (prop[2], prop[3]), headers) for prop in tables_proposals ] tables = [ table for table in tables if len(table.cells) + sum([len(h) for h in table.header]) > 3 ] blocks = [] blocks.extend(tables) prev_row_coord = 0 for row_id in range(1, last_row): row_coord = prev_row_coord + ( worksheet.row_dimensions[row_id].height if worksheet.row_dimensions[row_id].height else DEFAULT_HEIGHT) prev_col_coord = 0 for col_id in range(1, last_col): col_coord = prev_col_coord + ( worksheet.column_dimensions[get_column_letter(col_id)]. width if worksheet.column_dimensions[get_column_letter( col_id)].width else DEFAULT_WIDTH) if worksheet.cell(row_id, col_id).value and not any([ y1 <= row_id <= y2 and x1 <= col_id <= x2 for y1, x1, y2, x2 in tables_proposals ]): text_field = TextField( bbox=BorderBox( top_left_x=prev_col_coord * x_scale, top_left_y=prev_row_coord * y_scale, bottom_right_x=col_coord * x_scale, bottom_right_y=row_coord * y_scale, ), text=extract_cell_value(worksheet.cell(row_id, col_id)), ) blocks.append(text_field) prev_col_coord = col_coord prev_row_coord = row_coord pages.append( page_to_dict( Page( page_num=page_num, bbox=BorderBox( top_left_x=0, top_left_y=0, bottom_right_x=img_shape[1], bottom_right_y=img_shape[0], ), tables=blocks, ))) workbook.save(str(xlsx_path.absolute())) workbook.close() return pages