示例#1
0
def merge_text_fields(paddle_t_b: List[TextField],
                      poppler_t_b: List[TextField]) -> List[TextField]:
    not_matched = []
    merged_t_b = []
    for pop_t_b in poppler_t_b:
        merged = False
        for pad_t_b in paddle_t_b:
            if pop_t_b.bbox.box_is_inside_another(pad_t_b.bbox,
                                                  threshold=0.00):
                merged_t_b.append(
                    TextField(bbox=pad_t_b.bbox.merge(pop_t_b.bbox),
                              text=pop_t_b.text))
                merged = True
        if not merged:
            not_matched.append(pop_t_b)

    for pad_t_b in paddle_t_b:
        exists = False
        for mer_t_b in merged_t_b:
            if mer_t_b.bbox.box_is_inside_another(pad_t_b.bbox, threshold=0.0):
                exists = True
        if not exists:
            not_matched.append(pad_t_b)

    merged_t_b.extend(not_matched)

    return merged_t_b
示例#2
0
def convert_cells(cells: dict) -> list:
    converted_cells = []
    for coords, params in cells.items():
        coords_in_px = params[0]
        text_boxes = TextField(
            bbox=BorderBox(
                coords_in_px["top_left"][0],
                coords_in_px["top_left"][1],
                coords_in_px["bottom_right"][0],
                coords_in_px["bottom_right"][1],
            ),
            text=params[-2],
        )
        new_cell = CellLinked(
            coords_in_px["top_left"][0],
            coords_in_px["top_left"][1],
            coords_in_px["bottom_right"][0],
            coords_in_px["bottom_right"][1],
            text_boxes=[text_boxes],
            col=coords[0],
            row=coords[1],
            col_span=params[1],
            row_span=params[2],
        )

        converted_cells.append(new_cell)
    return converted_cells
示例#3
0
def merge_closest_text_fields(text_fields: List[TextField]):
    merged_fields: List[TextField] = []
    curr_field: TextField = None
    for text_field in sorted(text_fields,
                             key=lambda x:
                             (x.bbox.top_left_y, x.bbox.top_left_x)):
        if not curr_field:
            curr_field = text_field
            continue
        if curr_field:
            if (20 > text_field.bbox.top_left_x -
                    curr_field.bbox.bottom_right_x > -20
                    and curr_field.bbox.top_left_y - 10 <
                    text_field.bbox.top_left_y <
                    curr_field.bbox.top_left_y + 10):
                curr_field = TextField(
                    bbox=curr_field.bbox.merge(text_field.bbox),
                    text=curr_field.text + " " + text_field.text,
                )
            else:
                merged_fields.append(curr_field)
                curr_field = text_field
    if curr_field:
        merged_fields.append(curr_field)

    return merged_fields
示例#4
0
def actualize_text(table: StructuredTable, image_path: Path):
    with TextExtractor(str(image_path.absolute())) as te:
        for cell in table.cells:
            if not cell.text_boxes or any(
                [not text_box.text for text_box in cell.text_boxes]):
                text, _ = te.extract(cell.top_left_x, cell.top_left_y,
                                     cell.width, cell.height)
                cell.text_boxes.append(TextField(bbox=cell, text=text))
示例#5
0
 def extract_table_text(self, img: numpy.ndarray,
                        border_box: BorderBox) -> List[TextField]:
     x1, y1, x2, y2 = border_box.box
     dt_boxes, elapse = self.text_detector(img[y1:y2, x1:x2])
     bboxes = paddle_result_to_bboxes(dt_boxes)
     return [
         TextField(bbox=cell, text="")
         for cell in (BorderBox(b[0] + x1, b[1] + y1, b[2] + x1, b[3] + y1)
                      for b in bboxes)
     ]
def poppler_text_field_to_text_field(pt_field: PopplerTextField, scale: float):
    return TextField(
        bbox=bounding_box_to_bbox(pt_field.bbox, scale),
        text=pt_field.text
    )
示例#7
0
def match_inf_res(xlsx_path: Path, images_dir: Path):
    LOGGER.info(
        "Initializing CascadeMaskRCNN with config: %s and model: %s",
        CASCADE_CONFIG_PATH,
        CASCADE_MODEL_PATH,
    )
    cascade_rcnn_detector = CascadeRCNNInferenceService(
        CASCADE_CONFIG_PATH, CASCADE_MODEL_PATH, True)
    pages = []
    workbook = load_workbook(str(xlsx_path.absolute()), data_only=True)
    for page_num, worksheet in enumerate(workbook.worksheets):
        row_fill = {}
        for row_id in range(1, worksheet.max_row + 1):
            row_fill[row_id] = False
            for col_id in range(1, worksheet.max_column + 1):
                if worksheet.cell(row_id, col_id).value:
                    row_fill[row_id] = True
                    break
        last_row = worksheet.max_row
        for row_id, not_empty in sorted(
            [(row_id, not_empty) for row_id, not_empty in row_fill.items()],
                reverse=True,
                key=lambda x: x[0],
        ):
            if not_empty:
                if last_row == worksheet.max_row:
                    last_row += 1
                break
            last_row = row_id

        col_fill = {}
        for col_id in range(1, worksheet.max_column + 1):
            col_fill[col_id] = False
            for row_id in range(1, worksheet.max_row + 1):
                if worksheet.cell(row_id, col_id).value:
                    col_fill[col_id] = True
                    break
        last_col = worksheet.max_column
        for col_id, not_empty in sorted(
            [(col_id, not_empty) for col_id, not_empty in col_fill.items()],
                reverse=True,
                key=lambda x: x[0],
        ):
            if not_empty:
                if last_col == worksheet.max_column:
                    last_col += 1
                break
            last_col = col_id

        height = 0
        for row_id in range(1, last_row):
            if worksheet.row_dimensions[row_id].height:
                height += worksheet.row_dimensions[row_id].height
            else:
                height += DEFAULT_HEIGHT
        width = 0
        for col_id in range(1, last_col):
            if worksheet.column_dimensions[get_column_letter(col_id)].width:
                width += worksheet.column_dimensions[get_column_letter(
                    col_id)].width
            else:
                width += DEFAULT_WIDTH
        if height == 0 or width == 0:
            continue

        img = cv2.imread(str((images_dir / f"{page_num}.png").absolute()))
        if img is None:
            LOGGER.warning(
                "Image is empty or none, skipping processing on page %s",
                page_num)
            continue
        img_shape = img.shape[:2]

        tables_proposals = clust_tables(worksheet, last_row, last_col)
        row_dim, col_dim = get_grid(worksheet, last_row, last_col)
        y_scale = img_shape[0] / height
        x_scale = img_shape[1] / width
        row_dim = [dim * y_scale for dim in row_dim]
        col_dim = [dim * x_scale for dim in col_dim]

        headers = []
        if not any([s > 10000 for s in img_shape]) and last_row < 1000:
            _, headers = cascade_rcnn_detector.inference_image(
                images_dir / f"{page_num}.png", padding=200)
        tables = [
            comp_table(worksheet, row_dim, col_dim, (prop[0], prop[1]),
                       (prop[2], prop[3]), headers)
            for prop in tables_proposals
        ]

        tables = [
            table for table in tables
            if len(table.cells) + sum([len(h) for h in table.header]) > 3
        ]

        blocks = []
        blocks.extend(tables)
        prev_row_coord = 0
        for row_id in range(1, last_row):
            row_coord = prev_row_coord + (
                worksheet.row_dimensions[row_id].height
                if worksheet.row_dimensions[row_id].height else DEFAULT_HEIGHT)
            prev_col_coord = 0
            for col_id in range(1, last_col):
                col_coord = prev_col_coord + (
                    worksheet.column_dimensions[get_column_letter(col_id)].
                    width if worksheet.column_dimensions[get_column_letter(
                        col_id)].width else DEFAULT_WIDTH)
                if worksheet.cell(row_id, col_id).value and not any([
                        y1 <= row_id <= y2 and x1 <= col_id <= x2
                        for y1, x1, y2, x2 in tables_proposals
                ]):
                    text_field = TextField(
                        bbox=BorderBox(
                            top_left_x=prev_col_coord * x_scale,
                            top_left_y=prev_row_coord * y_scale,
                            bottom_right_x=col_coord * x_scale,
                            bottom_right_y=row_coord * y_scale,
                        ),
                        text=extract_cell_value(worksheet.cell(row_id,
                                                               col_id)),
                    )
                    blocks.append(text_field)
                prev_col_coord = col_coord
            prev_row_coord = row_coord

        pages.append(
            page_to_dict(
                Page(
                    page_num=page_num,
                    bbox=BorderBox(
                        top_left_x=0,
                        top_left_y=0,
                        bottom_right_x=img_shape[1],
                        bottom_right_y=img_shape[0],
                    ),
                    tables=blocks,
                )))
    workbook.save(str(xlsx_path.absolute()))
    workbook.close()
    return pages
示例#8
0
def comp_table(worksheet: Worksheet, row_dim: List[float],
               col_dim: List[float], s_cell: Tuple[int, int],
               e_cell: Tuple[int, int], headers: List[Cell]):
    m_ranges = []
    for m_range in worksheet.merged_cells.ranges:
        m_ranges.append(m_range)
    s_row, s_col = s_cell
    e_row, e_col = e_cell
    e_row = min(e_row, len(row_dim) - 1)
    e_col = min(e_col, len(col_dim) - 1)

    cells = []
    m_range_included = []
    for row in range(s_row, e_row + 1):
        for col in range(s_col, e_col + 1):
            is_in_merged = False
            cur_m_range = None
            for m_range in m_ranges:
                if (row, col) in list(m_range.cells):
                    is_in_merged = True
                    cur_m_range = m_range
                    break
            skip = False
            if is_in_merged:
                for m_range in m_range_included:
                    if (row, col) in list(m_range.cells):
                        skip = True
                        break
            if skip:
                continue
            if is_in_merged and cur_m_range:
                m_range_included.append(cur_m_range)
                cells.append(
                    CellLinked(
                        top_left_y=int(row_dim[cur_m_range.min_row - 1]),
                        top_left_x=int(col_dim[cur_m_range.min_col - 1]),
                        bottom_right_y=int(row_dim[min(cur_m_range.max_row,
                                                       len(row_dim) - 1)]),
                        bottom_right_x=int(col_dim[min(cur_m_range.max_col,
                                                       len(col_dim) - 1)]),
                        col=col - 1,
                        row=row - 1,
                        col_span=cur_m_range.max_col - cur_m_range.min_col + 1,
                        row_span=cur_m_range.max_row - cur_m_range.min_row + 1,
                        text_boxes=[
                            TextField(
                                bbox=BorderBox(
                                    top_left_y=int(
                                        row_dim[cur_m_range.min_row - 1]),
                                    top_left_x=int(
                                        col_dim[cur_m_range.min_col - 1]),
                                    bottom_right_y=int(row_dim[min(
                                        cur_m_range.max_row,
                                        len(row_dim) - 1)]),
                                    bottom_right_x=int(col_dim[min(
                                        cur_m_range.max_col,
                                        len(col_dim) - 1)]),
                                ),
                                text=extract_cell_value(
                                    cur_m_range.start_cell),
                            )
                        ],
                    ))
            else:
                cells.append(
                    CellLinked(
                        top_left_y=int(row_dim[row - 1]),
                        top_left_x=int(col_dim[col - 1]),
                        bottom_right_y=int(row_dim[row]),
                        bottom_right_x=int(col_dim[col]),
                        col=col - 1,
                        row=row - 1,
                        col_span=1,
                        row_span=1,
                        text_boxes=[
                            TextField(
                                bbox=BorderBox(
                                    top_left_y=int(row_dim[row - 1]),
                                    top_left_x=int(col_dim[col - 1]),
                                    bottom_right_y=int(row_dim[row]),
                                    bottom_right_x=int(col_dim[col]),
                                ),
                                text=extract_cell_value(
                                    worksheet.cell(row, col)),
                            )
                        ],
                    ))
    struct_table = StructuredTable(
        bbox=BorderBox(
            top_left_y=int(row_dim[s_row - 1]),
            top_left_x=int(col_dim[s_col - 1]),
            bottom_right_y=int(row_dim[e_row]),
            bottom_right_x=int(col_dim[e_col]),
        ),
        cells=cells,
    )
    struct_table_headered = get_headers_using_structured(struct_table, headers)
    if len(struct_table_headered.cells) + sum(
        [len(h) for h in struct_table_headered.header]) > 3:
        head_cells = []
        for pack in struct_table_headered.header:
            head_cells.extend(pack)
        for cell in head_cells:
            col = cell.col + 1
            row = cell.row + 1
            col_span = cell.col_span
            row_span = cell.row_span
            for r in range(row, row + row_span):
                for c in range(col, col + col_span):
                    worksheet.cell(r, c).fill = HEADER_FILL
        for cell in struct_table_headered.cells:
            col = cell.col + 1
            row = cell.row + 1
            col_span = cell.col_span
            row_span = cell.row_span
            for r in range(row, row + row_span):
                for c in range(col, col + col_span):
                    worksheet.cell(r,
                                   c).fill = PatternFill(start_color="CC55BB",
                                                         end_color="CC55BB",
                                                         fill_type="solid")
    return struct_table_headered
示例#9
0
    def process_page(self, image_path: Path, output_path: Path,
                     poppler_page) -> Dict[str, Any]:
        img = cv2.imread(str(image_path.absolute()))
        page = Page(page_num=int(image_path.name.split(".")[0]),
                    bbox=BorderBox(top_left_x=0,
                                   top_left_y=0,
                                   bottom_right_x=img.shape[1],
                                   bottom_right_y=img.shape[0]))
        text_fields = self._scale_poppler_result(img, output_path,
                                                 poppler_page, image_path)

        inference_tables, headers = self.inference_service.inference_image(
            image_path)
        if not inference_tables:
            return page_to_dict(page)

        has_bordered = any(
            [i_tab.label == 'Bordered' for i_tab in inference_tables])

        self.visualizer.draw_object_and_save(
            img, inference_tables,
            Path(f"{output_path}/inference_result/{image_path.name}"))

        text_fields_to_match = text_fields

        semi_bordered_tables = []
        detected_tables = []
        for inf_table in inference_tables:
            in_inf_table, text_fields_to_match = match_table_text(
                inf_table, text_fields_to_match)
            paddle_fields = self.text_detector.extract_table_text(
                img, inf_table.bbox)
            if paddle_fields:
                in_inf_table = merge_text_fields(paddle_fields, in_inf_table)

            mask_rcnn_count_matches, not_matched = match_cells_text_fields(
                inf_table.tags, in_inf_table)

            if inf_table.label == 'Borderless':
                semi_border = semi_bordered(img, inf_table)
                if semi_border:
                    semi_bordered_tables.append(semi_border)
                    semi_border_score = match_cells_table(
                        in_inf_table, semi_border)
                    if semi_border_score >= mask_rcnn_count_matches and semi_border.count_cells(
                    ) > len(inf_table.tags):
                        struct_table = semi_border_to_struct(
                            semi_border, img.shape)
                        if struct_table:
                            detected_tables.append(
                                (semi_border_score, struct_table))
                        continue
            struct = self.extract_table_from_inference(img, inf_table,
                                                       not_matched, img.shape,
                                                       image_path)
            if struct:
                detected_tables.append((mask_rcnn_count_matches, struct))

        if has_bordered or any(score < 0.2 * len(table.cells)
                               for score, table in detected_tables):
            image = detect_tables_on_page(
                image_path, draw=self.visualizer.should_visualize)
            if image.tables:
                text_fields_to_match = text_fields
                for bordered_table in image.tables:
                    matched = False
                    for score, inf_table in detected_tables:
                        if inf_table.bbox.box_is_inside_another(
                                bordered_table.bbox):
                            in_table, text_fields_to_match = match_table_text(
                                inf_table, text_fields_to_match)
                            paddle_fields = self.text_detector.extract_table_text(
                                img, inf_table.bbox)
                            if paddle_fields:
                                in_table = merge_text_fields(
                                    paddle_fields, in_table)

                            bordered_score = match_cells_table(
                                in_table, bordered_table)
                            if bordered_score >= score * 0.5 \
                                    and bordered_table.count_cells() >= len(inf_table.cells) * 0.5:
                                struct_table = semi_border_to_struct(
                                    bordered_table, img.shape)
                                if struct_table:
                                    page.tables.append(struct_table)
                            else:
                                page.tables.append(inf_table)
                            detected_tables.remove((score, inf_table))
                            matched = True
                            break
                    if not matched:
                        in_table, text_fields_to_match = match_table_text(
                            bordered_table, text_fields_to_match)
                        _ = match_cells_table(in_table, bordered_table)
                        struct_table = semi_border_to_struct(
                            bordered_table, img.shape)
                        if struct_table:
                            page.tables.append(struct_table)
                if detected_tables:
                    page.tables.extend(
                        [inf_table for _, inf_table in detected_tables])
            else:
                page.tables.extend([tab for _, tab in detected_tables])
        else:
            page.tables.extend([tab for _, tab in detected_tables])
        for table in page.tables:
            actualize_text(table, image_path)

        # TODO: Headers should be created only once
        cell_header_scores = []
        for table in page.tables:
            cell_header_scores.extend(
                self.header_checker.get_cell_scores(table.cells))

        self.visualizer.draw_object_and_save(
            img, cell_header_scores,
            output_path / 'cells_header' / f"{page.page_num}.png")

        tables_with_header = []
        for table in page.tables:
            header_rows = self.create_header(table.rows, headers, 6)
            table_with_header = StructuredTableHeadered.from_structured_and_rows(
                table, header_rows)
            header_cols = self.create_header(table.cols, headers, 5)
            # TODO: Cells should be actualized only once
            table_with_header.actualize_header_with_cols(header_cols)
            tables_with_header.append(table_with_header)
        page.tables = tables_with_header

        with TextExtractor(str(image_path.absolute()),
                           seg_mode=PSM.SPARSE_TEXT) as extractor:
            text_borders = [1]
            for table in page.tables:
                _, y, _, y2 = table.bbox.box
                text_borders.extend([y, y2])
            text_borders.append(img.shape[0])
            text_candidate_boxes: List[BorderBox] = []
            for i in range(len(text_borders) // 2):
                if text_borders[i * 2 + 1] - text_borders[i * 2] > 3:
                    text_candidate_boxes.append(
                        BorderBox(
                            top_left_x=1,
                            top_left_y=text_borders[i * 2],
                            bottom_right_x=img.shape[1],
                            bottom_right_y=text_borders[i * 2 + 1],
                        ))
            for box in text_candidate_boxes:
                text, _ = extractor.extract(box.top_left_x, box.top_left_y,
                                            box.width, box.height)
                if text:
                    page.text.append(TextField(box, text))

        self.visualizer.draw_object_and_save(
            img, semi_bordered_tables,
            output_path.joinpath('semi_bordered_tables').joinpath(
                image_path.name))
        self.visualizer.draw_object_and_save(
            img, page.tables,
            output_path.joinpath('tables').joinpath(image_path.name))
        page_dict = page_to_dict(page)
        if self.visualizer.should_visualize:
            save_page(page_dict,
                      output_path / 'pages' / f"{page.page_num}.json")

        return page_dict
示例#10
0
    def process_page(self, image_path: Path, output_path: Path,
                     poppler_page) -> Dict[str, Any]:
        img = cv2.imread(str(image_path.absolute()))
        page = Page(
            page_num=int(image_path.name.split(".")[0]),
            bbox=BorderBox(
                top_left_x=0,
                top_left_y=0,
                bottom_right_x=img.shape[1],
                bottom_right_y=img.shape[0],
            ),
        )
        text_fields = self._scale_poppler_result(img, output_path,
                                                 poppler_page, image_path)

        logger.info("Start inference")
        inference_tables, headers = self.inference_service.inference_image(
            image_path)
        logger.info("End inference")
        self.visualizer.draw_object_and_save(
            img,
            inference_tables,
            Path(f"{output_path}/inference_result/{image_path.name}"),
            headers=headers,
        )

        if inference_tables:
            logger.info("Start bordered")
            image = detect_tables_on_page(
                image_path, draw=self.visualizer.should_visualize)
            logger.info("End bordered")
            text_fields_to_match = text_fields
            bordered_tables = []
            if image.tables:
                for bordered_table in image.tables:
                    in_table, text_fields_to_match = match_table_text(
                        bordered_table, text_fields_to_match)
                    _ = match_cells_table(in_table, bordered_table)
                    bordered_tables.append(
                        semi_border_to_struct(bordered_table, img.shape))

            inf_tables_to_detect = []
            for inf_table in inference_tables:
                matched = False
                if image.tables:
                    for bordered_table in bordered_tables:
                        if (inf_table.bbox.box_is_inside_another(
                                bordered_table.bbox, 0.8)
                                and inf_table.label == "Bordered"
                                and len(bordered_table.cells) >
                                len(inf_table.tags) * 0.5):
                            matched = True
                            page.tables.append(bordered_table)
                if not matched:
                    inf_tables_to_detect.append(inf_table)

            semi_bordered_tables = []
            for inf_table in inf_tables_to_detect:
                in_inf_table, text_fields_to_match = match_table_text(
                    inf_table, text_fields_to_match)
                logger.info("Start paddle")
                paddle_fields = self.text_detector.extract_table_text(
                    img, inf_table.bbox)
                logger.info("End paddle")
                if paddle_fields:
                    in_inf_table = merge_text_fields(paddle_fields,
                                                     in_inf_table)

                mask_rcnn_count_matches, not_matched = match_cells_text_fields(
                    inf_table.tags, in_inf_table)

                if inf_table.label == "Borderless" and False:
                    semi_border = semi_bordered(img, inf_table)
                    if semi_border:
                        semi_bordered_tables.append(semi_border)
                        semi_border_score = match_cells_table(
                            in_inf_table, semi_border)
                        if (semi_border_score >= mask_rcnn_count_matches
                                and semi_border.count_cells() > len(
                                    inf_table.tags)):
                            struct_table = semi_border_to_struct(
                                semi_border, img.shape)
                            if struct_table:
                                page.tables.append(struct_table)
                            continue
                struct = self.extract_table_from_inference(
                    img, inf_table, not_matched, img.shape, image_path)
                if struct:
                    page.tables.append(struct)

            for table in page.tables:
                actualize_text(table, image_path, img.shape[:2])

            # TODO: Headers should be created only once
            cell_header_scores = []
            for table in page.tables:
                cell_header_scores.extend(
                    self.header_checker.get_cell_scores(table.cells))

            self.visualizer.draw_object_and_save(
                img,
                cell_header_scores,
                output_path / "cells_header" / f"{page.page_num}.png",
            )

            tables_with_header = []
            for table in page.tables:
                header_rows = self.create_header(table.rows, headers, 5)
                table_with_header = (
                    StructuredTableHeadered.from_structured_and_rows(
                        table, header_rows))
                header_cols = self.create_header(table.cols, headers, 1)
                # TODO: Cells should be actualized only once
                table_with_header.actualize_header_with_cols(header_cols)
                tables_with_header.append(table_with_header)
            page.tables = tables_with_header

            self.visualizer.draw_object_and_save(
                img,
                semi_bordered_tables,
                output_path.joinpath("semi_bordered_tables").joinpath(
                    image_path.name),
            )
            self.visualizer.draw_object_and_save(
                img,
                page.tables,
                output_path.joinpath("tables").joinpath(image_path.name),
            )
        logger.info("Start text extraction")
        with TextExtractor(str(image_path.absolute()),
                           seg_mode=PSM.SPARSE_TEXT) as extractor:
            text_borders = [1]
            for table in page.tables:
                _, y, _, y2 = table.bbox.box
                text_borders.extend([y, y2])
            text_borders.append(img.shape[0])
            text_candidate_boxes: List[BorderBox] = []
            for i in range(len(text_borders) // 2):
                if text_borders[i * 2 + 1] - text_borders[i * 2] > 3:
                    text_candidate_boxes.append(
                        BorderBox(
                            top_left_x=1,
                            top_left_y=text_borders[i * 2],
                            bottom_right_x=img.shape[1],
                            bottom_right_y=text_borders[i * 2 + 1],
                        ))
            for box in text_candidate_boxes:
                text, _ = extractor.extract(box.top_left_x, box.top_left_y,
                                            box.width, box.height)
                if text:
                    page.text.append(TextField(box, text))
        logger.info("End text extraction")
        page_dict = page_to_dict(page)
        if self.visualizer.should_visualize:
            save_page(page_dict,
                      output_path / "pages" / f"{page.page_num}.json")

        return page_dict