Пример #1
0
def recursive_xy_divide(elems, avg_font_size):
    """
    Recursively group/divide the document by white stripes
    by projecting elements onto alternating axes as intervals.

    avg_font_size: the minimum gap size between elements below
    which we consider interval continuous.
    """
    log = logging.getLogger(__name__)
    log.info(avg_font_size)
    objects = list(elems.mentions)
    objects.extend(elems.segments)
    bboxes = []

    # A tree that is a list of its children
    # bboxes can be recursively reconstructed from
    # the leaves
    def divide(objs, bbox, h_split=True, is_single=False):
        """
        Recursive wrapper for splitting a list of objects
        with bounding boxes.
        h_split: whether to split along y axis, otherwise
        we split along x axis.
        """
        if not objs:
            return []

        # range start/end indices
        axis = 1 if h_split else 0

        intervals, groups = project_onto(objs, axis, avg_font_size)

        # base case where we can not actually divide
        single_child = len(groups) == 1

        # Can not divide in both X and Y, stop
        if is_single and single_child:
            bboxes.append(bbox)
            return objs
        else:
            children = []

            for interval, group in zip(intervals, groups):
                # Create the bbox for the subgroup
                sub_bbox = np.array(bbox)
                sub_bbox[[axis, axis + 2]] = interval
                # Append the sub-document tree
                child = divide(group, sub_bbox, not h_split, single_child)
                children.append(child)
            return children

    full_page_bbox = (0, 0, elems.layout.width, elems.layout.height)
    # Filter out invalid objects
    objects = [o for o in objects if inside(full_page_bbox, o.bbox)]
    log.info("avg_font_size for dividing", avg_font_size)
    tree = divide(objects, full_page_bbox) if objects else []
    return bboxes, tree
Пример #2
0
    def __init__(self, mentions, lines, region, min_cell_size=6.0):
        """
        Constructor
        """
        self.min_cell_size = min_cell_size
        vlines, hlines = _split_vlines_hlines(lines)

        self.xs = [v.xc for v in vlines]
        self.ys = [h.yc for h in hlines]

        # Remove closely clustered lines
        # Also make sure there is at least 1 mega column for the table
        self.xs = _retain_centroids(self.xs + [region.x0, region.x1],
                                    min_cell_size)
        self.ys = _retain_centroids(self.ys + [region.y0, region.y1],
                                    min_cell_size)

        self.xranges = list(zip(self.xs, self.xs[1:]))
        self.yranges = list(zip(self.ys, self.ys[1:]))

        self.num_cols = len(self.xranges)
        self.num_rows = len(self.yranges)

        # Grid contents
        self._grid = np.full([self.num_rows, self.num_cols],
                             None,
                             dtype=np.dtype(object))
        grid = self._grid

        # Record whether a particular cell boundary is present
        line_plane = Plane(region.bbox)
        line_plane.extend(lines)
        vbars, hbars = self._mark_grid_bounds(line_plane, region)
        cells = []
        # Establish cell regions
        for i in range(self.num_rows):
            for j in range(self.num_cols):
                if grid[i, j]:
                    continue  # Skip already marked cells
                # Merge with cell above
                if i > 0 and not hbars[i, j]:
                    grid[i, j] = cell = grid[i - 1, j]
                    cell.rowend = i + 1
                # Merge with cell left
                elif j > 0 and not vbars[i, j]:
                    grid[i, j] = cell = grid[i, j - 1]
                    cell.colend = j + 1
                # Create new cell otherwise
                else:
                    grid[i, j] = cell = Cell([i, j])
                    cells.append(cell)

        # Now get the cell's contents by using its boundary
        text_plane = Plane(region.bbox)
        text_plane.extend(mentions)

        for cell in cells:
            x0 = self.xs[cell.colstart]
            x1 = self.xs[cell.colend]
            y0 = self.ys[cell.rowstart]
            y1 = self.ys[cell.rowend]
            bbox = (x0, y0, x1, y1)
            # Keep mentions whose centers are inside the cell
            cell.texts = [
                m for m in text_plane.find(bbox)
                if inside(bbox, (m.xc, m.yc) * 2)
            ]

        # TODO: provide HTML conversion here

        self.get_normalized_grid()