示例#1
0
 def find_neighbors_with_rsrcmgr(
         self, plane: Plane, ratio: float,
         rsrcmgr: PaperResourceManager) -> List[Union[LTItem, LTText]]:
     d = ratio * self.height
     objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
     classification = self.maybe_classify(rsrcmgr)
     return [
         obj for obj in objs
         if (isinstance(obj, LTTextLineHorizontalExtended)
             and classification == obj.maybe_classify(rsrcmgr) and
             ((abs(obj.height - self.height) < d and self.is_font_similar(
                 obj) and self.is_x_similar(obj, d)) or classification in
              [LTAuthor, LTPageMargin, LTCitationBox, LTFooter]))
     ]
示例#2
0
文件: grid.py 项目: zviri/pdftotree
    def __init__(self, mentions, lines, region, min_cell_size=6.0):
        """
        Constructor
        """
        self.min_cell_size = min_cell_size
        vlines, hlines = _split_vlines_hlines(lines)

        self.xs = [v.xc for v in vlines]
        self.ys = [h.yc for h in hlines]

        # Remove closely clustered lines
        # Also make sure there is at least 1 mega column for the table
        self.xs = _retain_centroids(self.xs + [region.x0, region.x1],
                                    min_cell_size)
        self.ys = _retain_centroids(self.ys + [region.y0, region.y1],
                                    min_cell_size)

        self.xranges = list(zip(self.xs, self.xs[1:]))
        self.yranges = list(zip(self.ys, self.ys[1:]))

        self.num_cols = len(self.xranges)
        self.num_rows = len(self.yranges)

        # Grid contents
        self._grid = np.full([self.num_rows, self.num_cols],
                             None,
                             dtype=np.dtype(object))
        grid = self._grid

        # Record whether a particular cell boundary is present
        line_plane = Plane(region.bbox)
        line_plane.extend(lines)
        vbars, hbars = self._mark_grid_bounds(line_plane, region)
        cells = []
        # Establish cell regions
        for i in range(self.num_rows):
            for j in range(self.num_cols):
                if grid[i, j]:
                    continue  # Skip already marked cells
                # Merge with cell above
                if i > 0 and not hbars[i, j]:
                    grid[i, j] = cell = grid[i - 1, j]
                    cell.rowend = i + 1
                # Merge with cell left
                elif j > 0 and not vbars[i, j]:
                    grid[i, j] = cell = grid[i, j - 1]
                    cell.colend = j + 1
                # Create new cell otherwise
                else:
                    grid[i, j] = cell = Cell([i, j])
                    cells.append(cell)

        # Now get the cell's contents by using its boundary
        text_plane = Plane(region.bbox)
        text_plane.extend(mentions)

        for cell in cells:
            x0 = self.xs[cell.colstart]
            x1 = self.xs[cell.colend]
            y0 = self.ys[cell.rowstart]
            y1 = self.ys[cell.rowend]
            bbox = (x0, y0, x1, y1)
            # Keep mentions whose centers are inside the cell
            cell.texts = [
                m for m in text_plane.find(bbox)
                if inside(bbox, (m.xc, m.yc) * 2)
            ]

        # TODO: provide HTML conversion here

        self.get_normalized_grid()
示例#3
0
class Sheet1 (object):
    cells = None
    text_layout = None
    column_edges = None
    row_edges = None


    def __init__(self):
        self.cells = Plane()
        self.text_layout = Plane()
        self.row_edges = {}
        self.column_edges = {}

    def add_cell (self, cell):
        self.cells.add(cell)


    def add_text (self, cell_text):
        self.text_layout.add(cell_text)
#        if cell_text.text[:3] == 'Oil': print cell_text.text, cell_text.bbox

    def add_column_edge (self, x_value):
        x = round(x_value,2)
        self.column_edges[x] = 1+ self.column_edges.get(x,0)

    def add_row_edge (self, y_value):
        y = round(y_value,2)
        self.row_edges[y] = 1+ self.row_edges.get(y,0)

    def add_line (self, bbox):
        if bbox[0]==bbox[2]:
            # vertical line
            self.add_column_edge(bbox[0])
        elif bbox[1]==bbox[3]:
            #horizontal line
            self.add_row_edge(bbox[1])
        else:
            print ('WARNING: non-orthogonal line found: %s'%bbox)

    def add_rect (self, bbox):
        self.add_column_edge(bbox[0])
        self.add_column_edge(bbox[2])
        self.add_row_edge(bbox[1])
        self.add_row_edge(bbox[3])

    def add_ltcontainer (self, obj, page_y_offset):
        #NB: row indexes (y axis) are negative!

        bbox = (
            round(obj.x0,2),
            round(-(obj.y1+page_y_offset),2),
            round(obj.x1,2),
            round(-(obj.y0+page_y_offset),2)
            )

        if isinstance (obj, LTTextLine):
            self.add_text (CellText(bbox, obj.get_text()))
        elif isinstance (obj, LTLine):
            self.add_line(bbox)
        elif isinstance (obj, LTRect):
            self.add_rect(bbox)
        elif isinstance (obj, LTContainer):
            for child in obj:
                self.add_ltcontainer (child, page_y_offset)
        else:
            pass

    def extract_rows (self):
#        for obj in self.text_layout.find((690, -1200, 800, -1000)):
#            print obj.bbox,obj.text

        row_bounds = sorted(self.row_edges)
        col_bounds = sorted(self.column_edges)

#        pprint.pprint(col_bounds)

        rows = []

        r0 = row_bounds[0] - 1 if row_bounds else 0

        #NB: row indexes (y axis) are negative!
        for r1 in row_bounds:
            if r1 - r0 < 1: continue

#            print r1-r0,r0,r1

            row=[]
            c0 = 0
            for c1 in col_bounds:
                if c1 - c0 < 1: continue

#                print c0,r0,c1,r1
                # get all text lines that intersect the bounds of this cell
                lines = [l for l in self.text_layout.find((c0,r0,c1,r1))]
                #sort from top to bottom
                lines = sorted(lines, key=lambda line: line.y0)

#                text = ' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1])
#                if text[:10] == 'Production': print text,c0,r0,c1,r1
#                if text[:3] == 'Oil': print text,c0,r0,c1,r1

                # remove anything where the left edge is not inside the cell and concatenate the rest
                row.append(' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1]))
                c0 = c1
            rows.append(row)
            r0 = r1
        return rows