def given_plane_with_one_object(object_size=50, gridsize=50): bounding_box = (0, 0, 100, 100) plane = Plane(bounding_box, gridsize) obj = LTComponent((0, 0, object_size, object_size)) plane.add(obj) return plane, obj
def test_find_neighbors_vertical(self): laparams = LAParams() plane = Plane((0, 0, 50, 50)) line = LTTextLineVertical(laparams.word_margin) line.set_bbox((4, 10, 6, 20)) plane.add(line) bottom_aligned_right = LTTextLineVertical(laparams.word_margin) bottom_aligned_right.set_bbox((6, 10, 8, 15)) plane.add(bottom_aligned_right) top_aligned_left = LTTextLineVertical(laparams.word_margin) top_aligned_left.set_bbox((2, 15, 4, 20)) plane.add(top_aligned_left) centrally_aligned_overlapping = LTTextLineVertical( laparams.word_margin) centrally_aligned_overlapping.set_bbox((5, 13, 7, 17)) plane.add(centrally_aligned_overlapping) not_aligned = LTTextLineVertical(laparams.word_margin) not_aligned.set_bbox((6, 0, 8, 5)) plane.add(not_aligned) wrong_width = LTTextLineVertical(laparams.word_margin) wrong_width.set_bbox((6, 10, 10, 15)) plane.add(wrong_width) neighbors = line.find_neighbors(plane, laparams.line_margin) self.assertCountEqual( neighbors, [ line, bottom_aligned_right, top_aligned_left, centrally_aligned_overlapping, ], )
class Sheet1 (object): cells = None text_layout = None column_edges = None row_edges = None def __init__(self): self.cells = Plane() self.text_layout = Plane() self.row_edges = {} self.column_edges = {} def add_cell (self, cell): self.cells.add(cell) def add_text (self, cell_text): self.text_layout.add(cell_text) # if cell_text.text[:3] == 'Oil': print cell_text.text, cell_text.bbox def add_column_edge (self, x_value): x = round(x_value,2) self.column_edges[x] = 1+ self.column_edges.get(x,0) def add_row_edge (self, y_value): y = round(y_value,2) self.row_edges[y] = 1+ self.row_edges.get(y,0) def add_line (self, bbox): if bbox[0]==bbox[2]: # vertical line self.add_column_edge(bbox[0]) elif bbox[1]==bbox[3]: #horizontal line self.add_row_edge(bbox[1]) else: print ('WARNING: non-orthogonal line found: %s'%bbox) def add_rect (self, bbox): self.add_column_edge(bbox[0]) self.add_column_edge(bbox[2]) self.add_row_edge(bbox[1]) self.add_row_edge(bbox[3]) def add_ltcontainer (self, obj, page_y_offset): #NB: row indexes (y axis) are negative! bbox = ( round(obj.x0,2), round(-(obj.y1+page_y_offset),2), round(obj.x1,2), round(-(obj.y0+page_y_offset),2) ) if isinstance (obj, LTTextLine): self.add_text (CellText(bbox, obj.get_text())) elif isinstance (obj, LTLine): self.add_line(bbox) elif isinstance (obj, LTRect): self.add_rect(bbox) elif isinstance (obj, LTContainer): for child in obj: self.add_ltcontainer (child, page_y_offset) else: pass def extract_rows (self): # for obj in self.text_layout.find((690, -1200, 800, -1000)): # print obj.bbox,obj.text row_bounds = sorted(self.row_edges) col_bounds = sorted(self.column_edges) # pprint.pprint(col_bounds) rows = [] r0 = row_bounds[0] - 1 if row_bounds else 0 #NB: row indexes (y axis) are negative! for r1 in row_bounds: if r1 - r0 < 1: continue # print r1-r0,r0,r1 row=[] c0 = 0 for c1 in col_bounds: if c1 - c0 < 1: continue # print c0,r0,c1,r1 # get all text lines that intersect the bounds of this cell lines = [l for l in self.text_layout.find((c0,r0,c1,r1))] #sort from top to bottom lines = sorted(lines, key=lambda line: line.y0) # text = ' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1]) # if text[:10] == 'Production': print text,c0,r0,c1,r1 # if text[:3] == 'Oil': print text,c0,r0,c1,r1 # remove anything where the left edge is not inside the cell and concatenate the rest row.append(' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1])) c0 = c1 rows.append(row) r0 = r1 return rows
def test_find_neighbors_horizontal(self): laparams = LAParams() plane = Plane((0, 0, 50, 50)) line = LTTextLineHorizontal(laparams.word_margin) line.set_bbox((10, 4, 20, 6)) plane.add(line) left_aligned_above = LTTextLineHorizontal(laparams.word_margin) left_aligned_above.set_bbox((10, 6, 15, 8)) plane.add(left_aligned_above) right_aligned_below = LTTextLineHorizontal(laparams.word_margin) right_aligned_below.set_bbox((15, 2, 20, 4)) plane.add(right_aligned_below) centrally_aligned_overlapping = LTTextLineHorizontal( laparams.word_margin) centrally_aligned_overlapping.set_bbox((13, 5, 17, 7)) plane.add(centrally_aligned_overlapping) not_aligned = LTTextLineHorizontal(laparams.word_margin) not_aligned.set_bbox((0, 6, 5, 8)) plane.add(not_aligned) wrong_height = LTTextLineHorizontal(laparams.word_margin) wrong_height.set_bbox((10, 6, 15, 10)) plane.add(wrong_height) neighbors = line.find_neighbors(plane, laparams.line_margin) self.assertCountEqual( neighbors, [ line, left_aligned_above, right_aligned_below, centrally_aligned_overlapping, ], )