def identify_scanned_page(self, boxes, page_bbox, page_width, page_height): plane = Plane(page_bbox) plane.extend(boxes) cid2obj = [set([i]) for i in range(len(boxes))] # initialize clusters # default object map to cluster with its own index obj2cid = list(range(len(boxes))) prev_clusters = obj2cid while (True): for i1, b1 in enumerate(boxes): for i2, b2 in enumerate(boxes): box1 = b1.bbox box2 = b2.bbox if (box1[0] == box2[0] and box1[2] == box2[2] and round(box1[3]) == round(box2[1])): min_i = min(i1, i2) max_i = max(i1, i2) cid1 = obj2cid[min_i] cid2 = obj2cid[max_i] for obj_iter in cid2obj[cid2]: cid2obj[cid1].add(obj_iter) obj2cid[obj_iter] = cid1 cid2obj[cid2] = set() if (prev_clusters == obj2cid): break prev_clusters = obj2cid clusters = [[boxes[i] for i in cluster] for cluster in filter(bool, cid2obj)] if (len(clusters) == 1 and clusters[0][0].bbox[0] < -0.0 and clusters[0][0].bbox[1] <= 0 and abs(clusters[0][0].bbox[2] - page_width) <= 5 and abs(clusters[0][0].bbox[3] - page_height) <= 5): return True return False
def group_textlines(self, laparams, lines): """Patched class method that fixes empty line aggregation, and allows run-time line margin detection""" plane = Plane(self.bbox) plane.extend(lines) boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) if line not in neighbors or not line.get_text().strip(): continue # Correct margin to paragraph specific true_margin = laparams.line_margin for obj1 in neighbors: if obj1 is line: continue margin = min(abs(obj1.y0 - line.y1), abs(obj1.y1 - line.y0)) margin = margin * 1.05 / line.height if margin < true_margin: true_margin = margin neighbors = line.find_neighbors(plane, true_margin) if line not in neighbors: continue members = [] for obj1 in neighbors: if not obj1.get_text().strip(): continue members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: box = LTTextBoxVertical() for obj in uniq(members): box.add(obj) boxes[obj] = box done = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box return
def group_textlines(self, laparams: LAParams, lines: List[LTTextContainer]) -> Generator: plane = Plane(self.bbox) plane.extend(lines) boxes: Dict[LTText, LTTextBox] = {} for line in lines: if isinstance(line, LTTextLineHorizontalExtended): box = LTTextBoxHorizontal() if self.rsrcmgr: klass = line.maybe_classify(self.rsrcmgr) if klass == LTTitle: self.rsrcmgr.after_title = True elif not self.rsrcmgr.after_abstract and klass == LTSectionHeader: self.rsrcmgr.after_abstract = True elif klass == LTSectionHeader and 'references' in line.get_text( ).lower(): self.rsrcmgr.after_ref = True box = klass() else: box = LTTextBoxVertical() if not isinstance(box, LTTitle) and not isinstance( box, LTSectionHeader): neighbors = line.find_neighbors_with_rsrcmgr( plane, laparams.line_margin, self.rsrcmgr) if line not in neighbors: continue else: neighbors = [line] members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) for obj in uniq(members): box.add(obj) boxes[obj] = box done: Set[LTTextBox] = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box return
def find_neighbors_with_rsrcmgr( self, plane: Plane, ratio: float, rsrcmgr: PaperResourceManager) -> List[Union[LTItem, LTText]]: d = ratio * self.height objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) classification = self.maybe_classify(rsrcmgr) return [ obj for obj in objs if (isinstance(obj, LTTextLineHorizontalExtended) and classification == obj.maybe_classify(rsrcmgr) and ((abs(obj.height - self.height) < d and self.is_font_similar( obj) and self.is_x_similar(obj, d)) or classification in [LTAuthor, LTPageMargin, LTCitationBox, LTFooter])) ]
def given_plane_with_one_object(object_size=50, gridsize=50): bounding_box = (0, 0, 100, 100) plane = Plane(bounding_box, gridsize) obj = LTComponent((0, 0, object_size, object_size)) plane.add(obj) return plane, obj
def __init__(self, mentions, lines, region, min_cell_size=6.0): """ Constructor """ self.min_cell_size = min_cell_size vlines, hlines = _split_vlines_hlines(lines) self.xs = [v.xc for v in vlines] self.ys = [h.yc for h in hlines] # Remove closely clustered lines # Also make sure there is at least 1 mega column for the table self.xs = _retain_centroids(self.xs + [region.x0, region.x1], min_cell_size) self.ys = _retain_centroids(self.ys + [region.y0, region.y1], min_cell_size) self.xranges = list(zip(self.xs, self.xs[1:])) self.yranges = list(zip(self.ys, self.ys[1:])) self.num_cols = len(self.xranges) self.num_rows = len(self.yranges) # Grid contents self._grid = np.full([self.num_rows, self.num_cols], None, dtype=np.dtype(object)) grid = self._grid # Record whether a particular cell boundary is present line_plane = Plane(region.bbox) line_plane.extend(lines) vbars, hbars = self._mark_grid_bounds(line_plane, region) cells = [] # Establish cell regions for i in range(self.num_rows): for j in range(self.num_cols): if grid[i, j]: continue # Skip already marked cells # Merge with cell above if i > 0 and not hbars[i, j]: grid[i, j] = cell = grid[i - 1, j] cell.rowend = i + 1 # Merge with cell left elif j > 0 and not vbars[i, j]: grid[i, j] = cell = grid[i, j - 1] cell.colend = j + 1 # Create new cell otherwise else: grid[i, j] = cell = Cell([i, j]) cells.append(cell) # Now get the cell's contents by using its boundary text_plane = Plane(region.bbox) text_plane.extend(mentions) for cell in cells: x0 = self.xs[cell.colstart] x1 = self.xs[cell.colend] y0 = self.ys[cell.rowstart] y1 = self.ys[cell.rowend] bbox = (x0, y0, x1, y1) # Keep mentions whose centers are inside the cell cell.texts = [ m for m in text_plane.find(bbox) if inside(bbox, (m.xc, m.yc) * 2) ] # TODO: provide HTML conversion here self.get_normalized_grid()
def __init__(self): self.cells = Plane() self.text_layout = Plane() self.row_edges = {} self.column_edges = {}
class Sheet1 (object): cells = None text_layout = None column_edges = None row_edges = None def __init__(self): self.cells = Plane() self.text_layout = Plane() self.row_edges = {} self.column_edges = {} def add_cell (self, cell): self.cells.add(cell) def add_text (self, cell_text): self.text_layout.add(cell_text) # if cell_text.text[:3] == 'Oil': print cell_text.text, cell_text.bbox def add_column_edge (self, x_value): x = round(x_value,2) self.column_edges[x] = 1+ self.column_edges.get(x,0) def add_row_edge (self, y_value): y = round(y_value,2) self.row_edges[y] = 1+ self.row_edges.get(y,0) def add_line (self, bbox): if bbox[0]==bbox[2]: # vertical line self.add_column_edge(bbox[0]) elif bbox[1]==bbox[3]: #horizontal line self.add_row_edge(bbox[1]) else: print ('WARNING: non-orthogonal line found: %s'%bbox) def add_rect (self, bbox): self.add_column_edge(bbox[0]) self.add_column_edge(bbox[2]) self.add_row_edge(bbox[1]) self.add_row_edge(bbox[3]) def add_ltcontainer (self, obj, page_y_offset): #NB: row indexes (y axis) are negative! bbox = ( round(obj.x0,2), round(-(obj.y1+page_y_offset),2), round(obj.x1,2), round(-(obj.y0+page_y_offset),2) ) if isinstance (obj, LTTextLine): self.add_text (CellText(bbox, obj.get_text())) elif isinstance (obj, LTLine): self.add_line(bbox) elif isinstance (obj, LTRect): self.add_rect(bbox) elif isinstance (obj, LTContainer): for child in obj: self.add_ltcontainer (child, page_y_offset) else: pass def extract_rows (self): # for obj in self.text_layout.find((690, -1200, 800, -1000)): # print obj.bbox,obj.text row_bounds = sorted(self.row_edges) col_bounds = sorted(self.column_edges) # pprint.pprint(col_bounds) rows = [] r0 = row_bounds[0] - 1 if row_bounds else 0 #NB: row indexes (y axis) are negative! for r1 in row_bounds: if r1 - r0 < 1: continue # print r1-r0,r0,r1 row=[] c0 = 0 for c1 in col_bounds: if c1 - c0 < 1: continue # print c0,r0,c1,r1 # get all text lines that intersect the bounds of this cell lines = [l for l in self.text_layout.find((c0,r0,c1,r1))] #sort from top to bottom lines = sorted(lines, key=lambda line: line.y0) # text = ' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1]) # if text[:10] == 'Production': print text,c0,r0,c1,r1 # if text[:3] == 'Oil': print text,c0,r0,c1,r1 # remove anything where the left edge is not inside the cell and concatenate the rest row.append(' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1])) c0 = c1 rows.append(row) r0 = r1 return rows
def test_find_neighbors_vertical(self): laparams = LAParams() plane = Plane((0, 0, 50, 50)) line = LTTextLineVertical(laparams.word_margin) line.set_bbox((4, 10, 6, 20)) plane.add(line) bottom_aligned_right = LTTextLineVertical(laparams.word_margin) bottom_aligned_right.set_bbox((6, 10, 8, 15)) plane.add(bottom_aligned_right) top_aligned_left = LTTextLineVertical(laparams.word_margin) top_aligned_left.set_bbox((2, 15, 4, 20)) plane.add(top_aligned_left) centrally_aligned_overlapping = LTTextLineVertical( laparams.word_margin) centrally_aligned_overlapping.set_bbox((5, 13, 7, 17)) plane.add(centrally_aligned_overlapping) not_aligned = LTTextLineVertical(laparams.word_margin) not_aligned.set_bbox((6, 0, 8, 5)) plane.add(not_aligned) wrong_width = LTTextLineVertical(laparams.word_margin) wrong_width.set_bbox((6, 10, 10, 15)) plane.add(wrong_width) neighbors = line.find_neighbors(plane, laparams.line_margin) self.assertCountEqual( neighbors, [ line, bottom_aligned_right, top_aligned_left, centrally_aligned_overlapping, ], )
def test_find_neighbors_horizontal(self): laparams = LAParams() plane = Plane((0, 0, 50, 50)) line = LTTextLineHorizontal(laparams.word_margin) line.set_bbox((10, 4, 20, 6)) plane.add(line) left_aligned_above = LTTextLineHorizontal(laparams.word_margin) left_aligned_above.set_bbox((10, 6, 15, 8)) plane.add(left_aligned_above) right_aligned_below = LTTextLineHorizontal(laparams.word_margin) right_aligned_below.set_bbox((15, 2, 20, 4)) plane.add(right_aligned_below) centrally_aligned_overlapping = LTTextLineHorizontal( laparams.word_margin) centrally_aligned_overlapping.set_bbox((13, 5, 17, 7)) plane.add(centrally_aligned_overlapping) not_aligned = LTTextLineHorizontal(laparams.word_margin) not_aligned.set_bbox((0, 6, 5, 8)) plane.add(not_aligned) wrong_height = LTTextLineHorizontal(laparams.word_margin) wrong_height.set_bbox((10, 6, 15, 10)) plane.add(wrong_height) neighbors = line.find_neighbors(plane, laparams.line_margin) self.assertCountEqual( neighbors, [ line, left_aligned_above, right_aligned_below, centrally_aligned_overlapping, ], )
def cluster_vertically_aligned_boxes(boxes, page_bbox, avg_font_pts, width, char_width, boxes_segments, boxes_curves, boxes_figures, page_width, combine): # Too many "." in the Table of Content pages if (len(boxes) == 0 or len(boxes) > 3500): return [] plane = Plane(page_bbox) plane.extend(boxes) cid2obj = [set([i]) for i in xrange(len(boxes))] # initialize clusters obj2cid = range( len(boxes)) # default object map to cluster with its own index prev_clusters = obj2cid while (True): for i1, b1 in enumerate(boxes): for i2, b2 in enumerate(boxes): if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])): continue if (b1.bbox[1] < b2.bbox[1]): box1 = b1.bbox box2 = b2.bbox elif (b2.bbox[1] < b1.bbox[1]): box1 = b2.bbox box2 = b1.bbox else: # horizontally aligned continue if ( box2[1] < box1[3] or (box2[1] - box1[1] < 1.5 * avg_font_pts) or (box2[3] - box1[3] < 1.5 * avg_font_pts) ): # can probably do better if we find the average space between words if (abs(box1[0] - box2[0]) < 3 or abs(box1[2] - box2[2]) < 3 or (((box1[0] + box1[2]) / 2) == ((box2[0] + box2[2]) / 2)) or ((box1[0] < box2[0]) and (box1[2] > box2[0])) or ((box1[0] > box2[0]) and (box2[2] > box1[0]))): # added center alignemnt min_i = min(i1, i2) max_i = max(i1, i2) cid1 = obj2cid[min_i] cid2 = obj2cid[max_i] # move all objects from cluster cid2 to cid1 # reassign cluster ids for all such objects as well for obj_iter in cid2obj[cid2]: cid2obj[cid1].add(obj_iter) obj2cid[obj_iter] = cid1 cid2obj[cid2] = set() if (prev_clusters == obj2cid): break prev_clusters = obj2cid clusters = [[boxes[i] for i in cluster] for cluster in filter(bool, cid2obj)] rid2obj = [set([i]) for i in xrange(len(boxes))] # initialize clusters obj2rid = range( len(boxes)) # default object map to cluster with its own index prev_clusters = obj2rid while (True): for i1, b1 in enumerate(boxes): for i2, b2 in enumerate(boxes): if ((i1 == i2) or (obj2rid[i1] == obj2rid[i2])): continue box1 = b1.bbox box2 = b2.bbox if ((abs(box1[1] - box2[1]) < 0.11 * avg_font_pts) or ((abs(box1[3] - box2[3]) < 0.11 * avg_font_pts)) or (round((box1[1] + box1[3]) / 2) == round( (box2[1] + box2[3]) / 2))): min_i = min(i1, i2) max_i = max(i1, i2) rid1 = obj2rid[min_i] rid2 = obj2rid[max_i] for obj_iter in rid2obj[rid2]: rid2obj[rid1].add(obj_iter) obj2rid[obj_iter] = rid1 rid2obj[rid2] = set() if (prev_clusters == obj2rid): break prev_clusters = obj2rid not_merge = set() for i1, b1 in enumerate(boxes): for i2 in cid2obj[obj2cid[i1]]: if (i1 == i2): continue row1 = obj2rid[i1] row2 = obj2rid[i2] if (row1 == row2): continue if (b1.bbox[1] < b2.bbox[1]): box1 = b1.bbox box2 = b2.bbox elif (b2.bbox[1] < b1.bbox[1]): box1 = b2.bbox box2 = b1.bbox else: # horizontally aligned continue text_1 = 0.0 for obj in rid2obj[row1]: text_1 += boxes[obj].bbox[2] - boxes[obj].bbox[0] text_2 = 0.0 for obj in rid2obj[row2]: text_2 += boxes[obj].bbox[2] - boxes[obj].bbox[0] if (abs(text_1 - text_2) / width > 0.1): min_i = min(i1, i2) max_i = max(i1, i2) not_merge.add((min_i, max_i)) # Alignment Features # If text boxes are very close in a row if_row_connected = defaultdict(int) num_row_connected = defaultdict(lambda: 1) # If text is merged using span code in adjacent rows, this feature tells the number of times the cluster went through span based clustering if_connected_by_span = defaultdict(int) num_connected_by_span = defaultdict(lambda: 1) # If columns were merged using cluster alignment if_connected_by_align = defaultdict(int) num_connected_by_align = defaultdict(lambda: 1) # If vertical columns were merged if_vertical_columns_merged = defaultdict(int) num_vertical_columns_merged = defaultdict(lambda: 1) # Number of Line Segments, Curves and Figures num_segments = defaultdict(int) num_curves = defaultdict(int) num_figures = defaultdict(int) # Average Word Space total_word_space = defaultdict(float) avg_word_space = defaultdict(float) avg_word_space_norm = defaultdict(float) node_space = defaultdict(float) avg_node_space = defaultdict(float) avg_node_space_norm = defaultdict(float) cid2obj = [set([i]) for i in xrange(len(boxes))] # initialize clusters obj2cid = range( len(boxes)) # default object map to cluster with its own index prev_clusters = obj2cid # add the code for merging close text boxes in particular row while (True): for i1, b1 in enumerate(boxes): for i2, b2 in enumerate(boxes): if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])): continue box1 = b1.bbox box2 = b2.bbox if (obj2rid[i1] == obj2rid[i2]): if (((b1.bbox[0] < b2.bbox[0]) and ((b2.bbox[0] - b1.bbox[2]) <= 2 * char_width)) or ((b2.bbox[0] < b1.bbox[0]) and ((b1.bbox[0] - b2.bbox[2]) <= 2 * char_width))): min_i = min(i1, i2) max_i = max(i1, i2) cid1 = obj2cid[min_i] cid2 = obj2cid[max_i] for obj_iter in cid2obj[cid2]: cid2obj[cid1].add(obj_iter) obj2cid[obj_iter] = cid1 cid2obj[cid2] = set() # Features if_row_connected[cid1] = 1 if_row_connected[cid2] = 0 num_row_connected[cid1] += num_row_connected[cid2] num_row_connected[cid2] = 0 if (prev_clusters == obj2cid): break prev_clusters = obj2cid # vertical alignment code while (True): for i1, b1 in enumerate(boxes): for i2, b2 in enumerate(boxes): if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])): continue if (b1.bbox[1] < b2.bbox[1]): box1 = b1.bbox box2 = b2.bbox elif (b2.bbox[1] < b1.bbox[1]): box1 = b2.bbox box2 = b1.bbox else: # horizontally aligned continue if ( box2[1] < box1[3] or (box2[1] - box1[1] < 1.5 * avg_font_pts) or (box2[3] - box1[3] < 1.5 * avg_font_pts) ): # can probably do better if we find the average space between words if ( abs(box1[0] - box2[0]) < 3 or abs(box1[2] - box2[2]) < 3 or (((box1[0] + box1[2]) / 2) == ((box2[0] + box2[2]) / 2)) ): # or ((box1[0]<box2[0]) and (box1[2]>box2[0])) or ((box1[0]>box2[0]) and (box2[2]>box1[0]))): #added center alignemnt min_i = min(i1, i2) max_i = max(i1, i2) if ((min_i, max_i) not in not_merge): cid1 = obj2cid[min_i] cid2 = obj2cid[max_i] # move all objects from cluster cid2 to cid1 # reassign cluster ids for all such objects as well for obj_iter in cid2obj[cid2]: cid2obj[cid1].add(obj_iter) obj2cid[obj_iter] = cid1 cid2obj[cid2] = set() # Features if_connected_by_span[cid1] = 1 if_connected_by_span[cid2] = 0 if (if_row_connected[cid1] == 1 or if_row_connected[cid2] == 1): if_row_connected[cid1] = 1 num_row_connected[cid1] += num_row_connected[ cid2] num_row_connected[cid2] = 0 if_row_connected[cid2] = 0 num_connected_by_span[ cid1] = num_connected_by_span[ cid1] + num_connected_by_span[cid2] num_connected_by_span[cid2] = 0 if (prev_clusters == obj2cid): break prev_clusters = obj2cid # blacklist nearly half-page wide clusters before horizontal merging cid2obj2 = cid2obj[:] obj2cid2 = obj2cid[:] blacklist = set() blacklist_obj = set() for cid_iter in range(len(cid2obj2)): cid = cid2obj2[cid_iter] xmin = float("Inf") xmax = float("-Inf") for obj in cid: xmin = min(xmin, boxes[obj].bbox[0]) xmax = max(xmax, boxes[obj].bbox[2]) if (((xmax - xmin) > width / 2.75 and (xmax - xmin) < width / 2) or ((xmax - xmin) > 0.9 * width)): blacklist.add(cid_iter) for obj in cid: blacklist_obj.add(obj) for obj_iter in rid2obj[obj2rid[obj]]: if (boxes[obj_iter].bbox[0] >= xmin and boxes[obj_iter].bbox[2] <= xmax): blacklist_obj.add(obj_iter) # create a cluster span cid2span = {} for cid in range(len(cid2obj)): cid2span[cid] = {} cid2span[cid]["min_x"] = float("Inf") cid2span[cid]["min_y"] = float("Inf") cid2span[cid]["max_x"] = float("-Inf") cid2span[cid]["max_y"] = float("-Inf") for obj in cid2obj[cid]: cid2span[cid]["min_x"] = min(cid2span[cid]["min_x"], boxes[obj].bbox[0]) cid2span[cid]["max_x"] = max(cid2span[cid]["max_x"], boxes[obj].bbox[2]) cid2span[cid]["min_y"] = min(cid2span[cid]["min_y"], boxes[obj].bbox[1]) cid2span[cid]["max_y"] = max(cid2span[cid]["max_y"], boxes[obj].bbox[3]) cid2cid = {} cid_pair_compared = set() cid2cid2 = [cid for cid in range(len(cid2obj))] for i1, b1 in enumerate(boxes): for i2, b2 in enumerate(boxes): if (i1 == i2): continue if (i1 in blacklist_obj or i2 in blacklist_obj): continue cid1 = obj2cid[i1] cid2 = obj2cid[i2] if ((min(cid1, cid2), max(cid1, cid2)) in cid_pair_compared): continue if (cid1 == cid2): continue if (obj2rid[i1] == obj2rid[i2]): continue if (cid1 not in cid2cid): cid2cid[cid1] = set() if (cid2 not in cid2cid): cid2cid[cid2] = set() if (cid2span[cid1]["min_y"] < cid2span[cid2]["min_y"]): box1 = [ cid2span[cid1]["min_x"], cid2span[cid1]["min_y"], cid2span[cid1]["max_x"], cid2span[cid1]["max_y"] ] box2 = [ cid2span[cid2]["min_x"], cid2span[cid2]["min_y"], cid2span[cid2]["max_x"], cid2span[cid2]["max_y"] ] else: box1 = [ cid2span[cid2]["min_x"], cid2span[cid2]["min_y"], cid2span[cid2]["max_x"], cid2span[cid2]["max_y"] ] box2 = [ cid2span[cid1]["min_x"], cid2span[cid1]["min_y"], cid2span[cid1]["max_x"], cid2span[cid1]["max_y"] ] if (((box1[1] < box2[1]) and (box1[3] > box2[1])) or ((box1[1] > box2[1]) and (box1[1] < box2[3]))): continue cid_pair_compared.add((min(cid1, cid2), max(cid1, cid2))) query_rect = (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box1[2], box2[2]), max(box1[3], box2[3])) connected = True for i3, b3 in enumerate(boxes): if ((i3 == i1) or (i3 == i2)): continue if (obj2cid[i1] == obj2cid[i3] or obj2cid[i2] == obj2cid[i3]): continue box3 = b3.bbox if (intersect(query_rect, box3)): connected = False break if ( ((round(box1[0]) == round(box2[0]) or round(box1[2]) == round(box2[2])) and connected) or (round((box1[0] + box1[2]) / 2) == round( (box2[0] + box2[2]) / 2) and connected) ): # or (abs((box1[0]+box1[2])/2-(box2[0]+box2[2])/2)<0.1*char_width and connected)):# or ((box1[0]<box2[0]) and (box1[2]>box2[0])) or ((box1[0]>box2[0]) and (box2[2]>box1[0]))): #added center alignemnt cid2cid[min(cid1, cid2)].add(max(cid1, cid2)) min_cid = min(cid1, cid2) max_cid = max(cid1, cid2) for cid_iter in cid2cid2: if (cid2cid2[cid_iter] == cid2cid2[max_cid]): cid2cid2[cid_iter] = cid2cid2[min_cid] # post-process cid2cid cid2obj2 = cid2obj[:] obj2cid2 = obj2cid[:] for cid in range(len(cid2cid2)): cid_merge = cid2cid2[cid] if (cid != cid_merge): for obj_iter in cid2obj2[cid]: cid2obj2[cid_merge].add(obj_iter) obj2cid2[obj_iter] = cid_merge cid2obj2[cid] = set() # Features if_connected_by_align[cid_merge] = 1 if_connected_by_align[cid] = 0 if (if_row_connected[cid_merge] == 1 or if_row_connected[cid] == 1): if_row_connected[cid_merge] = 1 num_row_connected[cid_merge] += num_row_connected[cid] num_row_connected[cid] = 0 if_row_connected[cid2] = 0 if (if_connected_by_span[cid_merge] == 1 or if_connected_by_span[cid] == 1): if_connected_by_span[cid_merge] = 1 num_connected_by_span[cid_merge] += num_connected_by_span[cid] num_connected_by_span[cid] = 0 if_connected_by_span[cid] = 0 num_connected_by_align[cid_merge] += num_connected_by_align[cid] num_connected_by_align[cid] = 0 # code to merge columns for table prev_clusters = obj2cid2 while (True): for obj1, b1 in enumerate(boxes): cid1 = obj2cid2[obj1] rid1 = obj2rid[obj1] if (cid1 in blacklist): continue if (obj1 in blacklist_obj): continue for obj2, b2 in enumerate(boxes): if (obj1 == obj2): continue if (obj2cid2[obj2] == cid1): rid2 = obj2rid[obj2] if (rid1 == rid2): continue for obj3 in rid2obj[rid2]: cid3 = obj2cid2[obj3] if (obj3 in blacklist_obj): continue if (cid1 != cid3): for obj4 in cid2obj2[cid3]: if (obj4 == obj3): continue if (obj2rid[obj4] == rid1): min_cid = min(cid1, cid3) max_cid = max(cid1, cid3) for obj_iter in cid2obj2[max_cid]: cid2obj2[min_cid].add(obj_iter) obj2cid2[obj_iter] = min_cid cid2obj2[max_cid] = set() # Features if_vertical_columns_merged[min_cid] = 1 if_vertical_columns_merged[max_cid] = 0 num_vertical_columns_merged[ min_cid] += num_vertical_columns_merged[ max_cid] num_vertical_columns_merged[max_cid] = 0 if (if_row_connected[min_cid] == 1 or if_row_connected[max_cid] == 1): if_row_connected[min_cid] = 1 num_row_connected[ min_cid] += num_row_connected[ max_cid] num_row_connected[max_cid] = 0 if_row_connected[max_cid] = 0 if (if_connected_by_span[min_cid] == 1 or if_connected_by_span[max_cid] == 1): if_connected_by_span[min_cid] = 1 num_connected_by_span[ min_cid] += num_connected_by_span[ max_cid] num_connected_by_span[max_cid] = 0 if_connected_by_span[max_cid] = 0 if (if_connected_by_align[min_cid] == 1 or if_connected_by_align[max_cid] == 1): if_connected_by_align[min_cid] = 1 num_connected_by_align[ min_cid] += num_connected_by_align[ max_cid] num_connected_by_align[max_cid] = 0 if_connected_by_align[max_cid] = 0 break if (prev_clusters == obj2cid2): break prev_clusters = obj2cid2 clusters = [[boxes[i] for i in cluster] for cluster in filter(bool, cid2obj2)] nodes = [Node(elems) for elems in clusters] node_indices = [i for i, x in enumerate(cid2obj2) if x] # for idx in range(len(nodes)): # print idx, node_indices[idx], nodes[idx] merge_indices = [i for i in range(len(node_indices))] page_stat = Node(boxes) nodes, merge_indices = merge_nodes(nodes, plane, page_stat, merge_indices) # Features for idx in range(len(merge_indices)): if (merge_indices[idx] != idx): cid1 = node_indices[merge_indices[idx]] cid2 = node_indices[idx] if (if_row_connected[cid1] == 1 or if_row_connected[cid2] == 1): if_row_connected[cid1] = 1 num_row_connected[cid1] += num_row_connected[cid2] num_row_connected[cid2] = 0 if_row_connected[cid2] = 0 if (if_connected_by_span[cid1] == 1 or if_connected_by_span[cid2] == 1): if_connected_by_span[cid1] = 1 num_connected_by_span[cid1] += num_connected_by_span[cid2] num_connected_by_span[cid2] = 0 if_connected_by_span[cid2] = 0 if (if_connected_by_align[cid1] == 1 or if_connected_by_align[cid2] == 1): if_connected_by_align[cid1] = 1 num_connected_by_align[cid1] += num_connected_by_align[cid2] num_connected_by_align[cid2] = 0 if_connected_by_align[cid2] = 0 if (if_vertical_columns_merged[cid1] == 1 or if_vertical_columns_merged[cid2] == 1): if_vertical_columns_merged[cid1] = 1 num_vertical_columns_merged[ cid1] += num_vertical_columns_merged[cid2] num_vertical_columns_merged[cid2] = 0 if_vertical_columns_merged[cid2] = 0 # Get Word Spacing Features rid2space = defaultdict(float) rid2space_norm = defaultdict(float) row_indices = [i for i, x in enumerate(rid2obj) if x] for rid in row_indices: obj_list = list(rid2obj[rid]) if (len(obj_list) == 1): rid2space[rid] = 0 continue obj_boxes = [boxes[obj].bbox[0] for obj in obj_list] sorted_obj_idx = [ i[0] for i in sorted(enumerate(obj_boxes), key=lambda x: x[1]) ] for obj_idx in range(len(sorted_obj_idx) - 1): rid2space[rid] += boxes[obj_list[sorted_obj_idx[obj_idx + 1]]].bbox[2] - \ boxes[obj_list[sorted_obj_idx[obj_idx]]].bbox[0] rid2space_norm[rid] = rid2space[rid] / (len(obj_list) - 1) for idx, node in enumerate(nodes): node_idx = node_indices[idx] if (merge_indices[idx] == idx): obj_list = [] for idx_iter in range(len(merge_indices)): if (merge_indices[idx_iter] == idx): obj_list += list(cid2obj2[node_indices[idx_iter]]) obj_list = list(set(obj_list)) rid_list = list(set([obj2rid[obj] for obj in obj_list])) for rid in rid_list: total_word_space[node_idx] += rid2space[rid] avg_word_space_norm[node_idx] += rid2space_norm[rid] obj_boxes = [ boxes[obj].bbox[0] for obj in rid2obj if obj in cid2obj2[node_idx] ] sorted_obj_idx = [ i[0] for i in sorted(enumerate(obj_boxes), key=lambda x: x[1]) ] for obj_idx in range(len(sorted_obj_idx) - 1): node_space[node_idx] += boxes[obj_list[sorted_obj_idx[obj_idx + 1]]].bbox[2] - \ boxes[obj_list[sorted_obj_idx[obj_idx]]].bbox[0] avg_node_space_norm[node_idx] += node_space[node_idx] / ( len(obj_boxes) - 1) avg_word_space[node_idx] = total_word_space[node_idx] / len( rid_list) avg_word_space_norm[node_idx] /= len(rid_list) avg_node_space[node_idx] = node_space[node_idx] / len(rid_list) avg_node_space_norm[node_idx] /= len(rid_list) new_nodes = [] new_node_indices = [] for idx in range(len(merge_indices)): if (merge_indices[idx] == idx): new_nodes.append(nodes[idx]) new_node_indices.append(node_indices[idx]) nodes = new_nodes node_indices = new_node_indices # Features for idx, node in enumerate(nodes): node_idx = node_indices[idx] node_bbox = (node.x0, node.y0, node.x1, node.y1) for i1, b1 in enumerate(boxes_segments): if (intersect(node_bbox, b1.bbox)): num_segments[node_idx] += 1 for i1, b1 in enumerate(boxes_figures): if (intersect(node_bbox, b1.bbox)): num_figures[node_idx] += 1 for i1, b1 in enumerate(boxes_curves): if (intersect(node_bbox, b1.bbox)): num_curves[node_idx] += 1 tables = [] table_indices = [] for idx, node in enumerate(nodes): node_idx = node_indices[idx] isTable = True if node.is_table(): for elem in node.elems: if ("table" in elem.get_text().lower()): continue if ((node.width - elem.bbox[2] + elem.bbox[0]) < 2 * char_width): isTable = False if (isTable): tables.append(node) table_indices.append(node_idx) if (combine == True): node_features = [0] * 17 for idx, node in enumerate(nodes): node_idx = node_indices[idx] node_features = [ sum(x) for x in zip(node_features, [ if_row_connected[node_idx], num_row_connected[node_idx], if_connected_by_span[node_idx], num_connected_by_span[node_idx], if_connected_by_align[node_idx], num_connected_by_align[node_idx], if_vertical_columns_merged[node_idx], num_vertical_columns_merged[node_idx], num_segments[node_idx], num_curves[node_idx], num_figures[node_idx], total_word_space[node_idx], avg_word_space[node_idx], avg_word_space_norm[node_idx], node_space[node_idx], avg_node_space[node_idx], avg_node_space_norm[node_idx] ]) ] return [], node_features else: table_features = [] for idx, table in enumerate(tables): table_idx = table_indices[idx] table_features += [[ if_row_connected[table_idx], num_row_connected[table_idx], if_connected_by_span[table_idx], num_connected_by_span[table_idx], if_connected_by_align[table_idx], num_connected_by_align[table_idx], if_vertical_columns_merged[table_idx], num_vertical_columns_merged[table_idx], num_segments[table_idx], num_curves[table_idx], num_figures[table_idx], total_word_space[table_idx], avg_word_space[table_idx], avg_word_space_norm[table_idx], node_space[table_idx], avg_node_space[table_idx], avg_node_space_norm[table_idx] ]] return tables, table_features