def collect_table_content(table_bboxes, elems): """ Returns a list of elements that are contained inside the corresponding supplied bbox. """ # list of table content chars table_contents = [[] for _ in range(len(table_bboxes))] prev_content = None prev_bbox = None for cid, c in enumerate(elems): # Annotations should not fall outside alone if isinstance(c, LTAnno): if prev_content is not None: prev_content.append(c) continue # Generally speaking table contents should be included sequentially # and we can avoid checking all tables for elems inside # Elements only need to intersect the bbox for table as some # formatting of fonts may result in slightly out of bbox text if prev_bbox is not None and intersect(prev_bbox, c.bbox): prev_content.append(c) continue # Search the rest of the tables for membership when done with # the current one for table_id, table_bbox in enumerate(table_bboxes): if intersect(table_bbox, c.bbox): prev_bbox = table_bbox prev_content = table_contents[table_id] prev_content.append(c) break return table_contents
def get_alignment_features(line_bboxes, elems, font_stat): alignment_features = [] for line_bbox in line_bboxes: line_bbox_ordered = (line_bbox[4], line_bbox[3], line_bbox[6], line_bbox[5]) boxes = [ elem for elem in elems.mentions if intersect(line_bbox_ordered, elem.bbox) ] boxes_segments = [ elem for elem in elems.segments if intersect(line_bbox_ordered, elem.bbox) ] boxes_figures = [ elem for elem in elems.figures if intersect(line_bbox_ordered, elem.bbox) ] boxes_curves = [ elem for elem in elems.curves if intersect(line_bbox_ordered, elem.bbox) ] page_width = elems.layout.width # page_height = elems.layout.height avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat) width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves) if len(boxes) == 0: alignment_features += [[0] * 17] continue char_width = get_char_width(boxes) grid_size = avg_font_pts / 2.0 for i, m in enumerate(boxes + elems.figures): m.id = i m.feats = defaultdict(bool) prefix = "" if isinstance(m, LTTextLine) and m.font_name: prefix = m.font_name + "-" + str(m.font_size) + "-" m.xc = (m.x0 + m.x1) / 2.0 m.yc = (m.y0 + m.y1) / 2.0 m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size nodes, nodes_features = cluster_vertically_aligned_boxes( boxes, elems.layout.bbox, avg_font_pts, width, char_width, boxes_segments, boxes_curves, boxes_figures, page_width, True, ) if len(nodes_features) == 0: alignment_features += [[0] * 17] else: alignment_features += [nodes_features] return alignment_features