예제 #1
0
def collect_table_content(table_bboxes, elems):
    """
    Returns a list of elements that are contained inside
    the corresponding supplied bbox.
    """
    # list of table content chars
    table_contents = [[] for _ in range(len(table_bboxes))]
    prev_content = None
    prev_bbox = None
    for cid, c in enumerate(elems):
        # Annotations should not fall outside alone
        if isinstance(c, LTAnno):
            if prev_content is not None:
                prev_content.append(c)
            continue
        # Generally speaking table contents should be included sequentially
        # and we can avoid checking all tables for elems inside
        # Elements only need to intersect the bbox for table as some
        # formatting of fonts may result in slightly out of bbox text
        if prev_bbox is not None and intersect(prev_bbox, c.bbox):
            prev_content.append(c)
            continue
        # Search the rest of the tables for membership when done with
        # the current one
        for table_id, table_bbox in enumerate(table_bboxes):
            if intersect(table_bbox, c.bbox):
                prev_bbox = table_bbox
                prev_content = table_contents[table_id]
                prev_content.append(c)
                break
    return table_contents
예제 #2
0
def get_alignment_features(line_bboxes, elems, font_stat):
    alignment_features = []
    for line_bbox in line_bboxes:
        line_bbox_ordered = (line_bbox[4], line_bbox[3], line_bbox[6], line_bbox[5])
        boxes = [
            elem for elem in elems.mentions if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_segments = [
            elem for elem in elems.segments if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_figures = [
            elem for elem in elems.figures if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_curves = [
            elem for elem in elems.curves if intersect(line_bbox_ordered, elem.bbox)
        ]
        page_width = elems.layout.width
        #  page_height = elems.layout.height
        avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
        width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves)
        if len(boxes) == 0:
            alignment_features += [[0] * 17]
            continue
        char_width = get_char_width(boxes)
        grid_size = avg_font_pts / 2.0
        for i, m in enumerate(boxes + elems.figures):
            m.id = i
            m.feats = defaultdict(bool)
            prefix = ""
            if isinstance(m, LTTextLine) and m.font_name:
                prefix = m.font_name + "-" + str(m.font_size) + "-"
            m.xc = (m.x0 + m.x1) / 2.0
            m.yc = (m.y0 + m.y1) / 2.0
            m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size
            m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size
            m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size
            m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size

        nodes, nodes_features = cluster_vertically_aligned_boxes(
            boxes,
            elems.layout.bbox,
            avg_font_pts,
            width,
            char_width,
            boxes_segments,
            boxes_curves,
            boxes_figures,
            page_width,
            True,
        )
        if len(nodes_features) == 0:
            alignment_features += [[0] * 17]
        else:
            alignment_features += [nodes_features]
    return alignment_features