Exemplo n.º 1
0
 def identify_scanned_page(self, boxes, page_bbox, page_width, page_height):
     plane = Plane(page_bbox)
     plane.extend(boxes)
     cid2obj = [set([i]) for i in range(len(boxes))]  # initialize clusters
     # default object map to cluster with its own index
     obj2cid = list(range(len(boxes)))
     prev_clusters = obj2cid
     while (True):
         for i1, b1 in enumerate(boxes):
             for i2, b2 in enumerate(boxes):
                 box1 = b1.bbox
                 box2 = b2.bbox
                 if (box1[0] == box2[0] and box1[2] == box2[2]
                         and round(box1[3]) == round(box2[1])):
                     min_i = min(i1, i2)
                     max_i = max(i1, i2)
                     cid1 = obj2cid[min_i]
                     cid2 = obj2cid[max_i]
                     for obj_iter in cid2obj[cid2]:
                         cid2obj[cid1].add(obj_iter)
                         obj2cid[obj_iter] = cid1
                     cid2obj[cid2] = set()
         if (prev_clusters == obj2cid):
             break
         prev_clusters = obj2cid
     clusters = [[boxes[i] for i in cluster]
                 for cluster in filter(bool, cid2obj)]
     if (len(clusters) == 1 and clusters[0][0].bbox[0] < -0.0
             and clusters[0][0].bbox[1] <= 0
             and abs(clusters[0][0].bbox[2] - page_width) <= 5
             and abs(clusters[0][0].bbox[3] - page_height) <= 5):
         return True
     return False
Exemplo n.º 2
0
def group_textlines(self, laparams, lines):
    """Patched class method that fixes empty line aggregation, and allows
    run-time line margin detection"""
    plane = Plane(self.bbox)
    plane.extend(lines)
    boxes = {}
    for line in lines:
        neighbors = line.find_neighbors(plane, laparams.line_margin)
        if line not in neighbors or not line.get_text().strip():
            continue

        # Correct margin to paragraph specific
        true_margin = laparams.line_margin
        for obj1 in neighbors:
            if obj1 is line:
                continue
            margin = min(abs(obj1.y0 - line.y1), abs(obj1.y1 - line.y0))
            margin = margin * 1.05 / line.height
            if margin < true_margin:
                true_margin = margin

        neighbors = line.find_neighbors(plane, true_margin)
        if line not in neighbors:
            continue

        members = []
        for obj1 in neighbors:
            if not obj1.get_text().strip():
                continue
            members.append(obj1)
            if obj1 in boxes:
                members.extend(boxes.pop(obj1))
        if isinstance(line, LTTextLineHorizontal):
            box = LTTextBoxHorizontal()
        else:
            box = LTTextBoxVertical()
        for obj in uniq(members):
            box.add(obj)
            boxes[obj] = box
    done = set()
    for line in lines:
        if line not in boxes:
            continue
        box = boxes[line]
        if box in done:
            continue
        done.add(box)
        if not box.is_empty():
            yield box
    return
Exemplo n.º 3
0
 def group_textlines(self, laparams: LAParams,
                     lines: List[LTTextContainer]) -> Generator:
     plane = Plane(self.bbox)
     plane.extend(lines)
     boxes: Dict[LTText, LTTextBox] = {}
     for line in lines:
         if isinstance(line, LTTextLineHorizontalExtended):
             box = LTTextBoxHorizontal()
             if self.rsrcmgr:
                 klass = line.maybe_classify(self.rsrcmgr)
                 if klass == LTTitle:
                     self.rsrcmgr.after_title = True
                 elif not self.rsrcmgr.after_abstract and klass == LTSectionHeader:
                     self.rsrcmgr.after_abstract = True
                 elif klass == LTSectionHeader and 'references' in line.get_text(
                 ).lower():
                     self.rsrcmgr.after_ref = True
                 box = klass()
         else:
             box = LTTextBoxVertical()
         if not isinstance(box, LTTitle) and not isinstance(
                 box, LTSectionHeader):
             neighbors = line.find_neighbors_with_rsrcmgr(
                 plane, laparams.line_margin, self.rsrcmgr)
             if line not in neighbors:
                 continue
         else:
             neighbors = [line]
         members = []
         for obj1 in neighbors:
             members.append(obj1)
             if obj1 in boxes:
                 members.extend(boxes.pop(obj1))
         for obj in uniq(members):
             box.add(obj)
             boxes[obj] = box
     done: Set[LTTextBox] = set()
     for line in lines:
         if line not in boxes:
             continue
         box = boxes[line]
         if box in done:
             continue
         done.add(box)
         if not box.is_empty():
             yield box
     return
Exemplo n.º 4
0
 def find_neighbors_with_rsrcmgr(
         self, plane: Plane, ratio: float,
         rsrcmgr: PaperResourceManager) -> List[Union[LTItem, LTText]]:
     d = ratio * self.height
     objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d))
     classification = self.maybe_classify(rsrcmgr)
     return [
         obj for obj in objs
         if (isinstance(obj, LTTextLineHorizontalExtended)
             and classification == obj.maybe_classify(rsrcmgr) and
             ((abs(obj.height - self.height) < d and self.is_font_similar(
                 obj) and self.is_x_similar(obj, d)) or classification in
              [LTAuthor, LTPageMargin, LTCitationBox, LTFooter]))
     ]
Exemplo n.º 5
0
 def given_plane_with_one_object(object_size=50, gridsize=50):
     bounding_box = (0, 0, 100, 100)
     plane = Plane(bounding_box, gridsize)
     obj = LTComponent((0, 0, object_size, object_size))
     plane.add(obj)
     return plane, obj
Exemplo n.º 6
0
    def __init__(self, mentions, lines, region, min_cell_size=6.0):
        """
        Constructor
        """
        self.min_cell_size = min_cell_size
        vlines, hlines = _split_vlines_hlines(lines)

        self.xs = [v.xc for v in vlines]
        self.ys = [h.yc for h in hlines]

        # Remove closely clustered lines
        # Also make sure there is at least 1 mega column for the table
        self.xs = _retain_centroids(self.xs + [region.x0, region.x1],
                                    min_cell_size)
        self.ys = _retain_centroids(self.ys + [region.y0, region.y1],
                                    min_cell_size)

        self.xranges = list(zip(self.xs, self.xs[1:]))
        self.yranges = list(zip(self.ys, self.ys[1:]))

        self.num_cols = len(self.xranges)
        self.num_rows = len(self.yranges)

        # Grid contents
        self._grid = np.full([self.num_rows, self.num_cols],
                             None,
                             dtype=np.dtype(object))
        grid = self._grid

        # Record whether a particular cell boundary is present
        line_plane = Plane(region.bbox)
        line_plane.extend(lines)
        vbars, hbars = self._mark_grid_bounds(line_plane, region)
        cells = []
        # Establish cell regions
        for i in range(self.num_rows):
            for j in range(self.num_cols):
                if grid[i, j]:
                    continue  # Skip already marked cells
                # Merge with cell above
                if i > 0 and not hbars[i, j]:
                    grid[i, j] = cell = grid[i - 1, j]
                    cell.rowend = i + 1
                # Merge with cell left
                elif j > 0 and not vbars[i, j]:
                    grid[i, j] = cell = grid[i, j - 1]
                    cell.colend = j + 1
                # Create new cell otherwise
                else:
                    grid[i, j] = cell = Cell([i, j])
                    cells.append(cell)

        # Now get the cell's contents by using its boundary
        text_plane = Plane(region.bbox)
        text_plane.extend(mentions)

        for cell in cells:
            x0 = self.xs[cell.colstart]
            x1 = self.xs[cell.colend]
            y0 = self.ys[cell.rowstart]
            y1 = self.ys[cell.rowend]
            bbox = (x0, y0, x1, y1)
            # Keep mentions whose centers are inside the cell
            cell.texts = [
                m for m in text_plane.find(bbox)
                if inside(bbox, (m.xc, m.yc) * 2)
            ]

        # TODO: provide HTML conversion here

        self.get_normalized_grid()
Exemplo n.º 7
0
 def __init__(self):
     self.cells = Plane()
     self.text_layout = Plane()
     self.row_edges = {}
     self.column_edges = {}
Exemplo n.º 8
0
class Sheet1 (object):
    cells = None
    text_layout = None
    column_edges = None
    row_edges = None


    def __init__(self):
        self.cells = Plane()
        self.text_layout = Plane()
        self.row_edges = {}
        self.column_edges = {}

    def add_cell (self, cell):
        self.cells.add(cell)


    def add_text (self, cell_text):
        self.text_layout.add(cell_text)
#        if cell_text.text[:3] == 'Oil': print cell_text.text, cell_text.bbox

    def add_column_edge (self, x_value):
        x = round(x_value,2)
        self.column_edges[x] = 1+ self.column_edges.get(x,0)

    def add_row_edge (self, y_value):
        y = round(y_value,2)
        self.row_edges[y] = 1+ self.row_edges.get(y,0)

    def add_line (self, bbox):
        if bbox[0]==bbox[2]:
            # vertical line
            self.add_column_edge(bbox[0])
        elif bbox[1]==bbox[3]:
            #horizontal line
            self.add_row_edge(bbox[1])
        else:
            print ('WARNING: non-orthogonal line found: %s'%bbox)

    def add_rect (self, bbox):
        self.add_column_edge(bbox[0])
        self.add_column_edge(bbox[2])
        self.add_row_edge(bbox[1])
        self.add_row_edge(bbox[3])

    def add_ltcontainer (self, obj, page_y_offset):
        #NB: row indexes (y axis) are negative!

        bbox = (
            round(obj.x0,2),
            round(-(obj.y1+page_y_offset),2),
            round(obj.x1,2),
            round(-(obj.y0+page_y_offset),2)
            )

        if isinstance (obj, LTTextLine):
            self.add_text (CellText(bbox, obj.get_text()))
        elif isinstance (obj, LTLine):
            self.add_line(bbox)
        elif isinstance (obj, LTRect):
            self.add_rect(bbox)
        elif isinstance (obj, LTContainer):
            for child in obj:
                self.add_ltcontainer (child, page_y_offset)
        else:
            pass

    def extract_rows (self):
#        for obj in self.text_layout.find((690, -1200, 800, -1000)):
#            print obj.bbox,obj.text

        row_bounds = sorted(self.row_edges)
        col_bounds = sorted(self.column_edges)

#        pprint.pprint(col_bounds)

        rows = []

        r0 = row_bounds[0] - 1 if row_bounds else 0

        #NB: row indexes (y axis) are negative!
        for r1 in row_bounds:
            if r1 - r0 < 1: continue

#            print r1-r0,r0,r1

            row=[]
            c0 = 0
            for c1 in col_bounds:
                if c1 - c0 < 1: continue

#                print c0,r0,c1,r1
                # get all text lines that intersect the bounds of this cell
                lines = [l for l in self.text_layout.find((c0,r0,c1,r1))]
                #sort from top to bottom
                lines = sorted(lines, key=lambda line: line.y0)

#                text = ' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1])
#                if text[:10] == 'Production': print text,c0,r0,c1,r1
#                if text[:3] == 'Oil': print text,c0,r0,c1,r1

                # remove anything where the left edge is not inside the cell and concatenate the rest
                row.append(' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1]))
                c0 = c1
            rows.append(row)
            r0 = r1
        return rows
Exemplo n.º 9
0
    def test_find_neighbors_vertical(self):
        laparams = LAParams()
        plane = Plane((0, 0, 50, 50))

        line = LTTextLineVertical(laparams.word_margin)
        line.set_bbox((4, 10, 6, 20))
        plane.add(line)

        bottom_aligned_right = LTTextLineVertical(laparams.word_margin)
        bottom_aligned_right.set_bbox((6, 10, 8, 15))
        plane.add(bottom_aligned_right)

        top_aligned_left = LTTextLineVertical(laparams.word_margin)
        top_aligned_left.set_bbox((2, 15, 4, 20))
        plane.add(top_aligned_left)

        centrally_aligned_overlapping = LTTextLineVertical(
            laparams.word_margin)
        centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
        plane.add(centrally_aligned_overlapping)

        not_aligned = LTTextLineVertical(laparams.word_margin)
        not_aligned.set_bbox((6, 0, 8, 5))
        plane.add(not_aligned)

        wrong_width = LTTextLineVertical(laparams.word_margin)
        wrong_width.set_bbox((6, 10, 10, 15))
        plane.add(wrong_width)

        neighbors = line.find_neighbors(plane, laparams.line_margin)
        self.assertCountEqual(
            neighbors,
            [
                line,
                bottom_aligned_right,
                top_aligned_left,
                centrally_aligned_overlapping,
            ],
        )
Exemplo n.º 10
0
    def test_find_neighbors_horizontal(self):
        laparams = LAParams()
        plane = Plane((0, 0, 50, 50))

        line = LTTextLineHorizontal(laparams.word_margin)
        line.set_bbox((10, 4, 20, 6))
        plane.add(line)

        left_aligned_above = LTTextLineHorizontal(laparams.word_margin)
        left_aligned_above.set_bbox((10, 6, 15, 8))
        plane.add(left_aligned_above)

        right_aligned_below = LTTextLineHorizontal(laparams.word_margin)
        right_aligned_below.set_bbox((15, 2, 20, 4))
        plane.add(right_aligned_below)

        centrally_aligned_overlapping = LTTextLineHorizontal(
            laparams.word_margin)
        centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
        plane.add(centrally_aligned_overlapping)

        not_aligned = LTTextLineHorizontal(laparams.word_margin)
        not_aligned.set_bbox((0, 6, 5, 8))
        plane.add(not_aligned)

        wrong_height = LTTextLineHorizontal(laparams.word_margin)
        wrong_height.set_bbox((10, 6, 15, 10))
        plane.add(wrong_height)

        neighbors = line.find_neighbors(plane, laparams.line_margin)
        self.assertCountEqual(
            neighbors,
            [
                line,
                left_aligned_above,
                right_aligned_below,
                centrally_aligned_overlapping,
            ],
        )
Exemplo n.º 11
0
def cluster_vertically_aligned_boxes(boxes, page_bbox, avg_font_pts, width,
                                     char_width, boxes_segments, boxes_curves,
                                     boxes_figures, page_width, combine):
    # Too many "." in the Table of Content pages
    if (len(boxes) == 0 or len(boxes) > 3500):
        return []
    plane = Plane(page_bbox)
    plane.extend(boxes)
    cid2obj = [set([i]) for i in xrange(len(boxes))]  # initialize clusters
    obj2cid = range(
        len(boxes))  # default object map to cluster with its own index
    prev_clusters = obj2cid
    while (True):
        for i1, b1 in enumerate(boxes):
            for i2, b2 in enumerate(boxes):
                if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])):
                    continue
                if (b1.bbox[1] < b2.bbox[1]):
                    box1 = b1.bbox
                    box2 = b2.bbox
                elif (b2.bbox[1] < b1.bbox[1]):
                    box1 = b2.bbox
                    box2 = b1.bbox
                else:
                    # horizontally aligned
                    continue
                if (
                        box2[1] < box1[3]
                        or (box2[1] - box1[1] < 1.5 * avg_font_pts)
                        or (box2[3] - box1[3] < 1.5 * avg_font_pts)
                ):  # can probably do better if we find the average space between words
                    if (abs(box1[0] - box2[0]) < 3
                            or abs(box1[2] - box2[2]) < 3
                            or (((box1[0] + box1[2]) / 2)
                                == ((box2[0] + box2[2]) / 2))
                            or ((box1[0] < box2[0]) and (box1[2] > box2[0])) or
                        ((box1[0] > box2[0]) and
                         (box2[2] > box1[0]))):  # added center alignemnt
                        min_i = min(i1, i2)
                        max_i = max(i1, i2)
                        cid1 = obj2cid[min_i]
                        cid2 = obj2cid[max_i]
                        # move all objects from cluster cid2 to cid1
                        # reassign cluster ids for all such objects as well
                        for obj_iter in cid2obj[cid2]:
                            cid2obj[cid1].add(obj_iter)
                            obj2cid[obj_iter] = cid1
                        cid2obj[cid2] = set()
        if (prev_clusters == obj2cid):
            break
        prev_clusters = obj2cid
    clusters = [[boxes[i] for i in cluster]
                for cluster in filter(bool, cid2obj)]

    rid2obj = [set([i]) for i in xrange(len(boxes))]  # initialize clusters
    obj2rid = range(
        len(boxes))  # default object map to cluster with its own index
    prev_clusters = obj2rid
    while (True):
        for i1, b1 in enumerate(boxes):
            for i2, b2 in enumerate(boxes):
                if ((i1 == i2) or (obj2rid[i1] == obj2rid[i2])):
                    continue
                box1 = b1.bbox
                box2 = b2.bbox
                if ((abs(box1[1] - box2[1]) < 0.11 * avg_font_pts)
                        or ((abs(box1[3] - box2[3]) < 0.11 * avg_font_pts))
                        or (round((box1[1] + box1[3]) / 2) == round(
                            (box2[1] + box2[3]) / 2))):
                    min_i = min(i1, i2)
                    max_i = max(i1, i2)
                    rid1 = obj2rid[min_i]
                    rid2 = obj2rid[max_i]
                    for obj_iter in rid2obj[rid2]:
                        rid2obj[rid1].add(obj_iter)
                        obj2rid[obj_iter] = rid1
                    rid2obj[rid2] = set()
        if (prev_clusters == obj2rid):
            break
        prev_clusters = obj2rid

    not_merge = set()
    for i1, b1 in enumerate(boxes):
        for i2 in cid2obj[obj2cid[i1]]:
            if (i1 == i2):
                continue
            row1 = obj2rid[i1]
            row2 = obj2rid[i2]
            if (row1 == row2):
                continue
            if (b1.bbox[1] < b2.bbox[1]):
                box1 = b1.bbox
                box2 = b2.bbox
            elif (b2.bbox[1] < b1.bbox[1]):
                box1 = b2.bbox
                box2 = b1.bbox
            else:
                # horizontally aligned
                continue
            text_1 = 0.0
            for obj in rid2obj[row1]:
                text_1 += boxes[obj].bbox[2] - boxes[obj].bbox[0]
            text_2 = 0.0
            for obj in rid2obj[row2]:
                text_2 += boxes[obj].bbox[2] - boxes[obj].bbox[0]
            if (abs(text_1 - text_2) / width > 0.1):
                min_i = min(i1, i2)
                max_i = max(i1, i2)
                not_merge.add((min_i, max_i))

    # Alignment Features
    # If text boxes are very close in a row
    if_row_connected = defaultdict(int)
    num_row_connected = defaultdict(lambda: 1)
    # If text is merged using span code in adjacent rows, this feature tells the number of times the cluster went through span based clustering
    if_connected_by_span = defaultdict(int)
    num_connected_by_span = defaultdict(lambda: 1)
    # If columns were merged using cluster alignment
    if_connected_by_align = defaultdict(int)
    num_connected_by_align = defaultdict(lambda: 1)
    # If vertical columns were merged
    if_vertical_columns_merged = defaultdict(int)
    num_vertical_columns_merged = defaultdict(lambda: 1)
    # Number of Line Segments, Curves and Figures
    num_segments = defaultdict(int)
    num_curves = defaultdict(int)
    num_figures = defaultdict(int)
    # Average Word Space
    total_word_space = defaultdict(float)
    avg_word_space = defaultdict(float)
    avg_word_space_norm = defaultdict(float)
    node_space = defaultdict(float)
    avg_node_space = defaultdict(float)
    avg_node_space_norm = defaultdict(float)

    cid2obj = [set([i]) for i in xrange(len(boxes))]  # initialize clusters
    obj2cid = range(
        len(boxes))  # default object map to cluster with its own index
    prev_clusters = obj2cid
    # add the code for merging close text boxes in particular row
    while (True):
        for i1, b1 in enumerate(boxes):
            for i2, b2 in enumerate(boxes):
                if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])):
                    continue
                box1 = b1.bbox
                box2 = b2.bbox
                if (obj2rid[i1] == obj2rid[i2]):
                    if (((b1.bbox[0] < b2.bbox[0]) and
                         ((b2.bbox[0] - b1.bbox[2]) <= 2 * char_width)) or
                        ((b2.bbox[0] < b1.bbox[0]) and
                         ((b1.bbox[0] - b2.bbox[2]) <= 2 * char_width))):
                        min_i = min(i1, i2)
                        max_i = max(i1, i2)
                        cid1 = obj2cid[min_i]
                        cid2 = obj2cid[max_i]
                        for obj_iter in cid2obj[cid2]:
                            cid2obj[cid1].add(obj_iter)
                            obj2cid[obj_iter] = cid1
                        cid2obj[cid2] = set()
                        # Features
                        if_row_connected[cid1] = 1
                        if_row_connected[cid2] = 0
                        num_row_connected[cid1] += num_row_connected[cid2]
                        num_row_connected[cid2] = 0
        if (prev_clusters == obj2cid):
            break
        prev_clusters = obj2cid

    # vertical alignment code
    while (True):
        for i1, b1 in enumerate(boxes):
            for i2, b2 in enumerate(boxes):
                if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])):
                    continue
                if (b1.bbox[1] < b2.bbox[1]):
                    box1 = b1.bbox
                    box2 = b2.bbox
                elif (b2.bbox[1] < b1.bbox[1]):
                    box1 = b2.bbox
                    box2 = b1.bbox
                else:
                    # horizontally aligned
                    continue
                if (
                        box2[1] < box1[3]
                        or (box2[1] - box1[1] < 1.5 * avg_font_pts)
                        or (box2[3] - box1[3] < 1.5 * avg_font_pts)
                ):  # can probably do better if we find the average space between words
                    if (
                            abs(box1[0] - box2[0]) < 3
                            or abs(box1[2] - box2[2]) < 3
                            or (((box1[0] + box1[2]) / 2)
                                == ((box2[0] + box2[2]) / 2))
                    ):  # or ((box1[0]<box2[0]) and (box1[2]>box2[0])) or ((box1[0]>box2[0]) and (box2[2]>box1[0]))): #added center alignemnt
                        min_i = min(i1, i2)
                        max_i = max(i1, i2)
                        if ((min_i, max_i) not in not_merge):
                            cid1 = obj2cid[min_i]
                            cid2 = obj2cid[max_i]
                            # move all objects from cluster cid2 to cid1
                            # reassign cluster ids for all such objects as well
                            for obj_iter in cid2obj[cid2]:
                                cid2obj[cid1].add(obj_iter)
                                obj2cid[obj_iter] = cid1
                            cid2obj[cid2] = set()
                            # Features
                            if_connected_by_span[cid1] = 1
                            if_connected_by_span[cid2] = 0
                            if (if_row_connected[cid1] == 1
                                    or if_row_connected[cid2] == 1):
                                if_row_connected[cid1] = 1
                                num_row_connected[cid1] += num_row_connected[
                                    cid2]
                                num_row_connected[cid2] = 0
                                if_row_connected[cid2] = 0
                            num_connected_by_span[
                                cid1] = num_connected_by_span[
                                    cid1] + num_connected_by_span[cid2]
                            num_connected_by_span[cid2] = 0
        if (prev_clusters == obj2cid):
            break
        prev_clusters = obj2cid

    # blacklist nearly half-page wide clusters before horizontal merging
    cid2obj2 = cid2obj[:]
    obj2cid2 = obj2cid[:]
    blacklist = set()
    blacklist_obj = set()
    for cid_iter in range(len(cid2obj2)):
        cid = cid2obj2[cid_iter]
        xmin = float("Inf")
        xmax = float("-Inf")
        for obj in cid:
            xmin = min(xmin, boxes[obj].bbox[0])
            xmax = max(xmax, boxes[obj].bbox[2])
        if (((xmax - xmin) > width / 2.75 and (xmax - xmin) < width / 2)
                or ((xmax - xmin) > 0.9 * width)):
            blacklist.add(cid_iter)
            for obj in cid:
                blacklist_obj.add(obj)
                for obj_iter in rid2obj[obj2rid[obj]]:
                    if (boxes[obj_iter].bbox[0] >= xmin
                            and boxes[obj_iter].bbox[2] <= xmax):
                        blacklist_obj.add(obj_iter)

    # create a cluster span
    cid2span = {}
    for cid in range(len(cid2obj)):
        cid2span[cid] = {}
        cid2span[cid]["min_x"] = float("Inf")
        cid2span[cid]["min_y"] = float("Inf")
        cid2span[cid]["max_x"] = float("-Inf")
        cid2span[cid]["max_y"] = float("-Inf")
        for obj in cid2obj[cid]:
            cid2span[cid]["min_x"] = min(cid2span[cid]["min_x"],
                                         boxes[obj].bbox[0])
            cid2span[cid]["max_x"] = max(cid2span[cid]["max_x"],
                                         boxes[obj].bbox[2])
            cid2span[cid]["min_y"] = min(cid2span[cid]["min_y"],
                                         boxes[obj].bbox[1])
            cid2span[cid]["max_y"] = max(cid2span[cid]["max_y"],
                                         boxes[obj].bbox[3])

    cid2cid = {}
    cid_pair_compared = set()
    cid2cid2 = [cid for cid in range(len(cid2obj))]
    for i1, b1 in enumerate(boxes):
        for i2, b2 in enumerate(boxes):
            if (i1 == i2):
                continue
            if (i1 in blacklist_obj or i2 in blacklist_obj):
                continue
            cid1 = obj2cid[i1]
            cid2 = obj2cid[i2]
            if ((min(cid1, cid2), max(cid1, cid2)) in cid_pair_compared):
                continue
            if (cid1 == cid2):
                continue
            if (obj2rid[i1] == obj2rid[i2]):
                continue
            if (cid1 not in cid2cid):
                cid2cid[cid1] = set()
            if (cid2 not in cid2cid):
                cid2cid[cid2] = set()
            if (cid2span[cid1]["min_y"] < cid2span[cid2]["min_y"]):
                box1 = [
                    cid2span[cid1]["min_x"], cid2span[cid1]["min_y"],
                    cid2span[cid1]["max_x"], cid2span[cid1]["max_y"]
                ]
                box2 = [
                    cid2span[cid2]["min_x"], cid2span[cid2]["min_y"],
                    cid2span[cid2]["max_x"], cid2span[cid2]["max_y"]
                ]
            else:
                box1 = [
                    cid2span[cid2]["min_x"], cid2span[cid2]["min_y"],
                    cid2span[cid2]["max_x"], cid2span[cid2]["max_y"]
                ]
                box2 = [
                    cid2span[cid1]["min_x"], cid2span[cid1]["min_y"],
                    cid2span[cid1]["max_x"], cid2span[cid1]["max_y"]
                ]
            if (((box1[1] < box2[1]) and (box1[3] > box2[1]))
                    or ((box1[1] > box2[1]) and (box1[1] < box2[3]))):
                continue
            cid_pair_compared.add((min(cid1, cid2), max(cid1, cid2)))
            query_rect = (min(box1[0], box2[0]), min(box1[1], box2[1]),
                          max(box1[2], box2[2]), max(box1[3], box2[3]))
            connected = True
            for i3, b3 in enumerate(boxes):
                if ((i3 == i1) or (i3 == i2)):
                    continue
                if (obj2cid[i1] == obj2cid[i3] or obj2cid[i2] == obj2cid[i3]):
                    continue
                box3 = b3.bbox
                if (intersect(query_rect, box3)):
                    connected = False
                    break
            if (
                ((round(box1[0]) == round(box2[0])
                  or round(box1[2]) == round(box2[2])) and connected)
                    or (round((box1[0] + box1[2]) / 2) == round(
                        (box2[0] + box2[2]) / 2) and connected)
            ):  # or (abs((box1[0]+box1[2])/2-(box2[0]+box2[2])/2)<0.1*char_width and connected)):# or ((box1[0]<box2[0]) and (box1[2]>box2[0])) or ((box1[0]>box2[0]) and (box2[2]>box1[0]))): #added center alignemnt
                cid2cid[min(cid1, cid2)].add(max(cid1, cid2))
                min_cid = min(cid1, cid2)
                max_cid = max(cid1, cid2)
                for cid_iter in cid2cid2:
                    if (cid2cid2[cid_iter] == cid2cid2[max_cid]):
                        cid2cid2[cid_iter] = cid2cid2[min_cid]

    # post-process cid2cid
    cid2obj2 = cid2obj[:]
    obj2cid2 = obj2cid[:]
    for cid in range(len(cid2cid2)):
        cid_merge = cid2cid2[cid]
        if (cid != cid_merge):
            for obj_iter in cid2obj2[cid]:
                cid2obj2[cid_merge].add(obj_iter)
                obj2cid2[obj_iter] = cid_merge
            cid2obj2[cid] = set()
            # Features
            if_connected_by_align[cid_merge] = 1
            if_connected_by_align[cid] = 0
            if (if_row_connected[cid_merge] == 1
                    or if_row_connected[cid] == 1):
                if_row_connected[cid_merge] = 1
                num_row_connected[cid_merge] += num_row_connected[cid]
                num_row_connected[cid] = 0
                if_row_connected[cid2] = 0
            if (if_connected_by_span[cid_merge] == 1
                    or if_connected_by_span[cid] == 1):
                if_connected_by_span[cid_merge] = 1
                num_connected_by_span[cid_merge] += num_connected_by_span[cid]
                num_connected_by_span[cid] = 0
                if_connected_by_span[cid] = 0
            num_connected_by_align[cid_merge] += num_connected_by_align[cid]
            num_connected_by_align[cid] = 0

            # code to merge columns for table
    prev_clusters = obj2cid2
    while (True):
        for obj1, b1 in enumerate(boxes):
            cid1 = obj2cid2[obj1]
            rid1 = obj2rid[obj1]
            if (cid1 in blacklist):
                continue
            if (obj1 in blacklist_obj):
                continue
            for obj2, b2 in enumerate(boxes):
                if (obj1 == obj2):
                    continue
                if (obj2cid2[obj2] == cid1):
                    rid2 = obj2rid[obj2]
                    if (rid1 == rid2):
                        continue
                    for obj3 in rid2obj[rid2]:
                        cid3 = obj2cid2[obj3]
                        if (obj3 in blacklist_obj):
                            continue
                        if (cid1 != cid3):
                            for obj4 in cid2obj2[cid3]:
                                if (obj4 == obj3):
                                    continue
                                if (obj2rid[obj4] == rid1):
                                    min_cid = min(cid1, cid3)
                                    max_cid = max(cid1, cid3)
                                    for obj_iter in cid2obj2[max_cid]:
                                        cid2obj2[min_cid].add(obj_iter)
                                        obj2cid2[obj_iter] = min_cid
                                    cid2obj2[max_cid] = set()
                                    # Features
                                    if_vertical_columns_merged[min_cid] = 1
                                    if_vertical_columns_merged[max_cid] = 0
                                    num_vertical_columns_merged[
                                        min_cid] += num_vertical_columns_merged[
                                            max_cid]
                                    num_vertical_columns_merged[max_cid] = 0
                                    if (if_row_connected[min_cid] == 1
                                            or if_row_connected[max_cid] == 1):
                                        if_row_connected[min_cid] = 1
                                        num_row_connected[
                                            min_cid] += num_row_connected[
                                                max_cid]
                                        num_row_connected[max_cid] = 0
                                        if_row_connected[max_cid] = 0
                                    if (if_connected_by_span[min_cid] == 1
                                            or if_connected_by_span[max_cid]
                                            == 1):
                                        if_connected_by_span[min_cid] = 1
                                        num_connected_by_span[
                                            min_cid] += num_connected_by_span[
                                                max_cid]
                                        num_connected_by_span[max_cid] = 0
                                        if_connected_by_span[max_cid] = 0
                                    if (if_connected_by_align[min_cid] == 1
                                            or if_connected_by_align[max_cid]
                                            == 1):
                                        if_connected_by_align[min_cid] = 1
                                        num_connected_by_align[
                                            min_cid] += num_connected_by_align[
                                                max_cid]
                                        num_connected_by_align[max_cid] = 0
                                        if_connected_by_align[max_cid] = 0
                                    break
        if (prev_clusters == obj2cid2):
            break
        prev_clusters = obj2cid2

    clusters = [[boxes[i] for i in cluster]
                for cluster in filter(bool, cid2obj2)]
    nodes = [Node(elems) for elems in clusters]
    node_indices = [i for i, x in enumerate(cid2obj2) if x]
    # for idx in range(len(nodes)):
    #     print idx, node_indices[idx], nodes[idx]
    merge_indices = [i for i in range(len(node_indices))]
    page_stat = Node(boxes)
    nodes, merge_indices = merge_nodes(nodes, plane, page_stat, merge_indices)
    # Features
    for idx in range(len(merge_indices)):
        if (merge_indices[idx] != idx):
            cid1 = node_indices[merge_indices[idx]]
            cid2 = node_indices[idx]
            if (if_row_connected[cid1] == 1 or if_row_connected[cid2] == 1):
                if_row_connected[cid1] = 1
                num_row_connected[cid1] += num_row_connected[cid2]
                num_row_connected[cid2] = 0
                if_row_connected[cid2] = 0
            if (if_connected_by_span[cid1] == 1
                    or if_connected_by_span[cid2] == 1):
                if_connected_by_span[cid1] = 1
                num_connected_by_span[cid1] += num_connected_by_span[cid2]
                num_connected_by_span[cid2] = 0
                if_connected_by_span[cid2] = 0
            if (if_connected_by_align[cid1] == 1
                    or if_connected_by_align[cid2] == 1):
                if_connected_by_align[cid1] = 1
                num_connected_by_align[cid1] += num_connected_by_align[cid2]
                num_connected_by_align[cid2] = 0
                if_connected_by_align[cid2] = 0
            if (if_vertical_columns_merged[cid1] == 1
                    or if_vertical_columns_merged[cid2] == 1):
                if_vertical_columns_merged[cid1] = 1
                num_vertical_columns_merged[
                    cid1] += num_vertical_columns_merged[cid2]
                num_vertical_columns_merged[cid2] = 0
                if_vertical_columns_merged[cid2] = 0

    # Get Word Spacing Features
    rid2space = defaultdict(float)
    rid2space_norm = defaultdict(float)
    row_indices = [i for i, x in enumerate(rid2obj) if x]
    for rid in row_indices:
        obj_list = list(rid2obj[rid])
        if (len(obj_list) == 1):
            rid2space[rid] = 0
            continue
        obj_boxes = [boxes[obj].bbox[0] for obj in obj_list]
        sorted_obj_idx = [
            i[0] for i in sorted(enumerate(obj_boxes), key=lambda x: x[1])
        ]
        for obj_idx in range(len(sorted_obj_idx) - 1):
            rid2space[rid] += boxes[obj_list[sorted_obj_idx[obj_idx + 1]]].bbox[2] - \
                              boxes[obj_list[sorted_obj_idx[obj_idx]]].bbox[0]
        rid2space_norm[rid] = rid2space[rid] / (len(obj_list) - 1)

    for idx, node in enumerate(nodes):
        node_idx = node_indices[idx]
        if (merge_indices[idx] == idx):
            obj_list = []
            for idx_iter in range(len(merge_indices)):
                if (merge_indices[idx_iter] == idx):
                    obj_list += list(cid2obj2[node_indices[idx_iter]])
            obj_list = list(set(obj_list))
            rid_list = list(set([obj2rid[obj] for obj in obj_list]))
            for rid in rid_list:
                total_word_space[node_idx] += rid2space[rid]
                avg_word_space_norm[node_idx] += rid2space_norm[rid]
                obj_boxes = [
                    boxes[obj].bbox[0] for obj in rid2obj
                    if obj in cid2obj2[node_idx]
                ]
                sorted_obj_idx = [
                    i[0]
                    for i in sorted(enumerate(obj_boxes), key=lambda x: x[1])
                ]
                for obj_idx in range(len(sorted_obj_idx) - 1):
                    node_space[node_idx] += boxes[obj_list[sorted_obj_idx[obj_idx + 1]]].bbox[2] - \
                                            boxes[obj_list[sorted_obj_idx[obj_idx]]].bbox[0]
                avg_node_space_norm[node_idx] += node_space[node_idx] / (
                    len(obj_boxes) - 1)
            avg_word_space[node_idx] = total_word_space[node_idx] / len(
                rid_list)
            avg_word_space_norm[node_idx] /= len(rid_list)
            avg_node_space[node_idx] = node_space[node_idx] / len(rid_list)
            avg_node_space_norm[node_idx] /= len(rid_list)

    new_nodes = []
    new_node_indices = []
    for idx in range(len(merge_indices)):
        if (merge_indices[idx] == idx):
            new_nodes.append(nodes[idx])
            new_node_indices.append(node_indices[idx])

    nodes = new_nodes
    node_indices = new_node_indices
    # Features
    for idx, node in enumerate(nodes):
        node_idx = node_indices[idx]
        node_bbox = (node.x0, node.y0, node.x1, node.y1)
        for i1, b1 in enumerate(boxes_segments):
            if (intersect(node_bbox, b1.bbox)):
                num_segments[node_idx] += 1
        for i1, b1 in enumerate(boxes_figures):
            if (intersect(node_bbox, b1.bbox)):
                num_figures[node_idx] += 1
        for i1, b1 in enumerate(boxes_curves):
            if (intersect(node_bbox, b1.bbox)):
                num_curves[node_idx] += 1

    tables = []
    table_indices = []
    for idx, node in enumerate(nodes):
        node_idx = node_indices[idx]
        isTable = True
        if node.is_table():
            for elem in node.elems:
                if ("table" in elem.get_text().lower()):
                    continue
                if ((node.width - elem.bbox[2] + elem.bbox[0]) <
                        2 * char_width):
                    isTable = False
            if (isTable):
                tables.append(node)
                table_indices.append(node_idx)

    if (combine == True):
        node_features = [0] * 17
        for idx, node in enumerate(nodes):
            node_idx = node_indices[idx]
            node_features = [
                sum(x) for x in zip(node_features, [
                    if_row_connected[node_idx], num_row_connected[node_idx],
                    if_connected_by_span[node_idx],
                    num_connected_by_span[node_idx],
                    if_connected_by_align[node_idx],
                    num_connected_by_align[node_idx],
                    if_vertical_columns_merged[node_idx],
                    num_vertical_columns_merged[node_idx],
                    num_segments[node_idx], num_curves[node_idx],
                    num_figures[node_idx], total_word_space[node_idx],
                    avg_word_space[node_idx], avg_word_space_norm[node_idx],
                    node_space[node_idx], avg_node_space[node_idx],
                    avg_node_space_norm[node_idx]
                ])
            ]
        return [], node_features
    else:
        table_features = []
        for idx, table in enumerate(tables):
            table_idx = table_indices[idx]
            table_features += [[
                if_row_connected[table_idx], num_row_connected[table_idx],
                if_connected_by_span[table_idx],
                num_connected_by_span[table_idx],
                if_connected_by_align[table_idx],
                num_connected_by_align[table_idx],
                if_vertical_columns_merged[table_idx],
                num_vertical_columns_merged[table_idx],
                num_segments[table_idx], num_curves[table_idx],
                num_figures[table_idx], total_word_space[table_idx],
                avg_word_space[table_idx], avg_word_space_norm[table_idx],
                node_space[table_idx], avg_node_space[table_idx],
                avg_node_space_norm[table_idx]
            ]]
        return tables, table_features