Python Plane.Plane примеры использования

Язык программирования: Python

Пространство имен/Пакет: pdfminer.utils

Класс/Тип: Plane

Метод/Функция: Plane

Примеров на hotexamples.com: 8

Python Plane.Plane - 8 примеров найдено. Это лучшие примеры Python кода для pdfminer.utils.Plane.Plane, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Plane(8)

extend(5)

add(3)

find(2)

Основные методы

Plane (8)

extend (5)

add (3)

find (2)

Пример #1

Показать файл

Файл: TableExtractML.py Проект: wabbitml/pdftotree

 def identify_scanned_page(self, boxes, page_bbox, page_width, page_height):
     plane = Plane(page_bbox)
     plane.extend(boxes)
     cid2obj = [set([i]) for i in range(len(boxes))]  # initialize clusters
     # default object map to cluster with its own index
     obj2cid = list(range(len(boxes)))
     prev_clusters = obj2cid
     while (True):
         for i1, b1 in enumerate(boxes):
             for i2, b2 in enumerate(boxes):
                 box1 = b1.bbox
                 box2 = b2.bbox
                 if (box1[0] == box2[0] and box1[2] == box2[2]
                         and round(box1[3]) == round(box2[1])):
                     min_i = min(i1, i2)
                     max_i = max(i1, i2)
                     cid1 = obj2cid[min_i]
                     cid2 = obj2cid[max_i]
                     for obj_iter in cid2obj[cid2]:
                         cid2obj[cid1].add(obj_iter)
                         obj2cid[obj_iter] = cid1
                     cid2obj[cid2] = set()
         if (prev_clusters == obj2cid):
             break
         prev_clusters = obj2cid
     clusters = [[boxes[i] for i in cluster]
                 for cluster in filter(bool, cid2obj)]
     if (len(clusters) == 1 and clusters[0][0].bbox[0] < -0.0
             and clusters[0][0].bbox[1] <= 0
             and abs(clusters[0][0].bbox[2] - page_width) <= 5
             and abs(clusters[0][0].bbox[3] - page_height) <= 5):
         return True
     return False

Пример #2

Показать файл

def group_textlines(self, laparams, lines):
    """Patched class method that fixes empty line aggregation, and allows
    run-time line margin detection"""
    plane = Plane(self.bbox)
    plane.extend(lines)
    boxes = {}
    for line in lines:
        neighbors = line.find_neighbors(plane, laparams.line_margin)
        if line not in neighbors or not line.get_text().strip():
            continue

        # Correct margin to paragraph specific
        true_margin = laparams.line_margin
        for obj1 in neighbors:
            if obj1 is line:
                continue
            margin = min(abs(obj1.y0 - line.y1), abs(obj1.y1 - line.y0))
            margin = margin * 1.05 / line.height
            if margin < true_margin:
                true_margin = margin

        neighbors = line.find_neighbors(plane, true_margin)
        if line not in neighbors:
            continue

        members = []
        for obj1 in neighbors:
            if not obj1.get_text().strip():
                continue
            members.append(obj1)
            if obj1 in boxes:
                members.extend(boxes.pop(obj1))
        if isinstance(line, LTTextLineHorizontal):
            box = LTTextBoxHorizontal()
        else:
            box = LTTextBoxVertical()
        for obj in uniq(members):
            box.add(obj)
            boxes[obj] = box
    done = set()
    for line in lines:
        if line not in boxes:
            continue
        box = boxes[line]
        if box in done:
            continue
        done.add(box)
        if not box.is_empty():
            yield box
    return

Пример #3

Показать файл

Файл: layout.py Проект: worldwise001/paperminer

 def group_textlines(self, laparams: LAParams,
                     lines: List[LTTextContainer]) -> Generator:
     plane = Plane(self.bbox)
     plane.extend(lines)
     boxes: Dict[LTText, LTTextBox] = {}
     for line in lines:
         if isinstance(line, LTTextLineHorizontalExtended):
             box = LTTextBoxHorizontal()
             if self.rsrcmgr:
                 klass = line.maybe_classify(self.rsrcmgr)
                 if klass == LTTitle:
                     self.rsrcmgr.after_title = True
                 elif not self.rsrcmgr.after_abstract and klass == LTSectionHeader:
                     self.rsrcmgr.after_abstract = True
                 elif klass == LTSectionHeader and 'references' in line.get_text(
                 ).lower():
                     self.rsrcmgr.after_ref = True
                 box = klass()
         else:
             box = LTTextBoxVertical()
         if not isinstance(box, LTTitle) and not isinstance(
                 box, LTSectionHeader):
             neighbors = line.find_neighbors_with_rsrcmgr(
                 plane, laparams.line_margin, self.rsrcmgr)
             if line not in neighbors:
                 continue
         else:
             neighbors = [line]
         members = []
         for obj1 in neighbors:
             members.append(obj1)
             if obj1 in boxes:
                 members.extend(boxes.pop(obj1))
         for obj in uniq(members):
             box.add(obj)
             boxes[obj] = box
     done: Set[LTTextBox] = set()
     for line in lines:
         if line not in boxes:
             continue
         box = boxes[line]
         if box in done:
             continue
         done.add(box)
         if not box.is_empty():
             yield box
     return

Пример #4

Показать файл

    def test_find_neighbors_vertical(self):
        laparams = LAParams()
        plane = Plane((0, 0, 50, 50))

        line = LTTextLineVertical(laparams.word_margin)
        line.set_bbox((4, 10, 6, 20))
        plane.add(line)

        bottom_aligned_right = LTTextLineVertical(laparams.word_margin)
        bottom_aligned_right.set_bbox((6, 10, 8, 15))
        plane.add(bottom_aligned_right)

        top_aligned_left = LTTextLineVertical(laparams.word_margin)
        top_aligned_left.set_bbox((2, 15, 4, 20))
        plane.add(top_aligned_left)

        centrally_aligned_overlapping = LTTextLineVertical(
            laparams.word_margin)
        centrally_aligned_overlapping.set_bbox((5, 13, 7, 17))
        plane.add(centrally_aligned_overlapping)

        not_aligned = LTTextLineVertical(laparams.word_margin)
        not_aligned.set_bbox((6, 0, 8, 5))
        plane.add(not_aligned)

        wrong_width = LTTextLineVertical(laparams.word_margin)
        wrong_width.set_bbox((6, 10, 10, 15))
        plane.add(wrong_width)

        neighbors = line.find_neighbors(plane, laparams.line_margin)
        self.assertCountEqual(
            neighbors,
            [
                line,
                bottom_aligned_right,
                top_aligned_left,
                centrally_aligned_overlapping,
            ],
        )

Пример #5

Показать файл

    def test_find_neighbors_horizontal(self):
        laparams = LAParams()
        plane = Plane((0, 0, 50, 50))

        line = LTTextLineHorizontal(laparams.word_margin)
        line.set_bbox((10, 4, 20, 6))
        plane.add(line)

        left_aligned_above = LTTextLineHorizontal(laparams.word_margin)
        left_aligned_above.set_bbox((10, 6, 15, 8))
        plane.add(left_aligned_above)

        right_aligned_below = LTTextLineHorizontal(laparams.word_margin)
        right_aligned_below.set_bbox((15, 2, 20, 4))
        plane.add(right_aligned_below)

        centrally_aligned_overlapping = LTTextLineHorizontal(
            laparams.word_margin)
        centrally_aligned_overlapping.set_bbox((13, 5, 17, 7))
        plane.add(centrally_aligned_overlapping)

        not_aligned = LTTextLineHorizontal(laparams.word_margin)
        not_aligned.set_bbox((0, 6, 5, 8))
        plane.add(not_aligned)

        wrong_height = LTTextLineHorizontal(laparams.word_margin)
        wrong_height.set_bbox((10, 6, 15, 10))
        plane.add(wrong_height)

        neighbors = line.find_neighbors(plane, laparams.line_margin)
        self.assertCountEqual(
            neighbors,
            [
                line,
                left_aligned_above,
                right_aligned_below,
                centrally_aligned_overlapping,
            ],
        )

Пример #6

Показать файл

 def given_plane_with_one_object(object_size=50, gridsize=50):
     bounding_box = (0, 0, 100, 100)
     plane = Plane(bounding_box, gridsize)
     obj = LTComponent((0, 0, object_size, object_size))
     plane.add(obj)
     return plane, obj

Пример #7

Показать файл

Файл: grid.py Проект: zviri/pdftotree

    def __init__(self, mentions, lines, region, min_cell_size=6.0):
        """
        Constructor
        """
        self.min_cell_size = min_cell_size
        vlines, hlines = _split_vlines_hlines(lines)

        self.xs = [v.xc for v in vlines]
        self.ys = [h.yc for h in hlines]

        # Remove closely clustered lines
        # Also make sure there is at least 1 mega column for the table
        self.xs = _retain_centroids(self.xs + [region.x0, region.x1],
                                    min_cell_size)
        self.ys = _retain_centroids(self.ys + [region.y0, region.y1],
                                    min_cell_size)

        self.xranges = list(zip(self.xs, self.xs[1:]))
        self.yranges = list(zip(self.ys, self.ys[1:]))

        self.num_cols = len(self.xranges)
        self.num_rows = len(self.yranges)

        # Grid contents
        self._grid = np.full([self.num_rows, self.num_cols],
                             None,
                             dtype=np.dtype(object))
        grid = self._grid

        # Record whether a particular cell boundary is present
        line_plane = Plane(region.bbox)
        line_plane.extend(lines)
        vbars, hbars = self._mark_grid_bounds(line_plane, region)
        cells = []
        # Establish cell regions
        for i in range(self.num_rows):
            for j in range(self.num_cols):
                if grid[i, j]:
                    continue  # Skip already marked cells
                # Merge with cell above
                if i > 0 and not hbars[i, j]:
                    grid[i, j] = cell = grid[i - 1, j]
                    cell.rowend = i + 1
                # Merge with cell left
                elif j > 0 and not vbars[i, j]:
                    grid[i, j] = cell = grid[i, j - 1]
                    cell.colend = j + 1
                # Create new cell otherwise
                else:
                    grid[i, j] = cell = Cell([i, j])
                    cells.append(cell)

        # Now get the cell's contents by using its boundary
        text_plane = Plane(region.bbox)
        text_plane.extend(mentions)

        for cell in cells:
            x0 = self.xs[cell.colstart]
            x1 = self.xs[cell.colend]
            y0 = self.ys[cell.rowstart]
            y1 = self.ys[cell.rowend]
            bbox = (x0, y0, x1, y1)
            # Keep mentions whose centers are inside the cell
            cell.texts = [
                m for m in text_plane.find(bbox)
                if inside(bbox, (m.xc, m.yc) * 2)
            ]

        # TODO: provide HTML conversion here

        self.get_normalized_grid()

Пример #8

Показать файл

def cluster_vertically_aligned_boxes(boxes, page_bbox, avg_font_pts, width,
                                     char_width, boxes_segments, boxes_curves,
                                     boxes_figures, page_width, combine):
    # Too many "." in the Table of Content pages
    if (len(boxes) == 0 or len(boxes) > 3500):
        return []
    plane = Plane(page_bbox)
    plane.extend(boxes)
    cid2obj = [set([i]) for i in xrange(len(boxes))]  # initialize clusters
    obj2cid = range(
        len(boxes))  # default object map to cluster with its own index
    prev_clusters = obj2cid
    while (True):
        for i1, b1 in enumerate(boxes):
            for i2, b2 in enumerate(boxes):
                if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])):
                    continue
                if (b1.bbox[1] < b2.bbox[1]):
                    box1 = b1.bbox
                    box2 = b2.bbox
                elif (b2.bbox[1] < b1.bbox[1]):
                    box1 = b2.bbox
                    box2 = b1.bbox
                else:
                    # horizontally aligned
                    continue
                if (
                        box2[1] < box1[3]
                        or (box2[1] - box1[1] < 1.5 * avg_font_pts)
                        or (box2[3] - box1[3] < 1.5 * avg_font_pts)
                ):  # can probably do better if we find the average space between words
                    if (abs(box1[0] - box2[0]) < 3
                            or abs(box1[2] - box2[2]) < 3
                            or (((box1[0] + box1[2]) / 2)
                                == ((box2[0] + box2[2]) / 2))
                            or ((box1[0] < box2[0]) and (box1[2] > box2[0])) or
                        ((box1[0] > box2[0]) and
                         (box2[2] > box1[0]))):  # added center alignemnt
                        min_i = min(i1, i2)
                        max_i = max(i1, i2)
                        cid1 = obj2cid[min_i]
                        cid2 = obj2cid[max_i]
                        # move all objects from cluster cid2 to cid1
                        # reassign cluster ids for all such objects as well
                        for obj_iter in cid2obj[cid2]:
                            cid2obj[cid1].add(obj_iter)
                            obj2cid[obj_iter] = cid1
                        cid2obj[cid2] = set()
        if (prev_clusters == obj2cid):
            break
        prev_clusters = obj2cid
    clusters = [[boxes[i] for i in cluster]
                for cluster in filter(bool, cid2obj)]

    rid2obj = [set([i]) for i in xrange(len(boxes))]  # initialize clusters
    obj2rid = range(
        len(boxes))  # default object map to cluster with its own index
    prev_clusters = obj2rid
    while (True):
        for i1, b1 in enumerate(boxes):
            for i2, b2 in enumerate(boxes):
                if ((i1 == i2) or (obj2rid[i1] == obj2rid[i2])):
                    continue
                box1 = b1.bbox
                box2 = b2.bbox
                if ((abs(box1[1] - box2[1]) < 0.11 * avg_font_pts)
                        or ((abs(box1[3] - box2[3]) < 0.11 * avg_font_pts))
                        or (round((box1[1] + box1[3]) / 2) == round(
                            (box2[1] + box2[3]) / 2))):
                    min_i = min(i1, i2)
                    max_i = max(i1, i2)
                    rid1 = obj2rid[min_i]
                    rid2 = obj2rid[max_i]
                    for obj_iter in rid2obj[rid2]:
                        rid2obj[rid1].add(obj_iter)
                        obj2rid[obj_iter] = rid1
                    rid2obj[rid2] = set()
        if (prev_clusters == obj2rid):
            break
        prev_clusters = obj2rid

    not_merge = set()
    for i1, b1 in enumerate(boxes):
        for i2 in cid2obj[obj2cid[i1]]:
            if (i1 == i2):
                continue
            row1 = obj2rid[i1]
            row2 = obj2rid[i2]
            if (row1 == row2):
                continue
            if (b1.bbox[1] < b2.bbox[1]):
                box1 = b1.bbox
                box2 = b2.bbox
            elif (b2.bbox[1] < b1.bbox[1]):
                box1 = b2.bbox
                box2 = b1.bbox
            else:
                # horizontally aligned
                continue
            text_1 = 0.0
            for obj in rid2obj[row1]:
                text_1 += boxes[obj].bbox[2] - boxes[obj].bbox[0]
            text_2 = 0.0
            for obj in rid2obj[row2]:
                text_2 += boxes[obj].bbox[2] - boxes[obj].bbox[0]
            if (abs(text_1 - text_2) / width > 0.1):
                min_i = min(i1, i2)
                max_i = max(i1, i2)
                not_merge.add((min_i, max_i))

    # Alignment Features
    # If text boxes are very close in a row
    if_row_connected = defaultdict(int)
    num_row_connected = defaultdict(lambda: 1)
    # If text is merged using span code in adjacent rows, this feature tells the number of times the cluster went through span based clustering
    if_connected_by_span = defaultdict(int)
    num_connected_by_span = defaultdict(lambda: 1)
    # If columns were merged using cluster alignment
    if_connected_by_align = defaultdict(int)
    num_connected_by_align = defaultdict(lambda: 1)
    # If vertical columns were merged
    if_vertical_columns_merged = defaultdict(int)
    num_vertical_columns_merged = defaultdict(lambda: 1)
    # Number of Line Segments, Curves and Figures
    num_segments = defaultdict(int)
    num_curves = defaultdict(int)
    num_figures = defaultdict(int)
    # Average Word Space
    total_word_space = defaultdict(float)
    avg_word_space = defaultdict(float)
    avg_word_space_norm = defaultdict(float)
    node_space = defaultdict(float)
    avg_node_space = defaultdict(float)
    avg_node_space_norm = defaultdict(float)

    cid2obj = [set([i]) for i in xrange(len(boxes))]  # initialize clusters
    obj2cid = range(
        len(boxes))  # default object map to cluster with its own index
    prev_clusters = obj2cid
    # add the code for merging close text boxes in particular row
    while (True):
        for i1, b1 in enumerate(boxes):
            for i2, b2 in enumerate(boxes):
                if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])):
                    continue
                box1 = b1.bbox
                box2 = b2.bbox
                if (obj2rid[i1] == obj2rid[i2]):
                    if (((b1.bbox[0] < b2.bbox[0]) and
                         ((b2.bbox[0] - b1.bbox[2]) <= 2 * char_width)) or
                        ((b2.bbox[0] < b1.bbox[0]) and
                         ((b1.bbox[0] - b2.bbox[2]) <= 2 * char_width))):
                        min_i = min(i1, i2)
                        max_i = max(i1, i2)
                        cid1 = obj2cid[min_i]
                        cid2 = obj2cid[max_i]
                        for obj_iter in cid2obj[cid2]:
                            cid2obj[cid1].add(obj_iter)
                            obj2cid[obj_iter] = cid1
                        cid2obj[cid2] = set()
                        # Features
                        if_row_connected[cid1] = 1
                        if_row_connected[cid2] = 0
                        num_row_connected[cid1] += num_row_connected[cid2]
                        num_row_connected[cid2] = 0
        if (prev_clusters == obj2cid):
            break
        prev_clusters = obj2cid

    # vertical alignment code
    while (True):
        for i1, b1 in enumerate(boxes):
            for i2, b2 in enumerate(boxes):
                if ((i1 == i2) or (obj2cid[i1] == obj2cid[i2])):
                    continue
                if (b1.bbox[1] < b2.bbox[1]):
                    box1 = b1.bbox
                    box2 = b2.bbox
                elif (b2.bbox[1] < b1.bbox[1]):
                    box1 = b2.bbox
                    box2 = b1.bbox
                else:
                    # horizontally aligned
                    continue
                if (
                        box2[1] < box1[3]
                        or (box2[1] - box1[1] < 1.5 * avg_font_pts)
                        or (box2[3] - box1[3] < 1.5 * avg_font_pts)
                ):  # can probably do better if we find the average space between words
                    if (
                            abs(box1[0] - box2[0]) < 3
                            or abs(box1[2] - box2[2]) < 3
                            or (((box1[0] + box1[2]) / 2)
                                == ((box2[0] + box2[2]) / 2))
                    ):  # or ((box1[0]<box2[0]) and (box1[2]>box2[0])) or ((box1[0]>box2[0]) and (box2[2]>box1[0]))): #added center alignemnt
                        min_i = min(i1, i2)
                        max_i = max(i1, i2)
                        if ((min_i, max_i) not in not_merge):
                            cid1 = obj2cid[min_i]
                            cid2 = obj2cid[max_i]
                            # move all objects from cluster cid2 to cid1
                            # reassign cluster ids for all such objects as well
                            for obj_iter in cid2obj[cid2]:
                                cid2obj[cid1].add(obj_iter)
                                obj2cid[obj_iter] = cid1
                            cid2obj[cid2] = set()
                            # Features
                            if_connected_by_span[cid1] = 1
                            if_connected_by_span[cid2] = 0
                            if (if_row_connected[cid1] == 1
                                    or if_row_connected[cid2] == 1):
                                if_row_connected[cid1] = 1
                                num_row_connected[cid1] += num_row_connected[
                                    cid2]
                                num_row_connected[cid2] = 0
                                if_row_connected[cid2] = 0
                            num_connected_by_span[
                                cid1] = num_connected_by_span[
                                    cid1] + num_connected_by_span[cid2]
                            num_connected_by_span[cid2] = 0
        if (prev_clusters == obj2cid):
            break
        prev_clusters = obj2cid

    # blacklist nearly half-page wide clusters before horizontal merging
    cid2obj2 = cid2obj[:]
    obj2cid2 = obj2cid[:]
    blacklist = set()
    blacklist_obj = set()
    for cid_iter in range(len(cid2obj2)):
        cid = cid2obj2[cid_iter]
        xmin = float("Inf")
        xmax = float("-Inf")
        for obj in cid:
            xmin = min(xmin, boxes[obj].bbox[0])
            xmax = max(xmax, boxes[obj].bbox[2])
        if (((xmax - xmin) > width / 2.75 and (xmax - xmin) < width / 2)
                or ((xmax - xmin) > 0.9 * width)):
            blacklist.add(cid_iter)
            for obj in cid:
                blacklist_obj.add(obj)
                for obj_iter in rid2obj[obj2rid[obj]]:
                    if (boxes[obj_iter].bbox[0] >= xmin
                            and boxes[obj_iter].bbox[2] <= xmax):
                        blacklist_obj.add(obj_iter)

    # create a cluster span
    cid2span = {}
    for cid in range(len(cid2obj)):
        cid2span[cid] = {}
        cid2span[cid]["min_x"] = float("Inf")
        cid2span[cid]["min_y"] = float("Inf")
        cid2span[cid]["max_x"] = float("-Inf")
        cid2span[cid]["max_y"] = float("-Inf")
        for obj in cid2obj[cid]:
            cid2span[cid]["min_x"] = min(cid2span[cid]["min_x"],
                                         boxes[obj].bbox[0])
            cid2span[cid]["max_x"] = max(cid2span[cid]["max_x"],
                                         boxes[obj].bbox[2])
            cid2span[cid]["min_y"] = min(cid2span[cid]["min_y"],
                                         boxes[obj].bbox[1])
            cid2span[cid]["max_y"] = max(cid2span[cid]["max_y"],
                                         boxes[obj].bbox[3])

    cid2cid = {}
    cid_pair_compared = set()
    cid2cid2 = [cid for cid in range(len(cid2obj))]
    for i1, b1 in enumerate(boxes):
        for i2, b2 in enumerate(boxes):
            if (i1 == i2):
                continue
            if (i1 in blacklist_obj or i2 in blacklist_obj):
                continue
            cid1 = obj2cid[i1]
            cid2 = obj2cid[i2]
            if ((min(cid1, cid2), max(cid1, cid2)) in cid_pair_compared):
                continue
            if (cid1 == cid2):
                continue
            if (obj2rid[i1] == obj2rid[i2]):
                continue
            if (cid1 not in cid2cid):
                cid2cid[cid1] = set()
            if (cid2 not in cid2cid):
                cid2cid[cid2] = set()
            if (cid2span[cid1]["min_y"] < cid2span[cid2]["min_y"]):
                box1 = [
                    cid2span[cid1]["min_x"], cid2span[cid1]["min_y"],
                    cid2span[cid1]["max_x"], cid2span[cid1]["max_y"]
                ]
                box2 = [
                    cid2span[cid2]["min_x"], cid2span[cid2]["min_y"],
                    cid2span[cid2]["max_x"], cid2span[cid2]["max_y"]
                ]
            else:
                box1 = [
                    cid2span[cid2]["min_x"], cid2span[cid2]["min_y"],
                    cid2span[cid2]["max_x"], cid2span[cid2]["max_y"]
                ]
                box2 = [
                    cid2span[cid1]["min_x"], cid2span[cid1]["min_y"],
                    cid2span[cid1]["max_x"], cid2span[cid1]["max_y"]
                ]
            if (((box1[1] < box2[1]) and (box1[3] > box2[1]))
                    or ((box1[1] > box2[1]) and (box1[1] < box2[3]))):
                continue
            cid_pair_compared.add((min(cid1, cid2), max(cid1, cid2)))
            query_rect = (min(box1[0], box2[0]), min(box1[1], box2[1]),
                          max(box1[2], box2[2]), max(box1[3], box2[3]))
            connected = True
            for i3, b3 in enumerate(boxes):
                if ((i3 == i1) or (i3 == i2)):
                    continue
                if (obj2cid[i1] == obj2cid[i3] or obj2cid[i2] == obj2cid[i3]):
                    continue
                box3 = b3.bbox
                if (intersect(query_rect, box3)):
                    connected = False
                    break
            if (
                ((round(box1[0]) == round(box2[0])
                  or round(box1[2]) == round(box2[2])) and connected)
                    or (round((box1[0] + box1[2]) / 2) == round(
                        (box2[0] + box2[2]) / 2) and connected)
            ):  # or (abs((box1[0]+box1[2])/2-(box2[0]+box2[2])/2)<0.1*char_width and connected)):# or ((box1[0]<box2[0]) and (box1[2]>box2[0])) or ((box1[0]>box2[0]) and (box2[2]>box1[0]))): #added center alignemnt
                cid2cid[min(cid1, cid2)].add(max(cid1, cid2))
                min_cid = min(cid1, cid2)
                max_cid = max(cid1, cid2)
                for cid_iter in cid2cid2:
                    if (cid2cid2[cid_iter] == cid2cid2[max_cid]):
                        cid2cid2[cid_iter] = cid2cid2[min_cid]

    # post-process cid2cid
    cid2obj2 = cid2obj[:]
    obj2cid2 = obj2cid[:]
    for cid in range(len(cid2cid2)):
        cid_merge = cid2cid2[cid]
        if (cid != cid_merge):
            for obj_iter in cid2obj2[cid]:
                cid2obj2[cid_merge].add(obj_iter)
                obj2cid2[obj_iter] = cid_merge
            cid2obj2[cid] = set()
            # Features
            if_connected_by_align[cid_merge] = 1
            if_connected_by_align[cid] = 0
            if (if_row_connected[cid_merge] == 1
                    or if_row_connected[cid] == 1):
                if_row_connected[cid_merge] = 1
                num_row_connected[cid_merge] += num_row_connected[cid]
                num_row_connected[cid] = 0
                if_row_connected[cid2] = 0
            if (if_connected_by_span[cid_merge] == 1
                    or if_connected_by_span[cid] == 1):
                if_connected_by_span[cid_merge] = 1
                num_connected_by_span[cid_merge] += num_connected_by_span[cid]
                num_connected_by_span[cid] = 0
                if_connected_by_span[cid] = 0
            num_connected_by_align[cid_merge] += num_connected_by_align[cid]
            num_connected_by_align[cid] = 0

            # code to merge columns for table
    prev_clusters = obj2cid2
    while (True):
        for obj1, b1 in enumerate(boxes):
            cid1 = obj2cid2[obj1]
            rid1 = obj2rid[obj1]
            if (cid1 in blacklist):
                continue
            if (obj1 in blacklist_obj):
                continue
            for obj2, b2 in enumerate(boxes):
                if (obj1 == obj2):
                    continue
                if (obj2cid2[obj2] == cid1):
                    rid2 = obj2rid[obj2]
                    if (rid1 == rid2):
                        continue
                    for obj3 in rid2obj[rid2]:
                        cid3 = obj2cid2[obj3]
                        if (obj3 in blacklist_obj):
                            continue
                        if (cid1 != cid3):
                            for obj4 in cid2obj2[cid3]:
                                if (obj4 == obj3):
                                    continue
                                if (obj2rid[obj4] == rid1):
                                    min_cid = min(cid1, cid3)
                                    max_cid = max(cid1, cid3)
                                    for obj_iter in cid2obj2[max_cid]:
                                        cid2obj2[min_cid].add(obj_iter)
                                        obj2cid2[obj_iter] = min_cid
                                    cid2obj2[max_cid] = set()
                                    # Features
                                    if_vertical_columns_merged[min_cid] = 1
                                    if_vertical_columns_merged[max_cid] = 0
                                    num_vertical_columns_merged[
                                        min_cid] += num_vertical_columns_merged[
                                            max_cid]
                                    num_vertical_columns_merged[max_cid] = 0
                                    if (if_row_connected[min_cid] == 1
                                            or if_row_connected[max_cid] == 1):
                                        if_row_connected[min_cid] = 1
                                        num_row_connected[
                                            min_cid] += num_row_connected[
                                                max_cid]
                                        num_row_connected[max_cid] = 0
                                        if_row_connected[max_cid] = 0
                                    if (if_connected_by_span[min_cid] == 1
                                            or if_connected_by_span[max_cid]
                                            == 1):
                                        if_connected_by_span[min_cid] = 1
                                        num_connected_by_span[
                                            min_cid] += num_connected_by_span[
                                                max_cid]
                                        num_connected_by_span[max_cid] = 0
                                        if_connected_by_span[max_cid] = 0
                                    if (if_connected_by_align[min_cid] == 1
                                            or if_connected_by_align[max_cid]
                                            == 1):
                                        if_connected_by_align[min_cid] = 1
                                        num_connected_by_align[
                                            min_cid] += num_connected_by_align[
                                                max_cid]
                                        num_connected_by_align[max_cid] = 0
                                        if_connected_by_align[max_cid] = 0
                                    break
        if (prev_clusters == obj2cid2):
            break
        prev_clusters = obj2cid2

    clusters = [[boxes[i] for i in cluster]
                for cluster in filter(bool, cid2obj2)]
    nodes = [Node(elems) for elems in clusters]
    node_indices = [i for i, x in enumerate(cid2obj2) if x]
    # for idx in range(len(nodes)):
    #     print idx, node_indices[idx], nodes[idx]
    merge_indices = [i for i in range(len(node_indices))]
    page_stat = Node(boxes)
    nodes, merge_indices = merge_nodes(nodes, plane, page_stat, merge_indices)
    # Features
    for idx in range(len(merge_indices)):
        if (merge_indices[idx] != idx):
            cid1 = node_indices[merge_indices[idx]]
            cid2 = node_indices[idx]
            if (if_row_connected[cid1] == 1 or if_row_connected[cid2] == 1):
                if_row_connected[cid1] = 1
                num_row_connected[cid1] += num_row_connected[cid2]
                num_row_connected[cid2] = 0
                if_row_connected[cid2] = 0
            if (if_connected_by_span[cid1] == 1
                    or if_connected_by_span[cid2] == 1):
                if_connected_by_span[cid1] = 1
                num_connected_by_span[cid1] += num_connected_by_span[cid2]
                num_connected_by_span[cid2] = 0
                if_connected_by_span[cid2] = 0
            if (if_connected_by_align[cid1] == 1
                    or if_connected_by_align[cid2] == 1):
                if_connected_by_align[cid1] = 1
                num_connected_by_align[cid1] += num_connected_by_align[cid2]
                num_connected_by_align[cid2] = 0
                if_connected_by_align[cid2] = 0
            if (if_vertical_columns_merged[cid1] == 1
                    or if_vertical_columns_merged[cid2] == 1):
                if_vertical_columns_merged[cid1] = 1
                num_vertical_columns_merged[
                    cid1] += num_vertical_columns_merged[cid2]
                num_vertical_columns_merged[cid2] = 0
                if_vertical_columns_merged[cid2] = 0

    # Get Word Spacing Features
    rid2space = defaultdict(float)
    rid2space_norm = defaultdict(float)
    row_indices = [i for i, x in enumerate(rid2obj) if x]
    for rid in row_indices:
        obj_list = list(rid2obj[rid])
        if (len(obj_list) == 1):
            rid2space[rid] = 0
            continue
        obj_boxes = [boxes[obj].bbox[0] for obj in obj_list]
        sorted_obj_idx = [
            i[0] for i in sorted(enumerate(obj_boxes), key=lambda x: x[1])
        ]
        for obj_idx in range(len(sorted_obj_idx) - 1):
            rid2space[rid] += boxes[obj_list[sorted_obj_idx[obj_idx + 1]]].bbox[2] - \
                              boxes[obj_list[sorted_obj_idx[obj_idx]]].bbox[0]
        rid2space_norm[rid] = rid2space[rid] / (len(obj_list) - 1)

    for idx, node in enumerate(nodes):
        node_idx = node_indices[idx]
        if (merge_indices[idx] == idx):
            obj_list = []
            for idx_iter in range(len(merge_indices)):
                if (merge_indices[idx_iter] == idx):
                    obj_list += list(cid2obj2[node_indices[idx_iter]])
            obj_list = list(set(obj_list))
            rid_list = list(set([obj2rid[obj] for obj in obj_list]))
            for rid in rid_list:
                total_word_space[node_idx] += rid2space[rid]
                avg_word_space_norm[node_idx] += rid2space_norm[rid]
                obj_boxes = [
                    boxes[obj].bbox[0] for obj in rid2obj
                    if obj in cid2obj2[node_idx]
                ]
                sorted_obj_idx = [
                    i[0]
                    for i in sorted(enumerate(obj_boxes), key=lambda x: x[1])
                ]
                for obj_idx in range(len(sorted_obj_idx) - 1):
                    node_space[node_idx] += boxes[obj_list[sorted_obj_idx[obj_idx + 1]]].bbox[2] - \
                                            boxes[obj_list[sorted_obj_idx[obj_idx]]].bbox[0]
                avg_node_space_norm[node_idx] += node_space[node_idx] / (
                    len(obj_boxes) - 1)
            avg_word_space[node_idx] = total_word_space[node_idx] / len(
                rid_list)
            avg_word_space_norm[node_idx] /= len(rid_list)
            avg_node_space[node_idx] = node_space[node_idx] / len(rid_list)
            avg_node_space_norm[node_idx] /= len(rid_list)

    new_nodes = []
    new_node_indices = []
    for idx in range(len(merge_indices)):
        if (merge_indices[idx] == idx):
            new_nodes.append(nodes[idx])
            new_node_indices.append(node_indices[idx])

    nodes = new_nodes
    node_indices = new_node_indices
    # Features
    for idx, node in enumerate(nodes):
        node_idx = node_indices[idx]
        node_bbox = (node.x0, node.y0, node.x1, node.y1)
        for i1, b1 in enumerate(boxes_segments):
            if (intersect(node_bbox, b1.bbox)):
                num_segments[node_idx] += 1
        for i1, b1 in enumerate(boxes_figures):
            if (intersect(node_bbox, b1.bbox)):
                num_figures[node_idx] += 1
        for i1, b1 in enumerate(boxes_curves):
            if (intersect(node_bbox, b1.bbox)):
                num_curves[node_idx] += 1

    tables = []
    table_indices = []
    for idx, node in enumerate(nodes):
        node_idx = node_indices[idx]
        isTable = True
        if node.is_table():
            for elem in node.elems:
                if ("table" in elem.get_text().lower()):
                    continue
                if ((node.width - elem.bbox[2] + elem.bbox[0]) <
                        2 * char_width):
                    isTable = False
            if (isTable):
                tables.append(node)
                table_indices.append(node_idx)

    if (combine == True):
        node_features = [0] * 17
        for idx, node in enumerate(nodes):
            node_idx = node_indices[idx]
            node_features = [
                sum(x) for x in zip(node_features, [
                    if_row_connected[node_idx], num_row_connected[node_idx],
                    if_connected_by_span[node_idx],
                    num_connected_by_span[node_idx],
                    if_connected_by_align[node_idx],
                    num_connected_by_align[node_idx],
                    if_vertical_columns_merged[node_idx],
                    num_vertical_columns_merged[node_idx],
                    num_segments[node_idx], num_curves[node_idx],
                    num_figures[node_idx], total_word_space[node_idx],
                    avg_word_space[node_idx], avg_word_space_norm[node_idx],
                    node_space[node_idx], avg_node_space[node_idx],
                    avg_node_space_norm[node_idx]
                ])
            ]
        return [], node_features
    else:
        table_features = []
        for idx, table in enumerate(tables):
            table_idx = table_indices[idx]
            table_features += [[
                if_row_connected[table_idx], num_row_connected[table_idx],
                if_connected_by_span[table_idx],
                num_connected_by_span[table_idx],
                if_connected_by_align[table_idx],
                num_connected_by_align[table_idx],
                if_vertical_columns_merged[table_idx],
                num_vertical_columns_merged[table_idx],
                num_segments[table_idx], num_curves[table_idx],
                num_figures[table_idx], total_word_space[table_idx],
                avg_word_space[table_idx], avg_word_space_norm[table_idx],
                node_space[table_idx], avg_node_space[table_idx],
                avg_node_space_norm[table_idx]
            ]]
        return tables, table_features