예제 #1
0
    def to_tables7(self):
        lines = []
        for t in filter(lambda x: x not in self.multi_col_boxes, self.texts):
            for line in t:
                placed = False
                for l in lines:
                    if (l['contain'].voverlap(line) / l['contain'].height >
                            0.9) and (l['contain'].voverlap(line) /
                                      l['contain'].height < 1.1):
                        l['texts'].append(line)
                        l['contain'].set_bbox(
                            (min(l['contain'].x0, line.x0), l['contain'].y0,
                             max(l['contain'].x1, line.x1), l['contain'].y1))
                        placed = True
                if not placed:
                    lines.append({
                        'contain': LTComponent(line.bbox),
                        'texts': [line]
                    })
        lines.sort(key=lambda x: x['contain'].y0)

        for line in lines:
            line['cells'] = [None] * len(self.finalcols2)
            for columntext in line['texts']:
                for i, column in enumerate(self.finalcols2):
                    if column['contain'].hoverlap(columntext):
                        if line['cells'][i] is None:
                            line['cells'][i] = columntext
                            break
                        else:
                            # seems like the parser library sometimes duplicates text, possible bug
                            pass
        self.lines = lines
예제 #2
0
 def __init__(self, annoObj, uri, pos, pageid):
     self.origObjs = [annoObj]
     self.gotoLoc = uri
     self.assocText = [""]
     self.assocTextIn = 0
     self.positions = [[LTComponent(pos), pageid]]
     self.destPage = None
     self.unparseCite = ""
     self.finalCiteStr = ""
     self.papObj = None
     self.papLink = ""
     self.author = ""
     self.year = ""
예제 #3
0
 def to_tables3(self):
     colgroups = []
     for c in self.columns:
         colg = None
         for d in colgroups:
             if c is not d:
                 if all(c['contain'].is_hoverlap(e['contain'])
                        for e in d['cols']):
                     d['cols'].append(c)
                     colg = d
                     break
         if not colg:
             colgroups.append({
                 'contain': LTComponent(c['contain'].bbox),
                 'cols': [c]
             })
     logger.info('{x} colgroups'.format(x=len(colgroups)))
     self.colgroups = colgroups
예제 #4
0
 def given_plane_with_one_object(object_size=50, gridsize=50):
     bounding_box = (0, 0, 100, 100)
     plane = Plane(bounding_box, gridsize)
     obj = LTComponent((0, 0, object_size, object_size))
     plane.add(obj)
     return plane, obj
예제 #5
0
 def to_tables2(self):
     columns = []
     # texts.sort(key=lambda x: x.width)
     for e in self.layout:
         if isinstance(e, LTTextBoxHorizontal):
             logger.info('Finding a column for box {i}'.format(i=e.index))
             ##im2 = im.copy()
             ##d = ImageDraw.Draw(im2)
             col = None
             for c in columns:
                 if (e.x1 < c['contain'].x1) and (e.x0 > c['contain'].x0):
                     if (e.width / c['contain'].width) < 0.8:
                         logger.info(
                             'Item too small, column may be several columns wide'
                         )
                     else:
                         logger.info('Item totally contained in column')
                         logger.info(
                             '{ex1} < {cx1} and {ex0} > {cx0}'.format(
                                 ex1=e.x1,
                                 cx1=c['contain'].x1,
                                 ex0=e.x0,
                                 cx0=c['contain'].x0))
                         col = c
                         col['boxes'].append(e)
                         col['contain'].set_bbox(
                             (c['contain'].x0, min(c['contain'].y0,
                                                   e.y0), c['contain'].x1,
                              max(c['contain'].y1, e.y1)))
                         ##d.rectangle([int(c['contain'].bbox[0] * scale_factor), h - int(c['contain'].bbox[3] * scale_factor), int(c['contain'].bbox[2] * scale_factor), h - int(c['contain'].bbox[1] * scale_factor)], fill=None, outline=c['color'], width=5 * scale_factor)
                     break
                 elif ((c['contain'].hoverlap(e) / c['contain'].width) >
                       0.9) and ((c['contain'].hoverlap(e) /
                                  c['contain'].width) < 1.1):
                     logger.info('Item is within 10% of current col width')
                     logger.info(
                         'Overlap of {hdist}, column width of {width}'.
                         format(hdist=c['contain'].hoverlap(e),
                                width=c['contain'].width))
                     col = c
                     col['boxes'].append(e)
                     col['contain'].set_bbox(
                         (min(c['contain'].x0,
                              e.x0), min(c['contain'].y0,
                                         e.y0), max(c['contain'].x1, e.x1),
                          max(c['contain'].y1, e.y1)))
                     ##d.rectangle([int(c['contain'].bbox[0] * scale_factor), h - int(c['contain'].bbox[3] * scale_factor), int(c['contain'].bbox[2] * scale_factor), h - int(c['contain'].bbox[1] * scale_factor)], fill=None, outline=c['color'], width=5 * scale_factor)
                     break
             if not col:
                 logger.info('Creating new column')
                 col = {
                     'contain':
                     LTComponent(e.bbox),
                     'boxes':
                     list(e),
                     'color':
                     random.choice(list(PIL.ImageColor.colormap.keys()))
                 }
                 columns.append(col)
                 columns.sort(key=lambda x: x['contain'].width)
                 ##d.rectangle([int(col['contain'].bbox[0] * scale_factor), h - int(col['contain'].bbox[3] * scale_factor), int(col['contain'].bbox[2] * scale_factor), h - int(col['contain'].bbox[1] * scale_factor)], fill=None, outline=col['color'], width=5 * scale_factor)
             ##im2.save('page0.{x}.png'.format(x=e.index), "PNG")
     self.columns = columns
예제 #6
0
 def addPosition(self, pos, pageid):
     self.positions.append([LTComponent(pos), pageid])
예제 #7
0
    def _create_cells(network):
        """
        Creates cells from the network and returns then
        as LTComponents.
        """
        squares_taken = defaultdict(set)
        cells = set()

        def city_distance(point, point_prime):
            return abs(point.x - point_prime.x) + abs(point.y - point_prime.y)

        def is_perpendicular(v1_x, v1_y, v2_x, v2_y):
            return v1_x*v2_x + v1_y*v2_y == 0

        for point in sorted(network, key=lambda p: (p.x, p.y)):
            for l1 in sorted(network.links[point],
                             key=lambda p: city_distance(p, point)):
                valid_links = [
                    link for link in network.links[point] if link != l1 and
                    is_perpendicular(link.x - point.x, link.y - point.y,
                                     l1.x - point.x, l1.y - point.y)]

                for l2 in sorted(valid_links,
                                 key=lambda p: city_distance(p, point)):
                    inter = network.links[l2].intersection(network.links[l1])
                    intersection = list(inter)

                    # remove initial point
                    intersection.remove(point)

                    if len(intersection) == 0:
                        continue

                    # sort by areas: smallest area first
                    area = lambda p: (p.x - point.x)*(p.y - point.y)
                    intersection.sort(key=area)

                    # square is formed by [point, l1, l2, last_point], in this
                    # order.
                    points = [point, l1, l2, intersection[0]]

                    # compute middle position of the square
                    middle_x = sum(point.x for point in points)/4.
                    middle_y = sum(point.y for point in points)/4.

                    # check if any point already has one of its squares
                    # (at most 4) used.
                    is_taken = False
                    square = range(4)
                    for i in range(4):
                        # compute the position of the point in relation to the
                        # middle corresponding to one of the following squares
                        # position: [(1,1), (-1,1), (1,-1), (-1,-1)]
                        vx = middle_x - points[i].x
                        vy = middle_y - points[i].y

                        square[i] = (int(vx/abs(vx)), int(vy/abs(vy)))

                        belongs = square[i] in squares_taken[points[i]]

                        is_taken = is_taken or belongs

                    if not is_taken:

                        cell = LTComponent((point.x, point.y,
                                            intersection[0].x, intersection[0].y))

                        cells.add(cell)

                        for i in range(4):
                            squares_taken[points[i]].add(square[i])
                        break

        return cells