def __init__(self, baseline, font): LTComponent.__init__(self, (+INF, +INF, -INF, -INF)) self._chars = [] self._text = [] self.baseline = baseline self.font = font self.type = None return
def __init__(self,baseline,font): LTComponent.__init__(self, (+INF, +INF, -INF, -INF)) self._chars = [] self._text = [] self.baseline = baseline self.font = font self.type = None return
def __init__(self, bbox): LTComponent.__init__(self, bbox) self._chars = [] self._figs = [] self._curves = [] self.type = None self.font = None return
def to_tables7(self): lines = [] for t in filter(lambda x: x not in self.multi_col_boxes, self.texts): for line in t: placed = False for l in lines: if (l['contain'].voverlap(line) / l['contain'].height > 0.9) and (l['contain'].voverlap(line) / l['contain'].height < 1.1): l['texts'].append(line) l['contain'].set_bbox( (min(l['contain'].x0, line.x0), l['contain'].y0, max(l['contain'].x1, line.x1), l['contain'].y1)) placed = True if not placed: lines.append({ 'contain': LTComponent(line.bbox), 'texts': [line] }) lines.sort(key=lambda x: x['contain'].y0) for line in lines: line['cells'] = [None] * len(self.finalcols2) for columntext in line['texts']: for i, column in enumerate(self.finalcols2): if column['contain'].hoverlap(columntext): if line['cells'][i] is None: line['cells'][i] = columntext break else: # seems like the parser library sometimes duplicates text, possible bug pass self.lines = lines
def __init__(self, network): if len(network) <= 2: raise self.EmptyTableError # construct rows and columns borders by distinct x and y's. self._rows_borders = sorted(list( set(point.y for point in network.points))) self._columns_borders = sorted(list( set(point.x for point in network.points))) LTComponent.__init__(self, (self._columns_borders[0], self._rows_borders[0], self._columns_borders[-1], self._rows_borders[-1])) self._cells = self._create_cells(network) self._elements = self._build_elements(self._cells)
def newLTCharInit(self, matrix, font, fontsize, scaling, rise, text, textwidth, textdisp): LTText.__init__(self) # Patched in line self.font = font self.fontsize = fontsize self._text = text self.matrix = matrix self.fontname = font.fontname self.adv = textwidth * fontsize * scaling # compute the boundary rectangle. if font.is_vertical(): # vertical width = font.get_width() * fontsize (vx, vy) = textdisp if vx is None: vx = width//2 else: vx = vx * fontsize * .001 vy = (1000 - vy) * fontsize * .001 tx = -vx ty = vy + rise bll = (tx, ty+self.adv) bur = (tx+width, ty) else: # horizontal height = font.get_height() * fontsize descent = font.get_descent() * fontsize ty = descent + rise bll = (0, ty) bur = (self.adv, ty+height) (a, b, c, d, e, f) = self.matrix self.upright = (0 < a*d*scaling and b*c <= 0) (x0, y0) = apply_matrix_pt(self.matrix, bll) (x1, y1) = apply_matrix_pt(self.matrix, bur) if x1 < x0: (x0, x1) = (x1, x0) if y1 < y0: (y0, y1) = (y1, y0) LTComponent.__init__(self, (x0, y0, x1, y1)) if font.is_vertical(): self.size = self.width else: self.size = self.height return
def __init__(self, annoObj, uri, pos, pageid): self.origObjs = [annoObj] self.gotoLoc = uri self.assocText = [""] self.assocTextIn = 0 self.positions = [[LTComponent(pos), pageid]] self.destPage = None self.unparseCite = "" self.finalCiteStr = "" self.papObj = None self.papLink = "" self.author = "" self.year = ""
def to_tables3(self): colgroups = [] for c in self.columns: colg = None for d in colgroups: if c is not d: if all(c['contain'].is_hoverlap(e['contain']) for e in d['cols']): d['cols'].append(c) colg = d break if not colg: colgroups.append({ 'contain': LTComponent(c['contain'].bbox), 'cols': [c] }) logger.info('{x} colgroups'.format(x=len(colgroups))) self.colgroups = colgroups
def given_plane_with_one_object(object_size=50, gridsize=50): bounding_box = (0, 0, 100, 100) plane = Plane(bounding_box, gridsize) obj = LTComponent((0, 0, object_size, object_size)) plane.add(obj) return plane, obj
def __init__(self, bbox): LTComponent.__init__ (self, bbox) self.text_lines = []
def __init__(self, bbox, text): LTComponent.__init__ (self, bbox) self.text = text
def to_tables2(self): columns = [] # texts.sort(key=lambda x: x.width) for e in self.layout: if isinstance(e, LTTextBoxHorizontal): logger.info('Finding a column for box {i}'.format(i=e.index)) ##im2 = im.copy() ##d = ImageDraw.Draw(im2) col = None for c in columns: if (e.x1 < c['contain'].x1) and (e.x0 > c['contain'].x0): if (e.width / c['contain'].width) < 0.8: logger.info( 'Item too small, column may be several columns wide' ) else: logger.info('Item totally contained in column') logger.info( '{ex1} < {cx1} and {ex0} > {cx0}'.format( ex1=e.x1, cx1=c['contain'].x1, ex0=e.x0, cx0=c['contain'].x0)) col = c col['boxes'].append(e) col['contain'].set_bbox( (c['contain'].x0, min(c['contain'].y0, e.y0), c['contain'].x1, max(c['contain'].y1, e.y1))) ##d.rectangle([int(c['contain'].bbox[0] * scale_factor), h - int(c['contain'].bbox[3] * scale_factor), int(c['contain'].bbox[2] * scale_factor), h - int(c['contain'].bbox[1] * scale_factor)], fill=None, outline=c['color'], width=5 * scale_factor) break elif ((c['contain'].hoverlap(e) / c['contain'].width) > 0.9) and ((c['contain'].hoverlap(e) / c['contain'].width) < 1.1): logger.info('Item is within 10% of current col width') logger.info( 'Overlap of {hdist}, column width of {width}'. format(hdist=c['contain'].hoverlap(e), width=c['contain'].width)) col = c col['boxes'].append(e) col['contain'].set_bbox( (min(c['contain'].x0, e.x0), min(c['contain'].y0, e.y0), max(c['contain'].x1, e.x1), max(c['contain'].y1, e.y1))) ##d.rectangle([int(c['contain'].bbox[0] * scale_factor), h - int(c['contain'].bbox[3] * scale_factor), int(c['contain'].bbox[2] * scale_factor), h - int(c['contain'].bbox[1] * scale_factor)], fill=None, outline=c['color'], width=5 * scale_factor) break if not col: logger.info('Creating new column') col = { 'contain': LTComponent(e.bbox), 'boxes': list(e), 'color': random.choice(list(PIL.ImageColor.colormap.keys())) } columns.append(col) columns.sort(key=lambda x: x['contain'].width) ##d.rectangle([int(col['contain'].bbox[0] * scale_factor), h - int(col['contain'].bbox[3] * scale_factor), int(col['contain'].bbox[2] * scale_factor), h - int(col['contain'].bbox[1] * scale_factor)], fill=None, outline=col['color'], width=5 * scale_factor) ##im2.save('page0.{x}.png'.format(x=e.index), "PNG") self.columns = columns
def addPosition(self, pos, pageid): self.positions.append([LTComponent(pos), pageid])
def __init__(self, ltimage): assert(isinstance(ltimage, LTImage)) LTComponent.__init__(self, ltimage.bbox) self._name = ltimage.name self._stream = ltimage.stream
def _create_cells(network): """ Creates cells from the network and returns then as LTComponents. """ squares_taken = defaultdict(set) cells = set() def city_distance(point, point_prime): return abs(point.x - point_prime.x) + abs(point.y - point_prime.y) def is_perpendicular(v1_x, v1_y, v2_x, v2_y): return v1_x*v2_x + v1_y*v2_y == 0 for point in sorted(network, key=lambda p: (p.x, p.y)): for l1 in sorted(network.links[point], key=lambda p: city_distance(p, point)): valid_links = [ link for link in network.links[point] if link != l1 and is_perpendicular(link.x - point.x, link.y - point.y, l1.x - point.x, l1.y - point.y)] for l2 in sorted(valid_links, key=lambda p: city_distance(p, point)): inter = network.links[l2].intersection(network.links[l1]) intersection = list(inter) # remove initial point intersection.remove(point) if len(intersection) == 0: continue # sort by areas: smallest area first area = lambda p: (p.x - point.x)*(p.y - point.y) intersection.sort(key=area) # square is formed by [point, l1, l2, last_point], in this # order. points = [point, l1, l2, intersection[0]] # compute middle position of the square middle_x = sum(point.x for point in points)/4. middle_y = sum(point.y for point in points)/4. # check if any point already has one of its squares # (at most 4) used. is_taken = False square = range(4) for i in range(4): # compute the position of the point in relation to the # middle corresponding to one of the following squares # position: [(1,1), (-1,1), (1,-1), (-1,-1)] vx = middle_x - points[i].x vy = middle_y - points[i].y square[i] = (int(vx/abs(vx)), int(vy/abs(vy))) belongs = square[i] in squares_taken[points[i]] is_taken = is_taken or belongs if not is_taken: cell = LTComponent((point.x, point.y, intersection[0].x, intersection[0].y)) cells.add(cell) for i in range(4): squares_taken[points[i]].add(square[i]) break return cells