def ajustRectangleShapes(pageNd, lClusterPoly): """ reduce a cluster rectangle with just area which does not overlap other area """ #lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters] lNewPoly = [] for i, pol1 in enumerate(lClusterPoly): # print (i,'init:',pol1.area,pol1.bounds) init = pol1.wkt # take all other and createa multipo # multi=MultiPolygon([p for p in lClusterPoly if p != pol1]).buffer(0) for j, pol2 in enumerate(lClusterPoly[i + 1:]): if pol1.intersection(pol2).area > pol2.area * 0.01: pol1 = pol1.symmetric_difference(pol2).difference(pol2) # pol1=pol1.difference(pol2) if not pol1.is_empty and type(pol1) in [ MultiPolygon, GeometryCollection ]: pol1 = max(pol1.geoms, key=lambda p: p.area) cellNd = etree.Element('COLX') pageNd.append(cellNd) if not pol1.is_empty: cellNd.set('points', ShapeLoader.getCoordsString(pol1)) lNewPoly.append(pol1) # lNewPoly.extend(lClusterPoly[1:]) # print(len(lNewPoly) ,len(lClusterPoly)) return lNewPoly
def processRegions(ndPage, bVerbose=False): """ Delete empty regions resize no empty regions """ lDel = [] lndRegions = ndPage.xpath(".//pg:TextRegion", namespaces=dNS) for ndRegion in lndRegions: lTL = ndRegion.xpath(".//pg:TextLine", namespaces=dNS) if lTL == []: # to be deleted lDel.append(ndRegion) else: #resize it oHull = ShapeLoader.convex_hull(lTL, bShapelyObject=True) PageXml.getChildByName(ndRegion, 'Coords')[0].set( "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True)) # contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lTL ]) # o = contour.minimum_rotated_rectangle # ndRegion.getChildByName('Coords').set("points", ShapeLoader.getCoordsString(o, bFailSafe=True)) # delete empty regions [ndRegion.getparent().remove(ndRegion) for ndRegion in lDel] if bVerbose: traceln(" - %d regions deleted" % (len(lDel))) traceln(" - %d regions updated" % (len(lndRegions) - len(lDel)))
def tableCreation(ndPage, dCellsIJ, dCellTxt, dCoordCell): """ find TABLE tag add CELLS """ # get TABLE node xTable = ".//TABLE" lTableNds = ndPage.xpath(xTable) # discard fake table!! tableNd = lTableNds[-1] for cellwkt in dCellsIJ.keys(): cellNd = etree.Element('CELL') i, j = dCellsIJ[cellwkt] x1, y1, x2, y2 = dCoordCell[cellwkt].bounds cellNd.set('row', f'{i}') cellNd.set('col', f'{j}') cellNd.set('x', str(x1)) cellNd.set('y', str(y1)) cellNd.set('height', str(abs(y2 - y1))) cellNd.set('width', str(abs(x2 - x1))) # empty # cellNd.set('points',ShapeLoader.getCoordsString(dCoordCell[cellwkt])) try: cellNd.set('points', ShapeLoader.getCoordsString(dCoordCell[cellwkt])) except: # take largest pol = max(dCoordCell[cellwkt].geoms, key=lambda p: p.area) cellNd.set('points', ShapeLoader.getCoordsString(pol)) # populate with TL!: # sort by Y increasing!! dCellTxt[cellwkt].sort( key=lambda x: ShapeLoader.node_to_Point(x).bounds[1]) [cellNd.append(t) for t in dCellTxt[cellwkt]] #cellNd.text= " ".join(dCellTxt[cellwkt]) cellNd.set('colSpan', "1") cellNd.set('rowSpan', "1") tableNd.append(cellNd) return
def shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow, dTextInCells): """ delete one 'col' and see if the rectangle is stable (area? comparison; rather angle) angle see baseline object or take the longest line in the polygon and compute its angle radian = math.atan((shape.lastpoint.x - shape.firstpoint.x)/(shape.lastpoint.y - shape.firstpoint.y)) degrees = radian * 180 / math.pi return degrees """ #sort by size (reverse) # lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters] ## can be restricted to ones with high angle? lNewPoly = [] for i in sorted(dCellCndPerRow.keys()): lcellRows = dCellCndPerRow[i] # skip first ltl = [] for c in lcellRows[1:]: if c.wkt in dTextInCells: for tl in dTextInCells[c.wkt]: assert ShapeLoader.node_to_Polygon(tl).is_valid ltl.append(tl) rectangle = ShapeLoader.minimum_rotated_rectangle(ltl, bShapelyObject=True) # print (i,lClusterPoly[i].intersection(rectangle).area / lClusterPoly[i].area) if not rectangle.is_empty: if abs(rectangle.bounds[1] - rectangle.bounds[3]) < abs(lClusterPoly[i].bounds[1] - lClusterPoly[i].bounds[3]): # rectangle=scale(rectangle,xfact=1.5) cellNd = etree.Element('COL') pageNd.append(cellNd) rectangle = scale(rectangle, xfact=2) lNewPoly.append(rectangle) cellNd.set('points', ShapeLoader.getCoordsString(rectangle)) else: lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5) lNewPoly.append(lClusterPoly[i]) else: lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5) lNewPoly.append(lClusterPoly[i]) ## get length and compute angle !!! take the one with more horizontal angle:: # cellNd = etree.Element('COL') # pageNd.append(cellNd) # if not rectangle.is_empty: # rectangle=scale(rectangle,xfact=2) # cellNd.set('points',ShapeLoader.getCoordsString(rectangle)) # print (len(lClusterPoly),len(lNewPoly)) return lNewPoly
def makeClusterNode(self, sAlgo): """ Create an XML node reflecting the cluster """ ndCluster = PageXml.createPageXmlNode('Cluster') ndCluster.set("name", self.name) ndCluster.set("algo", sAlgo) # add the space separated list of node ids ndCluster.set("content", " ".join(self.setID)) ndCoords = PageXml.createPageXmlNode('Coords') ndCluster.append(ndCoords) if self.shape is None: ndCoords.set('points', "") else: ndCoords.set('points', ShapeLoader.getCoordsString(self.shape)) ndCluster.tail = "\n" return ndCluster
def transformMinima2envelop(pageNd, dClusterWithTL): """ """ lClustersPoly = [] for row in dClusterWithTL: # bb = ShapeLoader.contourObject(dClusterWithTL[row]).envelope # if not bb.is_empty: # cellNd = etree.Element('COL') # pageNd.append(cellNd) # cellNd.set('points',ShapeLoader.getCoordsString(bb)) ch = ShapeLoader.contourObject(dClusterWithTL[row]).convex_hull lClustersPoly.append(ch) if not ch.is_empty: cellNd = etree.Element('LINE') pageNd.append(cellNd) cellNd.set('points', ShapeLoader.getCoordsString(ch)) return lClustersPoly
def makeTableNode(self): """ Make a DOM tree for this table """ lK = self._dCellNd.keys() lRow = list(set(_row for _row, _col in lK)) lRow.sort() lCol = list(set(_col for _row, _col in lK)) lCol.sort() ndTable = PageXml.createPageXmlNode("TableRegion") ndTable.set("id", "p%s_%s" % (self.pagenum, self.tablenum)) ndTable.tail = "\n" lCellShape = [] lNdCell = [] for row in lRow: for col in lCol: lNdText = self._dCellNd[(row, col)] # <TableCell row="0" col="1" rowSpan="1" colSpan="1" id="TableCell_1500971530732_2485"> # <Coords points="480,42 485,323 878,323 874,38"/> if lNdText: ndCell = PageXml.createPageXmlNode("TableCell") ndCell.set( "id", "p%s_t%s_r%s_c%s" % (self.pagenum, self.tablenum, row, col)) # shape of the cell oHull = ShapeLoader.convex_hull(lNdText, bShapelyObject=True) lCellShape.append( oHull) # keep those to compute table contour # Coords sub-element ndCoords = PageXml.createPageXmlNode("Coords") ndCoords.set( "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True)) ndCoords.tail = "\n" ndCell.append(ndCoords) # row="0" col="0" rowSpan="1" colSpan="1" leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false" bottomBorderVisible="false" ndCell.set("row", str(row)) ndCell.set("rowSpan", "1") ndCell.set("col", str(col)) ndCell.set("colSpan", "1") ndCell.tail = "\n" #add corner cornerNode = PageXml.createPageXmlNode("CornerPts") cornerNode.text = "0 1 2 3" ndCell.append(cornerNode) for nd in lNdText: ndCell.append(nd) lNdCell.append(ndCell) # Table geometry ndCoords = PageXml.createPageXmlNode("Coords") contour = cascaded_union( [p if p.is_valid else p.convex_hull for p in lCellShape]) o = contour.minimum_rotated_rectangle ndCoords.set("points", ShapeLoader.getCoordsString(o, bFailSafe=True)) ndCoords.tail = "\n" ndTable.append(ndCoords) for nd in lNdCell: ndTable.append(nd) return ndTable
def checkClusteringPerRow(pageNd, dCellCndPerRow, dTextInCells, dClusterWithTL, dEdges): """ recompute clusters for the rows """ lNewPoly = [] lNewClusterWithTL = [] lNewClusterWithCC = [] lRowtodel = [] lTLtoBeAdded = [] for rowid, rowCells in dCellCndPerRow.items(): lNbClusterPerCol = [] lClusterPerCol = [] for cell in rowCells: if cell.wkt in dTextInCells.keys(): lIds = [tl.get('id') for tl in dTextInCells[cell.wkt]] lClusterCells = connected_components(dEdges, lIds, TH=0.5) lNbClusterPerCol.append(len(lClusterCells)) lClusterPerCol.append(lClusterCells) #print (len(rowCells),lIds,lClusterCells,lNbClusterPerCol) #else empty cell if len(lNbClusterPerCol) > 2: # print (lNbClusterPerCol) try: nbRows = mode(lNbClusterPerCol) except statistics.StatisticsError: nbRows = 2 if nbRows != 1: print('WARNING CUT ', rowid, "mode", nbRows) lRowtodel.append(rowid) dClusterCells = defaultdict(list) for colcluster in lClusterPerCol: if len(colcluster) == nbRows: # get tl instead of ids # take the first element only for comparing position colclustertxt = [] for cc in colcluster: colclustertxt.append([ tl for tl in dClusterWithTL[rowid] if tl.get('id') in cc ]) sortedC = sorted(colclustertxt, key=lambda x: ShapeLoader. node_to_Polygon(x[0]).centroid.y) for i, cc in enumerate(sortedC): dClusterCells[i].append(cc) else: for cc in colcluster: lTLtoBeAdded.extend([ tl for tl in dClusterWithTL[rowid] if tl.get('id') in cc ]) for i, lcc in dClusterCells.items(): # print (i,lcc) rect = ShapeLoader.contourObject([ x for cc in lcc for x in cc ]).envelope # minimal_rectangle? rect = scale(rect, xfact=2) lNewPoly.append(rect) lNewClusterWithCC.append(lcc) lNewClusterWithTL.append([x for cc in lcc for x in cc]) # lNewCluster.Append() # need also to create a cluster with list of ids!! rNd = etree.Element('LINE') rNd.set('points', ShapeLoader.getCoordsString(rect)) pageNd.append(rNd) # final check: if overlap between rectangle: top rectangle is the cut #return updated list of lCLusters return lRowtodel, lNewPoly, lNewClusterWithTL, lNewClusterWithCC, lTLtoBeAdded