def processRegions(ndPage, bVerbose=False): """ Delete empty regions resize no empty regions """ lDel = [] lndRegions = ndPage.xpath(".//pg:TextRegion", namespaces=dNS) for ndRegion in lndRegions: lTL = ndRegion.xpath(".//pg:TextLine", namespaces=dNS) if lTL == []: # to be deleted lDel.append(ndRegion) else: #resize it oHull = ShapeLoader.convex_hull(lTL, bShapelyObject=True) PageXml.getChildByName(ndRegion, 'Coords')[0].set( "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True)) # contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lTL ]) # o = contour.minimum_rotated_rectangle # ndRegion.getChildByName('Coords').set("points", ShapeLoader.getCoordsString(o, bFailSafe=True)) # delete empty regions [ndRegion.getparent().remove(ndRegion) for ndRegion in lDel] if bVerbose: traceln(" - %d regions deleted" % (len(lDel))) traceln(" - %d regions updated" % (len(lndRegions) - len(lDel)))
def shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow, dTextInCells): """ delete one 'col' and see if the rectangle is stable (area? comparison; rather angle) angle see baseline object or take the longest line in the polygon and compute its angle radian = math.atan((shape.lastpoint.x - shape.firstpoint.x)/(shape.lastpoint.y - shape.firstpoint.y)) degrees = radian * 180 / math.pi return degrees """ #sort by size (reverse) # lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters] ## can be restricted to ones with high angle? lNewPoly = [] for i in sorted(dCellCndPerRow.keys()): lcellRows = dCellCndPerRow[i] # skip first ltl = [] for c in lcellRows[1:]: if c.wkt in dTextInCells: for tl in dTextInCells[c.wkt]: assert ShapeLoader.node_to_Polygon(tl).is_valid ltl.append(tl) rectangle = ShapeLoader.minimum_rotated_rectangle(ltl, bShapelyObject=True) # print (i,lClusterPoly[i].intersection(rectangle).area / lClusterPoly[i].area) if not rectangle.is_empty: if abs(rectangle.bounds[1] - rectangle.bounds[3]) < abs(lClusterPoly[i].bounds[1] - lClusterPoly[i].bounds[3]): # rectangle=scale(rectangle,xfact=1.5) cellNd = etree.Element('COL') pageNd.append(cellNd) rectangle = scale(rectangle, xfact=2) lNewPoly.append(rectangle) cellNd.set('points', ShapeLoader.getCoordsString(rectangle)) else: lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5) lNewPoly.append(lClusterPoly[i]) else: lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5) lNewPoly.append(lClusterPoly[i]) ## get length and compute angle !!! take the one with more horizontal angle:: # cellNd = etree.Element('COL') # pageNd.append(cellNd) # if not rectangle.is_empty: # rectangle=scale(rectangle,xfact=2) # cellNd.set('points',ShapeLoader.getCoordsString(rectangle)) # print (len(lClusterPoly),len(lNewPoly)) return lNewPoly
def computeShape(cls, ndPage, setID, bConvexHull=False): """ compute a shape for this cluster, as the minimum rotated rectangle of its content or optionally as the convex hull """ # let's find the nodes and compute the shape lNode = [ ndPage.xpath(".//*[@id='%s']" % _id, namespaces=dNS)[0] for _id in setID ] return ShapeLoader.convex_hull(lNode, bShapelyObject=True) \ if bConvexHull \ else ShapeLoader.minimum_rotated_rectangle(lNode, bShapelyObject=True)
def ajustRectangleShapes(pageNd, lClusterPoly): """ reduce a cluster rectangle with just area which does not overlap other area """ #lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters] lNewPoly = [] for i, pol1 in enumerate(lClusterPoly): # print (i,'init:',pol1.area,pol1.bounds) init = pol1.wkt # take all other and createa multipo # multi=MultiPolygon([p for p in lClusterPoly if p != pol1]).buffer(0) for j, pol2 in enumerate(lClusterPoly[i + 1:]): if pol1.intersection(pol2).area > pol2.area * 0.01: pol1 = pol1.symmetric_difference(pol2).difference(pol2) # pol1=pol1.difference(pol2) if not pol1.is_empty and type(pol1) in [ MultiPolygon, GeometryCollection ]: pol1 = max(pol1.geoms, key=lambda p: p.area) cellNd = etree.Element('COLX') pageNd.append(cellNd) if not pol1.is_empty: cellNd.set('points', ShapeLoader.getCoordsString(pol1)) lNewPoly.append(pol1) # lNewPoly.extend(lClusterPoly[1:]) # print(len(lNewPoly) ,len(lClusterPoly)) return lNewPoly
def tableCreation(ndPage, dCellsIJ, dCellTxt, dCoordCell): """ find TABLE tag add CELLS """ # get TABLE node xTable = ".//TABLE" lTableNds = ndPage.xpath(xTable) # discard fake table!! tableNd = lTableNds[-1] for cellwkt in dCellsIJ.keys(): cellNd = etree.Element('CELL') i, j = dCellsIJ[cellwkt] x1, y1, x2, y2 = dCoordCell[cellwkt].bounds cellNd.set('row', f'{i}') cellNd.set('col', f'{j}') cellNd.set('x', str(x1)) cellNd.set('y', str(y1)) cellNd.set('height', str(abs(y2 - y1))) cellNd.set('width', str(abs(x2 - x1))) # empty # cellNd.set('points',ShapeLoader.getCoordsString(dCoordCell[cellwkt])) try: cellNd.set('points', ShapeLoader.getCoordsString(dCoordCell[cellwkt])) except: # take largest pol = max(dCoordCell[cellwkt].geoms, key=lambda p: p.area) cellNd.set('points', ShapeLoader.getCoordsString(pol)) # populate with TL!: # sort by Y increasing!! dCellTxt[cellwkt].sort( key=lambda x: ShapeLoader.node_to_Point(x).bounds[1]) [cellNd.append(t) for t in dCellTxt[cellwkt]] #cellNd.text= " ".join(dCellTxt[cellwkt]) cellNd.set('colSpan', "1") cellNd.set('rowSpan', "1") tableNd.append(cellNd) return
def isBaselineHorizontal(ndText): lNdBaseline = MultiPageXml.getChildByName(ndText ,'Baseline') if lNdBaseline: try: o = ShapeLoader.node_to_LineString(lNdBaseline[0]) except: return True (minx, miny, maxx, maxy) = o.bounds return bool(maxx-minx >= maxy-miny) return True
def transformMinima2envelop(pageNd, dClusterWithTL): """ """ lClustersPoly = [] for row in dClusterWithTL: # bb = ShapeLoader.contourObject(dClusterWithTL[row]).envelope # if not bb.is_empty: # cellNd = etree.Element('COL') # pageNd.append(cellNd) # cellNd.set('points',ShapeLoader.getCoordsString(bb)) ch = ShapeLoader.contourObject(dClusterWithTL[row]).convex_hull lClustersPoly.append(ch) if not ch.is_empty: cellNd = etree.Element('LINE') pageNd.append(cellNd) cellNd.set('points', ShapeLoader.getCoordsString(ch)) return lClustersPoly
def parseDocNodeLabel(self, graph_node, defaultCls=None): """ Parse and set the graph node label and return its class index raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one """ domnode = graph_node.node sXmlLabel = domnode.get(self.sLabelAttr) # in case we also deal with column headers if self.bColumnHeader and 'CH' == domnode.get("DU_header"): sXmlLabel = 'CH' sXmlLabel = self.dConverter[sXmlLabel] if sXmlLabel is None: # special processing for singletons TODO: make it more efficient? ptTxt = ShapeLoader.node_to_Polygon(domnode).centroid plgCell = ShapeLoader.node_to_Polygon(domnode.getparent()) plgMiddle = shapely.affinity.scale(plgCell, 1, 0.333, 1, 'centroid') if plgMiddle.contains(ptTxt): sXmlLabel = "Sm" else: if ptTxt.y < plgCell.centroid.y: sXmlLabel = "St" else: sXmlLabel = "Sb" try: sLabel = self.dXmlLabel2Label[sXmlLabel] except KeyError: # #not a label of interest, can we ignore it? # try: # self.checkIsIgnored(sXmlLabel) # sLabel = self.sDefaultLabel # #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel: # except: raise ValueError("Invalid label '%s'" " (from @%s or @%s) in node %s" % (sXmlLabel, self.sLabelAttr, self.sDefaultLabel, etree.tostring(domnode))) # traceln(etree.tostring(domnode), sLabel) return sLabel
def addClusterToDom(self, dCluster, bMoveContent=False): """ Add Cluster elements to the Page DOM node """ pageNode = None for x, lnidx in dCluster.items(): #self.analysedCluster() if pageNode is None: pageNode = self.lNode[lnidx[0]].page.node pageNode.append( etree.Comment("Clusters created by the conjugate graph")) # lp = [ShapeLoader.node_to_Polygon(self.lNode[_i].node) for _i in lnidx] # Make it robust to bad data... lp = [] for _i in lnidx: try: lp.append(ShapeLoader.node_to_Polygon(self.lNode[_i].node)) except ValueError: pass contour = cascaded_union( [p if p.is_valid else p.convex_hull for p in lp]) # print(contour.wkt) try: spoints = ' '.join( "%s,%s" % (int(x[0]), int(x[1])) for x in contour.minimum_rotated_rectangle.exterior.coords) except: try: spoints = ' '.join( "%s,%s" % (int(x[0]), int(x[1])) for x in contour.minimum_rotated_rectangle.coords) # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence except: spoints = "" #print (spoints) ndCluster = PageXml.createPageXmlNode('Cluster') # add the space separated list of node ids ndCluster.set( "content", " ".join(self.lNode[_i].node.get("id") for _i in lnidx)) coords = PageXml.createPageXmlNode('Coords') ndCluster.append(coords) coords.set('points', spoints) pageNode.append(ndCluster) if bMoveContent: # move the DOM node of the content to the cluster for _i in lnidx: ndCluster.append(self.lNode[_i].node) return
def getClusterCoords(lElts): lp = [] for e in lElts: try: lp.append(ShapeLoader.node_to_Polygon(e)) except ValueError: pass contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lp ]) # print(contour.wkt) try:spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.exterior.coords) except: try: spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.coords) # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence except: spoints = "" return spoints
def assignConCompToCells(lCC, lCells): """ get textlines from ids and best-assign them to the list of dCells (shape) # do it at the cc level? """ dCellCC = {} for i, cc in enumerate(ShapeLoader.contourObject(lCC)): if not cc.is_empty: cellshape = bestRegionsAssignment(cc, lCells) if cellshape: try: dCellCC[cellshape.wkt].append(lCC[i]) except KeyError: dCellCC[cellshape.wkt] = [lCC[i]] return dCellCC
def makeClusterNode(self, sAlgo): """ Create an XML node reflecting the cluster """ ndCluster = PageXml.createPageXmlNode('Cluster') ndCluster.set("name", self.name) ndCluster.set("algo", sAlgo) # add the space separated list of node ids ndCluster.set("content", " ".join(self.setID)) ndCoords = PageXml.createPageXmlNode('Coords') ndCluster.append(ndCoords) if self.shape is None: ndCoords.set('points', "") else: ndCoords.set('points', ShapeLoader.getCoordsString(self.shape)) ndCluster.tail = "\n" return ndCluster
def assignTextLinesToCells(lTL, lCells): """ get textlines from ids and best-assign them to the list of dCells (shape) # do it at the cc level? """ # print (lCells) # print ([ShapeLoader.node_to_Polygon(nd) for nd in lTL]) dCellTL = {} for i, txt in enumerate([ShapeLoader.node_to_Polygon(nd) for nd in lTL]): if not txt.is_empty: cellshape = bestRegionsAssignment(txt, lCells) if cellshape: try: dCellTL[cellshape.wkt].append(lTL[i]) except KeyError: dCellTL[cellshape.wkt] = [lTL[i]] return dCellTL
def _iter_GraphNode(self, doc, domNdPage, page): """ to add the shape object reflecting the baseline """ for blk in super()._iter_GraphNode(doc, domNdPage, page): try: ndBaseline = blk.node.xpath(".//pc:Baseline", namespaces=self.dNS)[0] try: o = ShapeLoader.node_to_LineString(ndBaseline) except ValueError: traceln("SKIPPING INVALID Baseline: ", etree.tostring(ndBaseline)) continue blk.shape = o blk.du_index = int(ndBaseline.get("du_index")) yield blk except: pass return
def addSeparatorFeature(self): """ We load the graphical separators COmpute a set of shapely object In turn, for each edge, we compute the intersection """ # graphical separators from xml_formats.PageXml import PageXml dNS = {"pc": PageXml.NS_PAGE_XML} ndRoot = self.lNode[0].node.getroottree() lNdSep = ndRoot.xpath(".//pc:SeparatorRegion", namespaces=dNS) loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep] if self.bVerbose: traceln(" %d graphical separators" % len(loSep)) # make an indexed rtree idx = index.Index() for i, oSep in enumerate(loSep): idx.insert(i, oSep.bounds) # take each edge in turn and list the separators it crosses nCrossing = 0 for edge in self.lEdge: # bottom-left corner to bottom-left corner oEdge = geom.LineString([(edge.A.x1, edge.A.y1), (edge.B.x1, edge.B.y1)]) prepO = prep(oEdge) bCrossing = False for i in idx.intersection(oEdge.bounds): # check each candidate in turn if prepO.intersects(loSep[i]): bCrossing = True nCrossing += 1 break edge.bCrossingSep = bCrossing if self.bVerbose: traceln( " %d (/ %d) edges crossing at least one graphical separator" % (nCrossing, len(self.lEdge)))
def addClusterToDom(self, lCluster, bMoveContent=False, sAlgo="", pageNode=None): """ Add Cluster elements to the Page DOM node """ lNdCluster = [] for name, lnidx in enumerate(lCluster): #self.analysedCluster() if pageNode is None: for idx in lnidx: pageNode = self.lNode[idx].page.node break pageNode.append( etree.Comment( "\nClusters created by the conjugate graph\n")) ndCluster = PageXml.createPageXmlNode('Cluster') ndCluster.set("name", str(name)) ndCluster.set("algo", sAlgo) # add the space separated list of node ids ndCluster.set( "content", " ".join(self.lNode[_i].node.get("id") for _i in lnidx)) coords = PageXml.createPageXmlNode('Coords') ndCluster.append(coords) spoints = ShapeLoader.minimum_rotated_rectangle( [self.lNode[_i].node for _i in lnidx]) coords.set('points', spoints) pageNode.append(ndCluster) ndCluster.tail = "\n" if bMoveContent: # move the DOM node of the content to the cluster for _i in lnidx: ndCluster.append(self.lNode[_i].node) lNdCluster.append(ndCluster) return lNdCluster
def loadClusterNode(cls, ndPage, nd, sAlgo, bComputeShape=True): """ Load a cluster from its XML node Compute its shape, if not provided in the XML, as a minimum rotated rectangle """ name = nd.get("name") if name is None: name = "%s_%d" % (sAlgo, cls.cnt) cls.cnt += 1 nd.set("name", name) setID = set(nd.get("content").split()) if bool(setID): try: shape = ShapeLoader.node_to_Polygon(nd) except IndexError: if bComputeShape: shape = cls.computeShape(ndPage, setID) else: shape = None return cls(name, setID, shape) else: return None
def normaliseElement(nd, iNorm): try: ndBaseline = nd.xpath(xpBASELINE, namespaces=dNS)[0] except IndexError: raise NormaliseException("WARNING: skipped element normalisation: no Baseline: %s" % etree.tostring(nd)) try: line = ShapeLoader.node_to_LineString(ndBaseline) except ValueError: raise NormaliseException("WARNING: skipped element normalisation: invalid Coords: %s" % etree.tostring(nd)) topline = translate(line, yoff=-iNorm) # serialise both in circular sequence spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in line.coords) lp=list(topline.coords) lp.reverse() spoints = spoints+ ' ' +' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in lp) # ad-hoc way of setting the element coodinates ndCoords = nd.xpath(".//pg:Coords", namespaces=dNS)[0] ndCoords.set("points",spoints) return
def check(self, sFilename, scale, bVerbose=False): """ return Y_pred, YGT (shape=(nb_node,) dtype=np.int) return a TestReport """ lY, lYGT = [], [] #for the pretty printer to format better... assert os.path.isfile(sFilename), sFilename doc = etree.parse(sFilename, self.parser) #doc = etree.parse(sFilename) root = doc.getroot() #place each TextLine in the table rows and columns ndPage = MultiPageXml.getChildByName(root, 'Page')[0] # w, h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight")) def storeNode(oShape, nd): oShape.duNode = nd if True: loTxt = ShapeLoader.children_to_LinearRing(ndPage, 'TextLine', storeNode) else: loTxt = ShapeLoader.children_to_LineString(ndPage, 'Baseline', storeNode) if not scale is None: scaled_loTxt = [] for o in loTxt: scaled_o = shapely.affinity.scale(o, 1.0, scale) scaled_o.duNode = o.duNode scaled_loTxt.append(scaled_o) loTxt = scaled_loTxt if bVerbose: print("%d TextLines" % len(loTxt)) loSep = ShapeLoader.children_to_LineString(ndPage, 'SeparatorRegion', storeNode) if bVerbose: print("%d SeparatorRegion" % len(loSep)) if True: # brute-force code for oSep in loSep: if bVerbose: print("%35s %4s %4s %s" % (oSep, oSep.duNode.get("row"), oSep.duNode.get("col"), oSep.duNode.get("orient"))) sInfo = oSep.duNode.get("orient") if sInfo.startswith("horizontal"): YGT = 0 elif sInfo.startswith("vertical"): YGT = 1 continue else: YGT = 2 Y = None for oTxt in loTxt: if oSep.crosses(oTxt): Y = 2 nd = oTxt.duNode sid = nd.get("id") ndCell = nd.getparent() if bVerbose: print("\tCrossing %s row=%s col=%s" % (sid, ndCell.get("row"), ndCell.get("col"))) if Y is None: Y = YGT lY.append(Y) lYGT.append(YGT) else: # Collection-based code moTxt = shapely.geometry.MultiLineString(loTxt) for oSep in loSep: print("%40s %4s %4s %s" % (oSep, oSep.duNode.get("row"), oSep.duNode.get("col"), oSep.duNode.get("orient"))) if oSep.crosses(moTxt): print("NOO") lYGT.append("") oTstRpt = TestReport("SeparatorChecker", [lY], [lYGT], self.lsClassName, [sFilename]) #return np.array(lY , dtype=np.int), np.array(lYGT, dtype=np.int) fScore, sClassificationReport = oTstRpt.getClassificationReport() if fScore < 0.999: print("\t *** Accuracy score = %f" % fScore) # if fScore < 1: print(sFilename, sClassificationReport) return oTstRpt
def loadPageCol(self, ndPage, fRatio, shaper_fun=ShapeLoader.node_to_Point, funIndex=lambda x: x._du_index): """ load the page, looking for Baseline can filter by DU_row return a list of shapely objects , a dict of sorted list of objects, by column GT BUG: some Baseline are assigned to the wrong Cell => we also fix this here.... """ loBaseline = [] # list of Baseline shapes i = 0 dsetTableByCol = defaultdict(set) # sets of object ids, by col dsetTableDataByCol = defaultdict(set) # sets of object ids, by col dO = {} dNodeSeen = {} # first associate a unique id to each baseline and list them lshapeCell = [] lOrphanBaselineShape = [] lCells = MultiPageXml.getChildByName(ndPage, "TableCell") maxHeaderRowSpan = computeMaxRowSpan(lCells) traceln(" - maxHeaderRowSpan=", maxHeaderRowSpan) for ndCell in lCells: row, col = int(ndCell.get("row")), int(ndCell.get("col")) rowSpan = int(ndCell.get("rowSpan")) plg = ShapeLoader.node_to_Polygon(ndCell) #ymin, ymax of polygon lx = [_x for _x, _y in plg.exterior.coords] xmin, xmax = min(lx), max(lx) plg._row = row plg._col = col plg._xmin, plg._xmax = xmin, xmax lshapeCell.append(plg) for nd in MultiPageXml.getChildByName(ndCell, "Baseline"): nd.set("du_index", "%d" % i) ndParent = nd.getparent() dNodeSeen[ndParent.get('id')] = True # Baseline as a shapely object try: o = shaper_fun(nd) #make a LineString except Exception as e: traceln("ERROR: id=", nd.getparent().get("id")) raise e # scale the objects, as done when cutting!! # useless currently since we make a Point... o = shapely.affinity.scale(o, xfact=fRatio, yfact=fRatio) o._du_index = i o._du_nd = nd o._dom_id = nd.getparent().get("id") loBaseline.append(o) # is this object in the correct cell??? # We must use the centroid of the text box, otherwise a baseline # may be assigned to the next row # NOOO x = ShapeLoader.node_to_Polygon(ndParent).centroid.x # we must look for the leftest coordinate # NO CHECK FOR COLUMNS dsetTableByCol[col].add(funIndex(o)) if (row + rowSpan) > maxHeaderRowSpan: dsetTableDataByCol[col].add(funIndex(o)) i += 1 # if lOrphanBaselineShape: # traceln(" *** error: %d Baseline in incorrect row - fixing this..." % len(lOrphanBaselineShape)) # for o in lOrphanBaselineShape: # bestrow, bestdeltacol = 0, 9999 # try: # y = o.y # except: # y = o.centroid.y # for plg in lshapeCell: # if plg._ymin <= y and y <= plg._ymax: # # sounds good # deltacol = abs(o._bad_cell._col - plg._col) # if deltacol == 0: # # same column, ok it is that one # bestrow = plg._row # break # else: # if bestdeltacol > deltacol: # bestdeltacol = deltacol # bestrow = plg._row # traceln("\t id=%s misplaced in row=%s instead of row=%s" %( # o._du_nd.getparent().get("id") # , o._bad_cell._row # , bestrow)) # dsetTableByCol[bestrow].add(o._du_index) # del o._bad_cell # and (UGLY) process all Baseline outside any TableCell... for nd in MultiPageXml.getChildByName(ndPage, "Baseline"): try: dNodeSeen[nd.getparent().get('id')] except: #OLD "GOOD" CODE HERE nd.set("du_index", "%d" % i) # Baseline as a shapely object o = shaper_fun(nd) #make a LineString # scale the objects, as done when cutting!! o = shapely.affinity.scale(o, xfact=fRatio) o._du_index = i o._du_nd = nd o._dom_id = nd.getparent().get("id") loBaseline.append(o) i += 1 return loBaseline, dsetTableByCol, dsetTableDataByCol, maxHeaderRowSpan
def addSeparatorFeature(self): """ We load the graphical separators COmpute a set of shapely object In turn, for each edge, we compute the intersection with all separators The edge features will be: - boolean: at least crossing one separator - number of crossing points - span length of the crossing points - average length of the crossed separators - average distance between two crossings """ # graphical separators dNS = {"pc": PageXml.NS_PAGE_XML} someNode = self.lNode[0] ndPage = someNode.node.xpath("ancestor::pc:Page", namespaces=dNS)[0] lNdSep = ndPage.xpath(".//pc:SeparatorRegion", namespaces=dNS) loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep] if self.bVerbose: traceln(" %d graphical separators" % len(loSep)) # make an indexed rtree idx = index.Index() for i, oSep in enumerate(loSep): idx.insert(i, oSep.bounds) # take each edge in turn and list the separators it crosses nCrossing = 0 for edge in self.lEdge: # bottom-left corner to bottom-left corner oEdge = geom.LineString([(edge.A.x1, edge.A.y1), (edge.B.x1, edge.B.y1)]) prepO = prep(oEdge) lCrossingPoints = [] fSepTotalLen = 0 for i in idx.intersection(oEdge.bounds): # check each candidate in turn oSep = loSep[i] if prepO.intersects(oSep): fSepTotalLen += oSep.length oPt = oEdge.intersection(oSep) if type(oPt) != geom.Point: traceln('Intersection in not a point: skipping it') else: lCrossingPoints.append(oPt) if lCrossingPoints: nCrossing += 1 edge.bCrossingSep = True edge.sep_NbCrossing = len(lCrossingPoints) minx, miny, maxx, maxy = geom.MultiPoint( lCrossingPoints).bounds edge.sep_SpanLen = abs(minx - maxx) + abs(miny - maxy) edge.sep_AvgSpanSgmt = edge.sep_SpanLen / len(lCrossingPoints) edge.sep_AvgSepLen = fSepTotalLen / len(lCrossingPoints) else: edge.bCrossingSep = False edge.sep_NbCrossing = 0 edge.sep_SpanLen = 0 edge.sep_AvgSpanSgmt = 0 edge.sep_AvgSepLen = 0 #traceln((edge.A.domid, edge.B.domid, edge.bCrossingSep, edge.sep_NbCrossing, edge.sep_SpanLen, edge.sep_AvgSpanSgmt, edge.sep_AvgSepLen)) if self.bVerbose: traceln( " %d (/ %d) edges crossing at least one graphical separator" % (nCrossing, len(self.lEdge)))
def addClusterToDom(self, lCluster, sAlgo=""): """ From the predicted clusters, we create new TextRegions. The original TextRegion, if any, are either removed or renamed TextRegion_GT """ if hasattr(options, "bEvalRegion") and options.bEvalRegion: # in order to evaluate, we must keep the original TextRegion and the Cluster elements that are produced return super(My_ConjugateSegmenterGraph_MultiSinglePageXml, self).addClusterToDom(lCluster, sAlgo=sAlgo) lNdCluster = [] dNS = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" } # the graph has been constructed for a certain page pageNode = self.lNode[0].page.node # guess its position to create unique XML ID #pageNum = 1 #+ len( pageNode.xpath(".//preceding::pc:Page", namespaces=dNS) ) #traceln(" Page: ",pageNum) # enumerate TextAreas to remove lNdOldTextRegion = pageNode.xpath(".//pc:TextRegion", namespaces=dNS) # we copy the type attribute to the inner TextLine, to preserve the GT info, if any for ndTR in lNdOldTextRegion: sType = str(ndTR.get("type")) for ndTL in ndTR.xpath(".//pc:TextLine", namespaces=dNS): ndTL.set("type_gt", sType) # replace the ReadingOrder section by a new one ndReadingOrder = pageNode.xpath(".//pc:ReadingOrder", namespaces=dNS)[0] pageNode.remove(ndReadingOrder) ndReadingOrder = PageXml.createPageXmlNode('ReadingOrder') ndOrderedGroup = PageXml.createPageXmlNode('OrderedGroup') ndOrderedGroup.set("id", "ro_1") ndOrderedGroup.set("caption", "Regions reading order") ndReadingOrder.append(ndOrderedGroup) pageNode.append(ndReadingOrder) # loop over clusters for ic, c in enumerate(lCluster): ndCluster = PageXml.createPageXmlNode('TextRegion') #scid = "cluster_p%d_%d" % (pageNum, ic+1) scid = "cluster_%d" % (ic + 1) ndCluster.set("id", scid) ndCluster.set("custom", "readingOrder {index:%d;}" % ic) # TextRegion bounding box coords = PageXml.createPageXmlNode('Coords') ndCluster.append(coords) spoints = ShapeLoader.minimum_rotated_rectangle( [self.lNode[_i].node for _i in c]) coords.set('points', spoints) # if the inner TextLine are tagged, let's do a vote to tag the Cluster lsType = [self.lNode[_i].node.get('type') for _i in c] dType = Counter([o for o in lsType if o is not None]) mc = dType.most_common(1) if mc: sXmlLabel = mc[0][0] ndCluster.set("type", sXmlLabel) PageXml.setCustomAttr(ndCluster, "structure", "type", sXmlLabel) #TextLine: move the DOM node of the content to the cluster for _i in c: ndCluster.append(self.lNode[_i].node) pageNode.append(ndCluster) ndCluster.tail = "\n" lNdCluster.append(ndCluster) ndRegionRefIndexed = PageXml.createPageXmlNode('RegionRefIndexed') ndRegionRefIndexed.set("index", str(ic)) ndRegionRefIndexed.set("regionRef", scid) ndRegionRefIndexed.tail = "\n" ndOrderedGroup.append(ndRegionRefIndexed) # remove or rename the old TextRegion for nd in lNdOldTextRegion: if False: nd.tag = "TextRegion_GT" else: #pageNode.remove(nd) nd.getparent().remove(nd) return lNdCluster
def makeTableNode(self): """ Make a DOM tree for this table """ lK = self._dCellNd.keys() lRow = list(set(_row for _row, _col in lK)) lRow.sort() lCol = list(set(_col for _row, _col in lK)) lCol.sort() ndTable = PageXml.createPageXmlNode("TableRegion") ndTable.set("id", "p%s_%s" % (self.pagenum, self.tablenum)) ndTable.tail = "\n" lCellShape = [] lNdCell = [] for row in lRow: for col in lCol: lNdText = self._dCellNd[(row, col)] # <TableCell row="0" col="1" rowSpan="1" colSpan="1" id="TableCell_1500971530732_2485"> # <Coords points="480,42 485,323 878,323 874,38"/> if lNdText: ndCell = PageXml.createPageXmlNode("TableCell") ndCell.set( "id", "p%s_t%s_r%s_c%s" % (self.pagenum, self.tablenum, row, col)) # shape of the cell oHull = ShapeLoader.convex_hull(lNdText, bShapelyObject=True) lCellShape.append( oHull) # keep those to compute table contour # Coords sub-element ndCoords = PageXml.createPageXmlNode("Coords") ndCoords.set( "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True)) ndCoords.tail = "\n" ndCell.append(ndCoords) # row="0" col="0" rowSpan="1" colSpan="1" leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false" bottomBorderVisible="false" ndCell.set("row", str(row)) ndCell.set("rowSpan", "1") ndCell.set("col", str(col)) ndCell.set("colSpan", "1") ndCell.tail = "\n" #add corner cornerNode = PageXml.createPageXmlNode("CornerPts") cornerNode.text = "0 1 2 3" ndCell.append(cornerNode) for nd in lNdText: ndCell.append(nd) lNdCell.append(ndCell) # Table geometry ndCoords = PageXml.createPageXmlNode("Coords") contour = cascaded_union( [p if p.is_valid else p.convex_hull for p in lCellShape]) o = contour.minimum_rotated_rectangle ndCoords.set("points", ShapeLoader.getCoordsString(o, bFailSafe=True)) ndCoords.tail = "\n" ndTable.append(ndCoords) for nd in lNdCell: ndTable.append(nd) return ndTable
def project_Elt_to_GT(gtdoc, doc , xpElement1, xpElement2 , xpArea2 , bSep, lsRmId, bEval , fTH=0.5): """ Here we take the element out of the production file to put them in the GT doc WE IGNORE xpArea1 (no need for it) We return the GT doc """ gtroot = gtdoc.getroot() # Evaluation # we build a table of list of TextLineId from the GT to check this SW # table_id -> row -> col -> list of element id dTable = defaultdict(lambda : defaultdict(lambda : defaultdict(list))) nOk, nTot = 0, 0 if lsRmId: nbEltRemoved = 0 for sRmId in lsRmId: # for _nd in gtroot.xpath('//pg:*[@id="%s"]'%sRmId, namespaces=dNS): for _nd in gtroot.xpath('//*[@id="%s"]'%sRmId): _nd.getparent().remove(_nd) nbEltRemoved += 1 trace(" (Rm by ID: %d elements removed)" % nbEltRemoved) # remove all elements of interest from GT # inside TableRegion, we have TextLine, outside we have TextRegion if xpElement1 != xpArea2: for ndElt in gtroot.xpath(xpElement1, namespaces=dNS): if bEval: for ndElt2 in ndElt.xpath(xpElement2, namespaces=dNS): dTable[None][None][None].append(ndElt2.get("id")) ndElt.getparent().remove(ndElt) for ndElt in gtroot.xpath(xpElement2, namespaces=dNS): ndCell = ndElt.getparent() if bEval: dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")].append(ndElt.get("id")) ndCell.remove(ndElt) if bEval: traceln("\npEvaluation mode") if bSep: nbSepRemoved, nbSepAdded = 0, 0 for _nd in gtroot.xpath('//pg:SeparatorRegion', namespaces=dNS): _nd.getparent().remove(_nd) nbSepRemoved += 1 trace(" (Separators: %d removed" % nbSepRemoved) # project the GT areas, page by page lNdPage = doc.getroot().xpath("//pg:Page", namespaces=dNS) lNdPageGT = gtroot.xpath("//pg:Page", namespaces=dNS) if len(lNdPage) != len(lNdPageGT): raise GTProjectionException("GT and input have different numbers of pages") assert len(lNdPage) > 0, "No page??" uniqID = 1 nNdArea2 = 0 for ndPage, ndPageGT in zip(lNdPage, lNdPageGT): lNdArea2 = ndPageGT.xpath(xpArea2, namespaces=dNS) loArea2 = [ShapeLoader.node_to_Polygon(nd) for nd in lNdArea2] nNdArea2 += len(lNdArea2) for ndElt in ndPage.xpath(xpElement2, namespaces=dNS): oElt = ShapeLoader.node_to_Polygon(ndElt) lOvrl = [oElt.intersection(o).area for o in loArea2] iMax = argmax(lOvrl) if lOvrl else None vMax = -1 if iMax is None else lOvrl[iMax] # where to add it? if vMax > 0 and vMax / oElt.area > fTH: # ok, this is a match ndCell = lNdArea2[iMax] # add it directly to the area2 (TableCell) ndCell.append(deepcopy(ndElt)) if bEval: if ndElt.get("id") in dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")]: nOk += 1 else: try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text)) except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id"))) else: # add it outside of any area bestNd = ndPageGT # add it in its own TextRegion ndTR = etree.Element("TextRegion") ndTR.set("id", "prjct_region_%d" % uniqID) uniqID += 1 ndTR.set("custom", "") ndTR.append(deepcopy(ndElt.xpath("./pg:Coords", namespaces=dNS)[0])) ndTR.append(deepcopy(ndElt)) bestNd.append(ndTR) if bEval: if ndElt.get("id") in dTable[None][None][None]: nOk += 1 else: try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text)) except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id"))) nTot += 1 if bSep: for _nd in ndPage.xpath('//pg:SeparatorRegion', namespaces=dNS): ndPageGT.append(deepcopy(_nd)) nbSepAdded += 1 if bSep: trace(", %d added.) " % nbSepAdded) if bEval: traceln("-"*40) trace(" - evaluation: %d ok out of %d = %.2f%%\n" % (nOk, nTot, 100*nOk / (nTot+0.0001))) if nNdArea2 == 0: raise ProjectException("Empty GT") return gtdoc
def map_to_rows(cls, ndPage, maxRow, lCluster): """ find lienar separators separating rows """ # reflect each cluster by the highest point (highest ending points of baselines) dMinYByRow = defaultdict(lambda :9999999999) n = 2 * sum(len(c) for c in lCluster) X = np.zeros(shape=(n, 2)) # x,y coordinates i = 0 for c in lCluster: c.maxY = -1 c.minY = 9999999999 for _id in c.getSetID(): """ <TextLine id="r1l5" custom="readingOrder {index:4;}" DU_cluster="0" row="0" rowSpan="1" col="0" colSpan="1"> <Coords points="217,688 245,685 273,685 301,688 329,690 358,689 358,646 329,647 301,645 273,642 245,642 217,645"/> <Baseline points="217,688 245,685 273,685 301,688 329,690 358,689"/> <TextEquiv><Unicode>ung.</Unicode></TextEquiv> </TextLine> """ nd = ndPage.xpath(".//*[@id='%s']/pg:Baseline"%_id, namespaces=dNS)[0] ls = ShapeLoader.node_to_LineString(nd) pA, pB = ls.boundary.geoms minY = min(pA.y, pB.y) c.minY = min(c.minY, minY) c.maxY = max(c.maxY, max((pA.y, pB.y))) dMinYByRow[c.minrow] = min(dMinYByRow[c.minrow], minY) # for the linear separators X[i,:] = (pA.x, pA.y) i = i + 1 X[i,:] = (pB.x, pB.y) i = i + 1 # check consistency for c in lCluster: for i in range(maxRow, c.minrow, -1): if c.minY > dMinYByRow[i]: assert c.minrow < i # how possible??? fix!! c.minrow = i break # compute row1 and row2 for c in lCluster: c.row1 = c.minrow c.row2 = c.minrow for i in range(0, maxRow+1): if c.maxY > dMinYByRow[i]: c.row2 = i else: break # now compute maxRow - 1 separators! w = float(ndPage.get("imageWidth")) Y = np.zeros(shape=(n,)) # labels # lAB = [getLinearSeparator(X, np.clip(Y, row, row+1)) # for row in range(maxRow-1)] for nd in ndPage.xpath(".//pg:SeparatorRegion[@algo]", namespaces=dNS): ndPage.remove(nd) for row in range(maxRow+1): Y0 = dMinYByRow[row] - 20 Yw = Y0 ndSep = PageXml.createPageXmlNode("SeparatorRegion") ndSep.set("algo", "tabulate_rows") ndCoords = PageXml.createPageXmlNode("Coords") ndCoords.set("points", "%d,%d %d,%d" %(0, Y0, w, Yw)) ndSep.append(ndCoords) ndSep.tail = "\n" ndPage.append(ndSep) return
def addSeparatorFeature(self): """ We load the graphical separators COmpute a set of shapely object In turn, for each edge, we compute the intersection with all separators The edge features will be: - boolean: at least crossing one separator - number of crossing points - span length of the crossing points - average length of the crossed separators - average distance between two crossings xMiddle = (max(edge.A.x1, edge.B.x1) + min(edge.A.x2, edge.B.x2)) / 2.0 yMiddle = (max(edge.A.y1, edge.B.y1) + min(edge.A.y2, edge.B.y2)) / 2.0 if max(edge.A.x1, edge.B.x1) > min(edge.A.x2, edge.B.x2): # horizontal edge oEdge = geom.LineString([ (min(edge.A.x2, edge.B.x2), yMiddle) , (max(edge.A.x1, edge.B.x1), yMiddle)]) else: # vertical edge or box overlap oEdge = geom.LineString([ (xMiddle, min(edge.A.y2, edge.B.y2)) , (xMiddle, max(edge.A.y1, edge.B.y1))]) """ if self._cachedSelf == self: # same call... (by reified edge code maybe) (loSep, idx) = PageXmlSeparatorRegion._cached else: # graphical separators dNS = {"pc":PageXml.NS_PAGE_XML} someNode = self.lNode[0] lndPage = someNode.node.xpath("//pc:Page", namespaces=dNS) assert len(lndPage) == 1, "INTERNAL ERROR: CODE NOT READY FOR MULTIPAGE..." ndPage = someNode.node.xpath("ancestor::pc:Page", namespaces=dNS)[0] lNdSep = ndPage.xpath(".//pc:SeparatorRegion", namespaces=dNS) loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep] if self.bVerbose: traceln("\t\t\t%d graphical separators"%len(loSep)) # make an indexed rtree idx = index.Index() for i, oSep in enumerate(loSep): idx.insert(i, oSep.bounds) PageXmlSeparatorRegion._cachedSelf = self PageXmlSeparatorRegion._cached = (loSep, idx) # take each edge in turn and list the separators it crosses nCrossing = 0 for edge in self.lEdge: # bottom-left corner to bottom-left corner #oEdge = geom.LineString([(edge.A.x1, edge.A.y1), (edge.B.x1, edge.B.y1)]) xMiddle = (max(edge.A.x1, edge.B.x1) + min(edge.A.x2, edge.B.x2)) / 2.0 yMiddle = (max(edge.A.y1, edge.B.y1) + min(edge.A.y2, edge.B.y2)) / 2.0 if max(edge.A.x1, edge.B.x1) > min(edge.A.x2, edge.B.x2): # horizontal edge oEdge = geom.LineString([ (min(edge.A.x2, edge.B.x2), yMiddle) , (max(edge.A.x1, edge.B.x1), yMiddle)]) else: # vertical edge or box overlap oEdge = geom.LineString([ (xMiddle, min(edge.A.y2, edge.B.y2)) , (xMiddle, max(edge.A.y1, edge.B.y1))]) prepO = prep(oEdge) lCrossingPoints = [] fSepTotalLen = 0 for i in idx.intersection(oEdge.bounds): # check each candidate in turn oSep = loSep[i] if prepO.intersects(oSep): fSepTotalLen += oSep.length oPt = oEdge.intersection(oSep) if type(oPt) != geom.Point and type(oPt) != geom.MultiPoint: traceln('\t\t\tIntersection in not a point: skipping it') elif type(oPt) == geom.Point: lCrossingPoints.append(oPt) elif type(oPt) == geom.MultiPoint: for x in [ (p.x,p.y) for p in oPt]: lCrossingPoints.append(geom.Point(x)) if lCrossingPoints: nCrossing += 1 edge.bCrossingSep = True edge.sep_NbCrossing = len(lCrossingPoints) minx, miny, maxx, maxy = geom.MultiPoint(lCrossingPoints).bounds edge.sep_SpanLen = abs(minx-maxx) + abs(miny-maxy) edge.sep_AvgSpanSgmt = edge.sep_SpanLen / len(lCrossingPoints) edge.sep_AvgSepLen = fSepTotalLen / len(lCrossingPoints) else: edge.bCrossingSep = False edge.sep_NbCrossing = 0 edge.sep_SpanLen = 0 edge.sep_AvgSpanSgmt = 0 edge.sep_AvgSepLen = 0 #traceln((edge.A.domid, edge.B.domid, edge.bCrossingSep, edge.sep_NbCrossing, edge.sep_SpanLen, edge.sep_AvgSpanSgmt, edge.sep_AvgSepLen)) if self.bVerbose: traceln("\t\t\t%d (/ %d) edges crossing at least one graphical separator"%(nCrossing, len(self.lEdge)))
def loadPage(self, ndPage, shaper_fun=ShapeLoader.node_to_Point, funIndex=lambda x: x._du_index, bIgnoreHeader=False): """ load the page, looking for Baseline can filter by DU_row return a list of shapely objects , a dict of sorted list of objects, by row GT BUG: some Baseline are assigned to the wrong Cell => we also fix this here.... """ loBaseline = [] # list of Baseline shapes i = 0 dsetTableByRow = defaultdict(set) # sets of object ids, by row dNodeSeen = {} # first associate a unique id to each baseline and list them lshapeCell = [] lOrphanBaselineShape = [] for ndCell in MultiPageXml.getChildByName(ndPage, "TableCell"): row, col = ndCell.get("row"), ndCell.get("col") plg = ShapeLoader.node_to_Polygon(ndCell) #ymin, ymax of polygon ly = [_y for _x, _y in plg.exterior.coords] ymin, ymax = min(ly), max(ly) plg._row = int(row) plg._col = int(col) plg._ymin, plg._ymax = ymin, ymax i0 = i for nd in MultiPageXml.getChildByName(ndCell, "Baseline"): nd.set("du_index", "%d" % i) ndParent = nd.getparent() dNodeSeen[ndParent.get('id')] = True if bIgnoreHeader and ndParent.get("DU_header") == "CH": continue row_lbl = ndParent.get("DU_row") # Baseline as a shapely object try: o = shaper_fun(nd) #make a LineString except Exception as e: traceln("ERROR: id=", nd.getparent().get("id")) raise e o._du_index = i o._du_DU_row = row_lbl # can be None o._du_nd = nd o._dom_id = nd.getparent().get("id") loBaseline.append(o) # is this object in the correct cell??? # We must use the centroid of the text box, otherwise a baseline # may be assigned to the next row #y = o.centroid.y # NOO!! y = ShapeLoader.node_to_Polygon(ndParent).centroid.y # if ymin <= y and y <= ymax: # we allow the content of a cell to overlap the cell lower border if ymin <= y: dsetTableByRow[int(row)].add(funIndex(o)) else: # this is an orphan! o._bad_cell = plg lOrphanBaselineShape.append(o) i += 1 if bIgnoreHeader and i0 == i: continue # empty cells, certainly due to headers, ignore it. lshapeCell.append(plg) # end for if lOrphanBaselineShape: traceln( " *** error: %d Baseline in incorrect row - fixing this..." % len(lOrphanBaselineShape)) for o in lOrphanBaselineShape: bestrow, bestdeltacol = 0, 9999 try: y = o.y except: y = o.centroid.y for plg in lshapeCell: if plg._ymin <= y and y <= plg._ymax: # sounds good deltacol = abs(o._bad_cell._col - plg._col) if deltacol == 0: # same column, ok it is that one bestrow = plg._row break else: if bestdeltacol > deltacol: bestdeltacol = deltacol bestrow = plg._row traceln("\t id=%s misplaced in row=%s instead of row=%s" % (o._du_nd.getparent().get("id"), o._bad_cell._row, bestrow)) dsetTableByRow[bestrow].add(funIndex(o)) del o._bad_cell # and (UGLY) process all Baseline outside any TableCell... for nd in MultiPageXml.getChildByName(ndPage, "Baseline"): try: dNodeSeen[nd.getparent().get('id')] except: #OLD "GOOD" CODE HERE nd.set("du_index", "%d" % i) # -> TextLine -> TableCell (possibly) ndPrnt = nd.getparent() row_lbl = ndPrnt.get("DU_row") # Baseline as a shapely object o = shaper_fun(nd) #make a LineString o._du_index = i o._du_row = None # Must be None o._du_DU_row = row_lbl # can be None o._du_nd = nd o._dom_id = nd.getparent().get("id") loBaseline.append(o) i += 1 return loBaseline, dsetTableByRow
def checkClusteringPerRow(pageNd, dCellCndPerRow, dTextInCells, dClusterWithTL, dEdges): """ recompute clusters for the rows """ lNewPoly = [] lNewClusterWithTL = [] lNewClusterWithCC = [] lRowtodel = [] lTLtoBeAdded = [] for rowid, rowCells in dCellCndPerRow.items(): lNbClusterPerCol = [] lClusterPerCol = [] for cell in rowCells: if cell.wkt in dTextInCells.keys(): lIds = [tl.get('id') for tl in dTextInCells[cell.wkt]] lClusterCells = connected_components(dEdges, lIds, TH=0.5) lNbClusterPerCol.append(len(lClusterCells)) lClusterPerCol.append(lClusterCells) #print (len(rowCells),lIds,lClusterCells,lNbClusterPerCol) #else empty cell if len(lNbClusterPerCol) > 2: # print (lNbClusterPerCol) try: nbRows = mode(lNbClusterPerCol) except statistics.StatisticsError: nbRows = 2 if nbRows != 1: print('WARNING CUT ', rowid, "mode", nbRows) lRowtodel.append(rowid) dClusterCells = defaultdict(list) for colcluster in lClusterPerCol: if len(colcluster) == nbRows: # get tl instead of ids # take the first element only for comparing position colclustertxt = [] for cc in colcluster: colclustertxt.append([ tl for tl in dClusterWithTL[rowid] if tl.get('id') in cc ]) sortedC = sorted(colclustertxt, key=lambda x: ShapeLoader. node_to_Polygon(x[0]).centroid.y) for i, cc in enumerate(sortedC): dClusterCells[i].append(cc) else: for cc in colcluster: lTLtoBeAdded.extend([ tl for tl in dClusterWithTL[rowid] if tl.get('id') in cc ]) for i, lcc in dClusterCells.items(): # print (i,lcc) rect = ShapeLoader.contourObject([ x for cc in lcc for x in cc ]).envelope # minimal_rectangle? rect = scale(rect, xfact=2) lNewPoly.append(rect) lNewClusterWithCC.append(lcc) lNewClusterWithTL.append([x for cc in lcc for x in cc]) # lNewCluster.Append() # need also to create a cluster with list of ids!! rNd = etree.Element('LINE') rNd.set('points', ShapeLoader.getCoordsString(rect)) pageNd.append(rNd) # final check: if overlap between rectangle: top rectangle is the cut #return updated list of lCLusters return lRowtodel, lNewPoly, lNewClusterWithTL, lNewClusterWithCC, lTLtoBeAdded
def main(lsFilename, lsOutFilename): #for the pretty printer to format better... parser = etree.XMLParser(remove_blank_text=True) cnt, cntS = 0, 0 for sFilename, sOutFilename in zip(lsFilename, lsOutFilename): cntDoc, cntDocS = 0, 0 doc = etree.parse(sFilename, parser) root = doc.getroot() # Separators are not under tableRegion... :-/ lNdSep = MultiPageXml.getChildByName(root ,'SeparatorRegion') loSep = [ShapeLoader.node_to_LineString(ndSep) for ndSep in lNdSep] for _o in loSep: _o._bConsistent = True if not lNdSep: traceln("Warning: no separator in %s"%sFilename) else: traceln("%25s %d separators" % (sFilename, len(lNdSep))) lNdTR = MultiPageXml.getChildByName(root ,'TableRegion') for ndTR in lNdTR: lNdCells= MultiPageXml.getChildByName(ndTR ,'TableCell') if not lNdCells: continue nbRows = max(int(x.get('row')) for x in lNdCells) # build a list of Shapely objects augmented with our own table attributes loText = [] # for ndCell in lNdCells: minRow = int(ndCell.get('row')) minCol = int(ndCell.get('col')) maxRow = minRow + int(ndCell.get('rowSpan')) - 1 maxCol = minCol + int(ndCell.get('colSpan')) - 1 # # ignore cell spanning the whole table height # if maxRow >= nbRows: # continue for ndText in MultiPageXml.getChildByName(ndCell ,'TextLine'): try: oText = ShapeLoader.node_to_Polygon(ndText) except: traceln("WARNING: SKIPPING 1 TExtLine: cannot make a polygon from: %s" % etree.tostring(ndText)) continue # reflecting the textbox as a single point (minx, miny, maxx, maxy) = oText.bounds # is the baseline horizontal or vertical?? fDelta = min((maxx-minx) / 2.0, (maxy-miny) / 2.0) if isBaselineHorizontal(ndText): # supposed Horizontal text oText = geom.Point(minx + fDelta , (maxy + miny)/2.0) ndText.set("Horizontal", "TRUE") else: ndText.set("Horizontal", "nope") oText = geom.Point((minx + maxx)/2.0 , miny + fDelta) # considering it as a point, using its centroid # does not work well due to loooong texts oText = oText.centroid oText._minRow, oText._minCol = minRow, minCol oText._maxRow, oText._maxCol = maxRow, maxCol if DEBUG: oText._domnd = ndText loText.append(oText) traceln(" TableRegion %d texts" % (len(loText))) if loText: # checking in tun each separator for table-consistency sp = ShapePartition(loText) for oSep in loSep: (minx, miny, maxx, maxy) = oSep.bounds if maxx - minx >= maxy - miny: # supposed Horizontal l = sp.getObjectAboveLine(oSep) if l: maxRowBefore = max(_o._maxRow for _o in l) l = sp.getObjectBelowLine(oSep) if l: minRowAfter = min(_o._minRow for _o in l) if maxRowBefore >= minRowAfter: oSep._bConsistent = False else: l1 = sp.getObjectOnLeftOfLine(oSep) if l1: maxColBefore = max(_o._maxCol for _o in l1) l2 = sp.getObjectOnRightOfLine(oSep) if l2: minColAfter = min(_o._minCol for _o in l2) if maxColBefore >= minColAfter: oSep._bConsistent = False if DEBUG: # DEBUG for o in l1: if o._maxCol >= minColAfter: print("too much on right", etree.tostring(o._domnd)) for o in l2: if o._minCol <= maxColBefore: print("too much on left", etree.tostring(o._domnd)) # end of TableRegion # end of document for ndSep, oSep in zip(lNdSep, loSep): if oSep._bConsistent: ndSep.set("DU_Sep", "S") cntDocS += 1 else: ndSep.set("DU_Sep", "I") cntDoc += 1 doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True) traceln('%.2f%% consistent separators - annotation done for %s --> %s' % (100*float(cntDocS)/(cntDoc+0.000001), sFilename, sOutFilename)) del doc cnt, cntS = cnt+cntDoc, cntS+cntDocS traceln('%.2f%% consistent separators - annotation done for %d files' % (100*float(cntS)/(cnt+0.000001), cnt))
def createCellsFromZoneClusters(pageNd, dIds, lZones, lClusters, dEdges): """ TOP DOWN: CREATE A GRID TABLE AND REFINE IT lZones: list of zones for columns (ordered left right by construction) lClusters: list of cluster NODE!! create cells by zone intersections idea! compute 'invariance' of the angle of the rectangle: if one column less: should be the same angle variance for detecting merge: del on column and see if several clusters? 1 test if rectangle OK if all ok and no overlap: create rows and col # for overlapping: -> connected componants for each column create alternate rectangles by skipping one cell 3 merge if same #order (at cell level) 4 split if n connected cells per column # how: create alternate rectangles? """ lClusters = sorted(lClusters, key=lambda x: ShapeLoader.node_to_Polygon(x).centroid.y) lZones = sorted(lZones, key=lambda x: ShapeLoader.node_to_Polygon(x).centroid.x) dClusterWithTL = getClusterTL(pageNd, lClusters, dIds) #lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters] lZonePoly = [ShapeLoader.node_to_Polygon(nd) for nd in lZones] ## if one clusterpoly is contained in another: merge lClusterPoly = transformMinima2envelop(pageNd, dClusterWithTL) # return 0,0,0 dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection( lClusterPoly, lZonePoly) dTextInCells = {} #key = cell.wkt for i in range(len(lClusters)): dTextInCells.update( assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i])) lClusterPoly = shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow, dTextInCells) lClusterPoly = ajustRectangleShapes(pageNd, lClusterPoly) # again recompute cell zones dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection( lClusterPoly, lZonePoly) dTextInCells = {} #key = cell.wkt for i in range(len(lClusters)): dTextInCells.update( assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i])) # do it first ?? #check if row is a merge lRowtobeDel, lNewRowCluster, lNewClusterTL, lNewClusterCC, lTLtoBeAdded = checkClusteringPerRow( pageNd, dCellCndPerRow, dTextInCells, dClusterWithTL, dEdges) if lNewRowCluster: lClusterPoly = [ ShapeLoader.node_to_Polygon(nd) for i, nd in enumerate(lClusters) if i not in lRowtobeDel ] dClusterWithTL = getClusterTL( pageNd, [c for i, c in enumerate(lClusters) if i not in lRowtobeDel], dIds) for i, ltl in enumerate(lNewClusterTL): dClusterWithTL[len(lClusterPoly) + i] = ltl lClusterPoly.extend(lNewRowCluster) # lClusterPoly= ajustRectangleShapes(pageNd,lClusterPoly) dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection( lClusterPoly, lZonePoly) dTextInCells = {} #key = cell.wkt for i in range(len(lClusterPoly)): dTextInCells.update( assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i])) ## populate elements which are not ij the grid/cells : at cc level or at tl level ? for i in range(len(lClusterPoly)): dTextInCells.update( assignTextLinesToCells(lTLtoBeAdded, dCellCndPerRow[i])) ## oversegmentation : test links at cell level if compatible!see 0068 #COMPUTE PROFILE dUpdateCells, dNewCells = computeProfile(dCells, dTextInCells, dCellCndPerRow, dCellCndPerCol) # TABLE CREATION return dUpdateCells, dNewCells, dShapeCells