def processRegions(ndPage, bVerbose=False): """ Delete empty regions resize no empty regions """ lDel = [] lndRegions = ndPage.xpath(".//pg:TextRegion", namespaces=dNS) for ndRegion in lndRegions: lTL = ndRegion.xpath(".//pg:TextLine", namespaces=dNS) if lTL == []: # to be deleted lDel.append(ndRegion) else: #resize it oHull = ShapeLoader.convex_hull(lTL, bShapelyObject=True) PageXml.getChildByName(ndRegion, 'Coords')[0].set( "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True)) # contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lTL ]) # o = contour.minimum_rotated_rectangle # ndRegion.getChildByName('Coords').set("points", ShapeLoader.getCoordsString(o, bFailSafe=True)) # delete empty regions [ndRegion.getparent().remove(ndRegion) for ndRegion in lDel] if bVerbose: traceln(" - %d regions deleted" % (len(lDel))) traceln(" - %d regions updated" % (len(lndRegions) - len(lDel)))
def findTemplate(self,doc): """ find the page where the first TableRegion occurs and extract it """ lT = PageXml.getChildByName(doc.getRootElement(),'TableRegion') if lT == []: return None firstTable=lT[0] # lazy guy! page = firstTable.parent newDoc,_ = PageXml.createPageXmlDocument('XRCE', '', 0,0) page.unlinkNode() newDoc.setRootElement(page) ### need to add the ns!! # print newDoc.serialize('utf-8',True) # Borders must be visible: #leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false lcells = PageXml.getChildByName(newDoc.getRootElement(),'TableCell') for cell in lcells: cell.setProp("leftBorderVisible",'true') cell.setProp("rightBorderVisible",'true') cell.setProp("topBorderVisible",'true') cell.setProp("bottomBorderVisible",'true') return newDoc
def regularTextLines(self, doc): """ from a baseline: create a regular TextLine: rectangle along the baseline """ lTextLines = PageXml.getChildByName(doc.getRootElement(), 'TextLine') for tl in lTextLines: coord = tl.children baseline = tl.children.next sPoints = baseline.prop('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) except ValueError: print tl plg = Polygon(lXY) iHeight = 50 # not points!! 300 dpi : 62.5 x1, y1, x2, y2 = plg.getBoundingBox() coord.setProp( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
def reinitPage(self, doc): """ empty page """ lNodes = PageXml.getChildByName(doc.getroot(), 'Page') for node in lNodes: node.unlinkNode()
def unLinkTable(self,doc): """ delete table """ lT = PageXml.getChildByName(doc.getRootElement(),'TableRegion') if lT == []: return doc for table in lT: table.unlinkNode() table.freeNode() return doc
def unLinkTextLines(self,doc): """ delete textlines and baselines """ lT = PageXml.getChildByName(doc.getRootElement(),'TextLine') if lT == []: return doc for text in lT: text.unlinkNode() text.freeNode() return doc
def extractFileNamesFromMPXML(self, doc): """ to insure correct file order ! """ xmlpath = os.path.abspath( "%s%s%s%s%s" % (self.coldir, os.sep, 'col', os.sep, self.docid)) lNd = PageXml.getChildByName(doc.getroot(), 'Page') # for i in lNd:print i return list( map( lambda x: "%s%s%s.xml" % (xmlpath, os.sep, x.get('imageFilename')[:-4]), lNd))
def extractFileNamesFromMPXML(self, mpxmldoc): """ to insure correct file order ! duplicated form performCVLLA.py """ xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid)) lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page') # for i in lNd:print i return map( lambda x: "%s%s%s.xml" % (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd)
def regularTextLines(self, doc): """ from a baseline: create a regular TextLine: """ from shapely.geometry import LineString from shapely.affinity import translate self.xmlns = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15' lTextLines = PageXml.getChildByName(doc.getroot(), 'TextLine') for tl in lTextLines: #get Coords xpath = "./a:%s" % ("Coords") lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns}) coord = lCoords[0] xpath = "./a:%s" % ("Baseline") lBL = tl.xpath(xpath, namespaces={"a": self.xmlns}) try: baseline = lBL[0] except IndexError: continue sPoints = baseline.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) except ValueError: print(tl) #plg = Polygon(lXY) try: line = LineString(lXY) except ValueError: continue # LineStrings must have at least 2 coordinate tuples topline = translate(line, yoff=-40).simplify(10) #iHeight = 20 # in pixel #x1,y1, x2,y2 = topline.getBoundingBox() if coord is not None: spoints = ' '.join("%s,%s" % (int(x[0]), int(x[1])) for x in line.coords) lp = list(topline.coords) lp.reverse() spoints = spoints + ' ' + ' '.join("%s,%s" % (int(x[0]), int(x[1])) for x in lp) #spoints = ' '.join("%s,%s"%(x[0],x[1]) for x in pp.coords) #coord.set('points',"%d,%d %d,%d %d,%d %d,%d" % (x1,y1-iHeight,x2,y1-iHeight,x2,y2,x1,y2)) coord.set('points', spoints) else: print(tl)
def storeMPXML(self, lFiles): """ store files in lFiles as mpxml """ docDir = os.path.join(self.coldir + os.sep + 'col', self.docid) doc = MultiPageXml.makeMultiPageXml(lFiles) ## add cornerNode lCells = PageXml.getChildByName(doc.getRootElement(), 'TableCell') for domNode in lCells: cornerNode = libxml2.newNode('CornerPts') cornerNode.setContent("0 1 2 3") cornerNode.setNs(doc.getRootElement().ns()) domNode.addChild(cornerNode) sMPXML = docDir + ".mpxml" doc.saveFormatFileEnc(sMPXML, "UTF-8", True) return doc, sMPXML
def regularTextLinesold(self, doc): """ from a baseline: create a regular TextLine: also: for slanted baseline: """ from shapely.geometry import LineString from shapely.affinity import translate self.xmlns = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15' lTextLines = PageXml.getChildByName(doc.getroot(), 'TextLine') for tl in lTextLines: #get Coords xpath = "./a:%s" % ("Coords") lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns}) coord = lCoords[0] xpath = "./a:%s" % ("Baseline") lBL = tl.xpath(xpath, namespaces={"a": self.xmlns}) baseline = lBL[0] sPoints = baseline.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) except ValueError: print(tl) plg = Polygon(lXY) line = LineString(lXY) # 50 seems to large: the manual GT is 30 ? not always! iHeight = 30 # in pixel x1, y1, x2, y2 = plg.getBoundingBox() if coord is not None: coord.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2)) else: print(tl)
def findTemplate(self, doc): """ find the page where the first TableRegion occurs and extract it """ from copy import deepcopy lT = PageXml.getChildByName(doc.getroot(), 'TableRegion') if lT == []: return None firstTable = lT[0] # lazy guy! newDoc, fakepage = PageXml.createPageXmlDocument('NLE', '', 0, 0) page = firstTable.getparent() fakepage.set("imageFilename", page.get('imageFilename')) fakepage.set("imageWidth", page.get('imageWidth')) fakepage.set("imageHeight", page.get('imageHeight')) page.getparent().remove(page) # add table xx = deepcopy(firstTable) fakepage.append(xx) return newDoc
def performLA(self, doc): """ # for document doc ## find the page where the template is ## store it as template (check borders)) ## generate profile for table registration ## (execution) ## create profile for lA ## (execution) """ # lNumPages = [] if self.bTemplate or self.bBaseLine or self.bSeparator: # extract list of files sorted as in MPXML lFullPathXMLNames = self.extractFileNamesFromMPXML(doc) nbPages = len(lFullPathXMLNames) ## 1 generate xml files if only pxml are there xmlpath = os.path.abspath( os.path.join(self.coldir, 'col', self.docid)) lXMLNames = [ "%s%s%s" % (xmlpath, os.sep, name) for name in os.listdir(xmlpath) if os.path.basename(name)[-4:] == '.xml' ] isXml = [] != lXMLNames if isXml: [ os.remove("%s%s%s" % (xmlpath, os.sep, name)) for name in os.listdir(xmlpath) if os.path.basename(name)[-4:] == '.xml' ] isXml = False isPXml = [] != [ name for name in os.listdir(xmlpath) if os.path.basename(name)[-5:] == '.pxml' ] assert not isXml and isPXml # recreate doc? (mpxml) lPXMLNames = [ name for name in os.listdir(xmlpath) if os.path.basename(name)[-5:] == '.pxml' ] if not isXml: # copy pxml in xml for name in lPXMLNames: oldname = "%s%s%s" % (xmlpath, os.sep, name) newname = "%s%s%s" % (xmlpath, os.sep, name) newname = newname[:-5] + '.xml' tmpdoc = etree.parse(oldname) tmpdoc.write(newname, encoding="UTF-8", pretty_print=True, xml_declaration=True) if self.bKeepTL: # keep ltextLione lTextLines = [] lPages = PageXml.getChildByName(doc.getroot(), 'Page') for page in lPages: lTextLines.append(PageXml.getChildByName(page, 'TextLine')) ## Table registration if self.bTemplate: if self.sTemplateFile is None: templatePage = self.findTemplate(doc) if templatePage is None: traceln("No table found in this document: %s" % self.docid) else: oldOut = self.outputFileName self.outputFileName = "%s%s%s.templ.xml" % ( self.coldir, os.sep, self.docid) stemplatefile = "%s%s%s.templ.xml" % (self.coldir, os.sep, self.docid) print(stemplatefile) self.writeDom(templatePage, True) self.outputFileName = oldOut prnregfilename = self.createRegistrationProfile( stemplatefile) else: # raise Exception, 'file template stuff: to be done' prnregfilename = self.createRegistrationProfile( self.sTemplateFile) job = LAProcessor.cNomacs + " --batch %s" % (prnregfilename) os.system(job) traceln('table registration done: %s' % prnregfilename) ## separator detection if self.bSeparator: prnglfilename = self.createLinesProfile() job = LAProcessor.cNomacs + " --batch %s" % (prnglfilename) os.system(job) traceln('GL done: %s' % prnglfilename) ## baseline detection if self.bBaseLine: prnlafilename = self.createLAProfile() # job = LAProcessor.cNomacs+ " --batch %s"%(prnlafilename) job = LAProcessor.cNomacsold + " --batch %s" % (prnlafilename) os.system(job) traceln('LA done: %s' % prnlafilename) if self.bTemplate or self.bBaseLine or self.bSeparator: doc, sMPXML = self.storeMPXML(lFullPathXMLNames) # Does not work with URO LA! if self.bKeepTL: self.reintegrateTextIntoCells(doc, lTextLines) ## text rectangles as textline region if self.bRegularTextLine: self.regularTextLines(doc) doc.write(sMPXML, encoding="UTF-8", pretty_print=True, xml_declaration=True) return doc, nbPages
def reintegrateTextIntoCells(self, doc, lLTextLines=[]): """ from XMLDSTABLE """ def overlapX(zone1, zone2): [a1, a2] = zone1 #self.getX(),self.getX()+ self.getWidth() [b1, b2] = zone2 #zone.getX(),zone.getX()+ zone.getWidth() return min(a2, b2) >= max(a1, b1) def overlapY(zone1, zone2): [a1, a2] = zone1 #self.getY(),self.getY() + self.getHeight() [b1, b2] = zone2 #zone.getY(),zone.getY() + zone.getHeight() return min(a2, b2) >= max(a1, b1) def signedRatioOverlap(zone1, zone2): """ overlap self and zone return surface of self in zone """ [ x1, y1, x12, y12 ] = zone1 #self.getX(),self.getY(),self.getHeight(),self.getWidth() [ x2, y2, x22, y22 ] = zone2 #zone.getX(),zone.getY(),zone.getHeight(),zone.getWidth() w1, h1 = x12 - x1, y12 - y1 w2, h2 = x22 - x2, y22 - y2 fOverlap = 0.0 # print (x1,x12),(x2,x22) # print overlapX((x1,x12),(x2,x22)) # print (y1,y12),(y2,y22) # print overlapY((y1,y12),(y2,y22)) # if overlapX((x1,w1),(x2,w2)) and overlapY((y1,h1),(y2,h2)): if overlapX((x1, x12), (x2, x22)) and overlapY((y1, y12), (y2, y22)): [x11, y11, x12, y12] = [x1, y1, x1 + w1, y1 + h1] [x21, y21, x22, y22] = [x2, y2, x2 + w2, y2 + h2] s1 = w1 * h1 # possible ? if s1 == 0: s1 = 1.0 #intersection nx1 = max(x11, x21) nx2 = min(x12, x22) ny1 = max(y11, y21) ny2 = min(y12, y22) h = abs(nx2 - nx1) w = abs(ny2 - ny1) inter = h * w if inter > 0: fOverlap = inter / s1 else: # if overX and Y this is not possible ! fOverlap = 0.0 return fOverlap def bestRegionsAssignment(plgtl, lRegions): """ find the best (max overlap for self) region for self """ lOverlap = [] for _, plg in lRegions: lOverlap.append( signedRatioOverlap(plgtl.getBoundingBox(), plg.getBoundingBox())) # print plgtl.getBoundingBox(), lOverlap if max(lOverlap) == 0: return None return lRegions[lOverlap.index(max(lOverlap))] lPages = PageXml.getChildByName(doc.getroot(), 'Page') lRegionsToBeDeleted = [] for i, page in enumerate(lPages): if lLTextLines == []: lTextLines = PageXml.getChildByName(page, 'TextLine') else: lTextLines = lLTextLines[i] lCells = PageXml.getChildByName(page, 'TableCell') # print len(lCells),len(lTextLines) lOCells = [] for cell in lCells: #get Coords xpath = "./a:%s" % ("Coords") lCoords = cell.xpath(xpath, namespaces={"a": self.xmlns}) coord = lCoords[0] sPoints = coord.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) plg = Polygon(lXY) lOCells.append((cell, plg)) # find the best assignment of each text for tl in lTextLines: #get Coords xpath = "./a:%s" % ("Coords") lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns}) coord = lCoords[0] sPoints = coord.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) plg = Polygon(lXY) cell = bestRegionsAssignment(plg, lOCells) if cell: c, _ = cell lRegionsToBeDeleted.append(c.parent) ## what about parent TextRegion delete at least TextRegion/TextEquiv # tl.unlinkNode() tlcp = tl.docCopyNode(c.doc, True) # tlcp.unlinkNode() c.append(tlcp) # print c for region in lRegionsToBeDeleted: region.getParent().remove(region)
def mergeBaselineCells(self, coldir, colid, docid): """ Take a file (pxml) with stuff processed on Transkribus Tale the CVL template tool xml (xml) merge them regenerate a mpxml """ xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid)) # print (xmlpath) mpxml = xmlpath + ".mpxml" mpxmldoc = etree.parse(mpxml) lxml = glob.glob(os.path.join(xmlpath, "*.xml")) pxmldoc = MultiPageXml.makeMultiPageXml(lxml) lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml")) mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml) lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page') lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page') assert len(lXMLPage) == len(lPXMLPage) for i, cvlpage in enumerate(lXMLPage): ## remove TextRegion from xcvlpage lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion') for tr in lTextRegions: tr.getparent().remove(tr) pxmlpage = lPXMLPage[i] lTL = [] lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion') for x in lTextRegions: lTL.extend(PageXml.getChildByName(x, 'TextLine')) ltable = PageXml.getChildByName(cvlpage, 'TableRegion') if len(ltable) == 0: raise "NO TABLE" lCells = PageXml.getChildByName(ltable[0], 'TableCell') lC = [Polygon(PageXml.getPointList(c)) for c in lCells] lT = [Polygon(PageXml.getPointList(t)) for t in lTL] for i, tl in enumerate(lT): ## normalization lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords') lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline') coordB = lCoordsB[0] coord = lCoordsPoints[0] iHeight = 30 # in pixel x1, y1, x2, y2 = Polygon( PageXml.getPointList(coordB)).getBoundingBox() if coord is not None: coord.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2)) tl = Polygon(PageXml.getPointList(coordB)) lOverlap = [] for _, c in enumerate(lC): # print (lCells[j].get('row'),lCells[j].get('col'), self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox()) lOverlap.append(self.signedOverlap( c, tl)) #.getBoundingBox())) ## region of the same size as the textline # print (j,max(lOverlap),lOverlap.index(max(lOverlap))) if max(lOverlap) == 0: region = PageXml.createPageXmlNode('TextRegion') cvlpage.append(region) region.append(lTL[i]) else: cell = lCells[lOverlap.index(max(lOverlap))] cell.append(lTL[i]) # print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext())) pxmldoc.write(mpxml)