예제 #1
0
def processRegions(ndPage, bVerbose=False):
    """
        Delete empty regions
        resize no empty regions
    """
    lDel = []
    lndRegions = ndPage.xpath(".//pg:TextRegion", namespaces=dNS)
    for ndRegion in lndRegions:
        lTL = ndRegion.xpath(".//pg:TextLine", namespaces=dNS)
        if lTL == []:
            # to be deleted
            lDel.append(ndRegion)
        else:
            #resize it
            oHull = ShapeLoader.convex_hull(lTL, bShapelyObject=True)
            PageXml.getChildByName(ndRegion, 'Coords')[0].set(
                "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True))


#             contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lTL ])
#             o = contour.minimum_rotated_rectangle
#             ndRegion.getChildByName('Coords').set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))

# delete empty regions
    [ndRegion.getparent().remove(ndRegion) for ndRegion in lDel]

    if bVerbose:
        traceln(" - %d regions deleted" % (len(lDel)))
        traceln(" - %d regions updated" % (len(lndRegions) - len(lDel)))
예제 #2
0
    def findTemplate(self,doc):
        """
            find the page where the first TableRegion occurs and extract it
        """
        lT = PageXml.getChildByName(doc.getRootElement(),'TableRegion')
        if lT == []:
            return None
        firstTable=lT[0]
        # lazy guy!
        page = firstTable.parent
        newDoc,_ = PageXml.createPageXmlDocument('XRCE', '', 0,0)
        page.unlinkNode()
        newDoc.setRootElement(page)
        ### need to add the ns!!
#         print newDoc.serialize('utf-8',True)
        # Borders must be visible: 
        #leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false
        lcells = PageXml.getChildByName(newDoc.getRootElement(),'TableCell')
        for cell in lcells:
            cell.setProp("leftBorderVisible",'true')
            cell.setProp("rightBorderVisible",'true')
            cell.setProp("topBorderVisible",'true')
            cell.setProp("bottomBorderVisible",'true')

        return newDoc
예제 #3
0
    def regularTextLines(self, doc):
        """
            from a baseline: create a regular TextLine: rectangle along the baseline
                
        """

        lTextLines = PageXml.getChildByName(doc.getRootElement(), 'TextLine')
        for tl in lTextLines:
            coord = tl.children
            baseline = tl.children.next
            sPoints = baseline.prop('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                except ValueError:
                    print tl
            plg = Polygon(lXY)
            iHeight = 50  # not points!!  300 dpi : 62.5
            x1, y1, x2, y2 = plg.getBoundingBox()
            coord.setProp(
                'points', "%d,%d %d,%d %d,%d %d,%d" %
                (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
예제 #4
0
    def reinitPage(self, doc):
        """
         empty page 
        """
        lNodes = PageXml.getChildByName(doc.getroot(), 'Page')

        for node in lNodes:
            node.unlinkNode()
예제 #5
0
 def unLinkTable(self,doc):
     """
         delete table
     """
     lT = PageXml.getChildByName(doc.getRootElement(),'TableRegion')
     if lT == []:
         return doc
     
     for table in lT:
         table.unlinkNode()
         table.freeNode()
     return doc
예제 #6
0
 def unLinkTextLines(self,doc):
     """
         delete textlines and baselines
     """
     lT = PageXml.getChildByName(doc.getRootElement(),'TextLine')
     if lT == []:
         return doc
     
     for text in lT:
         text.unlinkNode()
         text.freeNode()
     return doc
예제 #7
0
    def extractFileNamesFromMPXML(self, doc):
        """
            to insure correct file order !
        """
        xmlpath = os.path.abspath(
            "%s%s%s%s%s" % (self.coldir, os.sep, 'col', os.sep, self.docid))

        lNd = PageXml.getChildByName(doc.getroot(), 'Page')
        #         for i in lNd:print i
        return list(
            map(
                lambda x: "%s%s%s.xml" %
                (xmlpath, os.sep, x.get('imageFilename')[:-4]), lNd))
예제 #8
0
    def extractFileNamesFromMPXML(self, mpxmldoc):
        """
            to insure correct file order !
            
            duplicated form performCVLLA.py
        """
        xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid))

        lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page')
        #         for i in lNd:print i
        return map(
            lambda x: "%s%s%s.xml" %
            (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd)
예제 #9
0
    def regularTextLines(self, doc):
        """
            from a baseline: create a regular TextLine:
        """
        from shapely.geometry import LineString
        from shapely.affinity import translate
        self.xmlns = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'

        lTextLines = PageXml.getChildByName(doc.getroot(), 'TextLine')
        for tl in lTextLines:
            #get Coords
            xpath = "./a:%s" % ("Coords")
            lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns})
            coord = lCoords[0]
            xpath = "./a:%s" % ("Baseline")
            lBL = tl.xpath(xpath, namespaces={"a": self.xmlns})
            try:
                baseline = lBL[0]
            except IndexError:
                continue

            sPoints = baseline.get('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                except ValueError:
                    print(tl)
            #plg = Polygon(lXY)
            try:
                line = LineString(lXY)
            except ValueError:
                continue  # LineStrings must have at least 2 coordinate tuples
            topline = translate(line, yoff=-40).simplify(10)
            #iHeight = 20   # in pixel
            #x1,y1, x2,y2 = topline.getBoundingBox()
            if coord is not None:
                spoints = ' '.join("%s,%s" % (int(x[0]), int(x[1]))
                                   for x in line.coords)
                lp = list(topline.coords)
                lp.reverse()
                spoints = spoints + ' ' + ' '.join("%s,%s" %
                                                   (int(x[0]), int(x[1]))
                                                   for x in lp)
                #spoints = ' '.join("%s,%s"%(x[0],x[1]) for x in pp.coords)
                #coord.set('points',"%d,%d %d,%d %d,%d %d,%d" % (x1,y1-iHeight,x2,y1-iHeight,x2,y2,x1,y2))
                coord.set('points', spoints)
            else:
                print(tl)
예제 #10
0
    def storeMPXML(self, lFiles):
        """
            store files in lFiles as mpxml
        """
        docDir = os.path.join(self.coldir + os.sep + 'col', self.docid)

        doc = MultiPageXml.makeMultiPageXml(lFiles)

        ## add cornerNode
        lCells = PageXml.getChildByName(doc.getRootElement(), 'TableCell')
        for domNode in lCells:
            cornerNode = libxml2.newNode('CornerPts')
            cornerNode.setContent("0 1 2 3")
            cornerNode.setNs(doc.getRootElement().ns())
            domNode.addChild(cornerNode)

        sMPXML = docDir + ".mpxml"
        doc.saveFormatFileEnc(sMPXML, "UTF-8", True)

        return doc, sMPXML
예제 #11
0
    def regularTextLinesold(self, doc):
        """
            from a baseline: create a regular TextLine:
            
            also: for slanted baseline: 
                
        """
        from shapely.geometry import LineString
        from shapely.affinity import translate
        self.xmlns = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'

        lTextLines = PageXml.getChildByName(doc.getroot(), 'TextLine')
        for tl in lTextLines:
            #get Coords
            xpath = "./a:%s" % ("Coords")
            lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns})
            coord = lCoords[0]
            xpath = "./a:%s" % ("Baseline")
            lBL = tl.xpath(xpath, namespaces={"a": self.xmlns})
            baseline = lBL[0]

            sPoints = baseline.get('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                except ValueError:
                    print(tl)
            plg = Polygon(lXY)
            line = LineString(lXY)
            # 50 seems to large: the manual GT is 30  ? not always!
            iHeight = 30  # in pixel
            x1, y1, x2, y2 = plg.getBoundingBox()
            if coord is not None:
                coord.set(
                    'points', "%d,%d %d,%d %d,%d %d,%d" %
                    (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
            else:
                print(tl)
예제 #12
0
    def findTemplate(self, doc):
        """
            find the page where the first TableRegion occurs and extract it
            
        """
        from copy import deepcopy

        lT = PageXml.getChildByName(doc.getroot(), 'TableRegion')
        if lT == []:
            return None
        firstTable = lT[0]
        # lazy guy!
        newDoc, fakepage = PageXml.createPageXmlDocument('NLE', '', 0, 0)

        page = firstTable.getparent()
        fakepage.set("imageFilename", page.get('imageFilename'))
        fakepage.set("imageWidth", page.get('imageWidth'))
        fakepage.set("imageHeight", page.get('imageHeight'))
        page.getparent().remove(page)
        # add table

        xx = deepcopy(firstTable)
        fakepage.append(xx)
        return newDoc
예제 #13
0
    def performLA(self, doc):
        """
            # for document doc 
            ## find the page where the template is
            ## store it as template (check borders))
            ## generate profile for table registration
            ## (execution)
            ## create profile for lA
            ## (execution)    
        """

        #         lNumPages = []
        if self.bTemplate or self.bBaseLine or self.bSeparator:
            # extract list of files sorted as in MPXML
            lFullPathXMLNames = self.extractFileNamesFromMPXML(doc)
            nbPages = len(lFullPathXMLNames)
            ## 1 generate xml files if only pxml are there

            xmlpath = os.path.abspath(
                os.path.join(self.coldir, 'col', self.docid))

            lXMLNames = [
                "%s%s%s" % (xmlpath, os.sep, name)
                for name in os.listdir(xmlpath)
                if os.path.basename(name)[-4:] == '.xml'
            ]
            isXml = [] != lXMLNames
            if isXml:
                [
                    os.remove("%s%s%s" % (xmlpath, os.sep, name))
                    for name in os.listdir(xmlpath)
                    if os.path.basename(name)[-4:] == '.xml'
                ]
                isXml = False
            isPXml = [] != [
                name for name in os.listdir(xmlpath)
                if os.path.basename(name)[-5:] == '.pxml'
            ]
            assert not isXml and isPXml

            # recreate doc?  (mpxml)

            lPXMLNames = [
                name for name in os.listdir(xmlpath)
                if os.path.basename(name)[-5:] == '.pxml'
            ]
            if not isXml:
                # copy pxml in xml
                for name in lPXMLNames:
                    oldname = "%s%s%s" % (xmlpath, os.sep, name)
                    newname = "%s%s%s" % (xmlpath, os.sep, name)
                    newname = newname[:-5] + '.xml'
                    tmpdoc = etree.parse(oldname)
                    tmpdoc.write(newname,
                                 encoding="UTF-8",
                                 pretty_print=True,
                                 xml_declaration=True)

        if self.bKeepTL:
            # keep ltextLione
            lTextLines = []
            lPages = PageXml.getChildByName(doc.getroot(), 'Page')
            for page in lPages:
                lTextLines.append(PageXml.getChildByName(page, 'TextLine'))
        ## Table registration
        if self.bTemplate:
            if self.sTemplateFile is None:
                templatePage = self.findTemplate(doc)
                if templatePage is None:
                    traceln("No table found in this document: %s" % self.docid)
                else:
                    oldOut = self.outputFileName
                    self.outputFileName = "%s%s%s.templ.xml" % (
                        self.coldir, os.sep, self.docid)
                    stemplatefile = "%s%s%s.templ.xml" % (self.coldir, os.sep,
                                                          self.docid)
                    print(stemplatefile)
                    self.writeDom(templatePage, True)
                    self.outputFileName = oldOut
                    prnregfilename = self.createRegistrationProfile(
                        stemplatefile)

            else:
                #                 raise Exception, 'file template stuff: to be done'
                prnregfilename = self.createRegistrationProfile(
                    self.sTemplateFile)

            job = LAProcessor.cNomacs + " --batch %s" % (prnregfilename)
            os.system(job)
            traceln('table registration done: %s' % prnregfilename)

        ## separator detection
        if self.bSeparator:
            prnglfilename = self.createLinesProfile()
            job = LAProcessor.cNomacs + " --batch %s" % (prnglfilename)
            os.system(job)
            traceln('GL done: %s' % prnglfilename)

        ## baseline detection
        if self.bBaseLine:
            prnlafilename = self.createLAProfile()
            #             job = LAProcessor.cNomacs+ " --batch %s"%(prnlafilename)
            job = LAProcessor.cNomacsold + " --batch %s" % (prnlafilename)

            os.system(job)
            traceln('LA done: %s' % prnlafilename)

        if self.bTemplate or self.bBaseLine or self.bSeparator:
            doc, sMPXML = self.storeMPXML(lFullPathXMLNames)

        # Does not work with URO LA!
        if self.bKeepTL:
            self.reintegrateTextIntoCells(doc, lTextLines)

        ## text rectangles as textline region
        if self.bRegularTextLine:
            self.regularTextLines(doc)
        doc.write(sMPXML,
                  encoding="UTF-8",
                  pretty_print=True,
                  xml_declaration=True)

        return doc, nbPages
예제 #14
0
    def reintegrateTextIntoCells(self, doc, lLTextLines=[]):
        """
        from XMLDSTABLE
        """
        def overlapX(zone1, zone2):
            [a1, a2] = zone1  #self.getX(),self.getX()+ self.getWidth()
            [b1, b2] = zone2  #zone.getX(),zone.getX()+ zone.getWidth()
            return min(a2, b2) >= max(a1, b1)

        def overlapY(zone1, zone2):
            [a1, a2] = zone1  #self.getY(),self.getY() + self.getHeight()
            [b1, b2] = zone2  #zone.getY(),zone.getY() + zone.getHeight()
            return min(a2, b2) >= max(a1, b1)

        def signedRatioOverlap(zone1, zone2):
            """
             overlap self and zone
             return surface of self in zone 
            """

            [
                x1, y1, x12, y12
            ] = zone1  #self.getX(),self.getY(),self.getHeight(),self.getWidth()
            [
                x2, y2, x22, y22
            ] = zone2  #zone.getX(),zone.getY(),zone.getHeight(),zone.getWidth()

            w1, h1 = x12 - x1, y12 - y1
            w2, h2 = x22 - x2, y22 - y2
            fOverlap = 0.0

            #             print (x1,x12),(x2,x22)
            #             print overlapX((x1,x12),(x2,x22))
            #             print (y1,y12),(y2,y22)
            #             print overlapY((y1,y12),(y2,y22))
            #             if overlapX((x1,w1),(x2,w2)) and overlapY((y1,h1),(y2,h2)):
            if overlapX((x1, x12), (x2, x22)) and overlapY((y1, y12),
                                                           (y2, y22)):
                [x11, y11, x12, y12] = [x1, y1, x1 + w1, y1 + h1]
                [x21, y21, x22, y22] = [x2, y2, x2 + w2, y2 + h2]

                s1 = w1 * h1

                # possible ?
                if s1 == 0: s1 = 1.0

                #intersection
                nx1 = max(x11, x21)
                nx2 = min(x12, x22)
                ny1 = max(y11, y21)
                ny2 = min(y12, y22)
                h = abs(nx2 - nx1)
                w = abs(ny2 - ny1)

                inter = h * w
                if inter > 0:
                    fOverlap = inter / s1
                else:
                    # if overX and Y this is not possible !
                    fOverlap = 0.0

            return fOverlap

        def bestRegionsAssignment(plgtl, lRegions):
            """
                find the best (max overlap for self) region  for self
            """

            lOverlap = []
            for _, plg in lRegions:
                lOverlap.append(
                    signedRatioOverlap(plgtl.getBoundingBox(),
                                       plg.getBoundingBox()))
#             print plgtl.getBoundingBox(), lOverlap
            if max(lOverlap) == 0: return None
            return lRegions[lOverlap.index(max(lOverlap))]

        lPages = PageXml.getChildByName(doc.getroot(), 'Page')
        lRegionsToBeDeleted = []
        for i, page in enumerate(lPages):
            if lLTextLines == []:
                lTextLines = PageXml.getChildByName(page, 'TextLine')
            else:
                lTextLines = lLTextLines[i]

            lCells = PageXml.getChildByName(page, 'TableCell')

            #             print len(lCells),len(lTextLines)
            lOCells = []
            for cell in lCells:
                #get Coords
                xpath = "./a:%s" % ("Coords")
                lCoords = cell.xpath(xpath, namespaces={"a": self.xmlns})
                coord = lCoords[0]
                sPoints = coord.get('points')
                lsPair = sPoints.split(' ')
                lXY = list()
                for sPair in lsPair:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                plg = Polygon(lXY)
                lOCells.append((cell, plg))

            # find the best assignment of each text
            for tl in lTextLines:
                #get Coords
                xpath = "./a:%s" % ("Coords")
                lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns})
                coord = lCoords[0]
                sPoints = coord.get('points')
                lsPair = sPoints.split(' ')
                lXY = list()
                for sPair in lsPair:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                plg = Polygon(lXY)
                cell = bestRegionsAssignment(plg, lOCells)
                if cell:
                    c, _ = cell
                    lRegionsToBeDeleted.append(c.parent)
                    ## what about parent  TextRegion  delete at least TextRegion/TextEquiv
                    #                     tl.unlinkNode()
                    tlcp = tl.docCopyNode(c.doc, True)
                    #                     tlcp.unlinkNode()
                    c.append(tlcp)
#                     print c

        for region in lRegionsToBeDeleted:
            region.getParent().remove(region)
예제 #15
0
    def mergeBaselineCells(self, coldir, colid, docid):
        """
        
            Take a file (pxml) with stuff processed on Transkribus
            Tale the CVL template tool xml (xml)
            
            merge them
            
            regenerate a mpxml
             
        """

        xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid))
        #         print (xmlpath)

        mpxml = xmlpath + ".mpxml"
        mpxmldoc = etree.parse(mpxml)

        lxml = glob.glob(os.path.join(xmlpath, "*.xml"))
        pxmldoc = MultiPageXml.makeMultiPageXml(lxml)

        lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml"))
        mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml)

        lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page')
        lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page')

        assert len(lXMLPage) == len(lPXMLPage)
        for i, cvlpage in enumerate(lXMLPage):
            ## remove TextRegion from xcvlpage
            lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion')
            for tr in lTextRegions:
                tr.getparent().remove(tr)

            pxmlpage = lPXMLPage[i]
            lTL = []
            lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion')
            for x in lTextRegions:
                lTL.extend(PageXml.getChildByName(x, 'TextLine'))

            ltable = PageXml.getChildByName(cvlpage, 'TableRegion')
            if len(ltable) == 0:
                raise "NO TABLE"
            lCells = PageXml.getChildByName(ltable[0], 'TableCell')

            lC = [Polygon(PageXml.getPointList(c)) for c in lCells]
            lT = [Polygon(PageXml.getPointList(t)) for t in lTL]
            for i, tl in enumerate(lT):
                ## normalization
                lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords')
                lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline')
                coordB = lCoordsB[0]
                coord = lCoordsPoints[0]
                iHeight = 30  # in pixel
                x1, y1, x2, y2 = Polygon(
                    PageXml.getPointList(coordB)).getBoundingBox()
                if coord is not None:
                    coord.set(
                        'points', "%d,%d %d,%d %d,%d %d,%d" %
                        (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
                    tl = Polygon(PageXml.getPointList(coordB))
                lOverlap = []
                for _, c in enumerate(lC):
                    #                     print (lCells[j].get('row'),lCells[j].get('col'),  self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox())
                    lOverlap.append(self.signedOverlap(
                        c, tl))  #.getBoundingBox()))
                ## region of the same size as the textline
#                 print (j,max(lOverlap),lOverlap.index(max(lOverlap)))
                if max(lOverlap) == 0:
                    region = PageXml.createPageXmlNode('TextRegion')
                    cvlpage.append(region)
                    region.append(lTL[i])

                else:
                    cell = lCells[lOverlap.index(max(lOverlap))]
                    cell.append(lTL[i])
#                     print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext()))

        pxmldoc.write(mpxml)