Exemplo n.º 1
0
    def run(self, doc):
        """
            take a set of line in a page and mine it
        """
        self.doc = doc
        # use the lite version
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        self.lPages = self.ODoc.getPages()

        if self.bManual:
            self.processWithTemplate(self.manualPattern, self.lPages)
        else:

            #             self.mainMining(self.lPages)
            for page in self.lPages:
                print("page")
                lTables = page.getAllNamedObjects(XMLDSTABLEClass)
                for table in lTables:
                    self.columnMining(table)

        self.addTagProcessToMetadata(self.doc)

        return self.doc
Exemplo n.º 2
0
    def run(self, domDoc):
        """
            conversion
        """
        ODoc = XMLDSDocument()
        # ODoc.lastPage=1
        ODoc.loadFromDom(domDoc)
        lPageXmlDoc = []
        lPages = ODoc.getPages()
        for page in lPages:
            #             print("%s %s"%(page, page.getAttribute('imageFilename')))
            try:
                filename = os.path.basename(page.getAttribute('imageFilename'))
            except:
                filename = "fakename"
            pageXmlDoc, pageNode = PageXml.createPageXmlDocument(
                creatorName='NLE',
                filename=filename,
                imgW=convertDot2Pixel(self.dpi, page.getWidth()),
                imgH=convertDot2Pixel(self.dpi, page.getHeight()))
            self.pageXmlNS = etree.QName(pageXmlDoc.getroot()).namespace
            if self.bRegionOnly:
                self.convertOnlyRegion(page, pageNode)
            else:
                self.convertDSPage(page, pageNode)
            lPageXmlDoc.append(
                (pageXmlDoc, page.getAttribute('imageFilename')))

        return lPageXmlDoc
Exemplo n.º 3
0
    def run(self):
        """
            take a set of line in a page and mine it
        """
        # use the lite version
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,listPages=range(self.firstPage,self.lastPage+1))        

        self.lPages= self.ODoc.getPages()   
        
        if self.bManual:
            self.processWithTemplate(self.manualPattern,self.lPages)
        else:
            
            self.mainLineMining(self.lPages)
#             lRes = self.mineLineFeature(self.lPages)
#             print lRes
            # returns the hierarchical set of elements (a list)
#             for page , region, tree in lRes:
#                 self.tagDom(page, region)
#                 return 
        
        self.addTagProcessToMetadata(self.doc)
        
        return self.doc 
Exemplo n.º 4
0
    def run(self, doc):
        """
        main issue: how to select the template: to be done by CVL
            assuming IE and htr info are stored in the template
        
        """
        #         self.firstPage = 117
        #         self.lastPage= 118

        if self.page2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        self.lPages = self.ODoc.getPages()
        if self.recordType == 'D':
            record = deathRecord(self.sModelName, self.sModelDir)
            foo = self.processDeathWithTemplate
        elif self.recordType == 'W':
            record = weddingRecord(self.sModelName, self.sModelDir)
            foo = self.processWeddingWithTemplate
        elif self.recordType == 'B':
            record = birthRecord(self.sModelName, self.sModelDir)
            foo = self.processBirthWithTemplate
        else:
            print(f'record type not supported: {self.recordType}')

        for page in self.lPages:
            print("page: ", page.getNumber())
            #             self.testGTText(page)
            #             continue
            lTables = page.getAllNamedObjects(XMLDSTABLEClass)

            for table in lTables:
                if table.getNbRows() < 2:
                    if self.bDebug:
                        print("page: %s : not a table? %d/%d" %
                              (page.getNumber(), table.getNbRows(),
                               table.getNbColumns()))
                    continue
                if self.BuseStoredTemplate:
                    if self.bDebug:
                        print("page: %s : table %d/%d" %
                              (page.getNumber(), table.getNbRows(),
                               table.getNbColumns()))
                    foo(table, record)
                else:
                    self.mineTable(table, record)

        self.evalData = record.generateOutput(self.evalData)
Exemplo n.º 5
0
    def createRefPerPage(self, doc):
        """
            create a ref file from the xml one
            
            for DAS 2018: one ref per graph(page)
        """
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        dRows = {}
        for page in self.ODoc.getPages():
            #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0">
            pageNode = etree.Element('PAGE')
            #             pageNode.set("number",page.getAttribute('number'))
            #SINGLER PAGE pnum=1
            pageNode.set("number", '1')

            pageNode.set("imageFilename", page.getAttribute('imageFilename'))
            pageNode.set("width", page.getAttribute('width'))
            pageNode.set("height", page.getAttribute('height'))

            root = etree.Element("DOCUMENT")
            refdoc = etree.ElementTree(root)
            root.append(pageNode)

            lTables = page.getAllNamedObjects(XMLDSTABLEClass)
            for table in lTables:
                tableNode = etree.Element('TABLE')
                tableNode.set("x", table.getAttribute('x'))
                tableNode.set("y", table.getAttribute('y'))
                tableNode.set("width", table.getAttribute('width'))
                tableNode.set("height", table.getAttribute('height'))
                pageNode.append(tableNode)
                for cell in table.getAllNamedObjects(XMLDSTABLECELLClass):
                    try:
                        dRows[int(cell.getAttribute("row"))].append(cell)
                    except KeyError:
                        dRows[int(cell.getAttribute("row"))] = [cell]

                lYcuts = []
                for rowid in sorted(dRows.keys()):
                    #                     print rowid, min(map(lambda x:x.getY(),dRows[rowid]))
                    lYcuts.append(
                        min(list(map(lambda x: x.getY(), dRows[rowid]))))
                self.createRowsWithCuts(lYcuts, table, tableNode)

            self.outputFileName = os.path.basename(
                page.getAttribute('imageFilename')[:-3] + 'ref')
            print(self.outputFileName)
            self.writeDom(refdoc, bIndent=True)

        return refdoc
Exemplo n.º 6
0
    def run(self, doc):
        """
        main issue: how to select the template: to be done by CVL
            assuming IE and htr info are stored in the template
        
        """
        #         self.firstPage = 9
        #         self.lastPage= 9

        if self.page2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        self.lPages = self.ODoc.getPages()
        dr = deathRecord(self.sModelName, self.sModelDir)

        ## selection of the templates first with X tables

        ###

        for page in self.lPages:
            print("page: ", page.getNumber())
            #             self.testGTText(page)
            #             continue
            lTables = page.getAllNamedObjects(XMLDSTABLEClass)

            for table in lTables:
                if table.getNbRows() < 2:
                    print("page: %s : not a table? %d/%d" %
                          (page.getNumber(), table.getNbRows(),
                           table.getNbColumns()))
                    continue
                if self.BuseStoredTemplate:
                    #                     self.processWithTemplate(table, dr)
                    try:
                        self.processWithTemplate(table, dr)
                    except:
                        print('issue with page %s' % page)
                else:
                    self.mineTable(table, dr)

        self.evalData = dr.generateOutput(self.evalData)
Exemplo n.º 7
0
    def run(self, doc):
        """
           load dom and find rows 
        """
        # conver to DS if needed
        if self.bCreateRef:
            if self.do2DS:
                dsconv = primaAnalysis()
                doc = dsconv.convert2DS(doc, self.docid)

            refdoc = self.createRef(doc)
            return refdoc
            # single ref per page
            refdoc = self.createRefPerPage(doc)
            return None

        if self.do2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))
        self.lPages = self.ODoc.getPages()

        self.checkInputFormat(self.lPages)
        self.findColumnsInDoc(self.lPages)

        if self.bMining:
            self.documentMining(self.lPages)

        if self.bCreateRef:
            refdoc = self.createRef(self.doc)
            return refdoc


#         if self.do2DS:
#             # bakc to PageXml
#             conv= DS2PageXMLConvertor()
#             lPageXDoc = conv.run(self.doc)
#             conv.storeMultiPageXml(lPageXDoc,self.getOutputFileName())
#             print self.getOutputFileName()
#             return None
        return self.doc
Exemplo n.º 8
0
    def createRef(self, doc):
        """
            create a ref file from the xml one
        """
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        root = etree.Element("DOCUMENT")
        refdoc = etree.ElementTree(root)

        for page in self.ODoc.getPages():
            #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0">
            pageNode = etree.Element('PAGE')
            pageNode.set("number", page.getAttribute('number'))
            pageNode.set("pagekey",
                         os.path.basename(page.getAttribute('imageFilename')))
            pageNode.set("width", page.getAttribute('width'))
            pageNode.set("height", page.getAttribute('height'))

            root.append(pageNode)
            lTables = page.getAllNamedObjects(XMLDSTABLEClass)
            for table in lTables:
                dCol = {}
                tableNode = etree.Element('TABLE')
                tableNode.set("x", table.getAttribute('x'))
                tableNode.set("y", table.getAttribute('y'))
                tableNode.set("width", table.getAttribute('width'))
                tableNode.set("height", table.getAttribute('height'))
                pageNode.append(tableNode)
                for cell in table.getAllNamedObjects(XMLDSTABLECELLClass):
                    try:
                        dCol[int(cell.getAttribute("col"))].append(cell)
                    except KeyError:
                        dCol[int(cell.getAttribute("col"))] = [cell]

                lXcuts = []
                for colid in sorted(dCol.keys()):
                    lXcuts.append(
                        min(list(map(lambda x: x.getX(), dCol[colid]))))
                self.createColumnsWithCuts(lXcuts, table, tableNode)

        return refdoc
Exemplo n.º 9
0
    def run(self, doc):
        """
           load dom and find rows 
        """
        # conver to DS if needed
        if self.bCreateRef:
            if self.do2DS:
                dsconv = primaAnalysis()
                doc = dsconv.convert2DS(doc, self.docid)

            refdoc = self.createRef(doc)
            return refdoc
            # single ref per page
            refdoc = self.createRefPerPage(doc)
            return None

        if self.do2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))
        #         self.ODoc.loadFromDom(self.doc,listPages = range(30,31))

        self.findColumnsInDoc(self.ODoc)
        refdoc = self.createRef(self.doc)
        #         print refdoc.serialize('utf-8', 1)

        if self.do2DS:
            # bakc to PageXml
            conv = DS2PageXMLConvertor()
            lPageXDoc = conv.run(self.doc)
            conv.storeMultiPageXml(lPageXDoc, self.getOutputFileName())
            #             print self.getOutputFileName()
            return None
        return self.doc
Exemplo n.º 10
0
    def processDocument(self, coldir, colid, docid, dom=None):
        """
            process a single document
            
            1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml
            2 python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
            3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
            4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
            6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
            7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py  <model-name> <dictionary-name> 5400 17442
            
            wait
            8  python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force 
             #covnert to ds
            9  python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate  
            
        """

        #create Transkribus client
        self.myTrKCient = TranskribusClient(sServerUrl=self.server,
                                            proxies={},
                                            loggingLevel=logging.WARN)
        #login
        _ = self.login(self.myTrKCient, trace=trace, traceln=traceln)

        #         self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)

        ## load dom
        if dom is None:
            self.inputFileName = os.path.abspath(
                os.path.join(coldir, TableProcessing.sCOL,
                             docid + TableProcessing.sMPXMLExtension))
            mpxml_doc = self.loadDom()
            nbPages = MultiPageXml.getNBPages(mpxml_doc)
        else:
            # load provided mpxml
            mpxml_doc = dom
            nbPages = MultiPageXml.getNBPages(mpxml_doc)

#         ### table registration: need to compute/select???   the template
#         # perform LA  separator, table registration, baseline with normalization
#         #python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
#         tableregtool= LAProcessor()
# #         latool.setParams(dParams)
#         tableregtool.coldir = coldir
#         tableregtool.docid = docid
#         tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
#         # creates xml and a new mpxml
#         mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
#
#

#         self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')

        lJobIDs = self.applyLA_URO(colid, docid, nbPages)
        return

        bWait = True
        assert lJobIDs != []
        jobid = lJobIDs[-1]
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED']

        ## coldir???
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        ##STOP HERE FOR DAS newx testset:
        return

        # tag text for BIES cell
        #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
        """ 
            needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
        """
        doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
        doer.load()
        ## needed predict at file level, and do not store dom, but return it
        rowpath = os.path.join(coldir, "col")
        BIESFiles = doer.predict([rowpath], docid)
        BIESDom = self.loadDom(BIESFiles[0])
        #         res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)

        # MPXML2DS
        #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
        dsconv = primaAnalysis()
        DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid)

        # create XMLDOC object
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(
            DSBIESdoc)  #,listPages = range(self.firstPage,self.lastPage+1))
        # create row
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
        rdc = RowDetection()
        rdc.findRowsInDoc(self.ODoc)

        #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
        # DS2MPXML
        DS2MPXML = DS2PageXMLConvertor()
        lPageXml = DS2MPXML.run(self.ODoc.getDom())
        if lPageXml != []:
            #             if DS2MPXML.bMultiPages:
            newDoc = MultiPageXml.makeMultiPageXmlMemory(
                map(lambda xy: xy[0], lPageXml))
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            newDoc.write(outputFileName,
                         xml_declaration=True,
                         encoding="UTF-8",
                         pretty_print=True)


#             else:
#                 DS2MPXML.storePageXmlSetofFiles(lPageXml)

        return

        #upload
        # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
        self.upLoadDocument(colid,
                            coldir,
                            docid,
                            sNote='NLE workflow;table row done')

        ## apply HTR
        ## how to deal with specific dictionaries?

        ## here need to know the ontology and the template

        nbPages = 1
        jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel,
                              self.sDictName)
        bWait = True
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED']

        # download  where???
        # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
        #   coldir is not right!! coldir must refer to the parent folder!
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)
Exemplo n.º 11
0
class lineMiner(Component.Component):
    """
        lineMiner class: a component to mine column-like page layout
    """
    
    
    #DEFINE the version, usage and description of this particular component
    usage = "" 
    version = "v.01"
    description = "description: line miner "

    
    #--- INIT -------------------------------------------------------------------------------------------------------------    
    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "lineMiner", self.usage, self.version, self.description) 
        
        
        # TH for comparing numerical features for X
        self.THNUMERICAL= 20
        # use for evaluation
        self.THCOMP = 10
        self.evalData= None
        
        self.bManual = False
        
    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        if "pattern" in dParams: 
            self.manualPattern = eval( dParams["pattern"])
            self.bManual=True  



    def mainLineMining(self,lPages):
        """
            mine with incremental length
            
        """
        import util.TwoDNeighbourhood as TwoDRel
        
        
        lVEdge = []
        lLElts=[ [] for i in range(0,len(lPages))]
        for i,page in enumerate(lPages):
            lElts= page.getAllNamedObjects(XMLDSTEXTClass)
            for e in lElts:
                e.lnext=[]
            ## filter elements!!!
#             lElts = filter(lambda x:min(x.getHeight(),x.getWidth()) > 10,lElts)
#             lElts = filter(lambda x:x.getHeight() > 10,lElts)
#             lElts = list(filter(lambda x:x.getX() > 60,lElts))
            lElts.sort(key=lambda x:x.getY())
            lLElts[i]=lElts
            lVEdge = TwoDRel.findVerticalNeighborEdges(lElts)
            for  a,b in lVEdge:
                a.lnext.append( b )      
                  
        for i,page, in enumerate(lPages):
            lElts= lLElts[i]
            for elt in lElts:
                ### need of more abstract features: justified, center, left, right  + numerical 
                elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['x','2','xc','text'],myLevel=XMLDSTEXTClass)
                elt.computeSetofFeatures()
            seqGen = sequenceMiner()
            seqGen.setMaxSequenceLength(1)
            seqGen.setSDC(0.7) # related to noise level       AND STRUCTURES (if many columns)          
            _  = seqGen.featureGeneration(lElts,2)
            seqGen.setObjectLevel(XMLDSTEXTClass)

            # for registration: needs to be replaced by ._lRegValues
            print("sequence of elements and their features:")

            for elt in lElts:
                elt.lFeatureForParsing=elt.getSetofFeatures()
#                 print (elt, elt.lFeatureForParsing)
            
            lTerminalTemplates=[]
            lCurList=lElts[:]
            for iLen in range(1,3):
                lCurList,lTerminalTemplates = self.mineLineFeature(seqGen,lCurList,lTerminalTemplates,iLen)                  
            
            del seqGen

    def mineLineFeature(self,seqGen,lCurList,lTerminalTemplates,iLen):
        """
            get a set of lines and mine them
        """ 
        seqGen.setMinSequenceLength(iLen)
        seqGen.setMaxSequenceLength(iLen)
        
        print( '***'*20, iLen)
        seqGen.bDebug = False
        for elt in lCurList:
            if elt.getSetofFeatures() is None:
                elt.resetFeatures()
                elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['virtual'],myLevel=XMLDSTEXTClass)
                elt.computeSetofFeatures()
                elt.lFeatureForParsing=elt.getSetofFeatures()
            else:
                elt.setSequenceOfFeatures(elt.lFeatureForParsing)
#                 print elt, elt.getSetofFeatures()
        lSortedFeatures  = seqGen.featureGeneration(lCurList,2)
        for cf in lSortedFeatures:
            cf.setWeight(len((cf.getNodes())))
#         for x in lCurList:
#             print( x, x.getCanonicalFeatures())
        lmaxSequence = seqGen.generateItemsets(lCurList)
        seqGen.bDebug = False
        
        lSeq, _ = seqGen.generateMSPSData(lmaxSequence,lSortedFeatures + lTerminalTemplates,mis = 0.01)
        lPatterns = seqGen.miningSequencePrefixScan(lSeq)
#         model = PrefixSpan.train(lSeq, minSupport = 0.1, maxPatternLength = 10)
#         result = model.freqSequences().collect()
#         result.sort(key=lambda x:x.freq)
#         lPatterns=[]
#         for fs in result:
#             if fs.freq > 1:
# #                 for i in fs.sequence:
# #                     for x in i:
# #                         print(x,x.getValue())
# #                     i.sort(key=lambda x:x.getValue())
#                 lPatterns.append((fs.sequence,fs.freq))
#         lPatterns = list(filter(lambda p_s:len(p_s[0]) >= seqGen.getMinSequenceLength() and len(p_s[0]) <= seqGen.getMaxSequenceLength(),lPatterns))
            
#         lPatterns = seqGen.beginMiningSequences(lSeq,lSortedFeatures,lMIS)
        if lPatterns is None:
            return lCurList, lTerminalTemplates
                    
        lPatterns.sort(key=lambda xy:xy[1], reverse=True)
        
        print ("List of patterns and their support:")
        for p,support  in lPatterns:
            if support >= 1: 
                print (p, support)
        seqGen.THRULES = 0.95
        lSeqRules = seqGen.generateSequentialRules(lPatterns)
        
        " here store features which are redundant and consider only the core feature"
        _,dCP = self.getPatternGraph(lSeqRules)
        
        dTemplatesCnd = self.analyzeListOfPatterns(lPatterns,dCP)
        lFullTemplates,lTerminalTemplates,tranprob = seqGen.testTreeKleeneageTemplates(dTemplatesCnd, lCurList)
        
        ## here we have a graph; second : is it useful here to correct noise??
        ## allows for selecting templates ?
        self.selectFinalTemplates(lTerminalTemplates,tranprob,lCurList)
        
        ## store parsed sequences in mytemplate
        ### patterns are competing: generate a set of parsing ??? 
        for mytemplate in lFullTemplates[:1]: # dTemplatesCnd.keys():
#                     for _,_, mytemplate in dTemplatesCnd[templateType][:1]:
                mytemplate.print_()
#                         print '___'*30
#                         print lCurList
                isKleenePlus,_,lCurList = seqGen.parseWithTreeTemplate(mytemplate,lCurList,bReplace=True)
                for elt in lCurList:
                    if elt.getSetofFeatures() is None:
                        elt.resetFeatures()
                        elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['virtual'],myLevel=XMLDSTEXTClass)
                        elt.computeSetofFeatures()
                        elt.lFeatureForParsing=elt.getSetofFeatures()
                    else:
                        elt.setSequenceOfFeatures(elt.lFeatureForParsing)                        
#                 self.printTreeView(lCurList)        
                            
#                 print 'curList:',lCurList
#                 print len(lCurList)
        print( "final hierarchy")
        self.printTreeView(lCurList)
#             lRegions= self.getRegionsFromStructure(page,lCurList)
                # store all interation
#             lPageRegions.append((page,lRegions,lCurList))
                
        return lCurList, lTerminalTemplates
    def mineLineFeatureBasic(self,lPages):
        """
            get a set of lines and mine them
        """ 
        
        import util.TwoDNeighbourhood as TwoDRel
        
        lPageRegions = []
        
        lVEdge = []
        lLElts=[ [] for i in range(0,len(lPages))]
        for i,page in enumerate(lPages):
            lElts= page.getAllNamedObjects(XMLDSTEXTClass)
            for e in lElts:
                e.lnext=[]
            ## filter elements!!!
            lElts = list(filter(lambda x:min(x.getHeight(),x.getWidth()) > 10,lElts))
#             lElts = filter(lambda x:x.getHeight() > 10,lElts)
#             lElts = list(filter(lambda x:x.getX() > 100,lElts))

            lElts.sort(key=lambda x:x.getY())
            lLElts[i]=lElts
            lVEdge = TwoDRel.findVerticalNeighborEdges(lElts)
            for  a,b in lVEdge:
                a.lnext.append( b )

        for i,page, in enumerate(lPages):
            lElts= lLElts[i]
            for elt in lElts:
                elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['x','x2','xc','text'],myLevel=XMLDSTEXTClass)
                elt.computeSetofFeatures()
#                 print elt.getSetofFeatures()
            seqGen = sequenceMiner()
            seqGen.setMaxSequenceLength(1)
            seqGen.setSDC(0.5) # related to noise level       AND STRUCTURES (if many columns)          
            lSortedFeatures  = seqGen.featureGeneration(lElts,2)
            seqGen.setObjectLevel(XMLDSTEXTClass)

            # for registration: needs to be replaced by ._lRegValues
            print( "sequence of elements and their features:")

            for elt in lElts:
                elt.lFeatureForParsing=elt.getSetofFeatures()
                print( elt, elt.lFeatureForParsing)
            icpt=0
            lTerminalTemplates = []
            
            lCurList= lElts
#             print len(lCurList)
            while icpt < 2:
    #             seqGen.bDebug = False
                ## generate sequences
                if icpt > 0:
                    seqGen.setMinSequenceLength(2)
                    seqGen.setMaxSequenceLength(2)
                    
                    print( '***'*20, icpt)
                    seqGen.bDebug = False
                    for elt in lCurList:
                        if elt.getSetofFeatures() is None:
                            elt.resetFeatures()
                            elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['virtual'],myLevel=XMLDSTEXTClass)
                            elt.computeSetofFeatures()
                            elt.lFeatureForParsing=elt.getSetofFeatures()
                        else:
                            elt.setSequenceOfFeatures(elt.lFeatureForParsing)
                    lSortedFeatures  = seqGen.featureGeneration(lCurList,2)
                    
                lmaxSequence = seqGen.generateItemsets(lCurList)
                seqGen.bDebug = False
                
                lSeq, lMIS = seqGen.generateMSPSData(lmaxSequence,lSortedFeatures + lTerminalTemplates,mis = 0.2)
                lPatterns  = seqGen.miningSequencePrefixScan(lSeq)
                if lPatterns is None:
                    return None
#                 lPatterns = seqGen.beginMiningSequences(lSeq,lSortedFeatures,lMIS)            
                lPatterns.sort(key=lambda xy:xy[1], reverse=True)
                
                print( "List of patterns and their support:")
                for p,support  in lPatterns:
                    if support >= 1: 
                        print (p, support)
                seqGen.THRULES = 0.95
                lSeqRules = seqGen.generateSequentialRules(lPatterns)
                
                " here store features which are redundant and consider only the core feature"
                _,dCP = self.getPatternGraph(lSeqRules)
                
                dTemplatesCnd = self.analyzeListOfPatterns(lPatterns,dCP,icpt)
                lFullTemplates,lTerminalTemplates,tranprob = seqGen.testTreeKleeneageTemplates(dTemplatesCnd, lCurList)
#                 lFullTemplates = seqGen.testTreeKleeneageTemplates(dTemplatesCnd, lCurList)
#                 print tranprob
#                 print lTerminalTemplates
                
                ## here we have a graph; second : is it useful here to correct noise??
                ## allows for selecting templates ?
                self.selectFinalTemplates(lTerminalTemplates,tranprob,lCurList)
                
                ## store parsed sequences in mytemplate
                ### patterns are competing: generate a set of parsing ??? 
                for mytemplate in lFullTemplates[:10]: # dTemplatesCnd.keys():
#                     for _,_, mytemplate in dTemplatesCnd[templateType][:1]:
                        mytemplate.print_()
#                         print '___'*30
#                         print lCurList
                        isKleenePlus,_,lCurList = seqGen.parseWithTreeTemplate(mytemplate,lCurList,bReplace=True)
                        for elt in lCurList:
                            if elt.getSetofFeatures() is None:
                                elt.resetFeatures()
                                elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['virtual'],myLevel=XMLDSTEXTClass)
                                elt.computeSetofFeatures()
                                elt.lFeatureForParsing = elt.getSetofFeatures()
                            else:
                                elt.setSequenceOfFeatures(elt.lFeatureForParsing)                        
#                 self.printTreeView(lCurList)        
                                
                icpt +=1
#                 print 'curList:',lCurList
#                 print len(lCurList)
            print ("final hierarchy")
            self.printTreeView(lCurList)
#             lRegions= self.getRegionsFromStructure(page,lCurList)
                # store all interation
#             lPageRegions.append((page,lRegions,lCurList))
                
        return lPageRegions
    
            
    def selectFinalTemplates(self,lTemplates,transProb,lElts):
        """
            apply viterbi to select best sequence of templates
        """
        import spm.viterbi as viterbi        

        if  lTemplates == []:
            return None
        
        def buildObs(lTemplates,lElts):
            """
                build observation prob
            """
            N = len(lTemplates) + 1
            obs = np.zeros((N,len(lElts)), dtype=np.float16) +10e-3
            for i,temp in enumerate(lTemplates):
                for j,elt in enumerate(lElts):
                    # how to dela with virtual nodes
                    try:
                        _, _, score= temp.registration(elt)
                    except : score =1
                    if score == -1:
                        score= 0.0
                    obs[i,j]= score
                    if np.isinf(obs[i,j]):
                        obs[i,j] = 64000
                    if np.isnan(obs[i,j]):
                        obs[i,j] = 0.0                        
#                     print i,j,elt,elt.lX, temp,score 
                #add no-template:-1
            return obs / np.amax(obs)

        
        N= len(lTemplates) + 1
        
        initialProb = np.ones(N) 
        initialProb = np.reshape(initialProb,(N,1))
        obs = buildObs(lTemplates,lElts)
        
        np.set_printoptions(precision= 3, linewidth =1000)
#         print "transProb"
#         print transProb
#         print 
#         print obs
        
        d = viterbi.Decoder(initialProb, transProb, obs)
        states,score =  d.Decode(np.arange(len(lElts)))

        # add empty template (last one in state)
        lTemplates.append(None)
        print (states, score)

        #assign to each elt the template assigned by viterbi
        for i,elt, in enumerate(lElts):
#             try: print elt,elt.lX,  lTemplates[states[i]]
#             except: print elt, elt.lX, 'no template'
            mytemplate= lTemplates[states[i]]
            elt.resetTemplate()
            if mytemplate is not None:
                elt.addTemplate(mytemplate)
                try:
                    registeredPoints, lMissing, score= mytemplate.registration(elt)
                except:
                    registeredPoints = None
                if registeredPoints:
#                     print registeredPoints, lMissing , score
                    if lMissing != []:
                        registeredPoints.extend(zip(lMissing,lMissing))
                    registeredPoints.sort(key=lambda xy:xy[1].getValue())
                    lcuts = map(lambda refcut:refcut[1],registeredPoints)
                    ## store features for the final parsing!!!
#                     print elt, lcuts
#                     elt.addVSeparator(mytemplate,lcuts)

        # return the new list with kleenePlus elts for next iteration
        ## reparse ?? YES using the featureSet given by viterbi  -> create an objectClass per kleenePlus element: objects: sub tree
#         print lTemplates[0]
#         self.parseWithTemplate(lTemplates[0], lElts)
        # elt = template
        
        return score     
    
    
    def createItemSetFromNext(self,lElts,iLen):
        """
            create itemset of length iLen using .lnext structures 
        """
    
    def getKleenePlusFeatures(self,lElts):
        """
            select KleenePlus elements based on .next (only possible for unigrams)
        """   
        dFreqFeatures={}
        dKleenePlusFeatures = {}
        
        lKleenePlus=[]
        for elt in lElts:
            for fea in elt.getSetofFeatures():
                try:dFreqFeatures[fea] +=1
                except KeyError:dFreqFeatures[fea] = 1
                for nextE in elt.next:
                    if fea in nextE.getSetofFeatures():
                        try:dKleenePlusFeatures[fea].append((elt,nextE))
                        except KeyError:dKleenePlusFeatures[fea]=[(elt,nextE)]
        for fea in dFreqFeatures:
            try:
                dKleenePlusFeatures[fea]
                if len(dKleenePlusFeatures[fea]) >= 0.5 *  dFreqFeatures[fea]:
                    lKleenePlus.append(fea) 
            except:
                pass
        return  lKleenePlus
    
    def computePatternScore(self,pattern):
        """
            consider the frequency of the pattern and the weights of the features
        """
        fScore = 0
        #terminal
        if  not isinstance(pattern, list):
            fScore += pattern.getCanonical().getWeight()
        else:
            for child in pattern:
                fScore += self.computePatternScore(child)
#         print 'score:',pattern ,fScore
        return fScore 
    
    def analyzeListOfPatterns(self,lPatterns,dCA,):
        """
            select patterns with no ancestor
            other criteria ?
            
            
            if many with similar frequency:  sort using  computePatternScore?
        """
        # reorder lPatterns considering feature weights and # of elements (for equally frequent patterns)
#         lPatterns.sort(key=lambda (x,y):self.computePatternScore(x),reverse=True)
#         for x,y in lPatterns:
#             print x,y,self.computePatternScore(x)
             
        dTemplatesTypes = {}
        for pattern,support in filter(lambda xy:xy[1]>1,lPatterns):
            try:
                dCA[str(pattern)]
                bSkip = True
            except KeyError:bSkip=False
#             if step > 0 and len(pattern) == 1:
#                 bSkip=True
            if not bSkip:   
                template  = treeTemplateClass()
                template.setPattern(pattern)
                template.buildTreeFromPattern(pattern)
                template.setType('lineTemplate')
                try:dTemplatesTypes[template.__class__.__name__].append((pattern, support, template))
                except KeyError: dTemplatesTypes[template.__class__.__name__] = [(pattern,support,template)]                      
        
        return dTemplatesTypes
        
        
    
    def processWithTemplate(self,lPattern,lPages):
        """
            apply a known pattern 
        """
        
        def convertStringtoPattern(xcur):
            ## need to integrate the 'virtual level'
            lRes=[]
            for elt in xcur:
                if isinstance(elt,list):
                    lRes.extend([convertStringtoPattern(elt)])
                else:
                    try:
                        float(elt)
                        f= featureObject()
                        f.setName("x")
                        f.setType(featureObject.NUMERICAL)
                        f.setValue(elt)
                        f.setObjectName(elt)
                        f.setWeight(1)
                        f.setTH(self.THNUMERICAL)                
                    except:
                        f= featureObject() 
                        f.setName("f")
                        f.setType(featureObject.EDITDISTANCE)
                        f.setValue(elt)
                        f.setTH(100.0)                
                    lRes.append(f)                    
            
            return lRes
    
        lfPattern =convertStringtoPattern(lPattern)
#         print lfPattern
        # create a template from lfPattern!
        mytemplate = treeTemplateClass()
#         mytemplate.setPattern(lfPattern)
        mytemplate.buildTreeFromPattern(lfPattern)
        mytemplate.print_()
        for page in lPages:
            seqGen = sequenceMiner()
            seqGen.setObjectLevel(XMLDSTEXTClass)

            lElts= page.getAllNamedObjects(XMLDSTEXTClass)
            for elt in lElts:
                elt.setFeatureFunction(elt.getSetOfListedAttributes,self.THNUMERICAL,lFeatureList=['x','x2','text'],myLevel=XMLDSTEXTClass)
                elt.computeSetofFeatures()
                elt.lFeatureForParsing=elt.getSetofFeatures()    
            
            lParsing,lNewSeq = seqGen.parseWithTreeTemplate(mytemplate,lElts,bReplace=True)
            
            del  seqGen
            self.printTreeView(lNewSeq)
            # process lNewSeq: create the output data structure?
    
    def printTreeView(self,lElts,level=0):
        """
            move to structuralMining? 
        """
        for elt in lElts:
            if elt.getAttribute('virtual'):
                print ("  "*level, 'Node', elt.getAttribute('virtual'))
                self.printTreeView(elt.getObjects(),level+1)
            else:
                print ("  " * level,elt.getContent())
#                 try:print ("  "*level, elt, elt.getContent())
#                 except: print (elt._content[:3])
    
    def getPatternGraph(self,lRules):
        """
            create an graph which linsk exoannded patterns
            (a) -> (ab)
            (abc) -> (abcd)
           
           rule = (newPattern,item,i,pattern, fConfidence)
                   RULE: [['x=19.0', 'x=48.0', 'x=345.0'], ['x=19.0', 'x=126.0', 'x=345.0']] => 'x=464.0'[0] (22.0/19.0 = 0.863636363636)


            can be used for  tagging go up unitl no paretn
        """
        dParentChild= {}
        dChildParent= {}
        for lhs, rhs, itemsetIndex, fullpattern, fConfidence in lRules:
                try:dParentChild[str(fullpattern)].append(lhs)
                except KeyError:dParentChild[str(fullpattern)] = [lhs]
                try:dChildParent[str(lhs)].append(fullpattern)
                except KeyError:dChildParent[str(lhs)] = [fullpattern]                

#         # for bigram: extend to grammy
#         for child in dChildParent.keys():
#             ltmp=[]
#             if len(eval(child)) == 2:
#                 for parent in dChildParent[child]:
#                     try:
#                         ltmp.extend(dChildParent[str(parent)])
#                     except KeyError:pass
#             dChildParent[child].extend(ltmp)
        return dParentChild,  dChildParent
        
        
    
    def generateTestOutput(self,lPages):
        """
            create a run XML file
        """
        
        root = etree.Element('DOCUMENT')
        self.evalData = etree.ElementTree(root)
        for page in lPages:
            domp = etree.Element('PAGE')
            domp.set('number',page.getAttribute('number'))
            root.append(domp)
            for sep in page.lVSeparator:
#                 print( page.lVSeparator)
                domsep= etree.Element('SeparatorRegion')
                domp.append(domsep)
                domsep.set('x', str(sep[0]))
        
        return self.evalData
        
    def getRegionsFromStructure(self,page,lTree):
        """
            tag the dom with what?
        """
        
        lZone=[]
        srx, sry  = 9e9, 9e9
        srx2,sry2 = 0 ,  0
        lSubZone=[]
        for elt in lTree:
            if elt.getAttribute('virtual'):
                rx,ry,rx2,ry2,lsub = self.getRegionsFromStructure(page,elt.getObjects())
                srx = min(srx,rx)
                sry = min(sry,ry)
                srx2= max(srx2,rx2)
                sry2= max(sry2,ry2)
                lSubZone.append([rx,ry,rx2,ry2,lsub])
            else:
                lZone.append(elt)
        # get BB of the zone
        fMinX , fMinY = srx  , sry
        fMaxX , fMaxY = srx2 , sry2
        for e in lZone:
            if e.getX2() > fMaxX:
                fMaxX= e.getX2()
            if e.getY2() > fMaxY:
                fMaxY= e.getY2()
            if e.getX() < fMinX:
                fMinX= e.getX()
            if e.getY() < fMinY:
                fMinY= e.getY()
        # has substructure
        if srx != 9e9:
            return [ fMinX,fMinY,fMaxX,fMaxY , lSubZone]
        else:
            return [ fMinX,fMinY,fMaxX,fMaxY, []]

    def tagDom(self,page,region):
        """
            tag page with region (x,y,x2,y2)
        """
        fMinX , fMinY,  fMaxX , fMaxY, ltail = region
        # new region node
        regionNode = etree.Element('REGION')
        page.getNode().append(regionNode)
        regionNode.set('x',str(fMinX))
        regionNode.set('y',str(fMinY))
        regionNode.set('height',str(fMaxY-fMinY))
        regionNode.set('width',str(fMaxX - fMinX))
        print() 
        print (region)
        print (regionNode)
        [self.tagDom(page,tail) for tail in ltail]
        
        return regionNode        
            
    #--- RUN ---------------------------------------------------------------------------------------------------------------    
    def run(self):
        """
            take a set of line in a page and mine it
        """
        # use the lite version
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,listPages=range(self.firstPage,self.lastPage+1))        

        self.lPages= self.ODoc.getPages()   
        
        if self.bManual:
            self.processWithTemplate(self.manualPattern,self.lPages)
        else:
            
            self.mainLineMining(self.lPages)
#             lRes = self.mineLineFeature(self.lPages)
#             print lRes
            # returns the hierarchical set of elements (a list)
#             for page , region, tree in lRes:
#                 self.tagDom(page, region)
#                 return 
        
        self.addTagProcessToMetadata(self.doc)
        
        return self.doc 

    #--- TESTS -------------------------------------------------------------------------------------------------------------    
    #
    # Here we have the code used to test this component on a prepared testset (see under <ROOT>/test/common)
    # Do: python ../../src/common/TypicalComponent.py --test REF_TypicalComponent/
    #
    def testeval(self,srefData,srunData, bVisual):
        """
            Test found reftemplate and reftemplate
        """ 
        cntOk = cntErr = cntMissed = 0
        RefData = etree.XML(srefData.strip("\n").encode('utf-8'))
        RunData = etree.XML(srunData.strip("\n").encode('utf-8'))        
         
        lRun = []
        if RunData is not None:
            lpages = RunData.xpath('//%s' % ('PAGE'))
            for page in lpages:
                if page.get('reftemplate'):
                    lRun.append(eval(page.get('reftemplate')))

        lRef = []
        lPages = RefData.xpath('//%s' % ('PAGE'))
        for page in lPages:
            if page.get('reftemplate'):
                lRef.append(eval(page.get('reftemplate')))
            else: lRef.append([])
 
        ltisRefsRunbErrbMiss= list()
        for i in range(0,len(lRef)):
            lRefCovered = []
            if lRun[i] ==[]:
                runLen=0
            else:
                posrun = lRun[i][0]
                runLen = len(lRun[i][posrun+1])
            if lRef[i]==[]:
                refLen=0
                refElt=None
                posref=None
            else:
                posref=lRef[i][0]
                refLen= len(lRef[i][posref+1])
            curRun = curRef = 0
            while curRun <= runLen - 1:  # or curRef <= refLen -1:
                bErr, bMiss = False, False
                try:
                    runElt = lRun[i][posrun+1][curRun]
                except IndexError: runElt = None
    #             print '___',curRun,runElt
                curRef = 0
                bFound = False
                while not bFound and curRef <= refLen - 1:
                    try: refElt = lRef[i][posref+1][curRef]
                    except IndexError: refElt = None
    #                 self.compareString(runElt,runElt)
                    if runElt and refElt not in lRefCovered and self.testComparePageVertical(runElt, refElt):
                        bFound = True
                        lRefCovered.append(refElt)
                        resRef=refElt
                    else:
                        curRef += 1
                if bFound:
                    if bVisual:print ("FOUND:", runElt, ' -- ', lRefCovered[-1])
                    cntOk += 1
                    curRun += 1
                else:
                    resRef=''
                    curRun += 1
                    cntErr += 1
                    bErr = True
#                     bMiss = True
                    if bVisual:print ("ERROR:", runElt)
                ltisRefsRunbErrbMiss.append( (i, resRef, runElt,bErr, bMiss) )
             
            if posref is not None:
                for ref in lRef[i][posref+1]:
                    if ref not in lRefCovered:
                        ltisRefsRunbErrbMiss.append( (i, ref, '',False, True) )
                        # add missed elements!
#                         print 'missed', len(lRef[i][posref+1]) , len(lRefCovered), lRef[i][posref+1], lRefCovered
                        cntMissed += 1#(len(lRef[i][posref+1]) - len(lRefCovered))
 
        ltisRefsRunbErrbMiss.sort(key=lambda x_y_z_t_u:x_y_z_t_u[0])

        return (cntOk, cntErr, cntMissed,ltisRefsRunbErrbMiss)              
    def testRun(self, filename, outFile=None):
        """
        testRun is responsible for running the component on this file and returning a string that reflects the result in a way
        that is understandable to a human and to a program. Nicely serialized Python data or XML is fine
        """
        
        doc = self.loadDom(filename)
        self.doc= doc
        self.run()
#         doc.freeDoc()
        self.generateTestOutput(self.lPages)

        if outFile: self.writeDom(doc)
        return etree.tostring(self.evalData,encoding='unicode')
        
    def testCompare(self, srefData, srunData, bVisual=False):
        """
        Our comparison is very simple: same or different. N
        We anyway return this in term of precision/recall
        If we want to compute the error differently, we must define out own testInit testRecord, testReport
        """
        dicTestByTask = dict()
        dicTestByTask['EVAL']= self.testeval(srefData, srunData,bVisual)
        return dicTestByTask
Exemplo n.º 12
0
class columnDetection(Component.Component):
    """
        build table column 
    """
    usage = ""
    version = "v.01"
    description = "description: column Detection"

    #--- INIT -------------------------------------------------------------------------------------------------------------
    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "columnDetection", self.usage,
                                     self.version, self.description)

        self.colname = None
        self.docid = None

        self.do2DS = False

        # for --test
        self.bCreateRef = False
        self.evalData = None

    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        #         if dParams.has_key("coldir"):
        #             self.colname = dParams["coldir"]
        if "docid" in dParams:
            self.docid = dParams["docid"]
        if "dsconv" in dParams:
            self.do2DS = dParams["dsconv"]

        if "createref" in dParams:
            self.bCreateRef = dParams["createref"]

#     def createCells(self, table):
#         """
#             create new cells using BIESO tags
#             @input: tableObeject with old cells
#             @return: tableObject with BIES cells
#             @precondition: requires columns
#
#         """
#         for col in table.getColumns():
#             lNewCells=[]
#             # keep original positions
#             col.resizeMe(XMLDSTABLECELLClass)
#             for cell in col.getCells():
# #                 print cell
#                 curChunk=[]
#                 lChunks = []
# #                 print map(lambda x:x.getAttribute('type'),cell.getObjects())
# #                 print map(lambda x:x.getID(),cell.getObjects())
#                 cell.getObjects().sort(key=lambda x:x.getY())
#                 for txt in cell.getObjects():
# #                     print txt.getAttribute("type")
#                     if txt.getAttribute("type") == 'RS':
#                         if curChunk != []:
#                             lChunks.append(curChunk)
#                             curChunk=[]
#                         lChunks.append([txt])
#                     elif txt.getAttribute("type") in ['RI', 'RE']:
#                         curChunk.append(txt)
#                     elif txt.getAttribute("type") == 'RB':
#                         if curChunk != []:
#                             lChunks.append(curChunk)
#                         curChunk=[txt]
#                     elif txt.getAttribute("type") == 'RO':
#                         ## add Other as well???
#                         curChunk.append(txt)
#
#                 if curChunk != []:
#                     lChunks.append(curChunk)
#
#                 if lChunks != []:
#                     # create new cells
#                     table.delCell(cell)
#                     irow= cell.getIndex()[0]
#                     for i,c in enumerate(lChunks):
# #                         print map(lambda x:x.getAttribute('type'),c)
#                         #create a new cell per chunk and replace 'cell'
#                         newCell = XMLDSTABLECELLClass()
#                         newCell.setPage(cell.getPage())
#                         newCell.setParent(table)
#                         newCell.setName(ds_xml.sCELL)
#                         newCell.setIndex(irow+i,cell.getIndex()[1])
#                         newCell.setObjectsList(c)
#                         newCell.resizeMe(XMLDSTEXTClass)
#                         newCell.tagMe2()
#                         for o in newCell.getObjects():
#                             o.setParent(newCell)
#                             o.tagMe()
# #                         table.addCell(newCell)
#                         lNewCells.append(newCell)
#                     cell.getNode().getparent().remove(cell.getNode())
#                     del(cell)
#             col.setObjectsList(lNewCells[:])
#             [table.addCell(c) for c in lNewCells]
#
# #             print col.tagMe()

    def createTable(self, page):
        """
            BB of all elements?
        """

    def processPage(self, page):
        from util.XYcut import mergeSegments

        ### skrinking to be done:
        lCuts, x1, x2 = mergeSegments(
            [(x.getX(), x.getX() + 20, x)
             for x in page.getAllNamedObjects(XMLDSTEXTClass)], 0)
        for x, y, cut in lCuts:
            ll = list(cut)
            ll.sort(key=lambda x: x.getY())
            traceln(len(ll))
#             traceln (list(map(lambda x:x.getContent(),ll)))

    def findColumnsInDoc(self, ODoc):
        """
        find columns for each table in ODoc
        """
        self.lPages = ODoc.getPages()

        # not always?
        #         self.mergeLineAndCells(self.lPages)

        for page in self.lPages:
            traceln("page: %d" % page.getNumber())
            self.processPage(page)

    def run(self, doc):
        """
           load dom and find rows 
        """
        # conver to DS if needed
        if self.bCreateRef:
            if self.do2DS:
                dsconv = primaAnalysis()
                doc = dsconv.convert2DS(doc, self.docid)

            refdoc = self.createRef(doc)
            return refdoc
            # single ref per page
            refdoc = self.createRefPerPage(doc)
            return None

        if self.do2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))
        #         self.ODoc.loadFromDom(self.doc,listPages = range(30,31))

        self.findColumnsInDoc(self.ODoc)
        refdoc = self.createRef(self.doc)
        #         print refdoc.serialize('utf-8', 1)

        if self.do2DS:
            # bakc to PageXml
            conv = DS2PageXMLConvertor()
            lPageXDoc = conv.run(self.doc)
            conv.storeMultiPageXml(lPageXDoc, self.getOutputFileName())
            #             print self.getOutputFileName()
            return None
        return self.doc

    ################ TEST ##################

    def testRun(self, filename, outFile=None):
        """
        evaluate using ABP new table dataset with tablecell
        """

        self.evalData = None
        doc = self.loadDom(filename)
        doc = self.run(doc)
        self.evalData = self.createRef(doc)
        if outFile: self.writeDom(doc)
        #         return self.evalData.serialize('utf-8',1)
        return etree.tostring(self.evalData,
                              encoding='unicode',
                              pretty_print=True)

    def overlapX(self, zone):

        [a1, a2] = self.getX(), self.getX() + self.getWidth()
        [b1, b2] = zone.getX(), zone.getX() + zone.getWidth()
        return min(a2, b2) >= max(a1, b1)

    def overlapY(self, zone):
        [a1, a2] = self.getY(), self.getY() + self.getHeight()
        [b1, b2] = zone.getY(), zone.getY() + zone.getHeight()
        return min(a2, b2) >= max(a1, b1)

    def signedRatioOverlap(self, z1, z2):
        """
         overlap self and zone
         return surface of self in zone 
        """
        [x1, y1, h1, w1] = z1.getX(), z1.getY(), z1.getHeight(), z1.getWidth()
        [x2, y2, h2, w2] = z2.getX(), z2.getY(), z2.getHeight(), z2.getWidth()

        fOverlap = 0.0

        if self.overlapX(z2) and self.overlapY(z2):
            [x11, y11, x12, y12] = [x1, y1, x1 + w1, y1 + h1]
            [x21, y21, x22, y22] = [x2, y2, x2 + w2, y2 + h2]

            s1 = w1 * h1

            # possible ?
            if s1 == 0: s1 = 1.0

            #intersection
            nx1 = max(x11, x21)
            nx2 = min(x12, x22)
            ny1 = max(y11, y21)
            ny2 = min(y12, y22)
            h = abs(nx2 - nx1)
            w = abs(ny2 - ny1)

            inter = h * w
            if inter > 0:
                fOverlap = inter / s1
            else:
                # if overX and Y this is not possible !
                fOverlap = 0.0

        return fOverlap

    def findSignificantOverlap(self, TOverlap, ref, run):
        """
            return 
        """
        pref, rowref = ref
        prun, rowrun = run
        if pref != prun: return False

        return rowref.ratioOverlap(rowrun) >= TOverlap

    def testCPOUM(self, TOverlap, srefData, srunData, bVisual=False):
        """
            TOverlap: Threshols used for comparing two surfaces
            
            
            Correct Detections:
            under and over segmentation?
        """

        cntOk = cntErr = cntMissed = 0

        RefData = etree.XML(srefData.strip("\n").encode('utf-8'))
        RunData = etree.XML(srunData.strip("\n").encode('utf-8'))
        #         try:
        #             RunData = libxml2.parseMemory(srunData.strip("\n"), len(srunData.strip("\n")))
        #         except:
        #             RunData = None
        #             return (cntOk, cntErr, cntMissed)
        lRun = []
        if RunData:
            lpages = RunData.xpath('//%s' % ('PAGE'))
            for page in lpages:
                pnum = page.get('number')
                #record level!
                lRows = page.xpath(".//%s" % ("ROW"))
                lORows = map(lambda x: XMLDSTABLEROWClass(0, x), lRows)
                for row in lORows:
                    row.fromDom(row._domNode)
                    row.setIndex(row.getAttribute('id'))
                    lRun.append((pnum, row))
        print(lRun)

        lRef = []
        lPages = RefData.xpath('//%s' % ('PAGE'))
        for page in lPages:
            pnum = page.get('number')
            lRows = page.xpath(".//%s" % ("ROW"))
            lORows = map(lambda x: XMLDSTABLEROWClass(0, x), lRows)
            for row in lORows:
                row.fromDom(row._domNode)
                row.setIndex(row.getAttribute('id'))
                lRef.append((pnum, row))

        refLen = len(lRef)
        #         bVisual = True
        ltisRefsRunbErrbMiss = list()
        lRefCovered = []
        for i in range(0, len(lRun)):
            iRef = 0
            bFound = False
            bErr, bMiss = False, False
            runElt = lRun[i]
            #             print '\t\t===',runElt
            while not bFound and iRef <= refLen - 1:
                curRef = lRef[iRef]
                if runElt and curRef not in lRefCovered and self.findSignificantOverlap(
                        TOverlap, runElt, curRef):
                    bFound = True
                    lRefCovered.append(curRef)
                iRef += 1
            if bFound:
                if bVisual: print("FOUND:", runElt, ' -- ', lRefCovered[-1])
                cntOk += 1
            else:
                curRef = ''
                cntErr += 1
                bErr = True
                if bVisual: print("ERROR:", runElt)
            if bFound or bErr:
                ltisRefsRunbErrbMiss.append(
                    (int(runElt[0]), curRef, runElt, bErr, bMiss))

        for i, curRef in enumerate(lRef):
            if curRef not in lRefCovered:
                if bVisual: print("MISSED:", curRef)
                ltisRefsRunbErrbMiss.append(
                    (int(curRef[0]), curRef, '', False, True))
                cntMissed += 1
        ltisRefsRunbErrbMiss.sort(key=lambda xyztu: xyztu[0])

        #         print cntOk, cntErr, cntMissed,ltisRefsRunbErrbMiss
        return (cntOk, cntErr, cntMissed, ltisRefsRunbErrbMiss)

    def testCompare(self, srefData, srunData, bVisual=False):
        """
        as in Shahad et al, DAS 2010

        Correct Detections 
        Partial Detections 
        Over-Segmented 
        Under-Segmented 
        Missed        
        False Positive
                
        """
        dicTestByTask = dict()
        dicTestByTask['T50'] = self.testCPOUM(0.50, srefData, srunData,
                                              bVisual)
        #         dicTestByTask['T75']= self.testCPOUM(0.750,srefData,srunData,bVisual)
        #         dicTestByTask['T100']= self.testCPOUM(0.50,srefData,srunData,bVisual)

        #         dicTestByTask['FirstName']= self.testFirstNameRecord(srefData, srunData,bVisual)
        #         dicTestByTask['Year']= self.testYear(srefData, srunData,bVisual)

        return dicTestByTask

    def createColumnsWithCuts(self, lXCuts, table, tableNode, bTagDoc=False):
        """
            create column dom node
        """

        prevCut = None
        lXCuts.sort()
        for index, cut in enumerate(lXCuts):
            # first correspond to the table: no rpw
            if prevCut is not None:
                colNode = etree.Element("COL")
                tableNode.append(colNode)
                colNode.set('x', str(prevCut))
                colNode.set('width', "{:.2f}".format(cut - prevCut))
                colNode.set('y', str(table.getY()))
                colNode.set('height', str(table.getHeight()))
                colNode.set('id', str(index - 1))
            prevCut = cut

        #last
        cut = table.getX2()
        colNode = etree.Element("COL")
        tableNode.append(colNode)
        colNode.set('x', "{:.2f}".format(prevCut))
        colNode.set('width', "{:.2f}".format(cut - prevCut))
        colNode.set('y', str(table.getY()))
        colNode.set('height', str(table.getHeight()))
        colNode.set('id', str(index))

    def createRef(self, doc):
        """
            create a ref file from the xml one
        """
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        root = etree.Element("DOCUMENT")
        refdoc = etree.ElementTree(root)

        for page in self.ODoc.getPages():
            #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0">
            pageNode = etree.Element('PAGE')
            pageNode.set("number", page.getAttribute('number'))
            pageNode.set("pagekey",
                         os.path.basename(page.getAttribute('imageFilename')))
            pageNode.set("width", page.getAttribute('width'))
            pageNode.set("height", page.getAttribute('height'))

            root.append(pageNode)
            lTables = page.getAllNamedObjects(XMLDSTABLEClass)
            for table in lTables:
                dCol = {}
                tableNode = etree.Element('TABLE')
                tableNode.set("x", table.getAttribute('x'))
                tableNode.set("y", table.getAttribute('y'))
                tableNode.set("width", table.getAttribute('width'))
                tableNode.set("height", table.getAttribute('height'))
                pageNode.append(tableNode)
                for cell in table.getAllNamedObjects(XMLDSTABLECELLClass):
                    try:
                        dCol[int(cell.getAttribute("col"))].append(cell)
                    except KeyError:
                        dCol[int(cell.getAttribute("col"))] = [cell]

                lXcuts = []
                for colid in sorted(dCol.keys()):
                    lXcuts.append(
                        min(list(map(lambda x: x.getX(), dCol[colid]))))
                self.createColumnsWithCuts(lXcuts, table, tableNode)

        return refdoc

    def createRefPerPage(self, doc):
        """
            create a ref file from the xml one
            
            for DAS 2018: one ref per graph(page)
        """
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        dRows = {}
        for page in self.ODoc.getPages():
            #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0">
            pageNode = etree.Element('PAGE')
            #             pageNode.set("number",page.getAttribute('number'))
            #SINGLER PAGE pnum=1
            pageNode.set("number", '1')

            pageNode.set("imageFilename", page.getAttribute('imageFilename'))
            pageNode.set("width", page.getAttribute('width'))
            pageNode.set("height", page.getAttribute('height'))

            root = etree.Element("DOCUMENT")
            refdoc = etree.ElementTree(root)
            root.append(pageNode)

            lTables = page.getAllNamedObjects(XMLDSTABLEClass)
            for table in lTables:
                tableNode = etree.Element('TABLE')
                tableNode.set("x", table.getAttribute('x'))
                tableNode.set("y", table.getAttribute('y'))
                tableNode.set("width", table.getAttribute('width'))
                tableNode.set("height", table.getAttribute('height'))
                pageNode.append(tableNode)
                for cell in table.getAllNamedObjects(XMLDSTABLECELLClass):
                    try:
                        dRows[int(cell.getAttribute("row"))].append(cell)
                    except KeyError:
                        dRows[int(cell.getAttribute("row"))] = [cell]

                lYcuts = []
                for rowid in sorted(dRows.keys()):
                    #                     print rowid, min(map(lambda x:x.getY(),dRows[rowid]))
                    lYcuts.append(
                        min(list(map(lambda x: x.getY(), dRows[rowid]))))
                self.createRowsWithCuts(lYcuts, table, tableNode)

            self.outputFileName = os.path.basename(
                page.getAttribute('imageFilename')[:-3] + 'ref')
            print(self.outputFileName)
            self.writeDom(refdoc, bIndent=True)

        return refdoc
Exemplo n.º 13
0
class TableProcessing(Component.Component):
    usage = ""
    version = "v.01"
    description = "description: table layout analysis based on template"

    sCOL = "col"
    sMPXMLExtension = ".mpxml"

    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "TableProcessing", self.usage,
                                     self.version, self.description)

        self.colid = None
        self.docid = None

        self.bFullCol = False
        # generate MPXML using Ext
        self.useExtForMPXML = False

        self.bRegenerateMPXML = False

        self.sRowModelName = None
        self.sRowModelDir = None

        self.sHTRmodel = None
        self.sDictName = None

    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        if "coldir" in dParams:
            self.coldir = dParams["coldir"]
        if "colid" in dParams:
            self.colid = dParams["colid"]
        if "colid" in dParams:
            self.docid = dParams["docid"]
        if "useExt" in dParams:
            self.useExtForMPXML = dParams["useExt"]

        if 'regMPXML' in dParams:
            self.bRegenerateMPXML = True

        if "rowmodelname" in dParams:
            self.sRowModelName = dParams["rowmodelname"]
        if "rowmodeldir" in dParams:
            self.sRowModelDir = dParams["rowmodeldir"]

        if "htrmodel" in dParams:
            self.sHTRmodel = dParams["htrmodel"]
        if "dictname" in dParams:
            self.sDictName = dParams["dictname"]

        # Connection to Transkribus
        self.myTrKCient = None
        self.persist = False
        self.loginInfo = False
        if dParams.has_key("server"):
            self.server = dParams["server"]
        if dParams.has_key("persist"):
            self.persist = dParams["persist"]
        if dParams.has_key("login"):
            self.loginInfo = dParams["login"]

    def login(self, trnskrbs_client, trace=None, traceln=None):
        """
        deal with the complicated login variants...
            -trace and traceln are optional print methods 
        return True or raises an exception
        """
        DEBUG = True
        bOk = False
        if self.persist:
            #try getting some persistent session token
            if DEBUG and trace:
                trace("  ---login--- Try reusing persistent session ... ")
            try:
                bOk = trnskrbs_client.reusePersistentSession()
                if DEBUG and traceln: traceln("OK!")
            except:
                if DEBUG and traceln: traceln("Failed")

        if not bOk:
            if self.loginInfo:
                login, pwd = self.loginInfo, self.pwd
            else:
                if trace:
                    DEBUG and trace(
                        "  ---login--- no login provided, looking for stored credentials... "
                    )
                login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False)
                if DEBUG and traceln: traceln("OK")

            if DEBUG and traceln:
                trace("  ---login--- logging onto Transkribus as %s " % login)
            trnskrbs_client.auth_login(login, pwd)
            if DEBUG and traceln: traceln("OK")
            bOk = True

        return bOk

    def downloadCollection(self,
                           colid,
                           destDir,
                           docid,
                           bNoImg=True,
                           bForce=False):
        """
            download colID
            
            replace destDir by '.'  ?
        """
        destDir = "."
        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        downloader = TranskribusDownloader(self.myTrKCient.getServerUrl(),
                                           self.myTrKCient.getProxies())
        downloader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- Downloading collection %s to folder %s" %
                (colid, os.path.abspath(destDir)))
        #         col_ts, colDir = downloader.downloadCollection(colid, destDir, bForce=options.bForce, bNoImage=options.bNoImage)
        col_ts, colDir, ldocids, dFileListPerDoc = downloader.downloadCollection(
            colid, destDir, bForce=bForce, bNoImage=bNoImg, sDocId=docid)
        traceln("- Done")

        with open(os.path.join(colDir, "config.txt"), "w") as fd:
            fd.write("server=%s\nforce=%s\nstrict=%s\n" %
                     (self.server, True, False))

        downloader.generateCollectionMultiPageXml(
            os.path.join(colDir, TableProcessing.sCOL), dFileListPerDoc, False)

        traceln('- Done, see in %s' % colDir)

        return ldocids

    def upLoadDocument(self,
                       colid,
                       coldir,
                       docid,
                       sNote="",
                       sTranscripExt='.mpxml'):
        """
            download colID
        """

        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        #         uploader = TranskribusTranscriptUploader(self.server,self.proxies)
        uploader = TranskribusDUTranscriptUploader(
            self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies())
        uploader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- uploading document %s to collection %s" % (docid, colid))
        uploader.uploadDocumentTranscript(colid,
                                          docid,
                                          os.path.join(coldir, sCOL),
                                          sNote,
                                          'NLE Table',
                                          sTranscripExt,
                                          iVerbose=False)
        traceln("- Done")
        return

    def applyLA_URO(self, colid, docid, nbpages):
        """
        apply textline finder 
        """
        # do the job...
        #         if options.trp_doc:
        #             trpdoc =  json.load(codecs.open(options.trp_doc, "rb",'utf-8'))
        #             docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc)

        traceln('process %s pages...' % nbpages)
        lretJobIDs = []
        for i in range(1, nbpages + 1):
            LA = DoLAbatch(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
            LA._trpMng.setSessionId(self.myTrKCient.getSessionId())
            LA.setSessionId(self.myTrKCient.getSessionId())
            _, sPageDesc = LA.buildDescription(colid, "%s/%s" % (docid, i))
            sPageDesc = LA.jsonToXMLDescription(sPageDesc)
            _, lJobIDs = LA.run(colid, sPageDesc, "CITlabAdvancedLaJob", False)
            traceln(lJobIDs)
            lretJobIDs.extend(lJobIDs)
            traceln("- LA running for page %d job:%s" % (i, lJobIDs))
        return lretJobIDs

    def applyHTRForRegions(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply an htr model at region level 
        """

        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception, "no model ID found for %s" % (modelname)
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret

    def applyHTR(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply HTR on docid
            
            htr id is needed: we have htrmodename
        """
        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception, "no model ID found for %s" % (modelname)
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret

    def extractFileNamesFromMPXML(self, mpxmldoc):
        """
            to insure correct file order !
            
            duplicated form performCVLLA.py
        """
        xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid))

        lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page')
        #         for i in lNd:print i
        return map(
            lambda x: "%s%s%s.xml" %
            (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd)

    def processDocument(self, coldir, colid, docid, dom=None):
        """
            process a single document
            
            1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml
            2 python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
            3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
            4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
            6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
            7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py  <model-name> <dictionary-name> 5400 17442
            
            wait
            8  python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force 
             #covnert to ds
            9  python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate  
            
        """

        #create Transkribus client
        self.myTrKCient = TranskribusClient(sServerUrl=self.server,
                                            proxies={},
                                            loggingLevel=logging.WARN)
        #login
        _ = self.login(self.myTrKCient, trace=trace, traceln=traceln)

        #         self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)

        ## load dom
        if dom is None:
            self.inputFileName = os.path.abspath(
                os.path.join(coldir, TableProcessing.sCOL,
                             docid + TableProcessing.sMPXMLExtension))
            mpxml_doc = self.loadDom()
            nbPages = MultiPageXml.getNBPages(mpxml_doc)
        else:
            # load provided mpxml
            mpxml_doc = dom
            nbPages = MultiPageXml.getNBPages(mpxml_doc)

#         ### table registration: need to compute/select???   the template
#         # perform LA  separator, table registration, baseline with normalization
#         #python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
#         tableregtool= LAProcessor()
# #         latool.setParams(dParams)
#         tableregtool.coldir = coldir
#         tableregtool.docid = docid
#         tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
#         # creates xml and a new mpxml
#         mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
#
#

#         self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')

        lJobIDs = self.applyLA_URO(colid, docid, nbPages)
        return

        bWait = True
        assert lJobIDs != []
        jobid = lJobIDs[-1]
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED']

        ## coldir???
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        ##STOP HERE FOR DAS newx testset:
        return

        # tag text for BIES cell
        #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
        """ 
            needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
        """
        doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
        doer.load()
        ## needed predict at file level, and do not store dom, but return it
        rowpath = os.path.join(coldir, "col")
        BIESFiles = doer.predict([rowpath], docid)
        BIESDom = self.loadDom(BIESFiles[0])
        #         res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)

        # MPXML2DS
        #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
        dsconv = primaAnalysis()
        DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid)

        # create XMLDOC object
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(
            DSBIESdoc)  #,listPages = range(self.firstPage,self.lastPage+1))
        # create row
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
        rdc = RowDetection()
        rdc.findRowsInDoc(self.ODoc)

        #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
        # DS2MPXML
        DS2MPXML = DS2PageXMLConvertor()
        lPageXml = DS2MPXML.run(self.ODoc.getDom())
        if lPageXml != []:
            #             if DS2MPXML.bMultiPages:
            newDoc = MultiPageXml.makeMultiPageXmlMemory(
                map(lambda xy: xy[0], lPageXml))
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            newDoc.write(outputFileName,
                         xml_declaration=True,
                         encoding="UTF-8",
                         pretty_print=True)


#             else:
#                 DS2MPXML.storePageXmlSetofFiles(lPageXml)

        return

        #upload
        # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
        self.upLoadDocument(colid,
                            coldir,
                            docid,
                            sNote='NLE workflow;table row done')

        ## apply HTR
        ## how to deal with specific dictionaries?

        ## here need to know the ontology and the template

        nbPages = 1
        jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel,
                              self.sDictName)
        bWait = True
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED']

        # download  where???
        # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
        #   coldir is not right!! coldir must refer to the parent folder!
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        #done!!

        # IE extr
        ## not here: specific to a usecas
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate

    def processCollection(self, coldir):
        """
            process all files in a colelction
            need mpxml files
        """
        lsDocFilename = sorted(
            glob.iglob(
                os.path.join(coldir, "*" + TableProcessing.sMPXMLExtension)))
        lDocId = []
        for sDocFilename in lsDocFilename:
            sDocId = os.path.basename(
                sDocFilename)[:-len(TableProcessing.sMPXMLExtension)]
            try:
                docid = int(sDocId)
                lDocId.append(docid)
            except ValueError:
                traceln("Warning: folder %s : %s invalid docid, IGNORING IT" %
                        (self.coldir, sDocId))
                continue

        # process each document
        for docid in lDocId:
            traceln("Processing %s : %s " % (self.coldir, sDocId))
            self.processDocument(self.colid, docid)
            traceln("\tProcessing done for %s " % (self.coldir, sDocId))

    def processParameters(self):
        """
            what to do with the parameters provided by the command line
        """
        if self.colid is None:
            print('collection id missing!')
            sys.exit(1)

        self.bFullCol = self.docid != None

        if self.bRegenerateMPXML and self.docid is not None:
            l = glob.glob(os.path.join(self.coldir, sCOL, self.docid,
                                       "*.pxml"))
            doc = MultiPageXml.makeMultiPageXml(l)
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            doc.write(outputFileName,
                      xml_declaration=True,
                      encoding="UTF-8",
                      pretty_print=True)
            return doc
        return None

    def run(self):
        """
            process at collection level or document level
        """
        newMPXML = self.processParameters()
        if self.bFullCol is None:
            self.processCollection(self.colid)
        else:
            self.processDocument(self.coldir, self.colid, self.docid, newMPXML)
Exemplo n.º 14
0
class TableProcessing(Component.Component):
    usage = ""
    version = "v.01"
    description = "description: table layout analysis based on template"

    sCOL = "col"
    sMPXMLExtension = ".mpxml"

    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "TableProcessing", self.usage,
                                     self.version, self.description)

        self.colid = None
        self.docid = None

        self.bFullCol = False
        # generate MPXML using Ext
        self.useExtForMPXML = False

        self.bRegenerateMPXML = False

        self.sRowModelName = None
        self.sRowModelDir = None

        self.sHTRmodel = None
        self.sDictName = None

    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        if "coldir" in dParams:
            self.coldir = dParams["coldir"]
        if "colid" in dParams:
            self.colid = dParams["colid"]
        if "colid" in dParams:
            self.docid = dParams["docid"]
        if "useExt" in dParams:
            self.useExtForMPXML = dParams["useExt"]

        if 'mergeTLC' in dParams:
            self.bUROCVLMerge = dParams["mergeTLC"]

        if 'regMPXML' in dParams:
            self.bRegenerateMPXML = dParams["regMPXML"]

        if "rowmodelname" in dParams:
            self.sRowModelName = dParams["rowmodelname"]
        if "rowmodeldir" in dParams:
            self.sRowModelDir = dParams["rowmodeldir"]

        if "htrmodel" in dParams:
            self.sHTRmodel = dParams["htrmodel"]
        if "dictname" in dParams:
            self.sDictName = dParams["dictname"]

        # Connection to Transkribus
        self.myTrKCient = None
        self.persist = False
        self.loginInfo = False
        if "server" in dParams:
            self.server = dParams["server"]
        if "persist" in dParams:
            self.persist = dParams["persist"]
        if "login" in dParams:
            self.loginInfo = dParams["login"]

    def login(self, trnskrbs_client, trace=None, traceln=None):
        """
        deal with the complicated login variants...
            -trace and traceln are optional print methods 
        return True or raises an exception
        """
        DEBUG = True
        bOk = False
        if self.persist:
            #try getting some persistent session token
            if DEBUG and trace:
                trace("  ---login--- Try reusing persistent session ... ")
            try:
                bOk = trnskrbs_client.reusePersistentSession()
                if DEBUG and traceln: traceln("OK!")
            except:
                if DEBUG and traceln: traceln("Failed")

        if not bOk:
            if self.loginInfo:
                login, pwd = self.loginInfo, self.pwd
            else:
                if trace:
                    DEBUG and trace(
                        "  ---login--- no login provided, looking for stored credentials... "
                    )
                login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False)
                if DEBUG and traceln: traceln("OK")

            if DEBUG and traceln:
                trace("  ---login--- logging onto Transkribus as %s " % login)
            trnskrbs_client.auth_login(login, pwd)
            if DEBUG and traceln: traceln("OK")
            bOk = True

        return bOk

    def downloadCollection(self,
                           colid,
                           destDir,
                           docid,
                           bNoImg=True,
                           bForce=False):
        """
            download colID
            
            replace destDir by '.'  ?
        """
        destDir = "."
        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        downloader = TranskribusDownloader(self.myTrKCient.getServerUrl(),
                                           self.myTrKCient.getProxies())
        downloader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- Downloading collection %s to folder %s" %
                (colid, os.path.abspath(destDir)))
        #         col_ts, colDir = downloader.downloadCollection(colid, destDir, bForce=options.bForce, bNoImage=options.bNoImage)
        col_ts, colDir, ldocids, dFileListPerDoc = downloader.downloadCollection(
            colid, destDir, bForce=bForce, bNoImage=bNoImg, sDocId=docid)
        traceln("- Done")

        with open(os.path.join(colDir, "config.txt"), "w") as fd:
            fd.write("server=%s\nforce=%s\nstrict=%s\n" %
                     (self.server, True, False))

        downloader.generateCollectionMultiPageXml(
            os.path.join(colDir, TableProcessing.sCOL), dFileListPerDoc, False)

        traceln('- Done, see in %s' % colDir)

        return ldocids

    def upLoadDocument(self,
                       colid,
                       coldir,
                       docid,
                       sNote="",
                       sTranscripExt='.mpxml'):
        """
            download colID
        """

        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        #         uploader = TranskribusTranscriptUploader(self.server,self.proxies)
        uploader = TranskribusDUTranscriptUploader(
            self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies())
        uploader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- uploading document %s to collection %s" % (docid, colid))
        uploader.uploadDocumentTranscript(colid,
                                          docid,
                                          os.path.join(coldir, sCOL),
                                          sNote,
                                          'NLE Table',
                                          sTranscripExt,
                                          iVerbose=False)
        traceln("- Done")
        return

    def applyLA_URO(self, colid, docid, nbpages):
        """
        apply textline finder 
        """
        # do the job...
        #         if options.trp_doc:
        #             trpdoc =  json.load(codecs.open(options.trp_doc, "rb",'utf-8'))
        #             docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc)

        traceln('process %s pages...' % nbpages)
        lretJobIDs = []
        for i in range(1, nbpages + 1):
            LA = DoLAbatch(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
            LA._trpMng.setSessionId(self.myTrKCient.getSessionId())
            LA.setSessionId(self.myTrKCient.getSessionId())
            _, sPageDesc = LA.buildDescription(colid, "%s/%s" % (docid, i))
            sPageDesc = LA.jsonToXMLDescription(sPageDesc)
            _, lJobIDs = LA.run(colid, sPageDesc, "CITlabAdvancedLaJob", False)
            traceln(lJobIDs)
            lretJobIDs.extend(lJobIDs)
            traceln("- LA running for page %d job:%s" % (i, lJobIDs))
        return lretJobIDs

    def applyHTRForRegions(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply an htr model at region level 
        """

        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception("no model ID found for %s" % (modelname))
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret

    def applyHTR(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply HTR on docid
            
            htr id is needed: we have htrmodename
        """
        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception("no model ID found for %s" % (modelname))
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret

    def overlapX(self, zoneA, zoneB):
        [x11, y11, x12, y12] = zoneA.getBoundingBox(
        )  #self.getX(),self.getY(),self.getHeight(),self.getWidth()
        [x21, y21, x22, y22] = zoneB.getBoundingBox()

        [a1, a2] = x11, x12
        [b1, b2] = x21, x22  #zoneB.getX(),zoneB.getX()+ zoneB.getWidth()
        return min(a2, b2) >= max(a1, b1)

    def overlapY(self, zoneA, zoneB):
        [x11, y11, x12, y12] = zoneA.getBoundingBox(
        )  #self.getX(),self.getY(),self.getHeight(),self.getWidth()
        [x21, y21, x22, y22] = zoneB.getBoundingBox()
        [a1, a2] = y11, y12
        [b1, b2] = y22, y22  #zone.getY(),zone.getY() + zone.getHeight()
        return min(a2, b2) >= max(a1, b1)

    def signedOverlap(self, zoneA, zoneB):
        """
         overlap self and zone
         return surface of self in zone 
        """

        [x11, y11, x12, y12] = zoneA.getBoundingBox(
        )  #self.getX(),self.getY(),self.getHeight(),self.getWidth()
        [x21, y21, x22, y22] = zoneB.getBoundingBox(
        )  #.getX(),zone.getY(),zone.getHeight(),zone.getWidth()
        w1 = x12 - x11
        h1 = y12 - y11
        fOverlap = 0.0

        if self.overlapX(zoneA, zoneB) and self.overlapY(zoneA, zoneB):
            s1 = w1 * h1

            # possible ?
            if s1 == 0: s1 = 1.0
            #intersection
            nx1 = max(x11, x21)
            nx2 = min(x12, x22)
            ny1 = max(y11, y21)
            ny2 = min(y12, y22)
            h = abs(nx2 - nx1)
            w = abs(ny2 - ny1)

            inter = h * w
            if inter > 0:
                fOverlap = inter / s1
            else:
                # if overX and Y this is not possible !
                fOverlap = 0.0

        return fOverlap

    def mergeBaselineCells(self, coldir, colid, docid):
        """
        
            Take a file (pxml) with stuff processed on Transkribus
            Tale the CVL template tool xml (xml)
            
            merge them
            
            regenerate a mpxml
             
        """

        xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid))
        #         print (xmlpath)

        mpxml = xmlpath + ".mpxml"
        mpxmldoc = etree.parse(mpxml)

        lxml = glob.glob(os.path.join(xmlpath, "*.xml"))
        pxmldoc = MultiPageXml.makeMultiPageXml(lxml)

        lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml"))
        mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml)

        lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page')
        lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page')

        assert len(lXMLPage) == len(lPXMLPage)
        for i, cvlpage in enumerate(lXMLPage):
            ## remove TextRegion from xcvlpage
            lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion')
            for tr in lTextRegions:
                tr.getparent().remove(tr)

            pxmlpage = lPXMLPage[i]
            lTL = []
            lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion')
            for x in lTextRegions:
                lTL.extend(PageXml.getChildByName(x, 'TextLine'))

            ltable = PageXml.getChildByName(cvlpage, 'TableRegion')
            if len(ltable) == 0:
                raise "NO TABLE"
            lCells = PageXml.getChildByName(ltable[0], 'TableCell')

            lC = [Polygon(PageXml.getPointList(c)) for c in lCells]
            lT = [Polygon(PageXml.getPointList(t)) for t in lTL]
            for i, tl in enumerate(lT):
                ## normalization
                lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords')
                lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline')
                coordB = lCoordsB[0]
                coord = lCoordsPoints[0]
                iHeight = 30  # in pixel
                x1, y1, x2, y2 = Polygon(
                    PageXml.getPointList(coordB)).getBoundingBox()
                if coord is not None:
                    coord.set(
                        'points', "%d,%d %d,%d %d,%d %d,%d" %
                        (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
                    tl = Polygon(PageXml.getPointList(coordB))
                lOverlap = []
                for _, c in enumerate(lC):
                    #                     print (lCells[j].get('row'),lCells[j].get('col'),  self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox())
                    lOverlap.append(self.signedOverlap(
                        c, tl))  #.getBoundingBox()))
                ## region of the same size as the textline
#                 print (j,max(lOverlap),lOverlap.index(max(lOverlap)))
                if max(lOverlap) == 0:
                    region = PageXml.createPageXmlNode('TextRegion')
                    cvlpage.append(region)
                    region.append(lTL[i])

                else:
                    cell = lCells[lOverlap.index(max(lOverlap))]
                    cell.append(lTL[i])
#                     print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext()))

        pxmldoc.write(mpxml)

    """                        
        lOverlap=[]        
        for region in lRegions:
            lOverlap.append(self.signedRatioOverlap(region))
        
        if max(lOverlap) == 0: return None
        return lRegions[lOverlap.index(max(lOverlap))]
    """
    """
            fOverlap = 0.0
        
        if self.overlapX(zone) and self.overlapY(zone):
            [x11,y11,x12,y12] = [x1,y1,x1+w1,y1+h1]
            [x21,y21,x22,y22] = [x2,y2,x2+w2,y2+h2]
            
            s1 = w1 * h1
            
            # possible ?
            if s1 == 0: s1 = 1.0
            
            #intersection
            nx1 = max(x11,x21)
            nx2 = min(x12,x22)
            ny1 = max(y11,y21)
            ny2 = min(y12,y22)
            h = abs(nx2 - nx1)
            w = abs(ny2 - ny1)
            
            inter = h * w
            if inter > 0 :
                fOverlap = inter/s1
            else:
                # if overX and Y this is not possible !
                fOverlap = 0.0
            
        return  fOverlap   
    
    """
    def extractFileNamesFromMPXML(self, mpxmldoc):
        """
            to insure correct file order !
            
            duplicated form performCVLLA.py
        """
        xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid))

        lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page')
        #         for i in lNd:print i
        return map(
            lambda x: "%s%s%s.xml" %
            (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd)

    def processDocument(self, coldir, colid, docid, dom=None):
        """
            process a single document
            
            1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml
            2 python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
            3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
            4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
            6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
            7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py  <model-name> <dictionary-name> 5400 17442
            
            wait
            8  python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force 
             #covnert to ds
            9  python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate  
            
        """

        #create Transkribus client
        self.myTrKCient = TranskribusClient(sServerUrl=self.server,
                                            proxies={},
                                            loggingLevel=logging.WARN)
        #login
        _ = self.login(self.myTrKCient, trace=trace, traceln=traceln)

        #         self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)

        ## load dom
        if dom is None:
            self.inputFileName = os.path.abspath(
                os.path.join(coldir, TableProcessing.sCOL,
                             docid + TableProcessing.sMPXMLExtension))
            mpxml_doc = self.loadDom()
            nbPages = MultiPageXml.getNBPages(mpxml_doc)
        else:
            # load provided mpxml
            mpxml_doc = dom
            nbPages = MultiPageXml.getNBPages(mpxml_doc)

#         ### table registration: need to compute/select???   the template
#         # perform LA  separator, table registration, baseline with normalization
#         #python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
#         tableregtool= LAProcessor()
# #         latool.setParams(dParams)
#         tableregtool.coldir = coldir
#         tableregtool.docid = docid
#         tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
#         # creates xml and a new mpxml
#         mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
#
#

#         self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')

        lJobIDs = self.apply_URO(colid, docid, nbPages)
        return

        bWait = True
        assert lJobIDs != []
        jobid = lJobIDs[-1]
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED']

        ## coldir???
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        ##STOP HERE FOR DAS newx testset:
        return

        # tag text for BIES cell
        #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
        """ 
            needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
        """
        doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
        doer.load()
        ## needed predict at file level, and do not store dom, but return it
        rowpath = os.path.join(coldir, "col")
        BIESFiles = doer.predict([rowpath], docid)
        BIESDom = self.loadDom(BIESFiles[0])
        #         res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)

        # MPXML2DS
        #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
        dsconv = primaAnalysis()
        DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid)

        # create XMLDOC object
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(
            DSBIESdoc)  #,listPages = range(self.firstPage,self.lastPage+1))
        # create row
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
        rdc = RowDetection()
        rdc.findRowsInDoc(self.ODoc)

        #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
        # DS2MPXML
        DS2MPXML = DS2PageXMLConvertor()
        lPageXml = DS2MPXML.run(self.ODoc.getDom())
        if lPageXml != []:
            #             if DS2MPXML.bMultiPages:
            newDoc = MultiPageXml.makeMultiPageXmlMemory(
                map(lambda xy: xy[0], lPageXml))
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            newDoc.write(outputFileName,
                         xml_declaration=True,
                         encoding="UTF-8",
                         pretty_print=True)


#             else:
#                 DS2MPXML.storePageXmlSetofFiles(lPageXml)

        return

        #upload
        # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
        self.upLoadDocument(colid,
                            coldir,
                            docid,
                            sNote='NLE workflow;table row done')

        ## apply HTR
        ## how to deal with specific dictionaries?
        ## here need to know the ontology and the template
        ## OPTION: put it after LA on server  (just one download needed )

        nbPages = 1
        jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel,
                              self.sDictName)
        bWait = True
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED']

        # download  where???
        # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
        #   coldir is not right!! coldir must refer to the parent folder!
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        #done!!

        # IE extr
        ## not here: specific to a usecas
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate

    def processCollection(self, coldir):
        """
            process all files in a colelction
            need mpxml files
        """
        lsDocFilename = sorted(
            glob.iglob(
                os.path.join(coldir, "*" + TableProcessing.sMPXMLExtension)))
        lDocId = []
        for sDocFilename in lsDocFilename:
            sDocId = os.path.basename(
                sDocFilename)[:-len(TableProcessing.sMPXMLExtension)]
            try:
                docid = int(sDocId)
                lDocId.append(docid)
            except ValueError:
                traceln("Warning: folder %s : %s invalid docid, IGNORING IT" %
                        (self.coldir, sDocId))
                continue

        # process each document
        for docid in lDocId:
            traceln("Processing %s : %s " % (self.coldir, sDocId))
            self.processDocument(self.colid, docid)
            traceln("\tProcessing done for %s " % (self.coldir, sDocId))

    def processParameters(self):
        """
            what to do with the parameters provided by the command line
        """
        if self.colid is None:
            print('collection id missing!')
            sys.exit(1)

        self.bFullCol = self.docid != None

        if self.bRegenerateMPXML and self.docid is not None:
            l = glob.glob(os.path.join(self.coldir, sCOL, self.docid,
                                       "*.pxml"))
            doc = MultiPageXml.makeMultiPageXml(l)
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            doc.write(outputFileName,
                      xml_declaration=True,
                      encoding="UTF-8",
                      pretty_print=True)
            return doc
        return None

    def run(self):
        """
            process at collection level or document level
        """
        newMPXML = self.processParameters()
        if self.bFullCol is None:
            self.processCollection(self.colid)
        else:
            if self.bUROCVLMerge:
                self.mergeBaselineCells(self.coldir, self.colid, self.docid)
                return
            self.processDocument(self.coldir, self.colid, self.docid, newMPXML)
Exemplo n.º 15
0
class IETest(Component.Component):
    """
        
    """
    usage = ""
    version = "v.01"
    description = "description: Information Extraction Tool for the ABP collection (READ project)"

    #--- INIT -------------------------------------------------------------------------------------------------------------
    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "ABP_IE", self.usage, self.version,
                                     self.description)
        self.usage = self.usage = "python %prog" + self.usageComponent
        self.colname = None
        self.docid = None

        self.sTemplate = None
        self.BuseStoredTemplate = False

        # HTR model id
        self.htrModelID = None

        # IE model
        self.sModelDir = None
        self.sModelName = None

        self.lcsTH = 75
        self.page2DS = False
        # for --test
        self.evalData = None

    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        if "coldir" in dParams:
            self.colname = dParams["coldir"]

        if "docid" in dParams:
            self.docid = dParams["docid"]

        if "htrid" in dParams:
            self.htrModelID = dParams["htrid"]

        if "template" in dParams:
            self.sTemplate = dParams["template"]

        if "UseStoredTemplate" in dParams:
            self.BuseStoredTemplate = dParams["UseStoredTemplate"]

        if 'modelName' in dParams:
            self.sModelName = dParams['modelName']
        if 'modelDir' in dParams:
            self.sModelDir = dParams['modelDir']
        if "2DS" in dParams:
            self.page2DS = dParams['2DS']

        if "LCSTH" in dParams:
            self.lcsTH = dParams['LCSTH']

    def labelTable(self, table):
        """
            toy example
            label columns with tags 
        """
        table.getColumns()[0].label()

    def findNameColumn(self, table):
        """    
                find the column which corresponds to the people names c 
        """
        self.bDebug = False
        #tag fields with template
        lColPos = {}
        lColInvName = {}
        for cell in table.getCells():
            try:
                lColPos[cell.getIndex()[1]]
            except:
                lColPos[cell.getIndex()[1]] = []
            if cell.getIndex()[1] < 5:
                for field in cell.getFields():
                    if field is not None:
                        res = field.applyTaggers(cell)
                        # res [ (token,label,score) ...]
                        extractedValues = field.extractLabel(res)
                        if extractedValues != []:
                            #                         extractedValues = map(lambda offset,value,label,score:(value,score),extractedValues)
                            extractedValues = list(
                                map(lambda x: (x[1], x[3]), extractedValues))
                            field.setOffset(res[0])
                            field.setValue(extractedValues)
                            #                         field.addValue(extractedValues)
                            lColPos[cell.getIndex()[1]].append(field.getName())
                            try:
                                lColInvName[field.getName()].append(
                                    cell.getIndex()[1])
                            except:
                                lColInvName[field.getName()] = [
                                    cell.getIndex()[1]
                                ]
                            if self.bDebug:
                                print('foundXX:', field.getName(),
                                      field.getValue())
            cell.resetField()
        return max(lColInvName['firstname'],
                   key=lColInvName['firstname'].count)

    def extractData(self, table, myRecord, lTemplate):
        """
            layout 
            tag content
            [use scoping for propagating 
                scoping: for tagging and for data
                scope fieldname    scope (fiedlname, fieldvalue)]   
            
            find if possible a contiguous repetition of records
            
            
            find layout level for record completion
            extract data/record
              -inference if IEOnto
            
        """
        #         self.bDebug = False
        #         table.buildNDARRAY()
        if lTemplate is not None:
            # convert string to tableTemplateObject
            template = tableTemplateClass()
            template.buildFromPattern(lTemplate)
            template.labelTable(table)
        else:
            return None
        #         firstNameColIndex =self.findNameColumn(table)

        # create a batch for the full page

        #tag fields with template
        for cell in table.getCells():
            if cell.getFields() != []:
                if self.bDebug:
                    print(table.getPage(), cell.getIndex(), cell.getFields(),
                          cell.getContent())
            for field in cell.getFields():
                if field is not None:
                    res = field.applyTaggers(cell)
                    # res [ (token,label,score) ...]
                    extractedValues = field.extractLabel(res)
                    if extractedValues != []:
                        #                         extractedValues = map(lambda offset,value,label,score:(value,score),extractedValues)
                        extractedValues = list(
                            map(lambda x: (x[1], x[3]), extractedValues))
                        field.setOffset(res[0])
                        field.setValue(extractedValues)
                        #                         field.addValue(extractedValues)
                        if self.bDebug:
                            print('found:', field, field.getValue())

        ### now at record level ?
        ### scope = propagation using only docObject (hardcoded ?)
        ### where to put the propagation mechanism?
#         myRecord.propagate(table)

## 'backpropagation:  select the rows, and collection subobjects with fields  (cells)

        for row in table.getRows():
            #if not row.isHeaders():
            myRecord.addCandidate(row)

#         #for each cell: take the record and
#         ### FR NOW: TAKE THE FIRST COLUMN
#         firstCol = table.getColumns()[0]
#         for cell in firstCol.getCells():
#             myRecord.addCandidate(cell)

        myRecord.rankCandidates()

        lcand = myRecord.getCandidates()
#         print lcand
#         myRecord.display()

    def mergeLineAndCells(self, lPages):
        """
            assign lines(TEXT) to cells
        """

        for page in lPages:
            lLines = page.getAllNamedObjects(XMLDSTEXTClass)
            lCells = page.getAllNamedObjects(XMLDSTABLECELLClass)
            dAssign = {}
            for line in lLines:
                bestscore = 0.0
                for cell in lCells:
                    ratio = line.ratioOverlap(cell)
                    if ratio > bestscore:
                        bestscore = ratio
                        dAssign[line] = cell

            [dAssign[line].addObject(line) for line in dAssign.keys()]
            [cell.getObjects().sort(key=lambda x: x.getY()) for cell in lCells]

    def testGTText(self, page):
        """
            extract region text and parse it 
        """
        from contentProcessing.taggerTrainKeras import DeepTagger

        myTagger = DeepTagger()
        myTagger.bPredict = True
        myTagger.sModelName = '2mix_cm'
        myTagger.dirName = 'IEdata/model/'
        myTagger.loadModels()

        for region in page.getObjects():
            #             print region.getContent().encode('utf-8')
            res = myTagger.predict([region.getContent()])
            try:
                res = myTagger.predict([region.getContent()])
#                 print res
            except:
                print('SENT WITH ISSUES : [%s]' %
                      (region.getContent().encode('utf-8')))

    def mineTable(self, tabel, dr):
        """
            from the current HTR: find the categories in each column (once NER applied)
            
        """

    def selectTemplat(self, lTemplates):
        """
         if a list of templates is available: 
             take a couple pages and perform IE: simply sum up the score of the tagger
             
        """

    def processWithTemplate(self, table, dr):
        """
            according to the # of columns, apply the corresponding template 
        """
        # selection of the dictionaries per columns
        # template 5,10: first col = numbering

        # find calibration column: abp_names
        table.buildNDARRAY()
        #         print (self.findNameColumn(table))
        #         lTemplateIE2 = [
        #              ((slice(1,None),slice(0,1))  ,[ 'numbering'],[ dr.getFieldByName('numbering') ])
        #             , ((slice(1,None),slice(1,2))  ,[ 'abp_names', 'names_aux','numbering','religion'],[ dr.getFieldByName('lastname'), dr.getFieldByName('firstname'),dr.getFieldByName('religion')  ])
        #             , ((slice(1,None),slice(2,3)) ,[ 'abp_profession','religion' ]        ,[ dr.getFieldByName('occupation'), dr.getFieldByName('religion') ])
        #             , ((slice(1,None),slice(3,4))  ,[ 'abp_location' ]                    ,[ dr.getFieldByName('location') ])
        #             , ((slice(1,None),slice(4,5)) ,[ 'abp_family' ]                       ,[ dr.getFieldByName('situation') ])
        #              ,((slice(1,None),slice(5,6)) ,[ 'deathreason','artz']                ,[ dr.getFieldByName('deathreason'),dr.getFieldByName('doktor')])
        #             , ((slice(1,None),slice(6,7)) ,[]                                     , [ ])  #binding
        #             , ((slice(1,None),slice(7,8)) ,['abp_dates', 'abp_dates' ,'abp_year']                        ,[,dr.getFieldByName('deathDate'),dr.getFieldByName('deathYear') ])
        #             , ((slice(1,None),slice(8,9)) ,[ 'abp_dates','abp_location' ]         ,[ dr.getFieldByName('burialDate'),dr.getFieldByName('burialLocation') ])
        #             , ((slice(1,None),slice(9,10)) ,[ 'abp_age','abp_ageunit']                           ,[ dr.getFieldByName('age'), dr.getFieldByName('ageUnit')])
        # #            , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')])
        # #            , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')])
        #            ]

        #fuzzy
        lTemplateIECAL = [
            ((slice(1, None), slice(0, 4)),
             ['abp_names', 'names_aux', 'numbering', 'religion'], [
                 dr.getFieldByName('lastname'),
                 dr.getFieldByName('firstname'),
                 dr.getFieldByName('religion')
             ]),
            ((slice(1, None), slice(1, 4)), ['abp_profession', 'religion'],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')])
        ]

        #detect empty left columns ?
        template = tableTemplateClass()
        template.buildFromPattern(lTemplateIECAL)
        template.labelTable(table)

        iRef = self.findNameColumn(table)
        lTemplateIE = [
            ((slice(1, None), slice(iRef, iRef + 1)),
             ['abp_names', 'names_aux', 'numbering', 'religion'], [
                 dr.getFieldByName('lastname'),
                 dr.getFieldByName('firstname'),
                 dr.getFieldByName('religion')
             ]),
            ((slice(1, None), slice(iRef + 1,
                                    iRef + 2)), ['abp_profession', 'religion'],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')]),
            ((slice(1, None), slice(iRef + 2, iRef + 3)), ['abp_location'],
             [dr.getFieldByName('location')]),
            ((slice(1, None), slice(iRef + 3, iRef + 4)), ['abp_family'],
             [dr.getFieldByName('situation')])
            #[] binding
            ,
            ((slice(1, None), slice(iRef + 4,
                                    iRef + 6)), ['abp_deathreason', 'artz'],
             [dr.getFieldByName('deathreason'),
              dr.getFieldByName('doktor')]),
            ((slice(1, None), slice(iRef + 5, iRef + 7)),
             ['abp_dates', 'abp_year'], [
                 dr.getFieldByName('MonthDayDateGenerator'),
                 dr.getFieldByName('deathDate'),
                 dr.getFieldByName('deathYear')
             ]),
            ((slice(1, None), slice(iRef + 6, iRef + 8)),
             ['abp_dates', 'abp_year', 'abp_location'], [
                 dr.getFieldByName('burialDate'),
                 dr.getFieldByName('deathYear'),
                 dr.getFieldByName('burialLocation')
             ]),
            ((slice(1, None), slice(iRef + 8,
                                    iRef + 10)), ['abp_age', 'abp_ageunit'],
             [dr.getFieldByName('age'),
              dr.getFieldByName('ageUnit')])
            #            , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')])
            #            , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')])
        ]
        # recalibrate template

        # #         lTemplate = lTemplateIE
        #         if table.getNbColumns() >= 12:
        #             lTemplate = lTemplateIE2
        #         else:
        #             lTemplate = lTemplateIE

        self.extractData(table, dr, lTemplateIE)

        # select best solutions
        # store inthe proper final format
        return dr

    def run(self, doc):
        """
        main issue: how to select the template: to be done by CVL
            assuming IE and htr info are stored in the template
        
        """
        #         self.firstPage = 9
        #         self.lastPage= 9

        if self.page2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        self.lPages = self.ODoc.getPages()
        dr = deathRecord(self.sModelName, self.sModelDir)

        ## selection of the templates first with X tables

        ###

        for page in self.lPages:
            print("page: ", page.getNumber())
            #             self.testGTText(page)
            #             continue
            lTables = page.getAllNamedObjects(XMLDSTABLEClass)

            for table in lTables:
                if table.getNbRows() < 2:
                    print("page: %s : not a table? %d/%d" %
                          (page.getNumber(), table.getNbRows(),
                           table.getNbColumns()))
                    continue
                if self.BuseStoredTemplate:
                    #                     self.processWithTemplate(table, dr)
                    try:
                        self.processWithTemplate(table, dr)
                    except:
                        print('issue with page %s' % page)
                else:
                    self.mineTable(table, dr)

        self.evalData = dr.generateOutput(self.evalData)
#         print self.evalData.serialize('utf-8',True)

    def generateTestOutput(self):
        """
  <PAGE number="1" pagenum="1" nbrecords="10" years="0">
    <RECORD lastname="Riesinger" firstname="Korona" role="Verstorbener" location="Neuhofen" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Ringseisen" firstname="Georg" role="Verstorbener" location="Etzing" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Nebel" firstname="Theresia" role="Verstorbener" location="Sandbach" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Schlögl" firstname="Cäcilia" role="Verstorbener" location="Stampfing" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Riedinger" firstname="Theresia" role="Verstorbener" location="Lapperding" occupation="Austragsbäuerin" year="0" month="0" day="0"/>
    <RECORD lastname="Wührer" firstname="Joseph" role="Verstorbener" location="Haizing" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Wilmerdinger" firstname="Theresia" role="Verstorbener" location="Hausmanning" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Ratzinger" firstname="Mathias" role="Verstorbener" location="Kafferding" occupation="Bauer" year="0" month="0" day="0"/>
    <RECORD lastname="Deixelberger" firstname="Joseph" role="Verstorbener" location="Gaishofen" occupation="Inwohner" year="0" month="0" day="0"/>
    <RECORD lastname="Beham" firstname="Martin" role="Verstorbener" location="Socking" occupation="Austragsbauer" year="0" month="0" day="0"/>
  </PAGE>            
            
        """
        root = etree.Element('DOCUMENT')
        self.evalData = etree.ElementTree(root)
        for page in lPages:

            domp = etree.Elemen('PAGE')
            domp.set('number', page.getAttribute('number'))
            domp.set('pagenum',
                     os.path.basename(page.getAttribute('imageFilename'))[:-4])
            root.append(domp)
            domp.set('template', page.getNode().prop('template'))
            domp.set('reftemplate', page.getNode().prop('reftemplate'))

        return self.evalData

    def testFirstNameLastNameRecord(self, srefData, srunData, bVisual):
        """
            test firstname in record
            
            group by page
                
        """

        cntOk = cntErr = cntMissed = 0
        #         srefData = srefData.decode('utf-8')
        #.strip("\n")

        RefData = etree.XML(srefData.strip("\n").encode('utf-8'))
        RunData = etree.XML(srunData.strip("\n").encode('utf-8'))

        lRef = []
        lPages = RefData.xpath('//%s' % ('PAGE[@number]'))

        lRefKeys = {}
        for page in lPages:
            pnum = page.get('number')
            key = page.get('pagenum')
            lRefKeys[key] = 1
            xpath = "./%s" % ("RECORD")
            lrecord = page.xpath(xpath)
            if len(lrecord) == 0:
                lRef.append([])
            else:
                for record in lrecord:
                    xpath = "./%s" % ("./@firstname")
                    lf = record.xpath(xpath)
                    xpath = "./%s" % ("./@lastname")
                    ln = record.xpath(xpath)
                    if len(lf) > 0:
                        lRef.append((pnum, key, lf[0], ln[0]))

        lPageMapping = {}
        lRun = []
        if RunData is not None:
            lpages = RunData.xpath('//%s' % ('PAGE[@number]'))
            for page in lpages:
                pnum = page.get('number')
                key = page.get('pagenum')
                #                 key= page.get('number')
                if key in lRefKeys.keys():
                    lPageMapping[key] = pnum

                    #record level!
                    xpath = "./%s" % ("RECORD[@firstname and @lastname]")
                    lrecord = page.xpath(xpath)
                    if len(lrecord) == 0:
                        pass
                    else:
                        for record in lrecord:
                            xpath = "./%s" % ("./@firstname")
                            lf = record.xpath(xpath)
                            xpath = "./%s" % ("./@lastname")
                            ln = record.xpath(xpath)
                            if len(
                                    lf
                            ) > 0:  # and lf[0].getContent() != ln[0].getContent():
                                lRun.append((pnum, key, lf[0], ln[0]))

        ltisRefsRunbErrbMiss = list()
        for key in lRunPerPage:
            #         for key in ['Neuoetting_009_05_0150']:
            lRun = lRunPerPage[key]
            lRef = lRefKeys[key]
            runLen = len(lRunPerPage[key])
            refLen = len(lRefKeys[key])

            bT = False
            if refLen <= runLen:
                rows = lRef
                cols = lRun
            else:
                rows = lRun
                cols = lRef
                bT = True
            cost_matrix = np.zeros((len(rows), len(cols)), dtype=float)
            for a, i in enumerate(rows):
                curRef = i
                for b, j in enumerate(cols):
                    runElt = j
                    ret, val = self.testCompareRecordField(curRef, runElt)
                    val /= 100
                    if val == 0:
                        dist = 10
                    else:
                        dist = 1 / val
                    cost_matrix[a, b] = dist
            m = linear_sum_assignment(cost_matrix)
            r1, r2 = m
            #             print (bT,r1,r2)
            #             print (list(x[2] for x in  rows))
            #             print (list(x[2] for x in cols))
            lcsTH = self.lcsTH / 100
            lCovered = []
            for a, i in enumerate(r2):
                #                 print (key,a,r1[a],i,rows[r1[a]][2],cols[i][2], 1/cost_matrix[r1[a],i])
                if 1 / cost_matrix[r1[a], i] > lcsTH:
                    cntOk += 1
                    if bT:
                        ltisRefsRunbErrbMiss.append(
                            (runElt[1], int(runElt[0]), cols[i], rows[r1[a]],
                             False, False))
                    else:
                        ltisRefsRunbErrbMiss.append(
                            (runElt[1], int(runElt[0]), rows[r1[a]], cols[i],
                             False, False))
                else:
                    #too distant: false
                    if bT:
                        lCovered.append(i)
                        ltisRefsRunbErrbMiss.append(
                            (runElt[1], int(runElt[0]), "", rows[r1[a]], True,
                             False))
                    else:
                        lCovered.append(r1[a])
                        ltisRefsRunbErrbMiss.append((runElt[1], int(runElt[0]),
                                                     "", cols[i], True, False))

                    cntErr += 1
            for iref in r1:
                if iref not in r2:
                    ltisRefsRunbErrbMiss.append((runElt[1], int(runElt[0]),
                                                 lRef[iref], '', False, True))
                    cntMissed += 1
            for iref in lCovered:
                ltisRefsRunbErrbMiss.append(
                    (runElt[1], int(runElt[0]), lRef[iref], '', False, True))
                cntMissed += 1
        ltisRefsRunbErrbMiss.sort(key=lambda x: x[0])

        #         runLen = len(lRun)
        #         refLen = len(lRef)
        # #         bVisual = True
        #         ltisRefsRunbErrbMiss= list()
        #         lRefCovered = []
        #         for i in range(0,len(lRun)):
        #             iRef =  0
        #             bFound = False
        #             bErr , bMiss= False, False
        #             runElt = lRun[i]
        # #             print '\t\t===',runElt
        #             while not bFound and iRef <= refLen - 1:
        #                 curRef = lRef[iRef]
        #                 if runElt and curRef not in lRefCovered and self.testCompareRecordFirstNameLastName(curRef,runElt):
        #                     bFound = True
        #                     lRefCovered.append(curRef)
        #                 iRef+=1
        #             if bFound:
        #                 if bVisual:print("FOUND:", runElt, ' -- ', lRefCovered[-1])
        #                 cntOk += 1
        #             else:
        #                 curRef=''
        #                 cntErr += 1
        #                 bErr = True
        #                 if bVisual:print("ERROR:", runElt)
        #             if bFound or bErr:
        #                 ltisRefsRunbErrbMiss.append( (runElt[1],int(runElt[0]), curRef, runElt,bErr, bMiss) )
        #         for i,curRef in enumerate(lRef):
        #             if curRef not in lRefCovered:
        #                 if bVisual:print("MISSED:", curRef)
        #                 ltisRefsRunbErrbMiss.append( (curRef[1],int(lPageMapping[curRef[1]]), curRef, '',False, True) )
        #                 cntMissed+=1
        #
        #         ltisRefsRunbErrbMiss.sort(key=lambda x:x[0])

        return (cntOk, cntErr, cntMissed, ltisRefsRunbErrbMiss)

    def testRecordField(self, lfieldName, lfieldInRef, srefData, srunData,
                        bVisual):
        """
            test fieldName in record
            
        """
        assert len(lfieldName) == len((lfieldInRef))

        for i, f in enumerate(lfieldName):
            if lfieldInRef[i] is None: lfieldInRef[i] = f
        cntOk = cntErr = cntMissed = 0
        #         srefData = srefData.decode('utf-8')
        #.strip("\n")

        RefData = etree.XML(srefData.strip("\n").encode('utf-8'))
        RunData = etree.XML(srunData.strip("\n").encode('utf-8'))

        lPages = RefData.xpath('//%s' % ('PAGE[@number]'))
        lRefKeys = {}
        for page in lPages:
            pnum = page.get('number')
            key = page.get('pagenum')
            xpath = "./%s" % ("RECORD")
            lrecord = page.xpath(xpath)
            if len(lrecord) == 0:
                lRef.append([])
            else:
                for record in lrecord:
                    lf = []
                    for fieldInRef in lfieldInRef:
                        xpath = "./%s" % ("./@%s" % fieldInRef)
                        ln = record.xpath(xpath)
                        if ln and len(ln[0]) > 0:
                            lf.append(ln[0])

                    if lf != []:
                        try:
                            #                         if (pnum,key,lf) in lRefKeys[key]:
                            #                             print ('duplicated',(pnum,key,lf))
                            #                         else:
                            #                             lRefKeys[key].append((pnum,key,lf))
                            lRefKeys[key].append((pnum, key, lf))
                        except KeyError:
                            lRefKeys[key] = [(pnum, key, lf)]

        lRunPerPage = {}
        lPageMapping = {}
        if RunData:
            lpages = RunData.xpath('//%s' % ('PAGE[@number]'))
            for page in lpages:
                pnum = page.get('number')
                key = page.get('pagenum')
                lPageMapping[key] = pnum
                if key in lRefKeys:
                    #record level!
                    xpath = "./%s" % ("RECORD")
                    lrecord = page.xpath(xpath)
                    if len(lrecord) == 0:
                        pass

    #                     lRun.append([])
                    else:
                        for record in lrecord:
                            lf = []
                            for fieldName in lfieldName:
                                xpath = "./%s" % ("./@%s" % fieldName)
                                ln = record.xpath(xpath)
                                if len(ln) > 0 and len(ln[0]) > 0:
                                    lf.append(ln[0])
                            if len(lf) == len(lfieldName):
                                try:
                                    lRunPerPage[key].append((pnum, key, lf))
                                except KeyError:
                                    lRunPerPage[key] = [(pnum, key, lf)]

        ltisRefsRunbErrbMiss = list()
        for key in lRunPerPage:
            #         for key in ['Neuoetting_008_03_0032']:
            lRun = lRunPerPage[key]
            lRef = lRefKeys[key]
            runLen = len(lRunPerPage[key])
            refLen = len(lRefKeys[key])

            bT = False
            if refLen <= runLen:
                rows = lRef
                cols = lRun
            else:
                rows = lRun
                cols = lRef
                bT = True
            cost_matrix = np.zeros((len(rows), len(cols)), dtype=float)
            for a, i in enumerate(rows):
                curRef = i
                for b, j in enumerate(cols):
                    runElt = j
                    ret, val = self.testCompareRecordField(curRef, runElt)
                    dist = 100 - val
                    cost_matrix[a, b] = dist
#                     print (curRef,runElt,val,dist)
            m = linear_sum_assignment(cost_matrix)
            r1, r2 = m
            if False:
                print(len(lRef), lRef)
                print(len(lRun), lRun)
                print(bT, r1, r2)
            lcsTH = self.lcsTH
            lCovered = []
            lMatched = []
            for a, i in enumerate(r2):
                #                 print (key,a,r1[a],i,rows[r1[a]][2],cols[i][2], 100-cost_matrix[r1[a],i])
                if 100 - cost_matrix[r1[a, ], i] > lcsTH:
                    cntOk += 1
                    if bT:
                        ltisRefsRunbErrbMiss.append(
                            (rows[r1[a]][1], int(rows[r1[a]][0]), cols[i][2],
                             rows[r1[a]][2], False, False))
                        lMatched.append(i)
                    else:
                        ltisRefsRunbErrbMiss.append(
                            (cols[i][1], int(cols[i][0]), rows[r1[a]][2],
                             cols[i][2], False, False))
                        lMatched.append(r1[a])
                else:
                    #too distant: false
                    if bT:
                        lCovered.append(i)
                        ltisRefsRunbErrbMiss.append(
                            (rows[r1[a]][1], int(rows[r1[a]][0]), "",
                             rows[r1[a]][2], True, False))
                    else:
                        lCovered.append(r1[a])
                        ltisRefsRunbErrbMiss.append(
                            (cols[i][1], int(cols[i][0]), "", cols[i][2], True,
                             False))

                    cntErr += 1
#             print ('matched',lMatched)
            for i, iref in enumerate(lRef):
                if i not in lMatched:
                    #                     print ('not mathced',i,iref)
                    ltisRefsRunbErrbMiss.append(
                        (lRef[i][1], int(lPageMapping[lRef[i][1]]), lRef[i][2],
                         '', False, True))
                    cntMissed += 1


#                 else:print('machtg!',i,lRef[i])

        ltisRefsRunbErrbMiss.sort(key=lambda x: x[0])

        #         for x in ltisRefsRunbErrbMiss:
        #             print (x)

        return (cntOk, cntErr, cntMissed, ltisRefsRunbErrbMiss)

    def testCompareRecordFirstNameLastName(self,
                                           refdata,
                                           rundata,
                                           bVisual=False):
        if refdata[1] != rundata[1]: return False

        refall = refdata[2].lower() + refdata[3].lower()
        reflen = len(refdata[2]) + len(refdata[3])
        runall = rundata[2].lower() + rundata[3].lower()
        runlen = len(rundata[2]) + len(rundata[3])
        runall.replace('n̄', 'nn')
        runall.replace('m̄', 'mm')

        return matchLCS(0, (refall, reflen), (runall, runlen))

    def testCompareRecordField(self, refdata, rundata, bVisual=False):
        # same page !!
        if refdata[1] != rundata[1]: return False, 0
        if rundata[2] == []: return False, 0
        if refdata[2] == []: return False, 0
        runall = " ".join(rundata[2]).strip().lower()
        refall = " ".join(refdata[2]).strip().lower()

        return matchLCS(0, (refall, len(refall)), (runall, len(runall)))

        return res, val

    def testCompareFullRecord(self, refdata, rundata, bVisual=False):
        bOK = True
        for i, attr in enumerate(refdata):
            bOK = bOK and refdata[i] == rundata[i]
        return bOK

    ################ TEST ##################

    def createFakeData(self):
        """
            for testing purpose
  <PAGE number="1" pagenum="1" nbrecords="10" years="0">
    <RECORD lastname="Riesinger" firstname="Korona" role="Verstorbener" location="Neuhofen" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Ringseisen" firstname="Georg" role="Verstorbener" location="Etzing" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Nebel" firstname="Theresia" role="Verstorbener" location="Sandbach" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Schlögl" firstname="Cäcilia" role="Verstorbener" location="Stampfing" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Riedinger" firstname="Theresia" role="Verstorbener" location="Lapperding" occupation="Austragsbäuerin" year="0" month="0" day="0"/>
    <RECORD lastname="Wührer" firstname="Joseph" role="Verstorbener" location="Haizing" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Wilmerdinger" firstname="Theresia" role="Verstorbener" location="Hausmanning" occupation="" year="0" month="0" day="0"/>
    <RECORD lastname="Ratzinger" firstname="Mathias" role="Verstorbener" location="Kafferding" occupation="Bauer" year="0" month="0" day="0"/>
    <RECORD lastname="Deixelberger" firstname="Joseph" role="Verstorbener" location="Gaishofen" occupation="Inwohner" year="0" month="0" day="0"/>
    <RECORD lastname="Beham" firstname="Martin" role="Verstorbener" location="Socking" occupation="Austragsbauer" year="0" month="0" day="0"/>
  </PAGE>            
                       
        """
        self.evalData = libxml2.newDoc('1.0')
        root = libxml2.newNode('DOCUMENT')
        self.evalData.setRootElement(root)
        domp = libxml2.newNode('PAGE')
        domp.setProp('number', '182')
        domp.setProp('nbrecords', 'None')
        domp.setProp('years', '1876;1877')

        root.addChild(domp)
        record = libxml2.newNode('RECORD')
        domp.addChild(record)
        record.setProp('lastname', 'Riedinger')
        record.setProp('firstname', 'Theresia')

        print(
            etree.tostring(self.evalData,
                           encoding='unicode',
                           pretty_print=True))
        return self.evalData

    def testRun(self, filename, outFile=None):
        """
        testRun is responsible for running the component on this file and returning a string that reflects the result in a way
        that is understandable to a human and to a program. Nicely serialized Python data or XML is fine
        """

        self.evalData = None
        doc = self.loadDom(filename)
        self.run(doc)
        #         self.generateTestOutput()
        #         self.createFakeData()
        if outFile: self.writeDom(doc)
        # return unicode
        return etree.tostring(self.evalData,
                              encoding='unicode',
                              pretty_print=True)

    def testCompare(self, srefData, srunData, bVisual=False):
        """
        Our comparison is very simple: same or different. N
        We anyway return this in term of precision/recall
        If we want to compute the error differently, we must define out own testInit testRecord, testReport
        """
        dicTestByTask = dict()
        #         dicTestByTask['Names']= self.testFirstNameLastNameRecord(srefData, srunData,bVisual)
        dicTestByTask['lastname'] = self.testRecordField(['lastname'], [None],
                                                         srefData, srunData,
                                                         bVisual)
        dicTestByTask['firstname'] = self.testRecordField(['firstname'],
                                                          [None], srefData,
                                                          srunData, bVisual)
        dicTestByTask['occupation'] = self.testRecordField(['occupation'],
                                                           [None], srefData,
                                                           srunData, bVisual)
        dicTestByTask['location'] = self.testRecordField(['location'], [None],
                                                         srefData, srunData,
                                                         bVisual)
        dicTestByTask['deathreason'] = self.testRecordField(['deathreason'],
                                                            [None], srefData,
                                                            srunData, bVisual)
        dicTestByTask['names'] = self.testRecordField(
            ['firstname', 'lastname'], [None, None], srefData, srunData,
            bVisual)
        dicTestByTask['doktor'] = self.testRecordField(['doktor'],
                                                       ['helfer_name'],
                                                       srefData, srunData,
                                                       bVisual)

        #         dicTestByTask['namedeathlocationoccupation']= self.testRecordField(['firstname','lastname','deathreason','location','occupation'],[None,None,None,None,None],srefData, srunData,bVisual)
        dicTestByTask['situation'] = self.testRecordField(['situation'],
                                                          ['family'], srefData,
                                                          srunData, bVisual)
        #         dicTestByTask['Year']= self.testYear(srefData, srunData,bVisual)

        return dicTestByTask

    def testRecordHtml(self, filename, data, nOk, nErr, nMiss):

        if nOk == None:
            assert nErr == None and nMiss == None, "INTERNAL ERROR"
            #we are reporting on multiple tasks!!
            lltisRefsRunbErrbMiss = data  #this is a list of (taskName, nOk, nErr, nMiss, ltisRefsRunbErrbMiss)
        else:
            lltisRefsRunbErrbMiss = [(None, nOk, nErr, nMiss, data)]

        #let's produce an HTML report!!
        sCollecDir = os.path.dirname(self.testDirXML)
        sCollec = os.path.basename(sCollecDir)
        sFile = os.path.basename(self.getRefFileName(filename))[:-4]
        sViewBaseUrl = "http://"  #+ sHttpHost

        fHtml = open(self.getHtmRunFileName(filename), "w", encoding='utf-8')

        sCss = """
<style type="text/css">
.OK {
color: green;
}
.Error {
color: red;
}
.Error\+Miss {
color: darkred;
}
.Miss {
color: orange;
}
</style>       
"""
        sRpt = self.makeHTMLReportHeader(sViewBaseUrl, "dla_pdf", sCss,
                                         sCollec + " - " + sFile,
                                         sCollec + " - " + sFile)

        fHtml.write(sRpt)

        #sRpt += " Doc Prec. Recall F1\t   nOk\t  nErr\t nMiss\tFilename\n"
        for taskName, nOk, nErr, nMiss, ltisRefsRunbErrbMiss in lltisRefsRunbErrbMiss:
            if taskName == None: taskName = ""
            sRpt = """
            <hr/>
            <h2>%s</h2>
<table>
    <tr align="left">
        <th></th>
        <th>Page</th>
        <th>Reference</th>
        <th>Run</th>
        <th></th>
    </tr>
""" % taskName
            fHtml.write(sRpt)
            ipnum_prev = -1
            key_prev = -1
            for (key, ipnum, sRef, sRun, bErr, bMiss) in ltisRefsRunbErrbMiss:
                if bErr and bMiss:
                    sRptType = "Error+Miss"
                else:
                    if bErr:
                        sRptType = "Error"
                    elif bMiss:
                        sRptType = "Miss"
                    else:
                        sRptType = "OK"

                sPfFile = sCollec + "/" + sFile + "/" + "pf%06d" % ipnum

                srefenc = " ".join(x for x in sRef)
                srun = " ".join(x for x in sRun)
                #                 if ipnum > ipnum_prev: #a new page
                if key != key_prev:
                    fHtml.write('<tr ><td>%s (%s)</td></tr>' % (key, ipnum))
                    fHtml.write(
                        '<tr class="%s"><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n'
                        % (
                            sRptType,
                            sRptType,
                            ""  #ipnum
                            ,
                            srefenc  #sRef
                            ,
                            srun  #sRun
                            #                                 , " - ".join(lsViews)
                        ))
                else:  #some more results for the same pafe
                    fHtml.write(
                        '<tr class="%s"><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n'
                        % (sRptType, sRptType, "", srefenc, srun, ""))
                ipnum_prev = ipnum
                key_prev = key

            fHtml.write('</table>')
            fHtml.write('<p/>')
            fHtml.write(
                self.genHtmlTableReport(sCollec, None,
                                        [(filename, nOk, nErr, nMiss, None)]))

        fHtml.write('<hr>')

        fHtml.close()

        return
Exemplo n.º 16
0
class columnDetection(Component.Component):
    """
        build table column 
    """
    usage = ""
    version = "v.01"
    description = "description: column Detection"

    #--- INIT -------------------------------------------------------------------------------------------------------------
    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "columnDetection", self.usage,
                                     self.version, self.description)

        self.colname = None
        self.docid = None

        self.do2DS = False

        self.THHighSupport = 0.75
        # for --test
        self.bCreateRef = False
        self.evalData = None

    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        #         if dParams.has_key("coldir"):
        #             self.colname = dParams["coldir"]
        if "docid" in dParams:
            self.docid = dParams["docid"]
        if "dsconv" in dParams:
            self.do2DS = dParams["dsconv"]

        self.bMining = dParams["mining"]

        if "createref" in dParams:
            self.bCreateRef = dParams["createref"]

    def createTable(self, page):
        """
            BB of all elements?
            todo: Ignore O!
        """
        x1, y1, x2, y2 = self.getBounbingBox(page)
        if x1 is None: return None
        myTable = XMLDSTABLEClass()
        myTable.setX(x1)
        myTable.setY(y1)
        myTable.setWidth(x2 - x1)
        myTable.setHeight(y2 - y1)

        page.addObject(myTable)
        return myTable

    def processPage(self, page, emptyTable):
        from util.XYcut import mergeSegments

        ### skrinking to be done: use center ?
        #         lCuts, _, _ = mergeSegments([(x.getX(),x.getX2(),x) for x in page.getAllNamedObjects(XMLDSTEXTClass)],0)
        lCuts, _, _ = mergeSegments(
            [(x.getX(), x.getX() + 0.25 * x.getWidth(), x)
             for x in page.getAllNamedObjects(XMLDSTEXTClass)], 0)
        #         lCuts, _, _ = mergeSegments([(x.getX()+0.5*x.getWidth()-0.25*x.getWidth(),x.getX()+0.5*x.getWidth()+0.25*x.getWidth(),x) for x in page.getAllNamedObjects(XMLDSTEXTClass)],0)

        for i, (x, _, cut) in enumerate(lCuts):
            ll = list(cut)
            ll.sort(key=lambda x: x.getY())
            #add column
            myCol = XMLDSTABLECOLUMNClass(i)
            myCol.setPage(page)
            myCol.setParent(emptyTable)
            emptyTable.addObject(myCol)
            myCol.setX(x)
            myCol.setY(emptyTable.getY())
            myCol.setHeight(emptyTable.getHeight())
            if i + 1 < len(lCuts):
                myCol.setWidth(lCuts[i + 1][0] - x)
            else:  # use table
                myCol.setWidth(emptyTable.getX2() - x)
            emptyTable.addColumn(myCol)
            if not self.bMining:
                myCol.tagMe(ds_xml.sCOL)

    def getBounbingBox(self, page):
        lElts = page.getAllNamedObjects(XMLDSTEXTClass)
        if lElts == []: return None, None, None, None

        lX1, lX2, lY1, lY2 = zip(*[(x.getX(), x.getX2(), x.getY(), x.getY2())
                                   for x in lElts])
        return min(lX1), min(lY1), max(lX2), max(lY2)

    def findColumnsInDoc(self, lPages):
        """
        find columns for each table in ODoc
        """

        for page in lPages:
            traceln("page: %d" % page.getNumber())
            lch, lcv = self.mergeHorVerClusters(page)
#             table = self.createTable(page)
#             if table is not None:
# #                 table.tagMe(ds_xml.sTABLE)
#                 self.processPage(page,table)

    def createContourFromListOfElements(self, lElts):
        """
            create a polyline from a list of elements
            input : list of elements
            output: Polygon object
        """
        from shapely.geometry import Polygon as pp
        from shapely.ops import cascaded_union

        lP = []
        for elt in lElts:

            sPoints = elt.getAttribute('points')
            if sPoints is None:
                lP.append(
                    pp([(elt.getX(), elt.getY()), (elt.getX(), elt.getY2()),
                        (elt.getX2(), elt.getY2()),
                        (elt.getX2(), elt.getY())]))
            else:
                lP.append(
                    pp([(float(x), float(y))
                        for x, y in zip(*[iter(sPoints.split(','))] * 2)]))
        try:
            ss = cascaded_union(lP)
        except ValueError:
            print(lElts, lP)
            return None

        return ss  #list(ss.convex_hull.exterior.coords)

    def mergeHorVerClusters(self, page):
        """
            build Horizontal and vertical clusters
        """
        from util import TwoDNeighbourhood as TwoDRel
        lTexts = page.getAllNamedObjects(XMLDSTEXTClass)

        for e in lTexts:
            e.lright = []
            e.lleft = []
            e.ltop = []
            e.lbottom = []
        lVEdge = TwoDRel.findVerticalNeighborEdges(lTexts)
        for a, b in lVEdge:
            a.lbottom.append(b)
            b.ltop.append(a)
        for elt in lTexts:
            # dirty!
            elt.setHeight(max(5, elt.getHeight() - 3))
            elt.setWidth(max(5, elt.getWidth() - 3))
            TwoDRel.rotateMinus90degOLD(elt)
        lHEdge = TwoDRel.findVerticalNeighborEdges(lTexts)
        for elt in lTexts:
            #             elt.tagMe()
            TwoDRel.rotatePlus90degOLD(elt)
#         return
        for a, b in lHEdge:
            a.lright.append(b)
            b.lleft.append(a)
#         ss
        for elt in lTexts:
            elt.lleft.sort(key=lambda x: x.getX(), reverse=True)
            #             elt.lright.sort(key = lambda x:x.getX())
            if len(elt.lright) > 1:
                elt.lright = []
            elt.lright.sort(key=lambda x: elt.signedRatioOverlapY(x),
                            reverse=True)
            #             print (elt, elt.getY(), elt.lright)
            elt.ltop.sort(key=lambda x: x.getY())
            if len(elt.lbottom) > 1:
                elt.lbottom = []
            elt.lbottom.sort(key=lambda x: elt.signedRatioOverlapX(x),
                             reverse=True)

        lHClusters = []
        # Horizontal
        lTexts.sort(key=lambda x: x.getX())
        lcovered = []
        for text in lTexts:
            if text not in lcovered:
                #                 print ('START :', text, text.getContent())
                lcovered.append(text)
                lcurRow = [text]
                curText = text
                while curText is not None:
                    try:
                        nextT = curText.lright[0]
                        #                         print ('\t',[(x,curText.signedRatioOverlapY(x)) for x in curText.lright])
                        if nextT not in lcovered:
                            lcurRow.append(nextT)
                            lcovered.append(nextT)
                        curText = nextT
                    except IndexError:
                        curText = None
#                 lHClusters.append(lcurRow)
#                 print ("FINAL", list(map(lambda x:(x,x.getContent()),lcurRow)) )
                if len(lcurRow) > 0:
                    #                     # create a contour for visualization
                    #                     # order by col: get top and  bottom polylines for them
                    contour = self.createContourFromListOfElements(lcurRow)
                    lHClusters.append((lcurRow, contour))

        # Vertical
        lVClusters = []
        lTexts.sort(key=lambda x: x.getY())
        lcovered = []
        for text in lTexts:
            if text not in lcovered:
                #                 print ('START :', text, text.getContent())
                lcovered.append(text)
                lcurCol = [text]
                curText = text
                while curText is not None:
                    try:
                        nextT = curText.lbottom[0]
                        #                         print ('\t',[(x,curText.signedRatioOverlapY(x)) for x in curText.lright])
                        if nextT not in lcovered and len(nextT.lbottom) == 1:
                            lcurCol.append(nextT)
                            lcovered.append(nextT)
                        curText = nextT
                    except IndexError:
                        curText = None

# #                 print ("FINAL", list(map(lambda x:(x,x.getContent()),lcurCol)) )
                if len(lcurCol) > 0:
                    contour = self.createContourFromListOfElements(lcurCol)
                    lVClusters.append((lcurCol, contour))
                    if contour:
                        #                         print (contour.bounds)
                        r = XMLDSObjectClass()
                        r.setName('cc')
                        r.setParent(page)
                        #                         r.addAttribute('points',spoints)
                        x1, y1, x2, y2 = contour.bounds
                        r.setXYHW(x1, y1, y2 - y1, x2 - x1)
                        page.addObject(r)
#                         r.tagMe('BLOCK')

        print(page.getAllNamedObjects('cc'))
        return lHClusters, lVClusters

    def documentMining(self, lPages):
        """
        need to clean up REGION nodes   
         
        """
        seqMiner = tableColumnMiner()
        seqMiner.columnMining(lPages, self.THHighSupport, sTag=ds_xml.sCOL)

    def checkInputFormat(self, lPages):
        """
            delete regions : copy regions elements at page object
            unlink subnodes
        """
        for page in lPages:

            lRegions = page.getAllNamedObjects("REGION")
            lElts = []
            [lElts.extend(x.getObjects()) for x in lRegions]
            [page.addObject(x, bDom=True) for x in lElts]
            [page.removeObject(x, bDom=True) for x in lRegions]

    def run(self, doc):
        """
           load dom and find rows 
        """
        # conver to DS if needed
        if self.bCreateRef:
            if self.do2DS:
                dsconv = primaAnalysis()
                doc = dsconv.convert2DS(doc, self.docid)

            refdoc = self.createRef(doc)
            return refdoc
            # single ref per page
            refdoc = self.createRefPerPage(doc)
            return None

        if self.do2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))
        self.lPages = self.ODoc.getPages()

        self.checkInputFormat(self.lPages)
        self.findColumnsInDoc(self.lPages)

        if self.bMining:
            self.documentMining(self.lPages)

        if self.bCreateRef:
            refdoc = self.createRef(self.doc)
            return refdoc


#         if self.do2DS:
#             # bakc to PageXml
#             conv= DS2PageXMLConvertor()
#             lPageXDoc = conv.run(self.doc)
#             conv.storeMultiPageXml(lPageXDoc,self.getOutputFileName())
#             print self.getOutputFileName()
#             return None
        return self.doc

    ################ TEST ##################

    def testRun(self, filename, outFile=None):
        """
        evaluate using ABP new table dataset with tablecell
        """

        self.evalData = None
        doc = self.loadDom(filename)
        doc = self.run(doc)
        self.evalData = self.createRef(doc)
        if outFile: self.writeDom(doc)
        #         return self.evalData.serialize('utf-8',1)
        return etree.tostring(self.evalData,
                              encoding='unicode',
                              pretty_print=True)

    def overlapX(self, zone):

        [a1, a2] = self.getX(), self.getX() + self.getWidth()
        [b1, b2] = zone.getX(), zone.getX() + zone.getWidth()
        return min(a2, b2) >= max(a1, b1)

    def overlapY(self, zone):
        [a1, a2] = self.getY(), self.getY() + self.getHeight()
        [b1, b2] = zone.getY(), zone.getY() + zone.getHeight()
        return min(a2, b2) >= max(a1, b1)

    def signedRatioOverlap(self, z1, z2):
        """
         overlap self and zone
         return surface of self in zone 
        """
        [x1, y1, h1, w1] = z1.getX(), z1.getY(), z1.getHeight(), z1.getWidth()
        [x2, y2, h2, w2] = z2.getX(), z2.getY(), z2.getHeight(), z2.getWidth()

        fOverlap = 0.0

        if self.overlapX(z2) and self.overlapY(z2):
            [x11, y11, x12, y12] = [x1, y1, x1 + w1, y1 + h1]
            [x21, y21, x22, y22] = [x2, y2, x2 + w2, y2 + h2]

            s1 = w1 * h1

            # possible ?
            if s1 == 0: s1 = 1.0

            #intersection
            nx1 = max(x11, x21)
            nx2 = min(x12, x22)
            ny1 = max(y11, y21)
            ny2 = min(y12, y22)
            h = abs(nx2 - nx1)
            w = abs(ny2 - ny1)

            inter = h * w
            if inter > 0:
                fOverlap = inter / s1
            else:
                # if overX and Y this is not possible !
                fOverlap = 0.0

        return fOverlap

    def findSignificantOverlap(self, TOverlap, ref, run):
        """
            return 
        """
        pref, rowref = ref
        prun, rowrun = run
        if pref != prun: return False

        return rowref.ratioOverlap(rowrun) >= TOverlap

    def testCPOUM(self, TOverlap, srefData, srunData, bVisual=False):
        """
            TOverlap: Threshols used for comparing two surfaces
            
            
            Correct Detections:
            under and over segmentation?
        """

        cntOk = cntErr = cntMissed = 0

        RefData = etree.XML(srefData.strip("\n").encode('utf-8'))
        RunData = etree.XML(srunData.strip("\n").encode('utf-8'))
        #         try:
        #             RunData = libxml2.parseMemory(srunData.strip("\n"), len(srunData.strip("\n")))
        #         except:
        #             RunData = None
        #             return (cntOk, cntErr, cntMissed)
        lRun = []
        if RunData:
            lpages = RunData.xpath('//%s' % ('PAGE'))
            for page in lpages:
                pnum = page.get('number')
                #record level!
                lRows = page.xpath(".//%s" % ("ROW"))
                lORows = map(lambda x: XMLDSTABLECOLUMNClass(0, x), lRows)
                for row in lORows:
                    row.fromDom(row._domNode)
                    row.setIndex(row.getAttribute('id'))
                    lRun.append((pnum, row))
        print(lRun)

        lRef = []
        lPages = RefData.xpath('//%s' % ('PAGE'))
        for page in lPages:
            pnum = page.get('number')
            lRows = page.xpath(".//%s" % ("ROW"))
            lORows = map(lambda x: XMLDSTABLECOLUMNClass(0, x), lRows)
            for row in lORows:
                row.fromDom(row._domNode)
                row.setIndex(row.getAttribute('id'))
                lRef.append((pnum, row))

        refLen = len(lRef)
        #         bVisual = True
        ltisRefsRunbErrbMiss = list()
        lRefCovered = []
        for i in range(0, len(lRun)):
            iRef = 0
            bFound = False
            bErr, bMiss = False, False
            runElt = lRun[i]
            #             print '\t\t===',runElt
            while not bFound and iRef <= refLen - 1:
                curRef = lRef[iRef]
                if runElt and curRef not in lRefCovered and self.findSignificantOverlap(
                        TOverlap, runElt, curRef):
                    bFound = True
                    lRefCovered.append(curRef)
                iRef += 1
            if bFound:
                if bVisual: print("FOUND:", runElt, ' -- ', lRefCovered[-1])
                cntOk += 1
            else:
                curRef = ''
                cntErr += 1
                bErr = True
                if bVisual: print("ERROR:", runElt)
            if bFound or bErr:
                ltisRefsRunbErrbMiss.append(
                    (int(runElt[0]), curRef, runElt, bErr, bMiss))

        for i, curRef in enumerate(lRef):
            if curRef not in lRefCovered:
                if bVisual: print("MISSED:", curRef)
                ltisRefsRunbErrbMiss.append(
                    (int(curRef[0]), curRef, '', False, True))
                cntMissed += 1
        ltisRefsRunbErrbMiss.sort(key=lambda xyztu: xyztu[0])

        #         print cntOk, cntErr, cntMissed,ltisRefsRunbErrbMiss
        return (cntOk, cntErr, cntMissed, ltisRefsRunbErrbMiss)

    def testCompare(self, srefData, srunData, bVisual=False):
        """
        as in Shahad et al, DAS 2010

        Correct Detections 
        Partial Detections 
        Over-Segmented 
        Under-Segmented 
        Missed        
        False Positive
                
        """
        dicTestByTask = dict()
        dicTestByTask['T50'] = self.testCPOUM(0.50, srefData, srunData,
                                              bVisual)
        #         dicTestByTask['T75']= self.testCPOUM(0.750,srefData,srunData,bVisual)
        #         dicTestByTask['T100']= self.testCPOUM(0.50,srefData,srunData,bVisual)

        #         dicTestByTask['FirstName']= self.testFirstNameRecord(srefData, srunData,bVisual)
        #         dicTestByTask['Year']= self.testYear(srefData, srunData,bVisual)

        return dicTestByTask

    def createColumnsWithCuts(self, lXCuts, table, tableNode, bTagDoc=False):
        """
            create column dom node
        """

        prevCut = None
        lXCuts.sort()
        for index, cut in enumerate(lXCuts):
            # first correspond to the table: no rpw
            if prevCut is not None:
                colNode = etree.Element("COL")
                tableNode.append(colNode)
                colNode.set('x', str(prevCut))
                colNode.set('width', "{:.2f}".format(cut - prevCut))
                colNode.set('y', str(table.getY()))
                colNode.set('height', str(table.getHeight()))
                colNode.set('id', str(index - 1))
            prevCut = cut

        #last
        cut = table.getX2()
        colNode = etree.Element("COL")
        tableNode.append(colNode)
        colNode.set('x', "{:.2f}".format(prevCut))
        colNode.set('width', "{:.2f}".format(cut - prevCut))
        colNode.set('y', str(table.getY()))
        colNode.set('height', str(table.getHeight()))
        colNode.set('id', str(index))

    def createRef(self, doc):
        """
            create a ref file from the xml one
        """
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        root = etree.Element("DOCUMENT")
        refdoc = etree.ElementTree(root)

        for page in self.ODoc.getPages():
            #imageFilename="..\col\30275\S_Freyung_021_0001.jpg" width="977.52" height="780.0">
            pageNode = etree.Element('PAGE')
            pageNode.set("number", page.getAttribute('number'))
            pageNode.set("pagekey",
                         os.path.basename(page.getAttribute('imageFilename')))
            pageNode.set("width", page.getAttribute('width'))
            pageNode.set("height", page.getAttribute('height'))

            root.append(pageNode)
            lTables = page.getAllNamedObjects(XMLDSTABLEClass)
            for table in lTables:
                dCol = {}
                tableNode = etree.Element('TABLE')
                tableNode.set("x", table.getAttribute('x'))
                tableNode.set("y", table.getAttribute('y'))
                tableNode.set("width", table.getAttribute('width'))
                tableNode.set("height", table.getAttribute('height'))
                pageNode.append(tableNode)
                for cell in table.getAllNamedObjects(XMLDSTABLECELLClass):
                    try:
                        dCol[int(cell.getAttribute("col"))].append(cell)
                    except KeyError:
                        dCol[int(cell.getAttribute("col"))] = [cell]

                lXcuts = []
                for colid in sorted(dCol.keys()):
                    lXcuts.append(
                        min(list(map(lambda x: x.getX(), dCol[colid]))))
                self.createColumnsWithCuts(lXcuts, table, tableNode)

        return refdoc
Exemplo n.º 17
0
class tableRowMiner(Component.Component):
    """
        tableRowMiner class: a component to mine table to find out  horizontal cuts (hence rows)
    """

    #DEFINE the version, usage and description of this particular component
    usage = ""
    version = "v.01"
    description = "description: table row miner "

    #--- INIT -------------------------------------------------------------------------------------------------------------
    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "tableRowMiner", self.usage,
                                     self.version, self.description)

        # TH for comparing numerical features
        ## need to be fucntion of leading: when leading small THNUMERICAL small, lineHeighr reduced a well?
        self.THNUMERICAL = 25
        # use for evaluation
        self.THCOMP = 10
        self.evalData = None

        self.THHighSupport = 0.33

        self.bManual = False

    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        if "pattern" in dParams:
            self.manualPattern = eval(dParams["pattern"])
            self.bManual = True
        if "thhighsupport" in dParams:
            self.THHighSupport = dParams["thhighsupport"] * 0.01

    def testHighSupport(self, sequences, th):
        """
            compute unigram support
        """
        # from mssp
        from collections import Counter
        import itertools

        sequence_count = len(sequences)

        flattened_sequences = [
            list(set(itertools.chain(*sequence))) for sequence in sequences
        ]
        support_counts = dict(
            Counter(item for flattened_sequence in flattened_sequences
                    for item in flattened_sequence))
        actual_supports = {
            item: support_counts.get(item) / float(sequence_count)
            for item in support_counts.keys()
        }
        #         lOneSupport= [k for k,v in actual_supports.iteritems() if v >= 0.5 ]
        lOneSupport = [k for k, v in actual_supports.items() if v >= th]
        #         print(actual_supports.items() )
        #         print (th,lOneSupport)
        return lOneSupport

    def createFeatureFromValue(self, elt, value, name):
        feature = featureObject()
        feature.setName(name)
        feature.setTH(self.THNUMERICAL)
        feature.addNode(elt)
        feature.setObjectName(elt)
        feature.setValue(float(value))
        feature.setType(featureObject.NUMERICAL)
        return feature

    def columnMining(self, table, thnum, th, predefinedCuts=[]):
        """
            for a table: take itemset=colmun, item=cell(Y) + separator
            - test: is a same rowgrid for all pages: row of fixed positions, size

        separator
        
        
        PREPRO: get the Y of overlaping separators for each col
            
        """
        if thnum != None:
            self.THNUMERICAL = thnum

        lElts = table.getColumns()  #getAllNamedObjects(XMLDSTABLECOLUMNClass)
        for elt in lElts:
            # how to add separator?
            #             for c in elt.getCells():c.setHeight()
            elt.resetFeatures()
            elt.setFeatureFunction(elt.getSetOfListedAttributes,
                                   self.THNUMERICAL,
                                   lFeatureList=['y'],
                                   myLevel=XMLDSTABLECELLClass)

            ## add predefinedCuts here
            elt.computeSetofFeatures()
            for prevFea in predefinedCuts:
                f = self.createFeatureFromValue(elt, round(prevFea), 'y')
                elt.addFeature(f)

        seqGen = sequenceMiner()
        seqGen.bDebug = False
        seqGen.setMaxSequenceLength(1)
        #         seqGen.setSDC(0.7) # related to noise level       AND STRUCTURES (if many columns)
        #         _  = seqGen.featureGeneration(lElts,2) # more at token level, but at text level: freq=2
        seqGen.setObjectLevel(XMLDSTABLECELLClass)
        for elt in lElts:
            elt.lFeatureForParsing = elt.getSetofFeatures()
            elt.lFeatureForParsing.sort(key=lambda x: x.getValue())
#             print( elt, elt.lFeatureForParsing)
#
        lSortedFeatures = seqGen.featureGeneration(lElts, 2)
        #         for f in lSortedFeatures:
        #             print ("%s\s%s"%(f, f.getNodes()))

        lmaxSequence = seqGen.generateItemsets(lElts)
        #         for elt in lElts:
        #             print elt, elt.getCanonicalFeatures()
        lSeq, _ = seqGen.generateMSPSData(lmaxSequence,
                                          lSortedFeatures,
                                          mis=0.5)
        lOneSupport = self.testHighSupport(lSeq, th)
        lOneSupport.sort(key=lambda x: x.getValue())
        return lOneSupport

#         lTerminalTemplates=[]
#         lCurList=lElts[:]
#         lCurList,lTerminalTemplates = self.mineLineFeature(seqGen,lCurList,lTerminalTemplates)
#         print lTerminalTemplates
#         return

    def mainMining(self, lPages):
        """
            mine with incremental length
            
        """
        import util.TwoDNeighbourhood as TwoDRel

        lLElts = [[] for i in range(0, len(lPages))]
        for i, page in enumerate(lPages):
            lElts = page.getAllNamedObjects(
                XMLDSTABLECELLClass
            )  #+page.getAllNamedObjects(XMLDSGRAPHLINEClass)
            for e in lElts:
                e.lnext = []
            ## filter elements!!!
            lElts = filter(lambda x: min(x.getHeight(), x.getWidth()) > 10,
                           lElts)
            lElts = filter(lambda x: x.getHeight() > 10, lElts)

            lElts.sort(key=lambda x: x.getY())
            lLElts[i] = lElts
            for elt in lElts:
                TwoDRel.rotateMinus90deg(
                    elt
                )  #rotate by 90 degrees and look for vertical neighbors :-)
            lHEdge = TwoDRel.findVerticalNeighborEdges(lElts)
            for elt in lElts:
                TwoDRel.rotatePlus90deg(elt)
            #             lVEdge = TwoDRel.findVerticalNeighborEdges(lElts)
            for a, b in lHEdge:
                a.lnext.append(b)

        for i, page, in enumerate(lPages):
            lElts = lLElts[i]
            for elt in lElts:
                elt.setFeatureFunction(elt.getSetOfListedAttributes,
                                       self.THNUMERICAL,
                                       lFeatureList=['y'],
                                       myLevel=XMLDSTABLECELLClass)
                elt.computeSetofFeatures()
#                 print elt.getSetofFeatures()
            seqGen = sequenceMiner()
            seqGen.setMaxSequenceLength(1)
            seqGen.setSDC(
                0.7
            )  # related to noise level       AND STRUCTURES (if many columns)
            _ = seqGen.featureGeneration(
                lElts, 2)  # more at token level, but at text level: freq=2
            seqGen.setObjectLevel(XMLDSTABLECELLClass)

            #             lKleendPlus = self.getKleenePlusFeatures(lElts)
            #             print lKleendPlus

            for elt in lElts:
                elt.lFeatureForParsing = elt.getSetofFeatures()
#                 print elt, elt.lFeatureForParsing
#
            lTerminalTemplates = []
            lCurList = lElts[:]
            lCurList, lTerminalTemplates = self.mineLineFeature(
                seqGen, lCurList, lTerminalTemplates)
            #             print lTerminalTemplates
            for mytemplate in lTerminalTemplates:
                page.addVerticalTemplate(mytemplate)
                page.addVSeparator(mytemplate, mytemplate.getPattern())
            del seqGen
#         self.tagAsRegion(lPages)

    def mineLineFeature(self, seqGen, lCurList, lTerminalTemplates):
        """
            get a set of lines and mine them
        """
        seqGen.setMinSequenceLength(1)
        seqGen.setMaxSequenceLength(1)

        print('***' * 20)

        seqGen.bDebug = False
        for elt in lCurList:
            if elt.getSetofFeatures() is None:
                elt.resetFeatures()
                elt.setFeatureFunction(elt.getSetOfListedAttributes,
                                       self.THNUMERICAL,
                                       lFeatureList=['virtual'],
                                       myLevel=XMLDSTABLECELLClass)
                elt.computeSetofFeatures()
                elt.lFeatureForParsing = elt.getSetofFeatures()
            else:
                elt.setSequenceOfFeatures(elt.lFeatureForParsing)
#                 print elt, elt.getSetofFeatures()
        lSortedFeatures = seqGen.featureGeneration(lCurList, 2)
        for cf in lSortedFeatures:
            cf.setWeight(len((cf.getNodes())))

        lmaxSequence = seqGen.generateItemsets(lCurList)
        seqGen.bDebug = False

        lSeq, _ = seqGen.generateMSPSData(lmaxSequence,
                                          lSortedFeatures + lTerminalTemplates,
                                          mis=0.01)
        lPatterns = seqGen.miningSequencePrefixScan(lSeq)
        #         lPatterns = seqGen.beginMiningSequences(lSeq,lSortedFeatures,lMIS)
        if lPatterns is None:
            return lCurList, lTerminalTemplates

        lPatterns.sort(key=lambda xy: xy[0], reverse=True)

        print("List of patterns and their support:")
        for p, support in lPatterns:
            if support >= 1:
                print(p, support)
        seqGen.THRULES = 0.95
        lSeqRules = seqGen.generateSequentialRules(lPatterns)

        " here store features which are redundant and consider only the core feature"

        dTemplatesCnd = self.analyzeListOfPatterns(lPatterns, {})
        lFullTemplates, lTerminalTemplates, tranprob = seqGen.testTreeKleeneageTemplates(
            dTemplatesCnd, lCurList, iterMax=40)

        return lCurList, lTerminalTemplates

    def selectFinalTemplates(self, lTemplates, transProb, lElts):
        """
            apply viterbi to select best sequence of templates
        """
        import spm.viterbi as viterbi

        if lTemplates == []:
            return None

        def buildObs(lTemplates, lElts):
            """
                build observation prob
            """
            N = len(lTemplates) + 1
            obs = np.zeros((N, len(lElts)), dtype=np.float16) + 10e-3
            for i, temp in enumerate(lTemplates):
                for j, elt in enumerate(lElts):
                    # how to dela with virtual nodes
                    try:
                        _, _, score = temp.registration(elt)
                    except:
                        score = 1
                    if score == -1:
                        score = 0.0
                    obs[i, j] = score
                    if np.isinf(obs[i, j]):
                        obs[i, j] = 64000
                    if np.isnan(obs[i, j]):
                        obs[i, j] = 0.0
#                     print i,j,elt,elt.lX, temp,score
#add no-template:-1
            return obs / np.amax(obs)

        N = len(lTemplates) + 1

        initialProb = np.ones(N)
        initialProb = np.reshape(initialProb, (N, 1))
        obs = buildObs(lTemplates, lElts)

        np.set_printoptions(precision=3, linewidth=1000)
        #         print "transProb"
        #         print transProb
        #         print
        #         print obs

        d = viterbi.Decoder(initialProb, transProb, obs)
        states, score = d.Decode(np.arange(len(lElts)))

        # add empty template (last one in state)
        lTemplates.append(None)
        print(states, score)

        #assign to each elt the template assigned by viterbi
        for i, elt, in enumerate(lElts):
            #             try: print elt,elt.lX,  lTemplates[states[i]]
            #             except: print elt, elt.lX, 'no template'
            mytemplate = lTemplates[states[i]]
            elt.resetTemplate()
            if mytemplate is not None:
                elt.addTemplate(mytemplate)
                try:
                    registeredPoints, lMissing, score = mytemplate.registration(
                        elt)
                except:
                    registeredPoints = None
                if registeredPoints:
                    #                     print registeredPoints, lMissing , score
                    if lMissing != []:
                        registeredPoints.extend(zip(lMissing, lMissing))
                    registeredPoints.sort(key=lambda xy: xy[0].getValue())
                    lcuts = map(lambda refcut: refcut[1], registeredPoints)
                    ## store features for the final parsing!!!
#                     print elt, lcuts
#                     elt.addVSeparator(mytemplate,lcuts)

# return the new list with kleenePlus elts for next iteration
## reparse ?? YES using the featureSet given by viterbi  -> create an objectClass per kleenePlus element: objects: sub tree
#         print lTemplates[0]
#         self.parseWithTemplate(lTemplates[0], lElts)
# elt = template

        return score

    def getKleenePlusFeatures(self, lElts):
        """
            select KleenePlus elements based on .next (only possible for unigrams)
        """
        dFreqFeatures = {}
        dKleenePlusFeatures = {}

        lKleenePlus = []
        for elt in lElts:
            for fea in elt.getSetofFeatures():
                try:
                    dFreqFeatures[fea] += 1
                except KeyError:
                    dFreqFeatures[fea] = 1
                for nextE in elt.lnext:
                    if fea in nextE.getSetofFeatures():
                        try:
                            dKleenePlusFeatures[fea].append((elt, nextE))
                        except KeyError:
                            dKleenePlusFeatures[fea] = [(elt, nextE)]
        for fea in dFreqFeatures:
            try:
                dKleenePlusFeatures[fea]
                if len(dKleenePlusFeatures[fea]) >= 0.5 * dFreqFeatures[fea]:
                    lKleenePlus.append(fea)
            except:
                pass
        return lKleenePlus

    def computePatternScore(self, pattern):
        """
            consider the frequency of the pattern and the weights of the features
        """
        fScore = 0
        #terminal
        if not isinstance(pattern, list):
            fScore += pattern.getCanonical().getWeight()
        else:
            for child in pattern:
                fScore += self.computePatternScore(child)
#         print 'score:',pattern ,fScore
        return fScore

    def analyzeListOfPatterns(
        self,
        lPatterns,
        dCA,
    ):
        """
            select patterns with no ancestor
            other criteria ?
            
            
            if many with similar frequency:  sort using  computePatternScore?
        """
        # reorder lPatterns considering feature weights and # of elements (for equally frequent patterns)
        #         lPatterns.sort(key=lambda (x,y):self.computePatternScore(x),reverse=True)
        #         for x,y in lPatterns:
        #             print x,y,self.computePatternScore(x)

        dTemplatesTypes = {}
        for pattern, support in filter(lambda xy: xy[1] > 1, lPatterns):
            try:
                dCA[str(pattern)]
                bSkip = True
            except KeyError:
                bSkip = False
            #             if step > 0 and len(pattern) == 1:
            #                 bSkip=True
            if not bSkip:
                template = treeTemplateClass()
                template.setPattern(pattern)
                template.buildTreeFromPattern(pattern)
                template.setType('lineTemplate')
                try:
                    dTemplatesTypes[template.__class__.__name__].append(
                        (pattern, support, template))
                except KeyError:
                    dTemplatesTypes[template.__class__.__name__] = [
                        (pattern, support, template)
                    ]

        return dTemplatesTypes

    def processWithTemplate(self, lPattern, lPages):
        """
            process sequence of pqges with given pattern
            create table 
        """

        lfPattern = []
        for itemset in lPattern:
            fItemset = []
            for item in itemset:
                f = featureObject()
                f.setName("x")
                f.setType(featureObject.NUMERICAL)
                f.setValue(item)
                f.setTH(self.THNUMERICAL)
                fItemset.append(f)
            lfPattern.append(fItemset)

        pattern = lfPattern

        print(pattern)

        ### in prodf: mytemplate given by page.getVerticalTemplates()
        mytemplate = treeTemplateClass()
        mytemplate.setPattern(pattern[0])

        # registration provides best matching
        ## from registration matched: select the final cuts
        for i, p in enumerate(lPages):
            p.lFeatureForParsing = p.lf_XCut
            sys.stdout.flush()
            registeredPoints1, lMissing1, score1 = mytemplate.registration(p)
            if score1 >= 0:
                lfinalCuts = map(
                    lambda xy: xy[1],
                    filter(lambda xy: xy[0] != 'EMPTY', registeredPoints1))
                print(p, 'final1:', lfinalCuts, lMissing1)
                p.addVerticalTemplate(mytemplate)
                p.addVSeparator(mytemplate, lfinalCuts)
            else:
                print('NO REGISTRATION')

        self.tagAsRegion(lPages)

        return 1

    def tagAsRegion(self, lPages):
        """
            create regions
        """
        for page in lPages:
            if page.getNode():
                # if several template ???
                for template in page.getVerticalTemplates():
                    page.getdVSeparator(template).sort(
                        key=lambda x: x.getValue())
                    print(page.getdVSeparator(template))
                    page.getNode().setProp('template',
                                           str(page.getdVSeparator(template)))
                    XMinus = 1
                    prevcut = 10
                    #                     print page, page.getdVSeparator(template)
                    for cut in page.getdVSeparator(template):
                        cellNode = etree.Element('REGION')
                        cellNode.set("y", str(prevcut))
                        ## it is better to avoid
                        YMinus = 10
                        cellNode.set("x", str(XMinus))
                        cellNode.set("width",
                                     str(page.getWidth() - 2 * YMinus))
                        cellNode.set("height", str(cut.getValue() - prevcut))
                        page.getNode().addChild(cellNode)
                        prevcut = cut.getValue()

    def generateTestOutput(self, lPages):
        """
            create a run XML file
        """
        root = etree.Element('DOCUMENT')
        self.evalData = etree.ElementTree(root)
        for page in lPages:
            domp = etree.Element('PAGE')
            domp.set('number', page.getAttribute('number'))
            root.addChild(domp)
            for sep in page.lVSeparator:
                print(page.lVSeparator)
                domsep = etree.Element('SeparatorRegion')
                domp.append(domsep)
                #                 domsep.setProp('x', str(sep[0].getValue()))
                domsep.set('x', str(sep[0]))

    #--- RUN ---------------------------------------------------------------------------------------------------------------
    def run(self, doc):
        """
            take a set of line in a page and mine it
        """
        self.doc = doc
        # use the lite version
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        self.lPages = self.ODoc.getPages()

        if self.bManual:
            self.processWithTemplate(self.manualPattern, self.lPages)
        else:

            #             self.mainMining(self.lPages)
            for page in self.lPages:
                print("page")
                lTables = page.getAllNamedObjects(XMLDSTABLEClass)
                for table in lTables:
                    self.columnMining(table)

        self.addTagProcessToMetadata(self.doc)

        return self.doc