Exemplo n.º 1
0
    def run(self, doc):
        """
           load dom and find rows 
        """
        # conver to DS if needed
        if self.bCreateRef:
            if self.do2DS:
                dsconv = primaAnalysis()
                doc = dsconv.convert2DS(doc, self.docid)

            refdoc = self.createRef(doc)
            return refdoc
            # single ref per page
            refdoc = self.createRefPerPage(doc)
            return None

        if self.do2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))
        self.lPages = self.ODoc.getPages()

        self.checkInputFormat(self.lPages)
        self.findColumnsInDoc(self.lPages)

        if self.bMining:
            self.documentMining(self.lPages)

        if self.bCreateRef:
            refdoc = self.createRef(self.doc)
            return refdoc


#         if self.do2DS:
#             # bakc to PageXml
#             conv= DS2PageXMLConvertor()
#             lPageXDoc = conv.run(self.doc)
#             conv.storeMultiPageXml(lPageXDoc,self.getOutputFileName())
#             print self.getOutputFileName()
#             return None
        return self.doc
Exemplo n.º 2
0
    def run(self, doc):
        """
        main issue: how to select the template: to be done by CVL
            assuming IE and htr info are stored in the template
        
        """
        #         self.firstPage = 117
        #         self.lastPage= 118

        if self.page2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        self.lPages = self.ODoc.getPages()
        if self.recordType == 'D':
            record = deathRecord(self.sModelName, self.sModelDir)
            foo = self.processDeathWithTemplate
        elif self.recordType == 'W':
            record = weddingRecord(self.sModelName, self.sModelDir)
            foo = self.processWeddingWithTemplate
        elif self.recordType == 'B':
            record = birthRecord(self.sModelName, self.sModelDir)
            foo = self.processBirthWithTemplate
        else:
            print(f'record type not supported: {self.recordType}')

        for page in self.lPages:
            print("page: ", page.getNumber())
            #             self.testGTText(page)
            #             continue
            lTables = page.getAllNamedObjects(XMLDSTABLEClass)

            for table in lTables:
                if table.getNbRows() < 2:
                    if self.bDebug:
                        print("page: %s : not a table? %d/%d" %
                              (page.getNumber(), table.getNbRows(),
                               table.getNbColumns()))
                    continue
                if self.BuseStoredTemplate:
                    if self.bDebug:
                        print("page: %s : table %d/%d" %
                              (page.getNumber(), table.getNbRows(),
                               table.getNbColumns()))
                    foo(table, record)
                else:
                    self.mineTable(table, record)

        self.evalData = record.generateOutput(self.evalData)
Exemplo n.º 3
0
    def run(self, doc):
        """
           load dom and find rows 
        """
        # conver to DS if needed
        if self.bCreateRef:
            if self.do2DS:
                dsconv = primaAnalysis()
                doc = dsconv.convert2DS(doc, self.docid)

            refdoc = self.createRef(doc)
            return refdoc
            # single ref per page
            refdoc = self.createRefPerPage(doc)
            return None

        if self.do2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))
        #         self.ODoc.loadFromDom(self.doc,listPages = range(30,31))

        self.findColumnsInDoc(self.ODoc)
        refdoc = self.createRef(self.doc)
        #         print refdoc.serialize('utf-8', 1)

        if self.do2DS:
            # bakc to PageXml
            conv = DS2PageXMLConvertor()
            lPageXDoc = conv.run(self.doc)
            conv.storeMultiPageXml(lPageXDoc, self.getOutputFileName())
            #             print self.getOutputFileName()
            return None
        return self.doc
Exemplo n.º 4
0
    def run(self, doc):
        """
        main issue: how to select the template: to be done by CVL
            assuming IE and htr info are stored in the template
        
        """
        #         self.firstPage = 9
        #         self.lastPage= 9

        if self.page2DS:
            dsconv = primaAnalysis()
            self.doc = dsconv.convert2DS(doc, self.docid)
        else:
            self.doc = doc
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(self.doc,
                              listPages=range(self.firstPage,
                                              self.lastPage + 1))

        self.lPages = self.ODoc.getPages()
        dr = deathRecord(self.sModelName, self.sModelDir)

        ## selection of the templates first with X tables

        ###

        for page in self.lPages:
            print("page: ", page.getNumber())
            #             self.testGTText(page)
            #             continue
            lTables = page.getAllNamedObjects(XMLDSTABLEClass)

            for table in lTables:
                if table.getNbRows() < 2:
                    print("page: %s : not a table? %d/%d" %
                          (page.getNumber(), table.getNbRows(),
                           table.getNbColumns()))
                    continue
                if self.BuseStoredTemplate:
                    #                     self.processWithTemplate(table, dr)
                    try:
                        self.processWithTemplate(table, dr)
                    except:
                        print('issue with page %s' % page)
                else:
                    self.mineTable(table, dr)

        self.evalData = dr.generateOutput(self.evalData)
Exemplo n.º 5
0
    def processDocument(self, coldir, colid, docid, dom=None):
        """
            process a single document
            
            1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml
            2 python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
            3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
            4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
            6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
            7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py  <model-name> <dictionary-name> 5400 17442
            
            wait
            8  python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force 
             #covnert to ds
            9  python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate  
            
        """

        #create Transkribus client
        self.myTrKCient = TranskribusClient(sServerUrl=self.server,
                                            proxies={},
                                            loggingLevel=logging.WARN)
        #login
        _ = self.login(self.myTrKCient, trace=trace, traceln=traceln)

        #         self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)

        ## load dom
        if dom is None:
            self.inputFileName = os.path.abspath(
                os.path.join(coldir, TableProcessing.sCOL,
                             docid + TableProcessing.sMPXMLExtension))
            mpxml_doc = self.loadDom()
            nbPages = MultiPageXml.getNBPages(mpxml_doc)
        else:
            # load provided mpxml
            mpxml_doc = dom
            nbPages = MultiPageXml.getNBPages(mpxml_doc)

#         ### table registration: need to compute/select???   the template
#         # perform LA  separator, table registration, baseline with normalization
#         #python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
#         tableregtool= LAProcessor()
# #         latool.setParams(dParams)
#         tableregtool.coldir = coldir
#         tableregtool.docid = docid
#         tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
#         # creates xml and a new mpxml
#         mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
#
#

#         self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')

        lJobIDs = self.applyLA_URO(colid, docid, nbPages)
        return

        bWait = True
        assert lJobIDs != []
        jobid = lJobIDs[-1]
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED']

        ## coldir???
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        ##STOP HERE FOR DAS newx testset:
        return

        # tag text for BIES cell
        #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
        """ 
            needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
        """
        doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
        doer.load()
        ## needed predict at file level, and do not store dom, but return it
        rowpath = os.path.join(coldir, "col")
        BIESFiles = doer.predict([rowpath], docid)
        BIESDom = self.loadDom(BIESFiles[0])
        #         res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)

        # MPXML2DS
        #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
        dsconv = primaAnalysis()
        DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid)

        # create XMLDOC object
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(
            DSBIESdoc)  #,listPages = range(self.firstPage,self.lastPage+1))
        # create row
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
        rdc = RowDetection()
        rdc.findRowsInDoc(self.ODoc)

        #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
        # DS2MPXML
        DS2MPXML = DS2PageXMLConvertor()
        lPageXml = DS2MPXML.run(self.ODoc.getDom())
        if lPageXml != []:
            #             if DS2MPXML.bMultiPages:
            newDoc = MultiPageXml.makeMultiPageXmlMemory(
                map(lambda xy: xy[0], lPageXml))
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            newDoc.write(outputFileName,
                         xml_declaration=True,
                         encoding="UTF-8",
                         pretty_print=True)


#             else:
#                 DS2MPXML.storePageXmlSetofFiles(lPageXml)

        return

        #upload
        # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
        self.upLoadDocument(colid,
                            coldir,
                            docid,
                            sNote='NLE workflow;table row done')

        ## apply HTR
        ## how to deal with specific dictionaries?

        ## here need to know the ontology and the template

        nbPages = 1
        jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel,
                              self.sDictName)
        bWait = True
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED']

        # download  where???
        # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
        #   coldir is not right!! coldir must refer to the parent folder!
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)