예제 #1
0
 def storeMultiPageXml(self,lListDocs,outputFileName=None):
     """
         write a multipagePageXml file
     """
     from xml_formats.PageXml import MultiPageXml
     mp = MultiPageXml()
     newDoc = mp.makeMultiPageXmlMemory(list(map(lambda xy:xy[0],lListDocs)))
     ## print(outputFileName)
     if outputFileName is None:
         outputFileName = os.path.dirname(self.inputFileName) + os.sep + ".."+os.sep +"col" + os.sep + os.path.basename(self.inputFileName)[:-7] + "_du.mpxml"
     # print(outputFileName)
     res= newDoc.write(outputFileName, encoding="UTF-8",pretty_print=True,xml_declaration=True)
예제 #2
0
def test_DS2PageXmlConversion():
    filename = os.path.join(sTESTS_DIR,
                            'testDS2PageXml/RRB_MM_01_033_Jahr_1810.ds.xml')
    conv= DS2PageXMLConvertor()
    conv.inputFileName = filename
    doc = conv.loadDom(filename)
    lPageXmlDocs = conv.run(doc)
    mp = MultiPageXml()
    # newDoc = mp.makeMultiPageXmlMemory(map(lambda (x,y):x,lPageXmlDocs))
    newDoc = mp.makeMultiPageXmlMemory([x for x,_y in lPageXmlDocs])
    newDoc.write(os.path.join(sTESTS_DIR,
                              "testDS2PageXml/RRB_MM_01_033_Jahr_1810.mpxml"),
                 xml_declaration=True,
                 encoding="UTF-8",
                 pretty_print=True)
예제 #3
0
    def processDocument(self, coldir, colid, docid, dom=None):
        """
            process a single document
            
            1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml
            2 python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
            3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
            4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
            6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
            7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py  <model-name> <dictionary-name> 5400 17442
            
            wait
            8  python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force 
             #covnert to ds
            9  python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate  
            
        """

        #create Transkribus client
        self.myTrKCient = TranskribusClient(sServerUrl=self.server,
                                            proxies={},
                                            loggingLevel=logging.WARN)
        #login
        _ = self.login(self.myTrKCient, trace=trace, traceln=traceln)

        #         self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)

        ## load dom
        if dom is None:
            self.inputFileName = os.path.abspath(
                os.path.join(coldir, TableProcessing.sCOL,
                             docid + TableProcessing.sMPXMLExtension))
            mpxml_doc = self.loadDom()
            nbPages = MultiPageXml.getNBPages(mpxml_doc)
        else:
            # load provided mpxml
            mpxml_doc = dom
            nbPages = MultiPageXml.getNBPages(mpxml_doc)

#         ### table registration: need to compute/select???   the template
#         # perform LA  separator, table registration, baseline with normalization
#         #python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
#         tableregtool= LAProcessor()
# #         latool.setParams(dParams)
#         tableregtool.coldir = coldir
#         tableregtool.docid = docid
#         tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
#         # creates xml and a new mpxml
#         mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
#
#

#         self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')

        lJobIDs = self.applyLA_URO(colid, docid, nbPages)
        return

        bWait = True
        assert lJobIDs != []
        jobid = lJobIDs[-1]
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED']

        ## coldir???
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        ##STOP HERE FOR DAS newx testset:
        return

        # tag text for BIES cell
        #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
        """ 
            needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
        """
        doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
        doer.load()
        ## needed predict at file level, and do not store dom, but return it
        rowpath = os.path.join(coldir, "col")
        BIESFiles = doer.predict([rowpath], docid)
        BIESDom = self.loadDom(BIESFiles[0])
        #         res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)

        # MPXML2DS
        #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
        dsconv = primaAnalysis()
        DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid)

        # create XMLDOC object
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(
            DSBIESdoc)  #,listPages = range(self.firstPage,self.lastPage+1))
        # create row
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
        rdc = RowDetection()
        rdc.findRowsInDoc(self.ODoc)

        #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
        # DS2MPXML
        DS2MPXML = DS2PageXMLConvertor()
        lPageXml = DS2MPXML.run(self.ODoc.getDom())
        if lPageXml != []:
            #             if DS2MPXML.bMultiPages:
            newDoc = MultiPageXml.makeMultiPageXmlMemory(
                map(lambda xy: xy[0], lPageXml))
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            newDoc.write(outputFileName,
                         xml_declaration=True,
                         encoding="UTF-8",
                         pretty_print=True)


#             else:
#                 DS2MPXML.storePageXmlSetofFiles(lPageXml)

        return

        #upload
        # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
        self.upLoadDocument(colid,
                            coldir,
                            docid,
                            sNote='NLE workflow;table row done')

        ## apply HTR
        ## how to deal with specific dictionaries?

        ## here need to know the ontology and the template

        nbPages = 1
        jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel,
                              self.sDictName)
        bWait = True
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED']

        # download  where???
        # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
        #   coldir is not right!! coldir must refer to the parent folder!
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)