def run(self, doc): """ load dom and find rows """ # conver to DS if needed if self.bCreateRef: if self.do2DS: dsconv = primaAnalysis() doc = dsconv.convert2DS(doc, self.docid) refdoc = self.createRef(doc) return refdoc # single ref per page refdoc = self.createRefPerPage(doc) return None if self.do2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() self.checkInputFormat(self.lPages) self.findColumnsInDoc(self.lPages) if self.bMining: self.documentMining(self.lPages) if self.bCreateRef: refdoc = self.createRef(self.doc) return refdoc # if self.do2DS: # # bakc to PageXml # conv= DS2PageXMLConvertor() # lPageXDoc = conv.run(self.doc) # conv.storeMultiPageXml(lPageXDoc,self.getOutputFileName()) # print self.getOutputFileName() # return None return self.doc
def run(self, doc): """ main issue: how to select the template: to be done by CVL assuming IE and htr info are stored in the template """ # self.firstPage = 117 # self.lastPage= 118 if self.page2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() if self.recordType == 'D': record = deathRecord(self.sModelName, self.sModelDir) foo = self.processDeathWithTemplate elif self.recordType == 'W': record = weddingRecord(self.sModelName, self.sModelDir) foo = self.processWeddingWithTemplate elif self.recordType == 'B': record = birthRecord(self.sModelName, self.sModelDir) foo = self.processBirthWithTemplate else: print(f'record type not supported: {self.recordType}') for page in self.lPages: print("page: ", page.getNumber()) # self.testGTText(page) # continue lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: if table.getNbRows() < 2: if self.bDebug: print("page: %s : not a table? %d/%d" % (page.getNumber(), table.getNbRows(), table.getNbColumns())) continue if self.BuseStoredTemplate: if self.bDebug: print("page: %s : table %d/%d" % (page.getNumber(), table.getNbRows(), table.getNbColumns())) foo(table, record) else: self.mineTable(table, record) self.evalData = record.generateOutput(self.evalData)
def run(self, doc): """ load dom and find rows """ # conver to DS if needed if self.bCreateRef: if self.do2DS: dsconv = primaAnalysis() doc = dsconv.convert2DS(doc, self.docid) refdoc = self.createRef(doc) return refdoc # single ref per page refdoc = self.createRefPerPage(doc) return None if self.do2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) # self.ODoc.loadFromDom(self.doc,listPages = range(30,31)) self.findColumnsInDoc(self.ODoc) refdoc = self.createRef(self.doc) # print refdoc.serialize('utf-8', 1) if self.do2DS: # bakc to PageXml conv = DS2PageXMLConvertor() lPageXDoc = conv.run(self.doc) conv.storeMultiPageXml(lPageXDoc, self.getOutputFileName()) # print self.getOutputFileName() return None return self.doc
def run(self, doc): """ main issue: how to select the template: to be done by CVL assuming IE and htr info are stored in the template """ # self.firstPage = 9 # self.lastPage= 9 if self.page2DS: dsconv = primaAnalysis() self.doc = dsconv.convert2DS(doc, self.docid) else: self.doc = doc self.ODoc = XMLDSDocument() self.ODoc.loadFromDom(self.doc, listPages=range(self.firstPage, self.lastPage + 1)) self.lPages = self.ODoc.getPages() dr = deathRecord(self.sModelName, self.sModelDir) ## selection of the templates first with X tables ### for page in self.lPages: print("page: ", page.getNumber()) # self.testGTText(page) # continue lTables = page.getAllNamedObjects(XMLDSTABLEClass) for table in lTables: if table.getNbRows() < 2: print("page: %s : not a table? %d/%d" % (page.getNumber(), table.getNbRows(), table.getNbColumns())) continue if self.BuseStoredTemplate: # self.processWithTemplate(table, dr) try: self.processWithTemplate(table, dr) except: print('issue with page %s' % page) else: self.mineTable(table, dr) self.evalData = dr.generateOutput(self.evalData)
def processDocument(self, coldir, colid, docid, dom=None): """ process a single document 1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml 2 python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form 3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400 4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml 6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442 7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py <model-name> <dictionary-name> 5400 17442 wait 8 python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force #covnert to ds 9 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml --doie --usetemplate """ #create Transkribus client self.myTrKCient = TranskribusClient(sServerUrl=self.server, proxies={}, loggingLevel=logging.WARN) #login _ = self.login(self.myTrKCient, trace=trace, traceln=traceln) # self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True) ## load dom if dom is None: self.inputFileName = os.path.abspath( os.path.join(coldir, TableProcessing.sCOL, docid + TableProcessing.sMPXMLExtension)) mpxml_doc = self.loadDom() nbPages = MultiPageXml.getNBPages(mpxml_doc) else: # load provided mpxml mpxml_doc = dom nbPages = MultiPageXml.getNBPages(mpxml_doc) # ### table registration: need to compute/select??? the template # # perform LA separator, table registration, baseline with normalization # #python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form # tableregtool= LAProcessor() # # latool.setParams(dParams) # tableregtool.coldir = coldir # tableregtool.docid = docid # tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False # # creates xml and a new mpxml # mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc) # # # self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done') lJobIDs = self.applyLA_URO(colid, docid, nbPages) return bWait = True assert lJobIDs != [] jobid = lJobIDs[-1] traceln("waiting for job %s" % jobid) while bWait: dInfo = self.myTrKCient.getJobStatus(jobid) bWait = dInfo['state'] not in ['FINISHED', 'FAILED'] ## coldir??? self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True) ##STOP HERE FOR DAS newx testset: return # tag text for BIES cell #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400 """ needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir, """ doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir) doer.load() ## needed predict at file level, and do not store dom, but return it rowpath = os.path.join(coldir, "col") BIESFiles = doer.predict([rowpath], docid) BIESDom = self.loadDom(BIESFiles[0]) # res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True) # MPXML2DS #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442 dsconv = primaAnalysis() DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid) # create XMLDOC object self.ODoc = XMLDSDocument() self.ODoc.loadFromDom( DSBIESdoc) #,listPages = range(self.firstPage,self.lastPage+1)) # create row #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml rdc = RowDetection() rdc.findRowsInDoc(self.ODoc) #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi # DS2MPXML DS2MPXML = DS2PageXMLConvertor() lPageXml = DS2MPXML.run(self.ODoc.getDom()) if lPageXml != []: # if DS2MPXML.bMultiPages: newDoc = MultiPageXml.makeMultiPageXmlMemory( map(lambda xy: xy[0], lPageXml)) outputFileName = os.path.join( self.coldir, sCOL, self.docid + TableProcessing.sMPXMLExtension) newDoc.write(outputFileName, xml_declaration=True, encoding="UTF-8", pretty_print=True) # else: # DS2MPXML.storePageXmlSetofFiles(lPageXml) return #upload # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442 self.upLoadDocument(colid, coldir, docid, sNote='NLE workflow;table row done') ## apply HTR ## how to deal with specific dictionaries? ## here need to know the ontology and the template nbPages = 1 jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel, self.sDictName) bWait = True traceln("waiting for job %s" % jobid) while bWait: dInfo = self.myTrKCient.getJobStatus(jobid) bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED'] # download where??? # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force # coldir is not right!! coldir must refer to the parent folder! self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)