def parseDomNodeLabel(self, domnode, defaultCls=None): """ Parse and set the graph node label and return its class index raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one """ sLabel = self.sDefaultLabel try: try: sXmlLabel = PageXml.getCustomAttr(domnode, self.sCustAttr_STRUCTURE, self.sCustAttr2_TYPE) except PageXmlException as e: if self.bOther: return self.sDefaultLabel #absence of label but bOther was True (I guess) else: raise e try: sLabel = self.dXmlLabel2Label[sXmlLabel] except KeyError: #not a label of interest try: self.checkIsIgnored(sXmlLabel) #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel: except: raise ValueError("Invalid label '%s' in node %s"%(sXmlLabel, etree.tostring(domnode))) except KeyError: #no label at all if not self.sDefaultLabel: raise ValueError("Missing label in node %s"%etree.tostring(domnode)) return sLabel
def _convertPageAnnotation(self, pnum, page, domNdPage): """ """ for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page): try: sResoNum = None lbl = PageXml.getCustomAttr(nd, "structure", "type") if lbl in ["heading"]: semLabel = self.dAnnotMapping[lbl] #heading may indicate a new resolution! if self.prevResolutionNumber == None: sgmLabel = self._getCurrentSegmentationLabel( ) #for instance 2 consecutive headings else: sgmLabel = self._switchSegmentationLabel() self.prevResolutionNumber = None #so that next number does not switch Heigh/Ho label elif lbl in ["header", "page-number", "marginalia"]: #continuation of a resolution semLabel = self.dAnnotMapping[lbl] sgmLabel = self._getCurrentSegmentationLabel() else: o = self.creResolutionHumanLabel.match(lbl) if not o: raise ValueError("%s is not a valid human annotation" % lbl) semLabel = self.dAnnotMapping[o.group( 1)] #"" for the resolution number #Here we have a resolution number! sResoNum = o.group(2) if not sResoNum: raise ValueError( "%s is not a valid human annotation - missing resolution number" % lbl) #now switch between heigh and ho !! :)) if self.prevResolutionNumber != None and self.prevResolutionNumber != sResoNum: #we got a new number, so switching segmentation label! sgmLabel = self._switchSegmentationLabel() else: #either same number or switching already done due to a heading sgmLabel = self._getCurrentSegmentationLabel() self.prevResolutionNumber = sResoNum except PageXmlException: semLabel = self.sOther sgmLabel = self._getCurrentSegmentationLabel() nd.set(self.sSemAttr, semLabel) nd.set(self.sSgmAttr, sgmLabel) if sResoNum: nd.set(self.sNumAttr, sResoNum ) #only when the number is part of the humanannotation!
def _convertPageAnnotation(self, pnum, page, domNdPage): """ """ #change: on each page we start by Heigh bRestartAtEachPageWithHeigh = True if bRestartAtEachPageWithHeigh: self._initSegmentationLabel() for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page): try: lbl = PageXml.getCustomAttr(nd, "structure", "type") except PageXmlException: nd.set(self.sSemAttr, self.sOther) nd.set(self.sSgmAttr, self.sOther) continue #this node has no annotation whatsoever if lbl in ["heading", "header", "page-number", "marginalia"]: semLabel = lbl sgmLabel = self.sOther #those elements are not part of a resolution sResoNum = None else: o = self.creResolutionHumanLabel.match(lbl) if not o: raise ValueError("%s is not a valid human annotation" % lbl) semLabel = o.group(1) #"" for the resolution number #now decide on the segmentation label sResoNum = o.group(2) if not sResoNum: raise ValueError( "%s is not a valid human annotation - missing resolution number" % lbl) #now switch between heigh and ho !! :)) if self.prevResolutionNumber == sResoNum: sgmLabel = self.prevSgmLbl else: sgmLabel = self._getNextSegmentationLabel(self.prevSgmLbl) assert bRestartAtEachPageWithHeigh or sResoNum not in self.lSeenResoNum, "ERROR: the ordering of the block has not preserved resolution number contiguity" self.lSeenResoNum.append(sResoNum) self.prevResolutionNumber, self.prevSgmLbl = sResoNum, sgmLabel #always have a semantic label sNewSemLbl = self.dAnnotMapping[semLabel] assert sNewSemLbl nd.set(self.sSemAttr, sNewSemLbl) #DU annotation #resolution parts also have a segmentation label and a resolution number assert sgmLabel nd.set(self.sSgmAttr, sgmLabel) #DU annotation if sResoNum: nd.set(self.sNumAttr, sResoNum)
def parseDocNodeLabel(self, graph_node, defaultCls=None): """ Parse and set the graph node label and return its class index We rely on the standard self.sLabelAttr raise a ValueError if the label is missing while bOther was not True , or if the label is neither a valid one nor an ignored one """ ndParent = graph_node.node.getparent() try: sLabel = "%s_%s" % ( self.sLabelAttr, PageXml.getCustomAttr(ndParent, 'structure','type') ) except : sLabel='type_None' return sLabel
def test_getsetCustomAttr(): sXml = b""" <TextRegion type="page-number" id="p1_region_1471502505726_2" custom="readingOrder {index:9;} structure {type:page-number;}"> <Coords points="972,43 1039,43 1039,104 972,104"/> </TextRegion> """ doc = etree.parse(BytesIO(sXml)) nd = doc.getroot() assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '9' assert PageXml.setCustomAttr(nd, "readingOrder", "index", 99) == 99 assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '99' assert PageXml.getCustomAttr(nd, "readingOrder") == {'index': '99'} assert PageXml.setCustomAttr(nd, "readingOrder", "toto", "zou") == "zou" assert PageXml.getCustomAttr(nd, "readingOrder", "toto") == 'zou' with pytest.raises(PageXmlException): PageXml.getCustomAttr(nd, "readingOrder", "axiste_pas") with pytest.raises(PageXmlException): PageXml.getCustomAttr(nd, "axiste_pas_non_plus", "axiste_pas")
def parsePage(self, doc, ctxtNd, name): for tag in self.lTag: lNdTag = ctxtNd.xpath(".//pg:%s"%tag, namespaces=self.dNS) for nd in lNdTag: self.seenDocTag(name, tag) if self.sCustom != None: if self.sCustom == "": try: lbl = PageXml.getCustomAttr(nd, "structure", "type") except: lbl = '' else: lbl = nd.get(self.sCustom) else: lbl = nd.get("type") if lbl: for cre, sRepl in self.ltCRES: lbl = cre.sub(sRepl, lbl) #pattern processing self.seenTagLabel(tag, lbl)
def _convertPageAnnotation(self, pnum, page, domNdPage): """ """ for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page): sResoNum = None bCurrentIsAStart = None try: lbl = PageXml.getCustomAttr(nd, "structure", "type") if lbl == "heading": semLabel = self.dAnnotMapping[lbl] #heading indicate the start of a new resolution, unless the previous is already a start! if self._prevIsB: bCurrentIsAStart = False else: bCurrentIsAStart = True self._prevNum = False #to prevent starting again when find the resolution number elif lbl in ["header", "page-number", "marginalia"]: semLabel = self.dAnnotMapping[lbl] #continuation of a resolution, except at very beginning (first node) if self._prevNd == None: bCurrentIsAStart = True else: bCurrentIsAStart = False else: o = self.creResolutionHumanLabel.match(lbl) if not o: if False: # strict raise ValueError("%s is not a valid human annotation" % lbl) else: # relaxed print(" ** WARNING ** strange annotation on node id=%s : '%s'"%(nd.get("id"), lbl)) semLabel = self.dAnnotMapping[None] #Here we have a resolution number! sResoNum = self._prevNum else: semLabel = self.dAnnotMapping[o.group(1)] #"" for the resolution number #Here we have a resolution number! sResoNum = o.group(2) if not sResoNum: raise ValueError("%s is not a valid human annotation - missing resolution number" % lbl) if self._prevNum != False and self._prevNum != sResoNum: #we got a new number, so switching segmentation label! bCurrentIsAStart = True else: #either same number or switching already done due to a heading bCurrentIsAStart = False self._prevNum = sResoNum except PageXmlException: semLabel = self.sOther bCurrentIsAStart = False #Now tagging!! #Semantic (easy) nd.set(self.sSemAttr, semLabel) # BIES, tough... if bCurrentIsAStart: if self._prevIsB: #make previous a singleton! if self._prevNd: self._prevNd.set(self.sSgmAttr, self.S) else: #make previous a End if self._prevNd: self._prevNd.set(self.sSgmAttr, self.E) self._prevIsB = True #for next cycle! else: if self._prevIsB: #confirm previous a a B if self._prevNd: self._prevNd.set(self.sSgmAttr, self.B) else: #confirm previous as a I if self._prevNd: self._prevNd.set(self.sSgmAttr, self.I) self._prevIsB = False #for next cycle! if sResoNum: nd.set(self.sNumAttr, sResoNum) #only when the number is part of the humanannotation! self._prevNd = nd #for next cycle! # end for if self._prevIsB: #make previous a singleton! if self._prevNd: self._prevNd.set(self.sSgmAttr, self.S) else: #make previous a End if self._prevNd: self._prevNd.set(self.sSgmAttr, self.E) return