def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = self._get_GraphNodeText(doc, domNdPage, ndBlock) if sText == None: sText = "" traceln("Warning: no text in node %s"%domid) #raise ValueError, "No text in node: %s"%ndBlock #now we need to infer the bounding box of that object lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: continue plg = Polygon(lXY) try: x1,y1, x2,y2 = plg.fitRectangle() except ZeroDivisionError: # traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum)) # continue # if True: # #we reduce a bit this rectangle, to ovoid overlap # w,h = x2-x1, y2-y1 # dx = max(w * 0.066, min(20, w/3)) #we make sure that at least 1/"rd of te width will remain! # dy = max(h * 0.066, min(20, w/3)) # x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] x1,y1,x2,y2 = plg.getBoundingBox() except ValueError: x1,y1,x2,y2 = plg.getBoundingBox() #we reduce a bit this rectangle, to ovoid overlap if not(self.BBoxDeltaFun is None): w,h = x2-x1, y2-y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] #TODO orientation = 0 #no meaning for PageXml classIndex = 0 #is computed later on #and create a Block blk = Block(page, (x1, y1, x2-x1, y2-y1), sText, orientation, classIndex, self, ndBlock, domid=domid) yield blk raise StopIteration()
def convertTR2Sep(filename): """ """ print (filename) tagname='TextRegion' xml = etree.parse(filename) ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}) for x in ltextsep: if "separator" in x.get('custom'): x.tag = 'SeparatorRegion' #now we need to convert that object to a line lXY = PageXml.getPointList(x) #the polygon assert lXY, "Separator without Coord??" plg = Polygon(lXY) try: x1,y1, x2,y2 = plg.fitRectangle() except ValueError: print("Warning: Coords might be bad, taking bounding box: ", lXY) x1,y1,x2,y2 = plg.getBoundingBox() # try: # x1,y1, x2,y2 = plg.fitRectangle() # except ZeroDivisionError: # x1,y1,x2,y2 = plg.getBoundingBox() # except ValueError: # x1,y1,x2,y2 = plg.getBoundingBox() if abs(x2-x1) > abs(y2-y1): # horizontal y1 = (y1+y2)/2 y2 = y1 else: x1 = (x1+x2)/2 x2=x1 ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0] PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)]) return xml
def resizeCell(self, cell, ns): """ replace the cell region by a BB for textlines: better for transcriber """ xpath = "./a:%s" % ("TextLine") lTextLines = cell.xpath(xpath, namespaces={'a': PageXml.NS_PAGE_XML}) if lTextLines == []: return True ## get minx,maxy for the cell lXY = PageXml.getPointList(cell) #the polygon plg = Polygon(lXY) x1, y1, x2, y2 = plg.fitRectangle() x1, y1, x2, y2 = plg.getBoundingBox() cellX1 = x1 cellX2 = x2 cellY1 = y1 cellY2 = y2 ## get all the textlines of the cell minx, miny, maxx, maxy = 9e9, 9e9, 0, 0 for line in lTextLines: lXY = PageXml.getPointList(line) #the polygon # in case empty cell if lXY == []: continue plg = Polygon(lXY) try: x1, y1, x2, y2 = plg.fitRectangle() except ZeroDivisionError: continue x1, y1, x2, y2 = plg.getBoundingBox() minx = min(minx, x1) miny = min(miny, y1) maxx = max(maxx, x2) maxy = max(maxy, y2) """ finally: simply use the BB of the textlines + padding """ # # new request: height= max(cell,text) # ## new new request: (12/10/2017): min (cell, text)! # HCell = cellY2 - cellY1 # HBBText = maxy - miny # miny -= self.vpadding # vertical padding (top) maxy += self.vpadding # vertical padding (bottom) # # # Height computation # ## if HBBText <= self.HeightTH * HCell: take HBBText as height for TextREgion # if HBBText > self.HeightTH * HCell: # miny = max(miny,cellY1) # maxy = min(maxy,cellY2) # # else : don't touch miny, maxy : use only Text for computing Height # # # Width computation minx -= self.hpadding # horizontal padding maxx += self.hpadding # horizontal padding # minx = min(cellX1,minx) # maxx = max(cellX2, maxx) # print cellX2, maxx corner = cell[0] # print minx,miny,maxx,miny,maxx,maxy,minx,maxy corner.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (minx, miny, maxx, miny, maxx, maxy, minx, maxy))
def _iter_GraphNode(self, doc, sFilename, page=None): """ Get the json dict iterator on the DOM, that returns nodes (of class Block) """ # --- XPATH contexts lNdBlock = doc['GlynchResults']['Areas'] #page_w, page_h = self.getPageWidthandHeight(sFilename) page = Page(1, 1, 1, 1) for ndBlock in lNdBlock: try: sText = ndBlock['label'] except KeyError: sText = "" traceln("WARNING: no 'label' in : %s" % ndBlock) if sText == None: sText = "" traceln("Warning: no text in node") # raise ValueError, "No text in node: %s"%ndBlock # now we need to infer the bounding box of that object lXY = self.getPointList(ndBlock) # the polygon if lXY == []: continue plg = Polygon(lXY) try: x1, y1, x2, y2 = plg.fitRectangle( bPreserveWidth=self.bPreserveWidth) except ZeroDivisionError: x1, y1, x2, y2 = plg.getBoundingBox() except ValueError: x1, y1, x2, y2 = plg.getBoundingBox() # we reduce a bit this rectangle, to ovoid overlap if not (self.BBoxDeltaFun is None): if type(self.BBoxDeltaFun) is tuple and len( self.BBoxDeltaFun) == 2: xFun, yFun = self.BBoxDeltaFun if xFun is not None: dx = xFun(x2 - x1) x1, x2 = int(round(x1 + dx)), int(round(x2 - dx)) if yFun is not None: dy = yFun(y2 - y1) y1, y2 = int(round(y1 + dy)), int(round(y2 - dy)) else: # historical code w, h = x2 - x1, y2 - y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1, y1, x2, y2 = [ int(round(v)) for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy] ] # TODO orientation = 0 # no meaning for PageXml classIndex = 0 # is computed later on # and create a Block blk = Block(page, (x1, y1, x2 - x1, y2 - y1), sText, orientation, classIndex, self, ndBlock, domid=None) blk.setShape(geom.Polygon(lXY)) yield blk return
def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath( self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = self._get_GraphNodeText(doc, domNdPage, ndBlock) if sText == None: sText = "" NodeType_PageXml.nbNoTextWarning += 1 if NodeType_PageXml.nbNoTextWarning < 33: traceln("Warning: no text in node %s" % domid) elif NodeType_PageXml.nbNoTextWarning == 33: traceln( "Warning: no text in node %s - *** %d repetition : I STOP WARNING ***" % (domid, NodeType_PageXml.nbNoTextWarning)) #raise ValueError, "No text in node: %s"%ndBlock #now we need to infer the bounding box of that object lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: continue plg = Polygon(lXY) try: x1, y1, x2, y2 = plg.fitRectangle( bPreserveWidth=self.bPreserveWidth) except ZeroDivisionError: # traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum)) # continue # if True: # #we reduce a bit this rectangle, to ovoid overlap # w,h = x2-x1, y2-y1 # dx = max(w * 0.066, min(20, w/3)) #we make sure that at least 1/"rd of te width will remain! # dy = max(h * 0.066, min(20, w/3)) # x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] x1, y1, x2, y2 = plg.getBoundingBox() except ValueError: x1, y1, x2, y2 = plg.getBoundingBox() #we reduce a bit this rectangle, to ovoid overlap if not (self.BBoxDeltaFun is None): if type(self.BBoxDeltaFun) is tuple and len( self.BBoxDeltaFun) == 2: xFun, yFun = self.BBoxDeltaFun if xFun is not None: dx = xFun(x2 - x1) x1, x2 = int(round(x1 + dx)), int(round(x2 - dx)) if yFun is not None: dy = yFun(y2 - y1) y1, y2 = int(round(y1 + dy)), int(round(y2 - dy)) else: # historical code w, h = x2 - x1, y2 - y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1, y1, x2, y2 = [ int(round(v)) for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy] ] # store the rectangle" ndBlock.set( "DU_points", " ".join([ "%d,%d" % (int(x), int(y)) for x, y in [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] ])) #TODO orientation = 0 #no meaning for PageXml classIndex = 0 #is computed later on #and create a Block blk = Block(page, (x1, y1, x2 - x1, y2 - y1), sText, orientation, classIndex, self, ndBlock, domid=domid) yield blk return