예제 #1
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """    
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = self._get_GraphNodeText(doc, domNdPage, ndBlock)
            if sText == None:
                sText = ""
                traceln("Warning: no text in node %s"%domid) 
                #raise ValueError, "No text in node: %s"%ndBlock 
            
            #now we need to infer the bounding box of that object
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                continue
            
            plg = Polygon(lXY)
            try:
                x1,y1, x2,y2 = plg.fitRectangle()
            except ZeroDivisionError:
#                 traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum))
#                 continue
#             if True:
#                 #we reduce a bit this rectangle, to ovoid overlap
#                 w,h = x2-x1, y2-y1
#                 dx = max(w * 0.066, min(20, w/3))  #we make sure that at least 1/"rd of te width will remain!
#                 dy = max(h * 0.066, min(20, w/3))
#                 x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]

                x1,y1,x2,y2 = plg.getBoundingBox()
            except ValueError:
                x1,y1,x2,y2 = plg.getBoundingBox()
                
                
            #we reduce a bit this rectangle, to ovoid overlap
            if not(self.BBoxDeltaFun is None):
                w,h = x2-x1, y2-y1
                dx = self.BBoxDeltaFun(w)
                dy = self.BBoxDeltaFun(h)
                x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]
                
            
            #TODO
            orientation = 0  #no meaning for PageXml
            classIndex = 0   #is computed later on

            #and create a Block
            blk = Block(page, (x1, y1, x2-x1, y2-y1), sText, orientation, classIndex, self, ndBlock, domid=domid)
            
            yield blk
            
        raise StopIteration()        
def convertTR2Sep(filename):
    """
    """
    print (filename)
    tagname='TextRegion'
    xml = etree.parse(filename)
    ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"})
    
    for x in ltextsep:
        if "separator" in x.get('custom'):
            x.tag = 'SeparatorRegion'
            
            #now we need to convert that object to a line
            lXY = PageXml.getPointList(x)  #the polygon
            assert lXY, "Separator without Coord??"
            
            plg = Polygon(lXY)
            try:
                x1,y1, x2,y2 = plg.fitRectangle()
            except ValueError:
                print("Warning: Coords might be bad, taking bounding box: ", lXY)
                x1,y1,x2,y2 = plg.getBoundingBox()
#             try:
#                 x1,y1, x2,y2 = plg.fitRectangle()
#             except ZeroDivisionError:
#                 x1,y1,x2,y2 = plg.getBoundingBox()
#             except ValueError:
#                 x1,y1,x2,y2 = plg.getBoundingBox()            
            if abs(x2-x1) > abs(y2-y1): # horizontal
                y1 = (y1+y2)/2
                y2 = y1
            else:
                x1 = (x1+x2)/2
                x2=x1
            
            ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0]
            PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)])
                
    return xml
예제 #3
0
    def resizeCell(self, cell, ns):
        """
            replace the cell region by a BB for textlines: better for transcriber
        """
        xpath = "./a:%s" % ("TextLine")
        lTextLines = cell.xpath(xpath, namespaces={'a': PageXml.NS_PAGE_XML})
        if lTextLines == []:
            return True

        ## get minx,maxy  for the cell
        lXY = PageXml.getPointList(cell)  #the polygon
        plg = Polygon(lXY)
        x1, y1, x2, y2 = plg.fitRectangle()
        x1, y1, x2, y2 = plg.getBoundingBox()
        cellX1 = x1
        cellX2 = x2
        cellY1 = y1
        cellY2 = y2

        ## get all the textlines of the cell
        minx, miny, maxx, maxy = 9e9, 9e9, 0, 0
        for line in lTextLines:
            lXY = PageXml.getPointList(line)  #the polygon
            # in case empty cell
            if lXY == []:
                continue
            plg = Polygon(lXY)
            try:
                x1, y1, x2, y2 = plg.fitRectangle()
            except ZeroDivisionError:
                continue
            x1, y1, x2, y2 = plg.getBoundingBox()
            minx = min(minx, x1)
            miny = min(miny, y1)
            maxx = max(maxx, x2)
            maxy = max(maxy, y2)
        """
            finally: simply use the BB of the textlines + padding
        """

        #         # new request: height= max(cell,text)
        #         ## new new request: (12/10/2017): min (cell, text)!
        #         HCell = cellY2 - cellY1
        #         HBBText = maxy - miny
        #
        miny -= self.vpadding  # vertical padding (top)
        maxy += self.vpadding  # vertical padding (bottom)
        #
        #         # Height computation
        #         ## if HBBText <= self.HeightTH * HCell: take HBBText as height for TextREgion
        #         if HBBText > self.HeightTH * HCell:
        #             miny = max(miny,cellY1)
        #             maxy = min(maxy,cellY2)
        #         # else : don't touch miny, maxy  : use only Text for computing Height
        #
        #         # Width computation
        minx -= self.hpadding  # horizontal padding
        maxx += self.hpadding  # horizontal padding

        #         minx = min(cellX1,minx)
        #         maxx = max(cellX2, maxx)
        #         print cellX2, maxx

        corner = cell[0]
        #         print minx,miny,maxx,miny,maxx,maxy,minx,maxy
        corner.set(
            'points', "%d,%d %d,%d %d,%d %d,%d" %
            (minx, miny, maxx, miny, maxx, maxy, minx, maxy))
예제 #4
0
    def _iter_GraphNode(self, doc, sFilename, page=None):
        """
        Get the json dict

        iterator on the DOM, that returns nodes  (of class Block)
        """
        # --- XPATH contexts

        lNdBlock = doc['GlynchResults']['Areas']
        #page_w, page_h = self.getPageWidthandHeight(sFilename)
        page = Page(1, 1, 1, 1)
        for ndBlock in lNdBlock:
            try:
                sText = ndBlock['label']
            except KeyError:
                sText = ""
                traceln("WARNING: no 'label' in : %s" % ndBlock)
            if sText == None:
                sText = ""
                traceln("Warning: no text in node")
                # raise ValueError, "No text in node: %s"%ndBlock

            # now we need to infer the bounding box of that object
            lXY = self.getPointList(ndBlock)  # the polygon
            if lXY == []:
                continue

            plg = Polygon(lXY)
            try:
                x1, y1, x2, y2 = plg.fitRectangle(
                    bPreserveWidth=self.bPreserveWidth)
            except ZeroDivisionError:
                x1, y1, x2, y2 = plg.getBoundingBox()
            except ValueError:
                x1, y1, x2, y2 = plg.getBoundingBox()

            # we reduce a bit this rectangle, to ovoid overlap
            if not (self.BBoxDeltaFun is None):
                if type(self.BBoxDeltaFun) is tuple and len(
                        self.BBoxDeltaFun) == 2:
                    xFun, yFun = self.BBoxDeltaFun
                    if xFun is not None:
                        dx = xFun(x2 - x1)
                        x1, x2 = int(round(x1 + dx)), int(round(x2 - dx))
                    if yFun is not None:
                        dy = yFun(y2 - y1)
                        y1, y2 = int(round(y1 + dy)), int(round(y2 - dy))
                else:
                    # historical code
                    w, h = x2 - x1, y2 - y1
                    dx = self.BBoxDeltaFun(w)
                    dy = self.BBoxDeltaFun(h)
                    x1, y1, x2, y2 = [
                        int(round(v))
                        for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy]
                    ]

            # TODO
            orientation = 0  # no meaning for PageXml
            classIndex = 0  # is computed later on
            # and create a Block
            blk = Block(page, (x1, y1, x2 - x1, y2 - y1),
                        sText,
                        orientation,
                        classIndex,
                        self,
                        ndBlock,
                        domid=None)
            blk.setShape(geom.Polygon(lXY))
            yield blk

        return
예제 #5
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(
            self.sxpNode, namespaces=self.dNS)  #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = self._get_GraphNodeText(doc, domNdPage, ndBlock)
            if sText == None:
                sText = ""
                NodeType_PageXml.nbNoTextWarning += 1
                if NodeType_PageXml.nbNoTextWarning < 33:
                    traceln("Warning: no text in node %s" % domid)
                elif NodeType_PageXml.nbNoTextWarning == 33:
                    traceln(
                        "Warning: no text in node %s  - *** %d repetition : I STOP WARNING ***"
                        % (domid, NodeType_PageXml.nbNoTextWarning))
                #raise ValueError, "No text in node: %s"%ndBlock

            #now we need to infer the bounding box of that object
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                continue

            plg = Polygon(lXY)
            try:
                x1, y1, x2, y2 = plg.fitRectangle(
                    bPreserveWidth=self.bPreserveWidth)
            except ZeroDivisionError:
                #                 traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum))
                #                 continue
                #             if True:
                #                 #we reduce a bit this rectangle, to ovoid overlap
                #                 w,h = x2-x1, y2-y1
                #                 dx = max(w * 0.066, min(20, w/3))  #we make sure that at least 1/"rd of te width will remain!
                #                 dy = max(h * 0.066, min(20, w/3))
                #                 x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]

                x1, y1, x2, y2 = plg.getBoundingBox()
            except ValueError:
                x1, y1, x2, y2 = plg.getBoundingBox()

            #we reduce a bit this rectangle, to ovoid overlap
            if not (self.BBoxDeltaFun is None):
                if type(self.BBoxDeltaFun) is tuple and len(
                        self.BBoxDeltaFun) == 2:
                    xFun, yFun = self.BBoxDeltaFun
                    if xFun is not None:
                        dx = xFun(x2 - x1)
                        x1, x2 = int(round(x1 + dx)), int(round(x2 - dx))
                    if yFun is not None:
                        dy = yFun(y2 - y1)
                        y1, y2 = int(round(y1 + dy)), int(round(y2 - dy))
                else:
                    # historical code
                    w, h = x2 - x1, y2 - y1
                    dx = self.BBoxDeltaFun(w)
                    dy = self.BBoxDeltaFun(h)
                    x1, y1, x2, y2 = [
                        int(round(v))
                        for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy]
                    ]

            # store the rectangle"
            ndBlock.set(
                "DU_points", " ".join([
                    "%d,%d" % (int(x), int(y))
                    for x, y in [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]
                ]))

            #TODO
            orientation = 0  #no meaning for PageXml
            classIndex = 0  #is computed later on

            #and create a Block
            blk = Block(page, (x1, y1, x2 - x1, y2 - y1),
                        sText,
                        orientation,
                        classIndex,
                        self,
                        ndBlock,
                        domid=domid)

            yield blk

        return