Пример #1
0
def processRegions(ndPage, bVerbose=False):
    """
        Delete empty regions
        resize no empty regions
    """
    lDel = []
    lndRegions = ndPage.xpath(".//pg:TextRegion", namespaces=dNS)
    for ndRegion in lndRegions:
        lTL = ndRegion.xpath(".//pg:TextLine", namespaces=dNS)
        if lTL == []:
            # to be deleted
            lDel.append(ndRegion)
        else:
            #resize it
            oHull = ShapeLoader.convex_hull(lTL, bShapelyObject=True)
            PageXml.getChildByName(ndRegion, 'Coords')[0].set(
                "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True))


#             contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lTL ])
#             o = contour.minimum_rotated_rectangle
#             ndRegion.getChildByName('Coords').set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))

# delete empty regions
    [ndRegion.getparent().remove(ndRegion) for ndRegion in lDel]

    if bVerbose:
        traceln(" - %d regions deleted" % (len(lDel)))
        traceln(" - %d regions updated" % (len(lndRegions) - len(lDel)))
Пример #2
0
def shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow, dTextInCells):
    """
        delete one 'col' and see if the rectangle is stable (area? comparison; rather angle)
        
        angle see baseline object or 
        take the longest line in the polygon and compute its angle
            radian = math.atan((shape.lastpoint.x - shape.firstpoint.x)/(shape.lastpoint.y - shape.firstpoint.y))  
            degrees = radian * 180 / math.pi  
        return degrees  
    """
    #sort by size (reverse)
    #     lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    ## can be restricted to ones with high angle?
    lNewPoly = []
    for i in sorted(dCellCndPerRow.keys()):
        lcellRows = dCellCndPerRow[i]
        # skip first
        ltl = []
        for c in lcellRows[1:]:
            if c.wkt in dTextInCells:
                for tl in dTextInCells[c.wkt]:
                    assert ShapeLoader.node_to_Polygon(tl).is_valid
                    ltl.append(tl)
        rectangle = ShapeLoader.minimum_rotated_rectangle(ltl,
                                                          bShapelyObject=True)
        #         print (i,lClusterPoly[i].intersection(rectangle).area / lClusterPoly[i].area)
        if not rectangle.is_empty:
            if abs(rectangle.bounds[1] -
                   rectangle.bounds[3]) < abs(lClusterPoly[i].bounds[1] -
                                              lClusterPoly[i].bounds[3]):
                #                 rectangle=scale(rectangle,xfact=1.5)
                cellNd = etree.Element('COL')
                pageNd.append(cellNd)
                rectangle = scale(rectangle, xfact=2)
                lNewPoly.append(rectangle)
                cellNd.set('points', ShapeLoader.getCoordsString(rectangle))

            else:
                lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5)
                lNewPoly.append(lClusterPoly[i])
        else:
            lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5)
            lNewPoly.append(lClusterPoly[i])
        ## get length and compute angle !!!  take the one with more horizontal angle::


#         cellNd = etree.Element('COL')
#         pageNd.append(cellNd)
#         if not rectangle.is_empty:
#             rectangle=scale(rectangle,xfact=2)
#             cellNd.set('points',ShapeLoader.getCoordsString(rectangle))
#     print (len(lClusterPoly),len(lNewPoly))
    return lNewPoly
Пример #3
0
 def computeShape(cls, ndPage, setID, bConvexHull=False):
     """ 
     compute a shape for this cluster, as the minimum rotated rectangle of its content
     or optionally as the convex hull
     """
     # let's find the nodes and compute the shape
     lNode = [
         ndPage.xpath(".//*[@id='%s']" % _id, namespaces=dNS)[0]
         for _id in setID
     ]
     return       ShapeLoader.convex_hull(lNode, bShapelyObject=True)   \
             if bConvexHull                                             \
             else ShapeLoader.minimum_rotated_rectangle(lNode, bShapelyObject=True)
Пример #4
0
def ajustRectangleShapes(pageNd, lClusterPoly):
    """
        reduce a cluster rectangle with just area which does not overlap other area
    """
    #lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    lNewPoly = []
    for i, pol1 in enumerate(lClusterPoly):
        #         print (i,'init:',pol1.area,pol1.bounds)
        init = pol1.wkt
        # take all other and createa multipo
        #         multi=MultiPolygon([p for p in lClusterPoly if p != pol1]).buffer(0)
        for j, pol2 in enumerate(lClusterPoly[i + 1:]):
            if pol1.intersection(pol2).area > pol2.area * 0.01:
                pol1 = pol1.symmetric_difference(pol2).difference(pol2)
                #                 pol1=pol1.difference(pol2)
                if not pol1.is_empty and type(pol1) in [
                        MultiPolygon, GeometryCollection
                ]:
                    pol1 = max(pol1.geoms, key=lambda p: p.area)

        cellNd = etree.Element('COLX')
        pageNd.append(cellNd)
        if not pol1.is_empty:
            cellNd.set('points', ShapeLoader.getCoordsString(pol1))
        lNewPoly.append(pol1)


#     lNewPoly.extend(lClusterPoly[1:])
#     print(len(lNewPoly) ,len(lClusterPoly))
    return lNewPoly
Пример #5
0
def tableCreation(ndPage, dCellsIJ, dCellTxt, dCoordCell):
    """
        find TABLE tag
        add CELLS 
    
    """
    # get TABLE node
    xTable = ".//TABLE"
    lTableNds = ndPage.xpath(xTable)
    # discard fake table!!
    tableNd = lTableNds[-1]

    for cellwkt in dCellsIJ.keys():
        cellNd = etree.Element('CELL')
        i, j = dCellsIJ[cellwkt]
        x1, y1, x2, y2 = dCoordCell[cellwkt].bounds
        cellNd.set('row', f'{i}')
        cellNd.set('col', f'{j}')
        cellNd.set('x', str(x1))
        cellNd.set('y', str(y1))
        cellNd.set('height', str(abs(y2 - y1)))
        cellNd.set('width', str(abs(x2 - x1)))

        # empty
        #         cellNd.set('points',ShapeLoader.getCoordsString(dCoordCell[cellwkt]))
        try:
            cellNd.set('points',
                       ShapeLoader.getCoordsString(dCoordCell[cellwkt]))
        except:
            # take largest
            pol = max(dCoordCell[cellwkt].geoms, key=lambda p: p.area)
            cellNd.set('points', ShapeLoader.getCoordsString(pol))

        # populate with TL!:
        # sort by Y increasing!!
        dCellTxt[cellwkt].sort(
            key=lambda x: ShapeLoader.node_to_Point(x).bounds[1])
        [cellNd.append(t) for t in dCellTxt[cellwkt]]
        #cellNd.text= " ".join(dCellTxt[cellwkt])

        cellNd.set('colSpan', "1")
        cellNd.set('rowSpan', "1")
        tableNd.append(cellNd)

    return
def isBaselineHorizontal(ndText):
    lNdBaseline = MultiPageXml.getChildByName(ndText ,'Baseline')
    if lNdBaseline:
        try:
            o = ShapeLoader.node_to_LineString(lNdBaseline[0])
        except:
            return True
        (minx, miny, maxx, maxy) = o.bounds
        return bool(maxx-minx >= maxy-miny)
    return True            
Пример #7
0
def transformMinima2envelop(pageNd, dClusterWithTL):
    """
    """
    lClustersPoly = []
    for row in dClusterWithTL:
        #         bb = ShapeLoader.contourObject(dClusterWithTL[row]).envelope
        #         if not bb.is_empty:
        #             cellNd = etree.Element('COL')
        #             pageNd.append(cellNd)
        #             cellNd.set('points',ShapeLoader.getCoordsString(bb))

        ch = ShapeLoader.contourObject(dClusterWithTL[row]).convex_hull
        lClustersPoly.append(ch)
        if not ch.is_empty:
            cellNd = etree.Element('LINE')
            pageNd.append(cellNd)
            cellNd.set('points', ShapeLoader.getCoordsString(ch))

    return lClustersPoly
    def parseDocNodeLabel(self, graph_node, defaultCls=None):
        """
        Parse and set the graph node label and return its class index
        raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
        """
        domnode = graph_node.node
        sXmlLabel = domnode.get(self.sLabelAttr)

        # in case we also deal with column headers
        if self.bColumnHeader and 'CH' == domnode.get("DU_header"):
            sXmlLabel = 'CH'

        sXmlLabel = self.dConverter[sXmlLabel]
        if sXmlLabel is None:
            # special processing for singletons TODO: make it more efficient?
            ptTxt = ShapeLoader.node_to_Polygon(domnode).centroid
            plgCell = ShapeLoader.node_to_Polygon(domnode.getparent())
            plgMiddle = shapely.affinity.scale(plgCell, 1, 0.333, 1,
                                               'centroid')
            if plgMiddle.contains(ptTxt):
                sXmlLabel = "Sm"
            else:
                if ptTxt.y < plgCell.centroid.y:
                    sXmlLabel = "St"
                else:
                    sXmlLabel = "Sb"
        try:
            sLabel = self.dXmlLabel2Label[sXmlLabel]
        except KeyError:
            #             #not a label of interest, can we ignore it?
            #             try:
            #                 self.checkIsIgnored(sXmlLabel)
            #                 sLabel = self.sDefaultLabel
            #                 #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel:
            #             except:
            raise ValueError("Invalid label '%s'"
                             " (from @%s or @%s) in node %s" %
                             (sXmlLabel, self.sLabelAttr, self.sDefaultLabel,
                              etree.tostring(domnode)))
        # traceln(etree.tostring(domnode), sLabel)
        return sLabel
Пример #9
0
    def addClusterToDom(self, dCluster, bMoveContent=False):
        """
        Add Cluster elements to the Page DOM node
        """
        pageNode = None
        for x, lnidx in dCluster.items():
            #self.analysedCluster()
            if pageNode is None:
                pageNode = self.lNode[lnidx[0]].page.node
                pageNode.append(
                    etree.Comment("Clusters created by the conjugate graph"))

            # lp = [ShapeLoader.node_to_Polygon(self.lNode[_i].node) for _i in lnidx]
            # Make it robust to bad data...
            lp = []
            for _i in lnidx:
                try:
                    lp.append(ShapeLoader.node_to_Polygon(self.lNode[_i].node))
                except ValueError:
                    pass
            contour = cascaded_union(
                [p if p.is_valid else p.convex_hull for p in lp])
            # print(contour.wkt)
            try:
                spoints = ' '.join(
                    "%s,%s" % (int(x[0]), int(x[1]))
                    for x in contour.minimum_rotated_rectangle.exterior.coords)
            except:
                try:
                    spoints = ' '.join(
                        "%s,%s" % (int(x[0]), int(x[1]))
                        for x in contour.minimum_rotated_rectangle.coords)
                    # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence
                except:
                    spoints = ""
            #print (spoints)
            ndCluster = PageXml.createPageXmlNode('Cluster')
            # add the space separated list of node ids
            ndCluster.set(
                "content",
                " ".join(self.lNode[_i].node.get("id") for _i in lnidx))
            coords = PageXml.createPageXmlNode('Coords')
            ndCluster.append(coords)
            coords.set('points', spoints)
            pageNode.append(ndCluster)

            if bMoveContent:
                # move the DOM node of the content to the cluster
                for _i in lnidx:
                    ndCluster.append(self.lNode[_i].node)

        return
Пример #10
0
def getClusterCoords(lElts):
    
        lp = []
        for e in lElts:
            try:
                lp.append(ShapeLoader.node_to_Polygon(e))
            except ValueError:
                pass
        contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lp ])     
        # print(contour.wkt)
        try:spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.exterior.coords)
        except:
            try: spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.coords)
            # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence
            except: spoints = ""    
        return spoints
Пример #11
0
def assignConCompToCells(lCC, lCells):
    """
        get textlines from ids and best-assign them to the list of dCells (shape)
        
        # do it at the cc level?
    """
    dCellCC = {}
    for i, cc in enumerate(ShapeLoader.contourObject(lCC)):
        if not cc.is_empty:
            cellshape = bestRegionsAssignment(cc, lCells)
            if cellshape:
                try:
                    dCellCC[cellshape.wkt].append(lCC[i])
                except KeyError:
                    dCellCC[cellshape.wkt] = [lCC[i]]

    return dCellCC
Пример #12
0
 def makeClusterNode(self, sAlgo):
     """
     Create an XML node reflecting the cluster
     """
     ndCluster = PageXml.createPageXmlNode('Cluster')
     ndCluster.set("name", self.name)
     ndCluster.set("algo", sAlgo)
     # add the space separated list of node ids
     ndCluster.set("content", " ".join(self.setID))
     ndCoords = PageXml.createPageXmlNode('Coords')
     ndCluster.append(ndCoords)
     if self.shape is None:
         ndCoords.set('points', "")
     else:
         ndCoords.set('points', ShapeLoader.getCoordsString(self.shape))
     ndCluster.tail = "\n"
     return ndCluster
Пример #13
0
def assignTextLinesToCells(lTL, lCells):
    """
        get textlines from ids and best-assign them to the list of dCells (shape)
        
        # do it at the cc level?
    """
    #     print (lCells)
    #     print ([ShapeLoader.node_to_Polygon(nd) for nd in lTL])
    dCellTL = {}
    for i, txt in enumerate([ShapeLoader.node_to_Polygon(nd) for nd in lTL]):
        if not txt.is_empty:
            cellshape = bestRegionsAssignment(txt, lCells)
            if cellshape:
                try:
                    dCellTL[cellshape.wkt].append(lTL[i])
                except KeyError:
                    dCellTL[cellshape.wkt] = [lTL[i]]
    return dCellTL
 def _iter_GraphNode(self, doc, domNdPage, page):
     """
     to add the shape object reflecting the baseline
     """
     for blk in super()._iter_GraphNode(doc, domNdPage, page):
         try:
             ndBaseline = blk.node.xpath(".//pc:Baseline", namespaces=self.dNS)[0]
             try:
                 o = ShapeLoader.node_to_LineString(ndBaseline)
             except ValueError:
                 traceln("SKIPPING INVALID Baseline: ", etree.tostring(ndBaseline))
                 continue
             blk.shape = o
             blk.du_index = int(ndBaseline.get("du_index"))
             yield blk
         except:
             pass
     return
Пример #15
0
    def addSeparatorFeature(self):
        """
        We load the graphical separators
        COmpute a set of shapely object
        In turn, for each edge, we compute the intersection
        """

        # graphical separators
        from xml_formats.PageXml import PageXml
        dNS = {"pc": PageXml.NS_PAGE_XML}
        ndRoot = self.lNode[0].node.getroottree()
        lNdSep = ndRoot.xpath(".//pc:SeparatorRegion", namespaces=dNS)
        loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep]

        if self.bVerbose: traceln(" %d graphical separators" % len(loSep))

        # make an indexed rtree
        idx = index.Index()
        for i, oSep in enumerate(loSep):
            idx.insert(i, oSep.bounds)

        # take each edge in turn and list the separators it crosses
        nCrossing = 0
        for edge in self.lEdge:
            # bottom-left corner to bottom-left corner
            oEdge = geom.LineString([(edge.A.x1, edge.A.y1),
                                     (edge.B.x1, edge.B.y1)])
            prepO = prep(oEdge)
            bCrossing = False
            for i in idx.intersection(oEdge.bounds):
                # check each candidate in turn
                if prepO.intersects(loSep[i]):
                    bCrossing = True
                    nCrossing += 1
                    break
            edge.bCrossingSep = bCrossing

        if self.bVerbose:
            traceln(
                " %d (/ %d) edges crossing at least one graphical separator" %
                (nCrossing, len(self.lEdge)))
    def addClusterToDom(self,
                        lCluster,
                        bMoveContent=False,
                        sAlgo="",
                        pageNode=None):
        """
        Add Cluster elements to the Page DOM node
        """
        lNdCluster = []
        for name, lnidx in enumerate(lCluster):
            #self.analysedCluster()
            if pageNode is None:
                for idx in lnidx:
                    pageNode = self.lNode[idx].page.node
                    break
                pageNode.append(
                    etree.Comment(
                        "\nClusters created by the conjugate graph\n"))

            ndCluster = PageXml.createPageXmlNode('Cluster')
            ndCluster.set("name", str(name))
            ndCluster.set("algo", sAlgo)
            # add the space separated list of node ids
            ndCluster.set(
                "content",
                " ".join(self.lNode[_i].node.get("id") for _i in lnidx))
            coords = PageXml.createPageXmlNode('Coords')
            ndCluster.append(coords)
            spoints = ShapeLoader.minimum_rotated_rectangle(
                [self.lNode[_i].node for _i in lnidx])
            coords.set('points', spoints)
            pageNode.append(ndCluster)
            ndCluster.tail = "\n"

            if bMoveContent:
                # move the DOM node of the content to the cluster
                for _i in lnidx:
                    ndCluster.append(self.lNode[_i].node)
            lNdCluster.append(ndCluster)

        return lNdCluster
Пример #17
0
 def loadClusterNode(cls, ndPage, nd, sAlgo, bComputeShape=True):
     """
     Load a cluster from its XML node
     Compute its shape, if not provided in the XML, as a minimum rotated rectangle       
     """
     name = nd.get("name")
     if name is None:
         name = "%s_%d" % (sAlgo, cls.cnt)
         cls.cnt += 1
         nd.set("name", name)
     setID = set(nd.get("content").split())
     if bool(setID):
         try:
             shape = ShapeLoader.node_to_Polygon(nd)
         except IndexError:
             if bComputeShape:
                 shape = cls.computeShape(ndPage, setID)
             else:
                 shape = None
         return cls(name, setID, shape)
     else:
         return None
def normaliseElement(nd, iNorm):
    try:
        ndBaseline = nd.xpath(xpBASELINE, namespaces=dNS)[0]
    except IndexError:
        raise NormaliseException("WARNING: skipped element normalisation: no Baseline: %s" % etree.tostring(nd))
        
    try:
        line = ShapeLoader.node_to_LineString(ndBaseline)
    except ValueError:
        raise NormaliseException("WARNING: skipped element normalisation: invalid Coords: %s" % etree.tostring(nd))
    topline = translate(line, yoff=-iNorm)
    
    # serialise both in circular sequence
    spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in line.coords)
    lp=list(topline.coords)
    lp.reverse()
    spoints = spoints+ ' ' +' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in lp)    

    # ad-hoc way of setting the element coodinates
    ndCoords = nd.xpath(".//pg:Coords", namespaces=dNS)[0]
    ndCoords.set("points",spoints)
    
    return
    def check(self, sFilename, scale, bVerbose=False):
        """
        return Y_pred, YGT   (shape=(nb_node,) dtype=np.int)
        return a TestReport
        """
        lY, lYGT = [], []

        #for the pretty printer to format better...
        assert os.path.isfile(sFilename), sFilename
        doc = etree.parse(sFilename, self.parser)
        #doc  = etree.parse(sFilename)
        root = doc.getroot()

        #place each TextLine in the table rows and columns
        ndPage = MultiPageXml.getChildByName(root, 'Page')[0]

        #         w, h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))

        def storeNode(oShape, nd):
            oShape.duNode = nd

        if True:
            loTxt = ShapeLoader.children_to_LinearRing(ndPage, 'TextLine',
                                                       storeNode)
        else:
            loTxt = ShapeLoader.children_to_LineString(ndPage, 'Baseline',
                                                       storeNode)

        if not scale is None:
            scaled_loTxt = []
            for o in loTxt:
                scaled_o = shapely.affinity.scale(o, 1.0, scale)
                scaled_o.duNode = o.duNode
                scaled_loTxt.append(scaled_o)
            loTxt = scaled_loTxt

        if bVerbose: print("%d TextLines" % len(loTxt))
        loSep = ShapeLoader.children_to_LineString(ndPage, 'SeparatorRegion',
                                                   storeNode)
        if bVerbose: print("%d SeparatorRegion" % len(loSep))

        if True:
            # brute-force code
            for oSep in loSep:
                if bVerbose:
                    print("%35s %4s %4s %s" %
                          (oSep, oSep.duNode.get("row"),
                           oSep.duNode.get("col"), oSep.duNode.get("orient")))
                sInfo = oSep.duNode.get("orient")
                if sInfo.startswith("horizontal"):
                    YGT = 0
                elif sInfo.startswith("vertical"):
                    YGT = 1
                    continue
                else:
                    YGT = 2
                Y = None
                for oTxt in loTxt:
                    if oSep.crosses(oTxt):
                        Y = 2
                        nd = oTxt.duNode
                        sid = nd.get("id")
                        ndCell = nd.getparent()
                        if bVerbose:
                            print("\tCrossing %s row=%s col=%s" %
                                  (sid, ndCell.get("row"), ndCell.get("col")))
                if Y is None: Y = YGT
                lY.append(Y)
                lYGT.append(YGT)
        else:
            # Collection-based code
            moTxt = shapely.geometry.MultiLineString(loTxt)
            for oSep in loSep:
                print("%40s %4s %4s %s" %
                      (oSep, oSep.duNode.get("row"), oSep.duNode.get("col"),
                       oSep.duNode.get("orient")))
                if oSep.crosses(moTxt):
                    print("NOO")
                lYGT.append("")
        oTstRpt = TestReport("SeparatorChecker", [lY], [lYGT],
                             self.lsClassName, [sFilename])
        #return np.array(lY  , dtype=np.int), np.array(lYGT, dtype=np.int)
        fScore, sClassificationReport = oTstRpt.getClassificationReport()
        if fScore < 0.999: print("\t *** Accuracy score = %f" % fScore)
        #         if fScore < 1: print(sFilename, sClassificationReport)
        return oTstRpt
    def loadPageCol(self,
                    ndPage,
                    fRatio,
                    shaper_fun=ShapeLoader.node_to_Point,
                    funIndex=lambda x: x._du_index):
        """
        load the page, looking for Baseline
        can filter by DU_row
        return a list of shapely objects
             , a dict of sorted list of objects, by column
             
        GT BUG: some Baseline are assigned to the wrong Cell
        => we also fix this here....
        
        """
        loBaseline = []  # list of Baseline shapes
        i = 0

        dsetTableByCol = defaultdict(set)  # sets of object ids, by col
        dsetTableDataByCol = defaultdict(set)  # sets of object ids, by col
        dO = {}

        dNodeSeen = {}
        # first associate a unique id to each baseline and list them
        lshapeCell = []
        lOrphanBaselineShape = []

        lCells = MultiPageXml.getChildByName(ndPage, "TableCell")
        maxHeaderRowSpan = computeMaxRowSpan(lCells)
        traceln("   - maxHeaderRowSpan=", maxHeaderRowSpan)
        for ndCell in lCells:
            row, col = int(ndCell.get("row")), int(ndCell.get("col"))
            rowSpan = int(ndCell.get("rowSpan"))
            plg = ShapeLoader.node_to_Polygon(ndCell)
            #ymin, ymax of polygon
            lx = [_x for _x, _y in plg.exterior.coords]
            xmin, xmax = min(lx), max(lx)
            plg._row = row
            plg._col = col
            plg._xmin, plg._xmax = xmin, xmax
            lshapeCell.append(plg)

            for nd in MultiPageXml.getChildByName(ndCell, "Baseline"):
                nd.set("du_index", "%d" % i)
                ndParent = nd.getparent()
                dNodeSeen[ndParent.get('id')] = True

                # Baseline as a shapely object
                try:
                    o = shaper_fun(nd)  #make a LineString
                except Exception as e:
                    traceln("ERROR: id=", nd.getparent().get("id"))
                    raise e
                # scale the objects, as done when cutting!!
                # useless currently since we make a Point...
                o = shapely.affinity.scale(o, xfact=fRatio, yfact=fRatio)

                o._du_index = i
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")
                loBaseline.append(o)

                # is this object in the correct cell???
                # We must use the centroid of the text box, otherwise a baseline
                # may be assigned to the next row
                # NOOO x = ShapeLoader.node_to_Polygon(ndParent).centroid.x
                # we must look for the leftest coordinate
                # NO CHECK FOR COLUMNS

                dsetTableByCol[col].add(funIndex(o))

                if (row + rowSpan) > maxHeaderRowSpan:
                    dsetTableDataByCol[col].add(funIndex(o))

                i += 1


#         if lOrphanBaselineShape:
#             traceln("    *** error: %d Baseline in incorrect row - fixing this..." % len(lOrphanBaselineShape))
#             for o in lOrphanBaselineShape:
#                 bestrow, bestdeltacol = 0, 9999
#                 try:
#                     y = o.y
#                 except:
#                     y = o.centroid.y
#                 for plg in lshapeCell:
#                     if plg._ymin <= y and y <= plg._ymax:
#                         # sounds good
#                         deltacol = abs(o._bad_cell._col - plg._col)
#                         if deltacol == 0:
#                             # same column, ok it is that one
#                             bestrow = plg._row
#                             break
#                         else:
#                             if bestdeltacol > deltacol:
#                                 bestdeltacol = deltacol
#                                 bestrow = plg._row
#                 traceln("\t id=%s misplaced in row=%s instead of row=%s" %(
#                     o._du_nd.getparent().get("id")
#                     , o._bad_cell._row
#                     , bestrow))
#                 dsetTableByCol[bestrow].add(o._du_index)
#                 del o._bad_cell

# and (UGLY) process all Baseline outside any TableCell...

        for nd in MultiPageXml.getChildByName(ndPage, "Baseline"):
            try:
                dNodeSeen[nd.getparent().get('id')]
            except:
                #OLD "GOOD" CODE HERE
                nd.set("du_index", "%d" % i)

                # Baseline as a shapely object
                o = shaper_fun(nd)  #make a LineString

                # scale the objects, as done when cutting!!
                o = shapely.affinity.scale(o, xfact=fRatio)

                o._du_index = i
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")
                loBaseline.append(o)

                i += 1

        return loBaseline, dsetTableByCol, dsetTableDataByCol, maxHeaderRowSpan
    def addSeparatorFeature(self):
        """
        We load the graphical separators
        COmpute a set of shapely object
        In turn, for each edge, we compute the intersection with all separators
        
        The edge features will be:
        - boolean: at least crossing one separator
        - number of crossing points
        - span length of the crossing points
        - average length of the crossed separators
        - average distance between two crossings
        """

        # graphical separators
        dNS = {"pc": PageXml.NS_PAGE_XML}
        someNode = self.lNode[0]
        ndPage = someNode.node.xpath("ancestor::pc:Page", namespaces=dNS)[0]
        lNdSep = ndPage.xpath(".//pc:SeparatorRegion", namespaces=dNS)
        loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep]

        if self.bVerbose: traceln(" %d graphical separators" % len(loSep))

        # make an indexed rtree
        idx = index.Index()
        for i, oSep in enumerate(loSep):
            idx.insert(i, oSep.bounds)

        # take each edge in turn and list the separators it crosses
        nCrossing = 0
        for edge in self.lEdge:
            # bottom-left corner to bottom-left corner
            oEdge = geom.LineString([(edge.A.x1, edge.A.y1),
                                     (edge.B.x1, edge.B.y1)])
            prepO = prep(oEdge)
            lCrossingPoints = []
            fSepTotalLen = 0
            for i in idx.intersection(oEdge.bounds):
                # check each candidate in turn
                oSep = loSep[i]
                if prepO.intersects(oSep):
                    fSepTotalLen += oSep.length
                    oPt = oEdge.intersection(oSep)
                    if type(oPt) != geom.Point:
                        traceln('Intersection in not a point: skipping it')
                    else:
                        lCrossingPoints.append(oPt)

            if lCrossingPoints:
                nCrossing += 1
                edge.bCrossingSep = True
                edge.sep_NbCrossing = len(lCrossingPoints)
                minx, miny, maxx, maxy = geom.MultiPoint(
                    lCrossingPoints).bounds
                edge.sep_SpanLen = abs(minx - maxx) + abs(miny - maxy)
                edge.sep_AvgSpanSgmt = edge.sep_SpanLen / len(lCrossingPoints)
                edge.sep_AvgSepLen = fSepTotalLen / len(lCrossingPoints)
            else:
                edge.bCrossingSep = False
                edge.sep_NbCrossing = 0
                edge.sep_SpanLen = 0
                edge.sep_AvgSpanSgmt = 0
                edge.sep_AvgSepLen = 0

            #traceln((edge.A.domid, edge.B.domid, edge.bCrossingSep, edge.sep_NbCrossing, edge.sep_SpanLen, edge.sep_AvgSpanSgmt, edge.sep_AvgSepLen))

        if self.bVerbose:
            traceln(
                " %d (/ %d) edges crossing at least one graphical separator" %
                (nCrossing, len(self.lEdge)))
Пример #22
0
    def addClusterToDom(self, lCluster, sAlgo=""):
        """
        From the predicted clusters, we create new TextRegions.
        The original TextRegion, if any, are either removed or renamed TextRegion_GT 
        """
        if hasattr(options, "bEvalRegion") and options.bEvalRegion:
            # in order to evaluate, we must keep the original TextRegion and the Cluster elements that are produced
            return super(My_ConjugateSegmenterGraph_MultiSinglePageXml,
                         self).addClusterToDom(lCluster, sAlgo=sAlgo)

        lNdCluster = []
        dNS = {
            "pc":
            "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"
        }

        # the graph has been constructed for a certain page
        pageNode = self.lNode[0].page.node
        # guess its position to create unique XML ID
        #pageNum = 1 #+ len( pageNode.xpath(".//preceding::pc:Page", namespaces=dNS) )
        #traceln(" Page: ",pageNum)
        # enumerate TextAreas to remove
        lNdOldTextRegion = pageNode.xpath(".//pc:TextRegion", namespaces=dNS)

        # we copy the type attribute to the inner TextLine, to preserve the GT info, if any
        for ndTR in lNdOldTextRegion:
            sType = str(ndTR.get("type"))
            for ndTL in ndTR.xpath(".//pc:TextLine", namespaces=dNS):
                ndTL.set("type_gt", sType)

        # replace the ReadingOrder section by a new one
        ndReadingOrder = pageNode.xpath(".//pc:ReadingOrder",
                                        namespaces=dNS)[0]
        pageNode.remove(ndReadingOrder)
        ndReadingOrder = PageXml.createPageXmlNode('ReadingOrder')
        ndOrderedGroup = PageXml.createPageXmlNode('OrderedGroup')
        ndOrderedGroup.set("id", "ro_1")
        ndOrderedGroup.set("caption", "Regions reading order")
        ndReadingOrder.append(ndOrderedGroup)
        pageNode.append(ndReadingOrder)

        # loop over clusters
        for ic, c in enumerate(lCluster):
            ndCluster = PageXml.createPageXmlNode('TextRegion')
            #scid = "cluster_p%d_%d" % (pageNum, ic+1)
            scid = "cluster_%d" % (ic + 1)
            ndCluster.set("id", scid)
            ndCluster.set("custom", "readingOrder {index:%d;}" % ic)

            # TextRegion bounding box
            coords = PageXml.createPageXmlNode('Coords')
            ndCluster.append(coords)
            spoints = ShapeLoader.minimum_rotated_rectangle(
                [self.lNode[_i].node for _i in c])
            coords.set('points', spoints)

            # if the inner TextLine are tagged, let's do a vote to tag the Cluster
            lsType = [self.lNode[_i].node.get('type') for _i in c]
            dType = Counter([o for o in lsType if o is not None])
            mc = dType.most_common(1)
            if mc:
                sXmlLabel = mc[0][0]
                ndCluster.set("type", sXmlLabel)
                PageXml.setCustomAttr(ndCluster, "structure", "type",
                                      sXmlLabel)

            #TextLine: move the DOM node of the content to the cluster
            for _i in c:
                ndCluster.append(self.lNode[_i].node)

            pageNode.append(ndCluster)
            ndCluster.tail = "\n"
            lNdCluster.append(ndCluster)

            ndRegionRefIndexed = PageXml.createPageXmlNode('RegionRefIndexed')
            ndRegionRefIndexed.set("index", str(ic))
            ndRegionRefIndexed.set("regionRef", scid)
            ndRegionRefIndexed.tail = "\n"
            ndOrderedGroup.append(ndRegionRefIndexed)

        # remove or rename the old TextRegion
        for nd in lNdOldTextRegion:
            if False:
                nd.tag = "TextRegion_GT"
            else:
                #pageNode.remove(nd)
                nd.getparent().remove(nd)

        return lNdCluster
Пример #23
0
    def makeTableNode(self):
        """
        Make a DOM tree for this table
        """
        lK = self._dCellNd.keys()
        lRow = list(set(_row for _row, _col in lK))
        lRow.sort()
        lCol = list(set(_col for _row, _col in lK))
        lCol.sort()

        ndTable = PageXml.createPageXmlNode("TableRegion")
        ndTable.set("id", "p%s_%s" % (self.pagenum, self.tablenum))
        ndTable.tail = "\n"
        lCellShape = []
        lNdCell = []
        for row in lRow:
            for col in lCol:
                lNdText = self._dCellNd[(row, col)]
                #     <TableCell row="0" col="1" rowSpan="1" colSpan="1" id="TableCell_1500971530732_2485">
                #        <Coords points="480,42 485,323 878,323 874,38"/>

                if lNdText:
                    ndCell = PageXml.createPageXmlNode("TableCell")
                    ndCell.set(
                        "id", "p%s_t%s_r%s_c%s" %
                        (self.pagenum, self.tablenum, row, col))

                    # shape of the cell
                    oHull = ShapeLoader.convex_hull(lNdText,
                                                    bShapelyObject=True)
                    lCellShape.append(
                        oHull)  # keep those to compute table contour

                    # Coords sub-element
                    ndCoords = PageXml.createPageXmlNode("Coords")
                    ndCoords.set(
                        "points",
                        ShapeLoader.getCoordsString(oHull, bFailSafe=True))
                    ndCoords.tail = "\n"
                    ndCell.append(ndCoords)

                    # row="0" col="0" rowSpan="1" colSpan="1" leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false" bottomBorderVisible="false"
                    ndCell.set("row", str(row))
                    ndCell.set("rowSpan", "1")
                    ndCell.set("col", str(col))
                    ndCell.set("colSpan", "1")
                    ndCell.tail = "\n"

                    #add corner
                    cornerNode = PageXml.createPageXmlNode("CornerPts")
                    cornerNode.text = "0 1 2 3"
                    ndCell.append(cornerNode)

                    for nd in lNdText:
                        ndCell.append(nd)

                    lNdCell.append(ndCell)

        # Table geometry
        ndCoords = PageXml.createPageXmlNode("Coords")
        contour = cascaded_union(
            [p if p.is_valid else p.convex_hull for p in lCellShape])
        o = contour.minimum_rotated_rectangle
        ndCoords.set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))
        ndCoords.tail = "\n"
        ndTable.append(ndCoords)

        for nd in lNdCell:
            ndTable.append(nd)

        return ndTable
def project_Elt_to_GT(gtdoc, doc
                      , xpElement1, xpElement2
                      , xpArea2
                      , bSep, lsRmId, bEval
                      , fTH=0.5):
    """
    Here we take the element out of the production file to put them in the GT
    doc
    
    WE IGNORE xpArea1 (no need for it)
    
    We return the GT doc
    """
    gtroot = gtdoc.getroot()

    # Evaluation
    # we build a table of list of TextLineId from the GT to check this SW
    # table_id -> row -> col -> list of element id
    dTable = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
    nOk, nTot = 0, 0
    
    if lsRmId:
        nbEltRemoved = 0
        for sRmId in lsRmId:
            # for _nd in gtroot.xpath('//pg:*[@id="%s"]'%sRmId, namespaces=dNS):
            for _nd in gtroot.xpath('//*[@id="%s"]'%sRmId):
                _nd.getparent().remove(_nd)
                nbEltRemoved += 1
        trace(" (Rm by ID: %d elements removed)" % nbEltRemoved)

    # remove all elements of interest from GT
    # inside TableRegion, we have TextLine, outside we have TextRegion
    if xpElement1 != xpArea2:
        for ndElt in gtroot.xpath(xpElement1, namespaces=dNS):
            if bEval:
                for ndElt2 in ndElt.xpath(xpElement2, namespaces=dNS):
                    dTable[None][None][None].append(ndElt2.get("id"))
            ndElt.getparent().remove(ndElt)
    for ndElt in gtroot.xpath(xpElement2, namespaces=dNS):
        ndCell = ndElt.getparent()
        if bEval: dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")].append(ndElt.get("id")) 
        ndCell.remove(ndElt)
    if bEval: traceln("\npEvaluation mode")
    
    if bSep:
        nbSepRemoved, nbSepAdded = 0, 0
        for _nd in gtroot.xpath('//pg:SeparatorRegion', namespaces=dNS):
            _nd.getparent().remove(_nd)
            nbSepRemoved += 1
        trace(" (Separators: %d removed" % nbSepRemoved)
                    
    # project the GT areas, page by page
    lNdPage   = doc.getroot().xpath("//pg:Page", namespaces=dNS)
    lNdPageGT =        gtroot.xpath("//pg:Page", namespaces=dNS)
    if len(lNdPage) != len(lNdPageGT):
        raise GTProjectionException("GT and input have different numbers of pages")
    assert len(lNdPage) > 0, "No page??"

    uniqID = 1
    nNdArea2 = 0
    for ndPage, ndPageGT in zip(lNdPage, lNdPageGT):
        lNdArea2 = ndPageGT.xpath(xpArea2, namespaces=dNS)
        loArea2 = [ShapeLoader.node_to_Polygon(nd) for nd in lNdArea2]
        nNdArea2 += len(lNdArea2)
        for ndElt in ndPage.xpath(xpElement2, namespaces=dNS):
            oElt = ShapeLoader.node_to_Polygon(ndElt)
            
            lOvrl = [oElt.intersection(o).area for o in loArea2]
            iMax = argmax(lOvrl) if lOvrl else None
            vMax = -1 if iMax is None else lOvrl[iMax]
            
            # where to add it?
            if vMax > 0 and vMax / oElt.area > fTH:
                # ok, this is a match
                ndCell = lNdArea2[iMax]
                # add it directly to the area2 (TableCell)
                ndCell.append(deepcopy(ndElt))
                if bEval:
                    if ndElt.get("id") in dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")]: 
                        nOk += 1
                    else:
                        try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
                        except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))
                    
            else:
                # add it outside of any area
                bestNd = ndPageGT
                # add it in its own TextRegion
                ndTR = etree.Element("TextRegion")
                ndTR.set("id", "prjct_region_%d" % uniqID)
                uniqID += 1
                ndTR.set("custom", "")
                ndTR.append(deepcopy(ndElt.xpath("./pg:Coords", namespaces=dNS)[0]))
                ndTR.append(deepcopy(ndElt))
                bestNd.append(ndTR)
                if bEval:
                    if ndElt.get("id") in dTable[None][None][None]: 
                        nOk += 1
                    else:
                        try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
                        except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))                        
                        
            nTot += 1
            
        if bSep:
            for _nd in ndPage.xpath('//pg:SeparatorRegion', namespaces=dNS):
                ndPageGT.append(deepcopy(_nd))
                nbSepAdded += 1
    if bSep: trace(", %d added.)  " % nbSepAdded)
        
    if bEval:
        traceln("-"*40)
        trace(" - evaluation: %d ok out of %d = %.2f%%\n" % (nOk, nTot, 100*nOk / (nTot+0.0001)))

    if nNdArea2 == 0: raise ProjectException("Empty GT")
    return gtdoc
    def map_to_rows(cls, ndPage, maxRow, lCluster):
        """
        find lienar separators separating rows
        """
        # reflect each cluster by the highest point (highest ending points of baselines)
        dMinYByRow = defaultdict(lambda :9999999999)
        n = 2 * sum(len(c) for c in lCluster)
        X = np.zeros(shape=(n, 2))  # x,y coordinates
        i = 0
        for c in lCluster:
            c.maxY = -1
            c.minY = 9999999999
            for _id in c.getSetID():
                """
                <TextLine id="r1l5" custom="readingOrder {index:4;}" DU_cluster="0" row="0" rowSpan="1" col="0" colSpan="1">
                  <Coords points="217,688 245,685 273,685 301,688 329,690 358,689 358,646 329,647 301,645 273,642 245,642 217,645"/>
                  <Baseline points="217,688 245,685 273,685 301,688 329,690 358,689"/>
                  <TextEquiv><Unicode>ung.</Unicode></TextEquiv>
                </TextLine>
                 """
                nd = ndPage.xpath(".//*[@id='%s']/pg:Baseline"%_id, namespaces=dNS)[0]
                ls = ShapeLoader.node_to_LineString(nd)
                pA, pB = ls.boundary.geoms
                minY = min(pA.y, pB.y)
                c.minY = min(c.minY, minY)
                c.maxY = max(c.maxY, max((pA.y, pB.y)))
                dMinYByRow[c.minrow] = min(dMinYByRow[c.minrow], minY)
                # for the linear separators
                X[i,:] = (pA.x, pA.y)
                i = i + 1
                X[i,:] = (pB.x, pB.y)
                i = i + 1
                
        # check consistency
        for c in lCluster:
            for i in range(maxRow, c.minrow, -1):
                if c.minY > dMinYByRow[i]:
                    assert c.minrow < i
                    # how possible??? fix!!
                    c.minrow = i
                    break
 
        # compute row1 and row2
        for c in lCluster:
            c.row1 = c.minrow
            c.row2 = c.minrow
            for i in range(0, maxRow+1):
                if c.maxY > dMinYByRow[i]:
                    c.row2 = i
                else:
                    break
               
        # now compute maxRow - 1 separators!
        w = float(ndPage.get("imageWidth"))
        Y = np.zeros(shape=(n,))    # labels
#         lAB = [getLinearSeparator(X, np.clip(Y, row, row+1)) 
#                for row in range(maxRow-1)]
        
        for nd in ndPage.xpath(".//pg:SeparatorRegion[@algo]", namespaces=dNS):
            ndPage.remove(nd)
        
        for row in range(maxRow+1):
            Y0 = dMinYByRow[row] - 20
            Yw = Y0
            ndSep = PageXml.createPageXmlNode("SeparatorRegion")
            ndSep.set("algo", "tabulate_rows")
            ndCoords = PageXml.createPageXmlNode("Coords")
            ndCoords.set("points", "%d,%d %d,%d" %(0, Y0, w, Yw))
            ndSep.append(ndCoords)
            ndSep.tail = "\n"
            ndPage.append(ndSep)
        
        return 
Пример #26
0
    def addSeparatorFeature(self):
        """
        We load the graphical separators
        COmpute a set of shapely object
        In turn, for each edge, we compute the intersection with all separators
        
        The edge features will be:
        - boolean: at least crossing one separator
        - number of crossing points
        - span length of the crossing points
        - average length of the crossed separators
        - average distance between two crossings
        
        
            xMiddle = (max(edge.A.x1, edge.B.x1) + min(edge.A.x2, edge.B.x2)) / 2.0
            yMiddle = (max(edge.A.y1, edge.B.y1) + min(edge.A.y2, edge.B.y2)) / 2.0
            if max(edge.A.x1, edge.B.x1) > min(edge.A.x2, edge.B.x2):
                # horizontal edge
                oEdge = geom.LineString([  (min(edge.A.x2, edge.B.x2), yMiddle)
                                         , (max(edge.A.x1, edge.B.x1), yMiddle)])
            else:
                # vertical edge or box overlap
                oEdge = geom.LineString([  (xMiddle, min(edge.A.y2, edge.B.y2))
                                         , (xMiddle, max(edge.A.y1, edge.B.y1))])        
        
        
        
        
        """
        
        if self._cachedSelf == self:
            # same call... (by reified edge code maybe)
            (loSep, idx) = PageXmlSeparatorRegion._cached
        else:
        # graphical separators
            dNS = {"pc":PageXml.NS_PAGE_XML}
            someNode = self.lNode[0]
            lndPage =  someNode.node.xpath("//pc:Page", namespaces=dNS)
            assert len(lndPage) == 1, "INTERNAL ERROR: CODE NOT READY FOR MULTIPAGE..."
            ndPage = someNode.node.xpath("ancestor::pc:Page", namespaces=dNS)[0]
            lNdSep = ndPage.xpath(".//pc:SeparatorRegion", namespaces=dNS)
            loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep]
        
            if self.bVerbose: traceln("\t\t\t%d graphical separators"%len(loSep))

            # make an indexed rtree
            idx = index.Index()
            for i, oSep in enumerate(loSep):
                idx.insert(i, oSep.bounds)

            PageXmlSeparatorRegion._cachedSelf = self
            PageXmlSeparatorRegion._cached     = (loSep, idx)
            
        # take each edge in turn and list the separators it crosses
        nCrossing = 0
        for edge in self.lEdge:
            # bottom-left corner to bottom-left corner
            #oEdge = geom.LineString([(edge.A.x1, edge.A.y1), (edge.B.x1, edge.B.y1)])
            
            xMiddle = (max(edge.A.x1, edge.B.x1) + min(edge.A.x2, edge.B.x2)) / 2.0
            yMiddle = (max(edge.A.y1, edge.B.y1) + min(edge.A.y2, edge.B.y2)) / 2.0
            if max(edge.A.x1, edge.B.x1) > min(edge.A.x2, edge.B.x2):
                # horizontal edge
                oEdge = geom.LineString([  (min(edge.A.x2, edge.B.x2), yMiddle)
                                         , (max(edge.A.x1, edge.B.x1), yMiddle)])
            else:
                # vertical edge or box overlap
                oEdge = geom.LineString([  (xMiddle, min(edge.A.y2, edge.B.y2))
                                         , (xMiddle, max(edge.A.y1, edge.B.y1))])                 
            
            
            prepO = prep(oEdge)
            lCrossingPoints = []
            fSepTotalLen = 0
            for i in idx.intersection(oEdge.bounds):
                # check each candidate in turn
                oSep = loSep[i]
                if prepO.intersects(oSep):
                    fSepTotalLen += oSep.length
                    oPt = oEdge.intersection(oSep)
                    if type(oPt) != geom.Point and type(oPt) != geom.MultiPoint:
                        traceln('\t\t\tIntersection in not a point: skipping it')
                    elif type(oPt) == geom.Point:
                        lCrossingPoints.append(oPt)
                    elif type(oPt) == geom.MultiPoint:
                        for x in [ (p.x,p.y) for p in oPt]:
                            lCrossingPoints.append(geom.Point(x))
            
            if lCrossingPoints:
                nCrossing += 1
                edge.bCrossingSep = True
                edge.sep_NbCrossing = len(lCrossingPoints)
                minx, miny, maxx, maxy  = geom.MultiPoint(lCrossingPoints).bounds
                edge.sep_SpanLen = abs(minx-maxx) + abs(miny-maxy)
                edge.sep_AvgSpanSgmt = edge.sep_SpanLen / len(lCrossingPoints) 
                edge.sep_AvgSepLen = fSepTotalLen / len(lCrossingPoints)
            else:
                edge.bCrossingSep = False
                edge.sep_NbCrossing = 0
                edge.sep_SpanLen = 0
                edge.sep_AvgSpanSgmt = 0 
                edge.sep_AvgSepLen = 0
                
            #traceln((edge.A.domid, edge.B.domid, edge.bCrossingSep, edge.sep_NbCrossing, edge.sep_SpanLen, edge.sep_AvgSpanSgmt, edge.sep_AvgSepLen))
                
        if self.bVerbose: 
            traceln("\t\t\t%d (/ %d) edges crossing at least one graphical separator"%(nCrossing, len(self.lEdge)))
    def loadPage(self,
                 ndPage,
                 shaper_fun=ShapeLoader.node_to_Point,
                 funIndex=lambda x: x._du_index,
                 bIgnoreHeader=False):
        """
        load the page, looking for Baseline
        can filter by DU_row
        return a list of shapely objects
             , a dict of sorted list of objects, by row
             
        GT BUG: some Baseline are assigned to the wrong Cell
        => we also fix this here....
        
        """
        loBaseline = []  # list of Baseline shapes
        i = 0

        dsetTableByRow = defaultdict(set)  # sets of object ids, by row

        dNodeSeen = {}
        # first associate a unique id to each baseline and list them
        lshapeCell = []
        lOrphanBaselineShape = []
        for ndCell in MultiPageXml.getChildByName(ndPage, "TableCell"):
            row, col = ndCell.get("row"), ndCell.get("col")
            plg = ShapeLoader.node_to_Polygon(ndCell)
            #ymin, ymax of polygon
            ly = [_y for _x, _y in plg.exterior.coords]
            ymin, ymax = min(ly), max(ly)
            plg._row = int(row)
            plg._col = int(col)
            plg._ymin, plg._ymax = ymin, ymax

            i0 = i
            for nd in MultiPageXml.getChildByName(ndCell, "Baseline"):
                nd.set("du_index", "%d" % i)
                ndParent = nd.getparent()
                dNodeSeen[ndParent.get('id')] = True
                if bIgnoreHeader and ndParent.get("DU_header") == "CH":
                    continue
                row_lbl = ndParent.get("DU_row")

                # Baseline as a shapely object
                try:
                    o = shaper_fun(nd)  #make a LineString
                except Exception as e:
                    traceln("ERROR: id=", nd.getparent().get("id"))
                    raise e
                o._du_index = i
                o._du_DU_row = row_lbl  # can be None
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")
                loBaseline.append(o)

                # is this object in the correct cell???
                # We must use the centroid of the text box, otherwise a baseline
                # may be assigned to the next row
                #y = o.centroid.y # NOO!!
                y = ShapeLoader.node_to_Polygon(ndParent).centroid.y
                # if ymin <= y and y <= ymax:
                # we allow the content of a cell to overlap the cell lower border
                if ymin <= y:
                    dsetTableByRow[int(row)].add(funIndex(o))
                else:
                    # this is an orphan!
                    o._bad_cell = plg
                    lOrphanBaselineShape.append(o)

                i += 1

            if bIgnoreHeader and i0 == i:
                continue  # empty cells, certainly due to headers, ignore it.

            lshapeCell.append(plg)
            # end for

        if lOrphanBaselineShape:
            traceln(
                "    *** error: %d Baseline in incorrect row - fixing this..."
                % len(lOrphanBaselineShape))
            for o in lOrphanBaselineShape:
                bestrow, bestdeltacol = 0, 9999
                try:
                    y = o.y
                except:
                    y = o.centroid.y
                for plg in lshapeCell:
                    if plg._ymin <= y and y <= plg._ymax:
                        # sounds good
                        deltacol = abs(o._bad_cell._col - plg._col)
                        if deltacol == 0:
                            # same column, ok it is that one
                            bestrow = plg._row
                            break
                        else:
                            if bestdeltacol > deltacol:
                                bestdeltacol = deltacol
                                bestrow = plg._row
                traceln("\t id=%s misplaced in row=%s instead of row=%s" %
                        (o._du_nd.getparent().get("id"), o._bad_cell._row,
                         bestrow))
                dsetTableByRow[bestrow].add(funIndex(o))
                del o._bad_cell

        # and (UGLY) process all Baseline outside any TableCell...

        for nd in MultiPageXml.getChildByName(ndPage, "Baseline"):
            try:
                dNodeSeen[nd.getparent().get('id')]
            except:
                #OLD "GOOD" CODE HERE
                nd.set("du_index", "%d" % i)
                # -> TextLine -> TableCell (possibly)
                ndPrnt = nd.getparent()
                row_lbl = ndPrnt.get("DU_row")

                # Baseline as a shapely object
                o = shaper_fun(nd)  #make a LineString
                o._du_index = i
                o._du_row = None  # Must be None
                o._du_DU_row = row_lbl  # can be None
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")

                loBaseline.append(o)

                i += 1

        return loBaseline, dsetTableByRow
Пример #28
0
def checkClusteringPerRow(pageNd, dCellCndPerRow, dTextInCells, dClusterWithTL,
                          dEdges):
    """
        recompute clusters for the rows
    """
    lNewPoly = []
    lNewClusterWithTL = []
    lNewClusterWithCC = []
    lRowtodel = []
    lTLtoBeAdded = []
    for rowid, rowCells in dCellCndPerRow.items():
        lNbClusterPerCol = []
        lClusterPerCol = []
        for cell in rowCells:
            if cell.wkt in dTextInCells.keys():
                lIds = [tl.get('id') for tl in dTextInCells[cell.wkt]]
                lClusterCells = connected_components(dEdges, lIds, TH=0.5)
                lNbClusterPerCol.append(len(lClusterCells))
                lClusterPerCol.append(lClusterCells)
                #print (len(rowCells),lIds,lClusterCells,lNbClusterPerCol)
            #else empty cell
        if len(lNbClusterPerCol) > 2:
            #             print (lNbClusterPerCol)
            try:
                nbRows = mode(lNbClusterPerCol)
            except statistics.StatisticsError:
                nbRows = 2
            if nbRows != 1:
                print('WARNING CUT ', rowid, "mode", nbRows)
                lRowtodel.append(rowid)
                dClusterCells = defaultdict(list)
                for colcluster in lClusterPerCol:
                    if len(colcluster) == nbRows:
                        # get tl instead of ids
                        # take the first element only for comparing position
                        colclustertxt = []
                        for cc in colcluster:
                            colclustertxt.append([
                                tl for tl in dClusterWithTL[rowid]
                                if tl.get('id') in cc
                            ])
                        sortedC = sorted(colclustertxt,
                                         key=lambda x: ShapeLoader.
                                         node_to_Polygon(x[0]).centroid.y)
                        for i, cc in enumerate(sortedC):
                            dClusterCells[i].append(cc)
                    else:
                        for cc in colcluster:
                            lTLtoBeAdded.extend([
                                tl for tl in dClusterWithTL[rowid]
                                if tl.get('id') in cc
                            ])
                for i, lcc in dClusterCells.items():
                    #                     print (i,lcc)
                    rect = ShapeLoader.contourObject([
                        x for cc in lcc for x in cc
                    ]).envelope  # minimal_rectangle?
                    rect = scale(rect, xfact=2)
                    lNewPoly.append(rect)
                    lNewClusterWithCC.append(lcc)
                    lNewClusterWithTL.append([x for cc in lcc for x in cc])
                    #                         lNewCluster.Append()
                    # need also to create a cluster with list of ids!!
                    rNd = etree.Element('LINE')
                    rNd.set('points', ShapeLoader.getCoordsString(rect))
                    pageNd.append(rNd)
                    # final check: if overlap between rectangle: top rectangle is the cut

    #return updated list of lCLusters
    return lRowtodel, lNewPoly, lNewClusterWithTL, lNewClusterWithCC, lTLtoBeAdded
def main(lsFilename, lsOutFilename):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    cnt, cntS = 0, 0
    for sFilename, sOutFilename in zip(lsFilename, lsOutFilename):
        cntDoc, cntDocS = 0, 0

        doc = etree.parse(sFilename, parser)
        root = doc.getroot()
        
        # Separators are not under tableRegion... :-/
        lNdSep = MultiPageXml.getChildByName(root ,'SeparatorRegion')
        loSep = [ShapeLoader.node_to_LineString(ndSep) for ndSep in lNdSep]
        for _o in loSep: _o._bConsistent = True
        
        if not lNdSep:
            traceln("Warning: no separator in %s"%sFilename)
        else:
            traceln("%25s  %d separators" % (sFilename, len(lNdSep)))
            lNdTR = MultiPageXml.getChildByName(root ,'TableRegion')
            for ndTR in lNdTR:
                lNdCells= MultiPageXml.getChildByName(ndTR ,'TableCell')
                if not lNdCells:
                    continue
                
                nbRows = max(int(x.get('row')) for x in lNdCells)
        
                # build a list of Shapely objects augmented with our own table attributes
                loText = [] #
                for ndCell in lNdCells:
                    minRow = int(ndCell.get('row'))
                    minCol = int(ndCell.get('col'))
                    maxRow = minRow + int(ndCell.get('rowSpan')) - 1
                    maxCol = minCol + int(ndCell.get('colSpan')) - 1
#                     # ignore cell spanning the whole table height
#                     if maxRow >= nbRows:
#                         continue
                    for ndText in MultiPageXml.getChildByName(ndCell ,'TextLine'):
                        try:
                            oText = ShapeLoader.node_to_Polygon(ndText)
                        except:
                            traceln("WARNING: SKIPPING 1 TExtLine: cannot make a polygon from: %s" % etree.tostring(ndText))
                            continue
                        # reflecting the textbox as a single point
                        (minx, miny, maxx, maxy) = oText.bounds
                        
                        # is the baseline horizontal or vertical??
                        fDelta = min((maxx-minx) / 2.0, (maxy-miny) / 2.0) 
                        if isBaselineHorizontal(ndText):
                            # supposed Horizontal text
                            oText = geom.Point(minx + fDelta  , (maxy + miny)/2.0)
                            ndText.set("Horizontal", "TRUE")

                        else:
                            ndText.set("Horizontal", "nope")
                            oText = geom.Point((minx + maxx)/2.0  , miny + fDelta)
                            
                        # considering it as a point, using its centroid
                        # does not work well due to loooong texts oText = oText.centroid
                        oText._minRow, oText._minCol = minRow, minCol
                        oText._maxRow, oText._maxCol = maxRow, maxCol
                        if DEBUG: oText._domnd = ndText
                        loText.append(oText)
                
                traceln("    TableRegion  %d texts" % (len(loText)))
                
                if loText:
                    # checking in tun each separator for table-consistency
                    sp = ShapePartition(loText)
                    
                    for oSep in loSep:
                        (minx, miny, maxx, maxy) = oSep.bounds
                        if maxx - minx >= maxy - miny:
                            # supposed Horizontal
                            l = sp.getObjectAboveLine(oSep)
                            if l:
                                maxRowBefore = max(_o._maxRow for _o in l)
                                l = sp.getObjectBelowLine(oSep)
                                if l:
                                    minRowAfter  = min(_o._minRow for _o in l)
                                    if maxRowBefore >= minRowAfter: oSep._bConsistent = False
                        else:
                            l1 = sp.getObjectOnLeftOfLine(oSep)
                            if l1:
                                maxColBefore = max(_o._maxCol for _o in l1)
                                l2 = sp.getObjectOnRightOfLine(oSep)
                                if l2:
                                    minColAfter  = min(_o._minCol for _o in l2)
                                    if maxColBefore >= minColAfter: 
                                        oSep._bConsistent = False
                                        if DEBUG:
                                            # DEBUG
                                            for o in l1:
                                                if o._maxCol >= minColAfter: print("too much on right", etree.tostring(o._domnd))
                                            for o in l2:
                                                if o._minCol <= maxColBefore: print("too much on left", etree.tostring(o._domnd))
                # end of TableRegion
            # end of document
            for ndSep, oSep in zip(lNdSep, loSep): 
                if oSep._bConsistent:
                    ndSep.set("DU_Sep", "S")
                    cntDocS += 1
                else:
                    ndSep.set("DU_Sep", "I")
                cntDoc += 1
            
        doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        traceln('%.2f%% consistent separators - annotation done for %s  --> %s' % (100*float(cntDocS)/(cntDoc+0.000001), sFilename, sOutFilename))
        
        del doc
        cnt, cntS = cnt+cntDoc, cntS+cntDocS
    traceln('%.2f%% consistent separators - annotation done for %d files' % (100*float(cntS)/(cnt+0.000001), cnt))
Пример #30
0
def createCellsFromZoneClusters(pageNd, dIds, lZones, lClusters, dEdges):
    """
    
        TOP DOWN: CREATE A GRID TABLE AND REFINE IT
    
        lZones: list of zones for columns (ordered left right by construction)
        lClusters: list of cluster NODE!! 
        
        create cells by zone intersections
    
    
        idea! compute 'invariance' of the angle of  the rectangle: if one column less: should be the same angle
           variance for detecting merge: del on column and see if several clusters?
        
        
        1 test if rectangle OK
            if all ok and no overlap: create rows and col
        
        # for overlapping:   -> connected componants for each column
            create alternate rectangles by skipping one cell 
        
        3 merge if same  #order (at cell level)
        
        4 split if n connected cells per column
            # how: create alternate rectangles?

    """
    lClusters = sorted(lClusters,
                       key=lambda x: ShapeLoader.node_to_Polygon(x).centroid.y)
    lZones = sorted(lZones,
                    key=lambda x: ShapeLoader.node_to_Polygon(x).centroid.x)

    dClusterWithTL = getClusterTL(pageNd, lClusters, dIds)

    #lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    lZonePoly = [ShapeLoader.node_to_Polygon(nd) for nd in lZones]

    ## if one clusterpoly is contained in another: merge

    lClusterPoly = transformMinima2envelop(pageNd, dClusterWithTL)
    #     return 0,0,0

    dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
        lClusterPoly, lZonePoly)
    dTextInCells = {}  #key = cell.wkt
    for i in range(len(lClusters)):
        dTextInCells.update(
            assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))
    lClusterPoly = shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow,
                                        dTextInCells)

    lClusterPoly = ajustRectangleShapes(pageNd, lClusterPoly)

    # again recompute cell zones
    dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
        lClusterPoly, lZonePoly)
    dTextInCells = {}  #key = cell.wkt
    for i in range(len(lClusters)):
        dTextInCells.update(
            assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))

    # do it first ??
    #check if row is a merge
    lRowtobeDel, lNewRowCluster, lNewClusterTL, lNewClusterCC, lTLtoBeAdded = checkClusteringPerRow(
        pageNd, dCellCndPerRow, dTextInCells, dClusterWithTL, dEdges)
    if lNewRowCluster:
        lClusterPoly = [
            ShapeLoader.node_to_Polygon(nd) for i, nd in enumerate(lClusters)
            if i not in lRowtobeDel
        ]
        dClusterWithTL = getClusterTL(
            pageNd,
            [c for i, c in enumerate(lClusters) if i not in lRowtobeDel], dIds)
        for i, ltl in enumerate(lNewClusterTL):
            dClusterWithTL[len(lClusterPoly) + i] = ltl
        lClusterPoly.extend(lNewRowCluster)
        #         lClusterPoly= ajustRectangleShapes(pageNd,lClusterPoly)

        dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
            lClusterPoly, lZonePoly)
        dTextInCells = {}  #key = cell.wkt
        for i in range(len(lClusterPoly)):
            dTextInCells.update(
                assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))

        ## populate elements which are not ij the grid/cells : at cc level or at tl level ?
        for i in range(len(lClusterPoly)):
            dTextInCells.update(
                assignTextLinesToCells(lTLtoBeAdded, dCellCndPerRow[i]))
    ## oversegmentation : test links at cell level if compatible!see 0068

    #COMPUTE PROFILE
    dUpdateCells, dNewCells = computeProfile(dCells, dTextInCells,
                                             dCellCndPerRow, dCellCndPerCol)

    # TABLE CREATION

    return dUpdateCells, dNewCells, dShapeCells