예제 #1
0
def shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow, dTextInCells):
    """
        delete one 'col' and see if the rectangle is stable (area? comparison; rather angle)
        
        angle see baseline object or 
        take the longest line in the polygon and compute its angle
            radian = math.atan((shape.lastpoint.x - shape.firstpoint.x)/(shape.lastpoint.y - shape.firstpoint.y))  
            degrees = radian * 180 / math.pi  
        return degrees  
    """
    #sort by size (reverse)
    #     lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    ## can be restricted to ones with high angle?
    lNewPoly = []
    for i in sorted(dCellCndPerRow.keys()):
        lcellRows = dCellCndPerRow[i]
        # skip first
        ltl = []
        for c in lcellRows[1:]:
            if c.wkt in dTextInCells:
                for tl in dTextInCells[c.wkt]:
                    assert ShapeLoader.node_to_Polygon(tl).is_valid
                    ltl.append(tl)
        rectangle = ShapeLoader.minimum_rotated_rectangle(ltl,
                                                          bShapelyObject=True)
        #         print (i,lClusterPoly[i].intersection(rectangle).area / lClusterPoly[i].area)
        if not rectangle.is_empty:
            if abs(rectangle.bounds[1] -
                   rectangle.bounds[3]) < abs(lClusterPoly[i].bounds[1] -
                                              lClusterPoly[i].bounds[3]):
                #                 rectangle=scale(rectangle,xfact=1.5)
                cellNd = etree.Element('COL')
                pageNd.append(cellNd)
                rectangle = scale(rectangle, xfact=2)
                lNewPoly.append(rectangle)
                cellNd.set('points', ShapeLoader.getCoordsString(rectangle))

            else:
                lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5)
                lNewPoly.append(lClusterPoly[i])
        else:
            lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5)
            lNewPoly.append(lClusterPoly[i])
        ## get length and compute angle !!!  take the one with more horizontal angle::


#         cellNd = etree.Element('COL')
#         pageNd.append(cellNd)
#         if not rectangle.is_empty:
#             rectangle=scale(rectangle,xfact=2)
#             cellNd.set('points',ShapeLoader.getCoordsString(rectangle))
#     print (len(lClusterPoly),len(lNewPoly))
    return lNewPoly
    def parseDocNodeLabel(self, graph_node, defaultCls=None):
        """
        Parse and set the graph node label and return its class index
        raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
        """
        domnode = graph_node.node
        sXmlLabel = domnode.get(self.sLabelAttr)

        # in case we also deal with column headers
        if self.bColumnHeader and 'CH' == domnode.get("DU_header"):
            sXmlLabel = 'CH'

        sXmlLabel = self.dConverter[sXmlLabel]
        if sXmlLabel is None:
            # special processing for singletons TODO: make it more efficient?
            ptTxt = ShapeLoader.node_to_Polygon(domnode).centroid
            plgCell = ShapeLoader.node_to_Polygon(domnode.getparent())
            plgMiddle = shapely.affinity.scale(plgCell, 1, 0.333, 1,
                                               'centroid')
            if plgMiddle.contains(ptTxt):
                sXmlLabel = "Sm"
            else:
                if ptTxt.y < plgCell.centroid.y:
                    sXmlLabel = "St"
                else:
                    sXmlLabel = "Sb"
        try:
            sLabel = self.dXmlLabel2Label[sXmlLabel]
        except KeyError:
            #             #not a label of interest, can we ignore it?
            #             try:
            #                 self.checkIsIgnored(sXmlLabel)
            #                 sLabel = self.sDefaultLabel
            #                 #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel:
            #             except:
            raise ValueError("Invalid label '%s'"
                             " (from @%s or @%s) in node %s" %
                             (sXmlLabel, self.sLabelAttr, self.sDefaultLabel,
                              etree.tostring(domnode)))
        # traceln(etree.tostring(domnode), sLabel)
        return sLabel
예제 #3
0
    def addClusterToDom(self, dCluster, bMoveContent=False):
        """
        Add Cluster elements to the Page DOM node
        """
        pageNode = None
        for x, lnidx in dCluster.items():
            #self.analysedCluster()
            if pageNode is None:
                pageNode = self.lNode[lnidx[0]].page.node
                pageNode.append(
                    etree.Comment("Clusters created by the conjugate graph"))

            # lp = [ShapeLoader.node_to_Polygon(self.lNode[_i].node) for _i in lnidx]
            # Make it robust to bad data...
            lp = []
            for _i in lnidx:
                try:
                    lp.append(ShapeLoader.node_to_Polygon(self.lNode[_i].node))
                except ValueError:
                    pass
            contour = cascaded_union(
                [p if p.is_valid else p.convex_hull for p in lp])
            # print(contour.wkt)
            try:
                spoints = ' '.join(
                    "%s,%s" % (int(x[0]), int(x[1]))
                    for x in contour.minimum_rotated_rectangle.exterior.coords)
            except:
                try:
                    spoints = ' '.join(
                        "%s,%s" % (int(x[0]), int(x[1]))
                        for x in contour.minimum_rotated_rectangle.coords)
                    # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence
                except:
                    spoints = ""
            #print (spoints)
            ndCluster = PageXml.createPageXmlNode('Cluster')
            # add the space separated list of node ids
            ndCluster.set(
                "content",
                " ".join(self.lNode[_i].node.get("id") for _i in lnidx))
            coords = PageXml.createPageXmlNode('Coords')
            ndCluster.append(coords)
            coords.set('points', spoints)
            pageNode.append(ndCluster)

            if bMoveContent:
                # move the DOM node of the content to the cluster
                for _i in lnidx:
                    ndCluster.append(self.lNode[_i].node)

        return
예제 #4
0
def getClusterCoords(lElts):
    
        lp = []
        for e in lElts:
            try:
                lp.append(ShapeLoader.node_to_Polygon(e))
            except ValueError:
                pass
        contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lp ])     
        # print(contour.wkt)
        try:spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.exterior.coords)
        except:
            try: spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.coords)
            # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence
            except: spoints = ""    
        return spoints
예제 #5
0
def assignTextLinesToCells(lTL, lCells):
    """
        get textlines from ids and best-assign them to the list of dCells (shape)
        
        # do it at the cc level?
    """
    #     print (lCells)
    #     print ([ShapeLoader.node_to_Polygon(nd) for nd in lTL])
    dCellTL = {}
    for i, txt in enumerate([ShapeLoader.node_to_Polygon(nd) for nd in lTL]):
        if not txt.is_empty:
            cellshape = bestRegionsAssignment(txt, lCells)
            if cellshape:
                try:
                    dCellTL[cellshape.wkt].append(lTL[i])
                except KeyError:
                    dCellTL[cellshape.wkt] = [lTL[i]]
    return dCellTL
예제 #6
0
 def loadClusterNode(cls, ndPage, nd, sAlgo, bComputeShape=True):
     """
     Load a cluster from its XML node
     Compute its shape, if not provided in the XML, as a minimum rotated rectangle       
     """
     name = nd.get("name")
     if name is None:
         name = "%s_%d" % (sAlgo, cls.cnt)
         cls.cnt += 1
         nd.set("name", name)
     setID = set(nd.get("content").split())
     if bool(setID):
         try:
             shape = ShapeLoader.node_to_Polygon(nd)
         except IndexError:
             if bComputeShape:
                 shape = cls.computeShape(ndPage, setID)
             else:
                 shape = None
         return cls(name, setID, shape)
     else:
         return None
def main(lsFilename, lsOutFilename):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    cnt, cntS = 0, 0
    for sFilename, sOutFilename in zip(lsFilename, lsOutFilename):
        cntDoc, cntDocS = 0, 0

        doc = etree.parse(sFilename, parser)
        root = doc.getroot()
        
        # Separators are not under tableRegion... :-/
        lNdSep = MultiPageXml.getChildByName(root ,'SeparatorRegion')
        loSep = [ShapeLoader.node_to_LineString(ndSep) for ndSep in lNdSep]
        for _o in loSep: _o._bConsistent = True
        
        if not lNdSep:
            traceln("Warning: no separator in %s"%sFilename)
        else:
            traceln("%25s  %d separators" % (sFilename, len(lNdSep)))
            lNdTR = MultiPageXml.getChildByName(root ,'TableRegion')
            for ndTR in lNdTR:
                lNdCells= MultiPageXml.getChildByName(ndTR ,'TableCell')
                if not lNdCells:
                    continue
                
                nbRows = max(int(x.get('row')) for x in lNdCells)
        
                # build a list of Shapely objects augmented with our own table attributes
                loText = [] #
                for ndCell in lNdCells:
                    minRow = int(ndCell.get('row'))
                    minCol = int(ndCell.get('col'))
                    maxRow = minRow + int(ndCell.get('rowSpan')) - 1
                    maxCol = minCol + int(ndCell.get('colSpan')) - 1
#                     # ignore cell spanning the whole table height
#                     if maxRow >= nbRows:
#                         continue
                    for ndText in MultiPageXml.getChildByName(ndCell ,'TextLine'):
                        try:
                            oText = ShapeLoader.node_to_Polygon(ndText)
                        except:
                            traceln("WARNING: SKIPPING 1 TExtLine: cannot make a polygon from: %s" % etree.tostring(ndText))
                            continue
                        # reflecting the textbox as a single point
                        (minx, miny, maxx, maxy) = oText.bounds
                        
                        # is the baseline horizontal or vertical??
                        fDelta = min((maxx-minx) / 2.0, (maxy-miny) / 2.0) 
                        if isBaselineHorizontal(ndText):
                            # supposed Horizontal text
                            oText = geom.Point(minx + fDelta  , (maxy + miny)/2.0)
                            ndText.set("Horizontal", "TRUE")

                        else:
                            ndText.set("Horizontal", "nope")
                            oText = geom.Point((minx + maxx)/2.0  , miny + fDelta)
                            
                        # considering it as a point, using its centroid
                        # does not work well due to loooong texts oText = oText.centroid
                        oText._minRow, oText._minCol = minRow, minCol
                        oText._maxRow, oText._maxCol = maxRow, maxCol
                        if DEBUG: oText._domnd = ndText
                        loText.append(oText)
                
                traceln("    TableRegion  %d texts" % (len(loText)))
                
                if loText:
                    # checking in tun each separator for table-consistency
                    sp = ShapePartition(loText)
                    
                    for oSep in loSep:
                        (minx, miny, maxx, maxy) = oSep.bounds
                        if maxx - minx >= maxy - miny:
                            # supposed Horizontal
                            l = sp.getObjectAboveLine(oSep)
                            if l:
                                maxRowBefore = max(_o._maxRow for _o in l)
                                l = sp.getObjectBelowLine(oSep)
                                if l:
                                    minRowAfter  = min(_o._minRow for _o in l)
                                    if maxRowBefore >= minRowAfter: oSep._bConsistent = False
                        else:
                            l1 = sp.getObjectOnLeftOfLine(oSep)
                            if l1:
                                maxColBefore = max(_o._maxCol for _o in l1)
                                l2 = sp.getObjectOnRightOfLine(oSep)
                                if l2:
                                    minColAfter  = min(_o._minCol for _o in l2)
                                    if maxColBefore >= minColAfter: 
                                        oSep._bConsistent = False
                                        if DEBUG:
                                            # DEBUG
                                            for o in l1:
                                                if o._maxCol >= minColAfter: print("too much on right", etree.tostring(o._domnd))
                                            for o in l2:
                                                if o._minCol <= maxColBefore: print("too much on left", etree.tostring(o._domnd))
                # end of TableRegion
            # end of document
            for ndSep, oSep in zip(lNdSep, loSep): 
                if oSep._bConsistent:
                    ndSep.set("DU_Sep", "S")
                    cntDocS += 1
                else:
                    ndSep.set("DU_Sep", "I")
                cntDoc += 1
            
        doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        traceln('%.2f%% consistent separators - annotation done for %s  --> %s' % (100*float(cntDocS)/(cntDoc+0.000001), sFilename, sOutFilename))
        
        del doc
        cnt, cntS = cnt+cntDoc, cntS+cntDocS
    traceln('%.2f%% consistent separators - annotation done for %d files' % (100*float(cntS)/(cnt+0.000001), cnt))
    def loadPageCol(self,
                    ndPage,
                    fRatio,
                    shaper_fun=ShapeLoader.node_to_Point,
                    funIndex=lambda x: x._du_index):
        """
        load the page, looking for Baseline
        can filter by DU_row
        return a list of shapely objects
             , a dict of sorted list of objects, by column
             
        GT BUG: some Baseline are assigned to the wrong Cell
        => we also fix this here....
        
        """
        loBaseline = []  # list of Baseline shapes
        i = 0

        dsetTableByCol = defaultdict(set)  # sets of object ids, by col
        dsetTableDataByCol = defaultdict(set)  # sets of object ids, by col
        dO = {}

        dNodeSeen = {}
        # first associate a unique id to each baseline and list them
        lshapeCell = []
        lOrphanBaselineShape = []

        lCells = MultiPageXml.getChildByName(ndPage, "TableCell")
        maxHeaderRowSpan = computeMaxRowSpan(lCells)
        traceln("   - maxHeaderRowSpan=", maxHeaderRowSpan)
        for ndCell in lCells:
            row, col = int(ndCell.get("row")), int(ndCell.get("col"))
            rowSpan = int(ndCell.get("rowSpan"))
            plg = ShapeLoader.node_to_Polygon(ndCell)
            #ymin, ymax of polygon
            lx = [_x for _x, _y in plg.exterior.coords]
            xmin, xmax = min(lx), max(lx)
            plg._row = row
            plg._col = col
            plg._xmin, plg._xmax = xmin, xmax
            lshapeCell.append(plg)

            for nd in MultiPageXml.getChildByName(ndCell, "Baseline"):
                nd.set("du_index", "%d" % i)
                ndParent = nd.getparent()
                dNodeSeen[ndParent.get('id')] = True

                # Baseline as a shapely object
                try:
                    o = shaper_fun(nd)  #make a LineString
                except Exception as e:
                    traceln("ERROR: id=", nd.getparent().get("id"))
                    raise e
                # scale the objects, as done when cutting!!
                # useless currently since we make a Point...
                o = shapely.affinity.scale(o, xfact=fRatio, yfact=fRatio)

                o._du_index = i
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")
                loBaseline.append(o)

                # is this object in the correct cell???
                # We must use the centroid of the text box, otherwise a baseline
                # may be assigned to the next row
                # NOOO x = ShapeLoader.node_to_Polygon(ndParent).centroid.x
                # we must look for the leftest coordinate
                # NO CHECK FOR COLUMNS

                dsetTableByCol[col].add(funIndex(o))

                if (row + rowSpan) > maxHeaderRowSpan:
                    dsetTableDataByCol[col].add(funIndex(o))

                i += 1


#         if lOrphanBaselineShape:
#             traceln("    *** error: %d Baseline in incorrect row - fixing this..." % len(lOrphanBaselineShape))
#             for o in lOrphanBaselineShape:
#                 bestrow, bestdeltacol = 0, 9999
#                 try:
#                     y = o.y
#                 except:
#                     y = o.centroid.y
#                 for plg in lshapeCell:
#                     if plg._ymin <= y and y <= plg._ymax:
#                         # sounds good
#                         deltacol = abs(o._bad_cell._col - plg._col)
#                         if deltacol == 0:
#                             # same column, ok it is that one
#                             bestrow = plg._row
#                             break
#                         else:
#                             if bestdeltacol > deltacol:
#                                 bestdeltacol = deltacol
#                                 bestrow = plg._row
#                 traceln("\t id=%s misplaced in row=%s instead of row=%s" %(
#                     o._du_nd.getparent().get("id")
#                     , o._bad_cell._row
#                     , bestrow))
#                 dsetTableByCol[bestrow].add(o._du_index)
#                 del o._bad_cell

# and (UGLY) process all Baseline outside any TableCell...

        for nd in MultiPageXml.getChildByName(ndPage, "Baseline"):
            try:
                dNodeSeen[nd.getparent().get('id')]
            except:
                #OLD "GOOD" CODE HERE
                nd.set("du_index", "%d" % i)

                # Baseline as a shapely object
                o = shaper_fun(nd)  #make a LineString

                # scale the objects, as done when cutting!!
                o = shapely.affinity.scale(o, xfact=fRatio)

                o._du_index = i
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")
                loBaseline.append(o)

                i += 1

        return loBaseline, dsetTableByCol, dsetTableDataByCol, maxHeaderRowSpan
예제 #9
0
def createCellsFromZoneClusters(pageNd, dIds, lZones, lClusters, dEdges):
    """
    
        TOP DOWN: CREATE A GRID TABLE AND REFINE IT
    
        lZones: list of zones for columns (ordered left right by construction)
        lClusters: list of cluster NODE!! 
        
        create cells by zone intersections
    
    
        idea! compute 'invariance' of the angle of  the rectangle: if one column less: should be the same angle
           variance for detecting merge: del on column and see if several clusters?
        
        
        1 test if rectangle OK
            if all ok and no overlap: create rows and col
        
        # for overlapping:   -> connected componants for each column
            create alternate rectangles by skipping one cell 
        
        3 merge if same  #order (at cell level)
        
        4 split if n connected cells per column
            # how: create alternate rectangles?

    """
    lClusters = sorted(lClusters,
                       key=lambda x: ShapeLoader.node_to_Polygon(x).centroid.y)
    lZones = sorted(lZones,
                    key=lambda x: ShapeLoader.node_to_Polygon(x).centroid.x)

    dClusterWithTL = getClusterTL(pageNd, lClusters, dIds)

    #lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    lZonePoly = [ShapeLoader.node_to_Polygon(nd) for nd in lZones]

    ## if one clusterpoly is contained in another: merge

    lClusterPoly = transformMinima2envelop(pageNd, dClusterWithTL)
    #     return 0,0,0

    dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
        lClusterPoly, lZonePoly)
    dTextInCells = {}  #key = cell.wkt
    for i in range(len(lClusters)):
        dTextInCells.update(
            assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))
    lClusterPoly = shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow,
                                        dTextInCells)

    lClusterPoly = ajustRectangleShapes(pageNd, lClusterPoly)

    # again recompute cell zones
    dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
        lClusterPoly, lZonePoly)
    dTextInCells = {}  #key = cell.wkt
    for i in range(len(lClusters)):
        dTextInCells.update(
            assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))

    # do it first ??
    #check if row is a merge
    lRowtobeDel, lNewRowCluster, lNewClusterTL, lNewClusterCC, lTLtoBeAdded = checkClusteringPerRow(
        pageNd, dCellCndPerRow, dTextInCells, dClusterWithTL, dEdges)
    if lNewRowCluster:
        lClusterPoly = [
            ShapeLoader.node_to_Polygon(nd) for i, nd in enumerate(lClusters)
            if i not in lRowtobeDel
        ]
        dClusterWithTL = getClusterTL(
            pageNd,
            [c for i, c in enumerate(lClusters) if i not in lRowtobeDel], dIds)
        for i, ltl in enumerate(lNewClusterTL):
            dClusterWithTL[len(lClusterPoly) + i] = ltl
        lClusterPoly.extend(lNewRowCluster)
        #         lClusterPoly= ajustRectangleShapes(pageNd,lClusterPoly)

        dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
            lClusterPoly, lZonePoly)
        dTextInCells = {}  #key = cell.wkt
        for i in range(len(lClusterPoly)):
            dTextInCells.update(
                assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))

        ## populate elements which are not ij the grid/cells : at cc level or at tl level ?
        for i in range(len(lClusterPoly)):
            dTextInCells.update(
                assignTextLinesToCells(lTLtoBeAdded, dCellCndPerRow[i]))
    ## oversegmentation : test links at cell level if compatible!see 0068

    #COMPUTE PROFILE
    dUpdateCells, dNewCells = computeProfile(dCells, dTextInCells,
                                             dCellCndPerRow, dCellCndPerCol)

    # TABLE CREATION

    return dUpdateCells, dNewCells, dShapeCells
    def loadPage(self,
                 ndPage,
                 shaper_fun=ShapeLoader.node_to_Point,
                 funIndex=lambda x: x._du_index,
                 bIgnoreHeader=False):
        """
        load the page, looking for Baseline
        can filter by DU_row
        return a list of shapely objects
             , a dict of sorted list of objects, by row
             
        GT BUG: some Baseline are assigned to the wrong Cell
        => we also fix this here....
        
        """
        loBaseline = []  # list of Baseline shapes
        i = 0

        dsetTableByRow = defaultdict(set)  # sets of object ids, by row

        dNodeSeen = {}
        # first associate a unique id to each baseline and list them
        lshapeCell = []
        lOrphanBaselineShape = []
        for ndCell in MultiPageXml.getChildByName(ndPage, "TableCell"):
            row, col = ndCell.get("row"), ndCell.get("col")
            plg = ShapeLoader.node_to_Polygon(ndCell)
            #ymin, ymax of polygon
            ly = [_y for _x, _y in plg.exterior.coords]
            ymin, ymax = min(ly), max(ly)
            plg._row = int(row)
            plg._col = int(col)
            plg._ymin, plg._ymax = ymin, ymax

            i0 = i
            for nd in MultiPageXml.getChildByName(ndCell, "Baseline"):
                nd.set("du_index", "%d" % i)
                ndParent = nd.getparent()
                dNodeSeen[ndParent.get('id')] = True
                if bIgnoreHeader and ndParent.get("DU_header") == "CH":
                    continue
                row_lbl = ndParent.get("DU_row")

                # Baseline as a shapely object
                try:
                    o = shaper_fun(nd)  #make a LineString
                except Exception as e:
                    traceln("ERROR: id=", nd.getparent().get("id"))
                    raise e
                o._du_index = i
                o._du_DU_row = row_lbl  # can be None
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")
                loBaseline.append(o)

                # is this object in the correct cell???
                # We must use the centroid of the text box, otherwise a baseline
                # may be assigned to the next row
                #y = o.centroid.y # NOO!!
                y = ShapeLoader.node_to_Polygon(ndParent).centroid.y
                # if ymin <= y and y <= ymax:
                # we allow the content of a cell to overlap the cell lower border
                if ymin <= y:
                    dsetTableByRow[int(row)].add(funIndex(o))
                else:
                    # this is an orphan!
                    o._bad_cell = plg
                    lOrphanBaselineShape.append(o)

                i += 1

            if bIgnoreHeader and i0 == i:
                continue  # empty cells, certainly due to headers, ignore it.

            lshapeCell.append(plg)
            # end for

        if lOrphanBaselineShape:
            traceln(
                "    *** error: %d Baseline in incorrect row - fixing this..."
                % len(lOrphanBaselineShape))
            for o in lOrphanBaselineShape:
                bestrow, bestdeltacol = 0, 9999
                try:
                    y = o.y
                except:
                    y = o.centroid.y
                for plg in lshapeCell:
                    if plg._ymin <= y and y <= plg._ymax:
                        # sounds good
                        deltacol = abs(o._bad_cell._col - plg._col)
                        if deltacol == 0:
                            # same column, ok it is that one
                            bestrow = plg._row
                            break
                        else:
                            if bestdeltacol > deltacol:
                                bestdeltacol = deltacol
                                bestrow = plg._row
                traceln("\t id=%s misplaced in row=%s instead of row=%s" %
                        (o._du_nd.getparent().get("id"), o._bad_cell._row,
                         bestrow))
                dsetTableByRow[bestrow].add(funIndex(o))
                del o._bad_cell

        # and (UGLY) process all Baseline outside any TableCell...

        for nd in MultiPageXml.getChildByName(ndPage, "Baseline"):
            try:
                dNodeSeen[nd.getparent().get('id')]
            except:
                #OLD "GOOD" CODE HERE
                nd.set("du_index", "%d" % i)
                # -> TextLine -> TableCell (possibly)
                ndPrnt = nd.getparent()
                row_lbl = ndPrnt.get("DU_row")

                # Baseline as a shapely object
                o = shaper_fun(nd)  #make a LineString
                o._du_index = i
                o._du_row = None  # Must be None
                o._du_DU_row = row_lbl  # can be None
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")

                loBaseline.append(o)

                i += 1

        return loBaseline, dsetTableByRow
def project_Elt_to_GT(gtdoc, doc
                      , xpElement1, xpElement2
                      , xpArea2
                      , bSep, lsRmId, bEval
                      , fTH=0.5):
    """
    Here we take the element out of the production file to put them in the GT
    doc
    
    WE IGNORE xpArea1 (no need for it)
    
    We return the GT doc
    """
    gtroot = gtdoc.getroot()

    # Evaluation
    # we build a table of list of TextLineId from the GT to check this SW
    # table_id -> row -> col -> list of element id
    dTable = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
    nOk, nTot = 0, 0
    
    if lsRmId:
        nbEltRemoved = 0
        for sRmId in lsRmId:
            # for _nd in gtroot.xpath('//pg:*[@id="%s"]'%sRmId, namespaces=dNS):
            for _nd in gtroot.xpath('//*[@id="%s"]'%sRmId):
                _nd.getparent().remove(_nd)
                nbEltRemoved += 1
        trace(" (Rm by ID: %d elements removed)" % nbEltRemoved)

    # remove all elements of interest from GT
    # inside TableRegion, we have TextLine, outside we have TextRegion
    if xpElement1 != xpArea2:
        for ndElt in gtroot.xpath(xpElement1, namespaces=dNS):
            if bEval:
                for ndElt2 in ndElt.xpath(xpElement2, namespaces=dNS):
                    dTable[None][None][None].append(ndElt2.get("id"))
            ndElt.getparent().remove(ndElt)
    for ndElt in gtroot.xpath(xpElement2, namespaces=dNS):
        ndCell = ndElt.getparent()
        if bEval: dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")].append(ndElt.get("id")) 
        ndCell.remove(ndElt)
    if bEval: traceln("\npEvaluation mode")
    
    if bSep:
        nbSepRemoved, nbSepAdded = 0, 0
        for _nd in gtroot.xpath('//pg:SeparatorRegion', namespaces=dNS):
            _nd.getparent().remove(_nd)
            nbSepRemoved += 1
        trace(" (Separators: %d removed" % nbSepRemoved)
                    
    # project the GT areas, page by page
    lNdPage   = doc.getroot().xpath("//pg:Page", namespaces=dNS)
    lNdPageGT =        gtroot.xpath("//pg:Page", namespaces=dNS)
    if len(lNdPage) != len(lNdPageGT):
        raise GTProjectionException("GT and input have different numbers of pages")
    assert len(lNdPage) > 0, "No page??"

    uniqID = 1
    nNdArea2 = 0
    for ndPage, ndPageGT in zip(lNdPage, lNdPageGT):
        lNdArea2 = ndPageGT.xpath(xpArea2, namespaces=dNS)
        loArea2 = [ShapeLoader.node_to_Polygon(nd) for nd in lNdArea2]
        nNdArea2 += len(lNdArea2)
        for ndElt in ndPage.xpath(xpElement2, namespaces=dNS):
            oElt = ShapeLoader.node_to_Polygon(ndElt)
            
            lOvrl = [oElt.intersection(o).area for o in loArea2]
            iMax = argmax(lOvrl) if lOvrl else None
            vMax = -1 if iMax is None else lOvrl[iMax]
            
            # where to add it?
            if vMax > 0 and vMax / oElt.area > fTH:
                # ok, this is a match
                ndCell = lNdArea2[iMax]
                # add it directly to the area2 (TableCell)
                ndCell.append(deepcopy(ndElt))
                if bEval:
                    if ndElt.get("id") in dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")]: 
                        nOk += 1
                    else:
                        try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
                        except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))
                    
            else:
                # add it outside of any area
                bestNd = ndPageGT
                # add it in its own TextRegion
                ndTR = etree.Element("TextRegion")
                ndTR.set("id", "prjct_region_%d" % uniqID)
                uniqID += 1
                ndTR.set("custom", "")
                ndTR.append(deepcopy(ndElt.xpath("./pg:Coords", namespaces=dNS)[0]))
                ndTR.append(deepcopy(ndElt))
                bestNd.append(ndTR)
                if bEval:
                    if ndElt.get("id") in dTable[None][None][None]: 
                        nOk += 1
                    else:
                        try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
                        except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))                        
                        
            nTot += 1
            
        if bSep:
            for _nd in ndPage.xpath('//pg:SeparatorRegion', namespaces=dNS):
                ndPageGT.append(deepcopy(_nd))
                nbSepAdded += 1
    if bSep: trace(", %d added.)  " % nbSepAdded)
        
    if bEval:
        traceln("-"*40)
        trace(" - evaluation: %d ok out of %d = %.2f%%\n" % (nOk, nTot, 100*nOk / (nTot+0.0001)))

    if nNdArea2 == 0: raise ProjectException("Empty GT")
    return gtdoc