def addEdgeToDoc(self, Y=None):
        """
        To display the grpah conveniently we add new Edge elements
        """
        import random
        (pnum, page, ndPage) = next(self._iter_Page_DocNode(self.doc))
        w = int(ndPage.get("imageWidth"))

        nn = 1 + len([e for e in self.lEdge if type(e) not in [HorizontalEdge, VerticalEdge, Edge_BL]])
        ii = 0
        for edge in self.lEdge:
            if type(edge) in [HorizontalEdge, VerticalEdge]:
                A, B = edge.A.shape.centroid, edge.B.shape.centroid
            elif type(edge) in [Edge_BL]:
                A = edge.A.shape.centroid
                # not readable                 _pt, B = shapely.ops.nearest_points(A, edge.B.shape)
                _pt, B = shapely.ops.nearest_points(edge.A.shape, edge.B.shape)
            else:
                ii += 1
                x = 1 + ii * (w/nn)
                pt = geom.Point(x, 0)
                A, _ = shapely.ops.nearest_points(edge.A.shape, pt)
                B, _ = shapely.ops.nearest_points(edge.B.shape, pt)
            ndSep = MultiPageXml.createPageXmlNode("Edge")
            ndSep.set("DU_type", type(edge).__name__)
            ndPage.append(ndSep)
            MultiPageXml.setPoints(ndSep, [(A.x, A.y), (B.x, B.y)])
        return
예제 #2
0
def add_cluster_to_dom(root, llX):
    """
    Cluster the Textline based on the vertical cuts
    """

    for lX, (_iPage, ndPage) in zip(
            llX, enumerate(MultiPageXml.getChildByName(root, 'Page'))):
        w, _h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))

        lX.append(w)
        lX.sort()
        # cluster of objects on
        imax = len(lX)
        dCluster = {i: list() for i in range(imax)}

        #Histogram of projections
        lndTextline = MultiPageXml.getChildByName(ndPage, 'TextLine')

        # hack to use addClusterToDom
        class MyBlock:
            def __init__(self, nd):
                self.node = nd

        o = GraphBinaryConjugateSegmenter()
        o.lNode = []
        for nd in lndTextline:
            o.lNode.append(MyBlock(nd))

        for iNd, ndTextline in enumerate(lndTextline):
            sPoints = MultiPageXml.getChildByName(ndTextline,
                                                  'Coords')[0].get('points')
            try:
                x1, _y1, x2, _y2 = Polygon.parsePoints(sPoints).fitRectangle()
                xm = (x1 + x2) / 2.0
                bLastColumn = True
                for i, xi in enumerate(lX):
                    if xm <= xi:
                        dCluster[i].append(iNd)
                        ndTextline.set("DU_cluster", str(i))
                        bLastColumn = False
                        break
                if bLastColumn:
                    i = imax
                    dCluster[i].append(iNd)
                    ndTextline.set("DU_cluster", str(i))
            except ZeroDivisionError:
                pass
            except ValueError:
                pass

        # add clusters
        lNdCluster = o.addClusterToDom(dCluster,
                                       bMoveContent=False,
                                       sAlgo="cut",
                                       pageNode=ndPage)

        # add a cut_X attribute to the clusters
        for ndCluster in lNdCluster:
            i = int(ndCluster.get('name'))
            ndCluster.set("cut_X", str(lX[i]))
    def convertDoc(self, sFilename):
        
        assert sFilename.endswith(self.sXml_HumanAnnotation_Extension)
        
        g = Graph_MultiPageXml()
        
        doc = etree.parse(sFilename, encoding='utf-8')

        #the Heigh/Ho annotation runs over consecutive pages, so we keep those values accross pages
        self._initSegmentationLabel()
        self.lSeenResoNum = list()

        for pnum, page, domNdPage in g._iter_Page_DocNode(doc):
            self._convertPageAnnotation(pnum, page, domNdPage)
            
        MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments)
        
        assert sFilename.endswith(self.sXml_HumanAnnotation_Extension)
        
        sDUFilename = sFilename[:-len(self.sXml_HumanAnnotation_Extension)] + self.sXml_MachineAnnotation_Extension
#         doc.save(sDUFilename, encoding='utf-8', pretty_print=True)
        doc.write(sDUFilename,
                  xml_declaration=True,
                  encoding="utf-8",
                  pretty_print=True
                  #compression=0,  #0 to 9
                  )
        
#         doc.saveFormatFileEnc(sDUFilename, "utf-8", True)  #True to indent the XML
#         doc.freeDoc()     
        
        return sDUFilename
def main(lsFilename, lsOutFilename):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    for sFilename, sOutFilename in zip(lsFilename, lsOutFilename):
        doc = etree.parse(sFilename, parser)
        root = doc.getroot()

        lCells= MultiPageXml.getChildByName(root,'TableCell')
        if not lCells:
            traceln("ERROR: no TableCell - SKIPPING THIS FILE!!!")
            continue
        
        # default: O for all cells: all cells must have all tags!
        for cell in lCells:
            lText = MultiPageXml.getChildByName(cell,'TextLine')
            [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText]
            [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText]
            [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText]
            
        
        if False:
            # Oct' 2018 RV and JL decided that we keep the binding TextLine (if any!)
            # ignore "binding" cells
            # dirty...
            # lCells = list(filter(lambda x: int(x.get('rowSpan')) < 5, lCells))
            # less dirty
            maxrow = max(int(x.get('row')) for x in lCells)
            binding_rowspan = max(5, maxrow * 0.8) 
            traceln(" - max row = %d  => considering rowspan > %d as binding cells"
                    % (maxrow, binding_rowspan))
            lValidCell, lBindingCell = [], []
            for ndCell in lCells:
                if int(ndCell.get('rowSpan')) < binding_rowspan:
                    lValidCell.append(ndCell)
                else:
                    lBindingCell.append(ndCell)
            nDiscarded = len(lBindingCell)
            if nDiscarded > 1: traceln("****************   WARNING  ****************")
            traceln(" - %d cells discarded as binding cells" % nDiscarded)
            for ndCell in lBindingCell:
                ndCell.set("type", "table-binding")
            lCells = lValidCell
            
        # FOR COLUMN HEADER: get max(cell[0,i].span)
        maxRowSpan = computeMaxRowSpan(lCells)
        
        tag_DU_row_col_header(root, lCells, maxRowSpan)
            
        try:
            removeSeparator(root)
            addSeparator(root, lCells)
            doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
            traceln('annotation done for %s  --> %s' % (sFilename, sOutFilename))
        except TableAnnotationException:
            traceln("No Table region in file ", sFilename, "  IGNORED!!")
        
        del doc
예제 #5
0
 def storeMultiPageXml(self,lListDocs,outputFileName=None):
     """
         write a multipagePageXml file
     """
     from xml_formats.PageXml import MultiPageXml
     mp = MultiPageXml()
     newDoc = mp.makeMultiPageXmlMemory(list(map(lambda xy:xy[0],lListDocs)))
     ## print(outputFileName)
     if outputFileName is None:
         outputFileName = os.path.dirname(self.inputFileName) + os.sep + ".."+os.sep +"col" + os.sep + os.path.basename(self.inputFileName)[:-7] + "_du.mpxml"
     # print(outputFileName)
     res= newDoc.write(outputFileName, encoding="UTF-8",pretty_print=True,xml_declaration=True)
 def remove_cuts_from_dom(self, root):
     """
     clean the DOM from any existing cut 
     return the number of removed cut lines
     """
     lnd = MultiPageXml.getChildByName(root, 'CutSeparator')
     n = len(lnd)
     for nd in lnd:
         nd.getparent().remove(nd)
     #check...
     lnd = MultiPageXml.getChildByName(root, 'CutSeparator')
     assert len(lnd) == 0
     return n
예제 #7
0
    def add_grid_to_DOM(self, root, ltlHlV=None):
        """
        Add the grid lines to the DOM
        Tag them if ltlHlV is given
        Modify the XML DOM
        return the number of grid lines created
        """
        domid = 0  #to add unique separator id and count them

        for iPage, ndPage in enumerate(
                MultiPageXml.getChildByName(root, 'Page')):
            try:
                lHi, lVi = ltlHlV[iPage]
            except IndexError:
                lHi, lVi = [], []

            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0]

            def addPageXmlSeparator(nd, i, lGTi, x1, y1, x2, y2, domid):
                ndSep = MultiPageXml.createPageXmlNode("GridSeparator")
                if lGTi:
                    # propagate the groundtruth info we have
                    sLabel = self.getLabel(i, lGTi)
                    ndSep.set("type", sLabel)
                if abs(x2 - x1) > abs(y2 - y1):
                    ndSep.set("orient", "0")
                else:
                    ndSep.set("orient", "90")
                ndSep.set("id", "s_%d" % domid)
                nd.append(ndSep)
                ndCoord = MultiPageXml.createPageXmlNode("Coords")
                MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)])
                ndSep.append(ndCoord)
                return ndSep

            #Vertical grid lines
            for i, (x1, y1, x2,
                    y2) in enumerate(self.iterGridVerticalLines(w, h)):
                domid += 1
                addPageXmlSeparator(ndTR, i, lVi, x1, y1, x2, y2, domid)

            #horizontal grid lines
            for i, (x1, y1, x2,
                    y2) in enumerate(self.iterGridHorizontalLines(w, h)):
                domid += 1
                addPageXmlSeparator(ndTR, i, lHi, x1, y1, x2, y2, domid)

        return domid
예제 #8
0
def test_DS2PageXmlConversion():
    filename = os.path.join(sTESTS_DIR,
                            'testDS2PageXml/RRB_MM_01_033_Jahr_1810.ds.xml')
    conv= DS2PageXMLConvertor()
    conv.inputFileName = filename
    doc = conv.loadDom(filename)
    lPageXmlDocs = conv.run(doc)
    mp = MultiPageXml()
    # newDoc = mp.makeMultiPageXmlMemory(map(lambda (x,y):x,lPageXmlDocs))
    newDoc = mp.makeMultiPageXmlMemory([x for x,_y in lPageXmlDocs])
    newDoc.write(os.path.join(sTESTS_DIR,
                              "testDS2PageXml/RRB_MM_01_033_Jahr_1810.mpxml"),
                 xml_declaration=True,
                 encoding="UTF-8",
                 pretty_print=True)
예제 #9
0
    def run(self):

        docdom = self.loadDom()
        self.convertTableCells(docdom)
        try:
            PageXml.setMetadata(docdom,
                                None,
                                'NLE',
                                Comments='TableCell 2 TextRegion')
        except ValueError:
            MultiPageXml.setMetadata(docdom,
                                     None,
                                     'NLE',
                                     Comments='TableCell 2 TextRegion')
        return docdom
 def addPageXmlSeparator(cls, ndPage, oCut, domid):
     ndSep = MultiPageXml.createPageXmlNode("CutSeparator")
     # propagate the groundtruth info we have
     ndSep.set("DU_type", oCut._du_label)
     ndSep.set("orient", "0")
     ndSep.set("DU_angle", "%.1f" % math.degrees(oCut._du_angle))
     ndSep.set("DU_angle_freq", "%.3f" % oCut._du_angle_freq)
     ndSep.set("DU_angle_cumul_freq", "%.3f" % oCut._du_angle_cumfreq)
     ndSep.set("DU_set_support", "%s" % oCut._du_set_support)
     ndSep.set("id", "cs_%d" % domid)
     ndPage.append(ndSep)
     ndCoord = MultiPageXml.createPageXmlNode("Coords")
     MultiPageXml.setPoints(ndCoord, oCut.coords)
     ndSep.append(ndCoord)
     return ndSep
 def addPageXmlSeparator(cls, nd, sLabel, x1, y1, x2, y2, domid):
     ndSep = MultiPageXml.createPageXmlNode("CutSeparator")
     if not sLabel is None:
         # propagate the groundtruth info we have
         ndSep.set("type", sLabel)
     if abs(x2 - x1) > abs(y2 - y1):
         ndSep.set("orient", "0")
     else:
         ndSep.set("orient", "90")
     ndSep.set("id", "s_%d" % domid)
     nd.append(ndSep)
     ndCoord = MultiPageXml.createPageXmlNode("Coords")
     MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)])
     ndSep.append(ndCoord)
     return ndSep
 def addEdgeToDOM(self):
     """
     To display the grpah conveniently we add new Edge elements
     Since we change the BAseline representation, we show the new one
     """
     super().addEdgeToDOM()
     
     for blk in self.lNode:
         assert blk.type.name in ["row", "sepH"], blk.type.name
         
         if blk.type.name == "row":
             ndBaseline = blk.node.xpath(".//pc:Baseline", namespaces=self.dNS)[0]
             o = self.shaper_fun(ndBaseline)
             MultiPageXml.setPoints(ndBaseline, list(o.coords))
         
     return
def op_gt_recall(lsFilename,
                 bCutAbove,
                 lDegAngle,
                 fMinHorizProjection=0.05,
                 fCutHeight=25):
    cAll = Counter()
    for sFilename in lsFilename:
        traceln("- loading GT: %s" % sFilename)

        #for the pretty printer to format better...
        parser = etree.XMLParser(remove_blank_text=True)
        doc = etree.parse(sFilename, parser)
        root = doc.getroot()

        doer = SkewedCutAnnotator(bCutAbove,
                                  lAngle=[math.radians(x) for x in lDegAngle])

        pnum = 0
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            pnum += 1
            traceln(" --- page %s - constructing separator candidates" % pnum)

            #load the page objects and the GT partition (defined by the table) if any
            loBaseline, dsetTableByRow = doer.loadPage(ndPage)
            traceln(" - found %d objects on page" % (len(loBaseline)))

            # find almost-horizontal cuts and tag them if GT is available
            loHCut = doer.findHCut(ndPage, loBaseline, dsetTableByRow,
                                   fCutHeight)
            cAll.update(Counter(o._du_label for o in loHCut))

    lk = sorted(cAll.keys())
    traceln("GT: ALL CUT Label count:  ",
            "  ".join("%s:%d" % (k, cAll[k]) for k in lk))
예제 #14
0
 def addPageXmlSeparator(nd, i, lGTi, x1, y1, x2, y2, domid):
     ndSep = MultiPageXml.createPageXmlNode("GridSeparator")
     if lGTi:
         # propagate the groundtruth info we have
         sLabel = self.getLabel(i, lGTi)
         ndSep.set("type", sLabel)
     if abs(x2 - x1) > abs(y2 - y1):
         ndSep.set("orient", "0")
     else:
         ndSep.set("orient", "90")
     ndSep.set("id", "s_%d" % domid)
     nd.append(ndSep)
     ndCoord = MultiPageXml.createPageXmlNode("Coords")
     MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)])
     ndSep.append(ndCoord)
     return ndSep
def addSeparator(root, lCells):
    """
    Add separator that correspond to cell boundaries
    modify the XML DOM
    """
    dRow, dCol = getCellsSeparators(lCells)

    try:
        ndTR = MultiPageXml.getChildByName(root,'TableRegion')[0]
    except IndexError:
        raise TableAnnotationException("No TableRegion!!! ")

    lRow = sorted(dRow.keys())
    lB = []
    for row in lRow:
        (x1, y1), (x2, y2) = dRow[row]
        b = math.degrees(math.atan((y2-y1) / (x2-x1)))
        lB.append(b)
        
        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "horizontal angle=%.2f" % b)
        ndSep.set("row", "%d" % row)
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)])
        ndSep.append(ndCoord)
        sStat = "\tHORIZONTAL: Average=%.1f°  stdev=%.2f°  min=%.1f° max=%.1f°" % (
        np.average(lB), np.std(lB), min(lB), max(lB)
        )
    ndTR.append(etree.Comment(sStat))
    traceln(sStat)
    
    lCol = sorted(dCol.keys())
    lB = []
    for col in lCol:
        (x1, y1), (x2, y2) = dCol[col]
        b = 90  -math.degrees(math.atan((x2-x1) / (y2 - y1)))
        lB.append(b)
        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "vertical %.2f" % b)
        ndSep.set("col", "%d" % col)
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)])
        ndSep.append(ndCoord)
    sStat = "\tVERTICAL  : Average=%.1f°  stdev=%.2f°  min=%.1f° max=%.1f°" % (
        np.average(lB), np.std(lB), min(lB), max(lB)
        )
    ndTR.append(etree.Comment(sStat))
    traceln(sStat)

    return
예제 #16
0
    def remove_grid_from_dom(self, root):
        """
        clean the DOM from any existing grid (useful to choose at run-time the 
        grid increment (called step)
        return the number of removed grid lines
        """
        for iPage, ndPage in enumerate(
                MultiPageXml.getChildByName(root, 'Page')):

            lnd = MultiPageXml.getChildByName(root, 'GridSeparator')
            n = len(lnd)
            for nd in lnd:
                nd.getparent().remove(nd)
            #check...
            lnd = MultiPageXml.getChildByName(root, 'GridSeparator')
            assert len(lnd) == 0
        return n
def isBaselineHorizontal(ndText):
    lNdBaseline = MultiPageXml.getChildByName(ndText ,'Baseline')
    if lNdBaseline:
        try:
            o = ShapeLoader.node_to_LineString(lNdBaseline[0])
        except:
            return True
        (minx, miny, maxx, maxy) = o.bounds
        return bool(maxx-minx >= maxy-miny)
    return True            
    def get_separator_YX_from_DOM(self, root, fMinPageCoverage):
        """
        get the x and y of the GT table separators
        return lists of y, for horizontal and of x for vertical separators, per page
        return [(y_list, x_list), ...]
        """
        ltlYlX = []
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            lYi, lXi = [], []

            l = MultiPageXml.getChildByName(ndPage, 'TableRegion')
            if len(l) != 1:
                if l:
                    traceln(
                        "** warning ** %d TableRegion instead of expected 1" %
                        len(l))
                else:
                    traceln("** warning ** no TableRegion, expected 1")
            if l:
                for ndTR in l:
                    #enumerate the table separators
                    for ndSep in MultiPageXml.getChildByName(
                            ndTR, 'SeparatorRegion'):
                        sPoints = MultiPageXml.getChildByName(
                            ndSep, 'Coords')[0].get('points')
                        [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY

                        dx, dy = abs(x2 - x1), abs(y2 - y1)
                        if dx > dy:
                            #horizontal table line
                            if dx > (fMinPageCoverage * w):
                                #ym = (y1+y2)/2.0   # 2.0 to support python2
                                lYi.append((y1, y2))
                        else:
                            if dy > (fMinPageCoverage * h):
                                #xm = (x1+x2)/2.0
                                lXi.append((x1, x2))
            ltlYlX.append((lYi, lXi))

        return ltlYlX
예제 #19
0
    def get_grid_GT_index_from_DOM(self, root, fMinPageCoverage):
        """
        get the index in our grid of the table lines
        return lists of index, for horizontal and for vertical grid lines, per page
        return [(h_list, v_list), ...]
        """
        ltlHlV = []
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            lHi, lVi = [], []

            l = MultiPageXml.getChildByName(ndPage, 'TableRegion')
            if l:
                assert len(l) == 1, "More than 1 TableRegion??"
                ndTR = l[0]

                #enumerate the table separators
                for ndSep in MultiPageXml.getChildByName(
                        ndTR, 'SeparatorRegion'):
                    sPoints = MultiPageXml.getChildByName(
                        ndSep, 'Coords')[0].get('points')
                    [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY

                    dx, dy = abs(x2 - x1), abs(y2 - y1)
                    if dx > dy:
                        #horizontal table line
                        if dx > (fMinPageCoverage * w):
                            ym = (y1 + y2) / 2.0  # 2.0 to support python2
                            #i = int(round(ym / self.iGridVertiStep, 0))
                            i = self.snapToGridIndex(ym, self.iGridVertiStep)
                            lHi.append(i)
                    else:
                        if dy > (fMinPageCoverage * h):
                            xm = (x1 + x2) / 2.0
                            #i = int(round(xm / self.iGridHorizStep, 0))
                            i = self.snapToGridIndex(xm, self.iGridHorizStep)
                            lVi.append(i)
            ltlHlV.append((lHi, lVi))

        return ltlHlV
 def getDomBaselineXY(cls, domNode):
     """
     find the baseline descendant node and return its "central" point
     """
     try:
         ndBaseline = MultiPageXml.getChildByName(domNode, 'Baseline')[0]
     except IndexError as e:
         traceln("WARNING:  No Baseline child in ", domNode.get('id'))
         raise e
     x, y = cls.getPolylineAverageXY(ndBaseline)
     # modulo should be done only after the GT assigns labels.
     return (x, y)
예제 #21
0
 def _shapeFromNodePoints(cls, nd, ShapeClass):
     """
     Find the Coords child of the node and parse its points
      e.g. <SeparatorRegion orient="horizontal 373.8 0.006" row="1">
             <Coords points="3324,392 3638,394"/>
           </SeparatorRegion>
     returns a shape of given class
     """
     sPoints = nd.get('points')
     if sPoints is None:
         sPoints = MultiPageXml.getChildByName(nd,
                                               'Coords')[0].get('points')
     return cls._shapeFromPoints(sPoints, ShapeClass)
예제 #22
0
    def evalClusterByRow(self, sFilename):
        """
        Evaluate the quality of the partitioning by table row, by comparing the
        GT table information to the partition done automatically (thanks to the
        separators added to the DOM).
        """
        self.doc = etree.parse(sFilename)
        root = self.doc.getroot()
        
#         doer = BaselineCutAnnotator()
#         
#         #load the groundtruth table separators, if any, per page (1 in tABP)
#         ltlYlX = doer.get_separator_YX_from_DOM(root, self.fMinPageCoverage)
#         for (lHi, lVi) in ltlYlX:
#             traceln(" - found %d horizontal,  %d vertical  GT separators" % (len(lHi), len(lVi)))

#         #create DOM node reflecting the cuts 
#         #first clean (just in case!)
#         n = doer.remove_cuts_from_dom(root) 
#         if n > 0: 
#             traceln(" - removed %d pre-existing cut lines" % n)
#         
#         # if GT, then we have labelled cut lines in DOM
#         _ltlYCutXCut = doer.add_cut_to_DOM(root, ltlYlX=ltlYlX)  

        lClassicType = [nt for nt in self.getNodeTypeList() if nt     in self._lClassicNodeType]
        lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]

        #load the block nodes per page        
        for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
            #now that we have the page, let's create the node for each type!
            lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
            lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
        
            # -- GT ---------------------------------------------
            # partition by columns ad rows
            dGTByRow = collections.defaultdict(list)
            dGTByCol = collections.defaultdict(list)
        
            for blk in lClassicPageNode:
                cell = MultiPageXml.getAncestorByName(blk, 'TableCell')[0]
                row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \
                                              in ["row", "col", "rowSpan", "colSpan"] ]
                # TODO: deal with span
                dGTByRow[row].append(blk)
                dGTByCol[col].append(col)
                
            for k,l in dGTByRow.items:
                l.sort(key=lambda o: (o.x1, o.y1))
            for k,l in dGTByCol.items:
                l.sort(key=lambda o: (o.y1, o.x1))
def getDocSeparators(sFilename):
    """
    return two dictionaries
    row -> list of (x1, y1, x2, y2)
    col -> list of (x1, y1, x2, y2)
    """
    parser = etree.XMLParser()
    doc = etree.parse(sFilename, parser)
    root = doc.getroot()
    lCell= MultiPageXml.getChildByName(root,'TableCell')
    if not lCell:
        raise DocSeparatorException("No TableCell element in %s" %sFilename)
    dRowSep, dColSep = getCellsSeparators(lCell)
    del doc
    return dRowSep, dColSep
예제 #24
0
    def storeMPXML(self, lFiles):
        """
            store files in lFiles as mpxml
        """
        docDir = os.path.join(self.coldir + os.sep + 'col', self.docid)

        doc = MultiPageXml.makeMultiPageXml(lFiles)

        ## add cornerNode
        lCells = PageXml.getChildByName(doc.getRootElement(), 'TableCell')
        for domNode in lCells:
            cornerNode = libxml2.newNode('CornerPts')
            cornerNode.setContent("0 1 2 3")
            cornerNode.setNs(doc.getRootElement().ns())
            domNode.addChild(cornerNode)

        sMPXML = docDir + ".mpxml"
        doc.saveFormatFileEnc(sMPXML, "UTF-8", True)

        return doc, sMPXML
예제 #25
0
def defineTableBordersFromCells(table):
    """
        Table points are too loose: redefine thme with cells regions 
    """
    lCells = MultiPageXml.getChildByName(table, 'TableCell')
    maxRow = max(int(x.get('row')) for x in lCells)
    maxCol = max(int(x.get('col')) for x in lCells)
    #first col
    col1 = filter(lambda x: x.get('col') == "0", lCells)
    colN = filter(lambda x: x.get('col') == str(maxCol), lCells)
    row1 = filter(lambda x: x.get('row') == "0", lCells)
    rowN = filter(lambda x: x.get('row') == str(maxRow), lCells)

    # T R B L
    lColSep_1 = getTBLRBorders(col1)[3]
    lColSep_N = getTBLRBorders(colN)[1]
    lRowSep_1 = getTBLRBorders(row1)[0]
    lRowSep_N = getTBLRBorders(rowN)[2]

    return lColSep_1, lColSep_N, lRowSep_1, lRowSep_N
def op_cut(sFilename,
           sOutFilename,
           lDegAngle,
           bCutAbove,
           fMinHorizProjection=0.05,
           fCutHeight=25):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    doc = etree.parse(sFilename, parser)
    root = doc.getroot()

    doer = SkewedCutAnnotator(bCutAbove,
                              lAngle=[math.radians(x) for x in lDegAngle])

    pnum = 0
    domid = 0
    for ndPage in MultiPageXml.getChildByName(root, 'Page'):
        pnum += 1
        traceln(" --- page %s - constructing separator candidates" % pnum)

        #load the page objects and the GT partition (defined by the table) if any
        loBaseline, dsetTableByRow = doer.loadPage(ndPage)
        traceln(" - found %d objects on page" % (len(loBaseline)))

        # find almost-horizontal cuts and tag them if GT is available
        loHCut = doer.findHCut(ndPage, loBaseline, dsetTableByRow, fCutHeight)

        #create DOM node reflecting the cuts
        #first clean (just in case!)
        n = doer.remove_cuts_from_dom(ndPage)
        if n > 0:
            traceln(" - removed %d pre-existing cut lines" % n)

        # if GT, then we have labelled cut lines in DOM
        domid = doer.add_Hcut_to_Page(ndPage, loHCut, domid)

    doc.write(sOutFilename,
              encoding='utf-8',
              pretty_print=True,
              xml_declaration=True)
    print('Annotated cut separators added to %s' % sOutFilename)
예제 #27
0
    def storeMPXML(self, lFiles):
        """
            store files in lFiles as mpxml
        """
        docDir = os.path.join(self.coldir + os.sep + 'col', self.docid)

        doc = MultiPageXml.makeMultiPageXml(lFiles)

        sMPXML = docDir + ".mpxml"
        #         print sMPXML
        doc.write(sMPXML,
                  encoding="UTF-8",
                  pretty_print=True,
                  xml_declaration=True)

        #         trace("\t\t- validating the MultiPageXml ...")
        #         if not MultiPageXml.validate(doc):
        #                 traceln("   *** WARNING: XML file is invalid against the schema: '%s'"%self.outputFileName)
        #         traceln(" Ok!")

        return doc, sMPXML
    def getHisto(self,
                 lNd,
                 w,
                 _fMinHorizProjection,
                 h,
                 _fMinVertiProjection,
                 fRatio=1.0,
                 fMinHLen=None):
        """
        
        return two Numpy array reflecting the histogram of projections of objects
        first array along Y axis (horizontal projection), 2nd along X axis 
        (vertical projection)
        
        when fMinHLen is given , we do not scale horizontally text shorter than fMinHLen
        """

        hy = np.zeros((h, ), np.float)
        hx = np.zeros((w, ), np.float)

        for nd in lNd:
            sPoints = MultiPageXml.getChildByName(nd,
                                                  'Coords')[0].get('points')
            try:
                x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle()

                if fMinHLen is None or abs(x2 - x1) > fMinHLen:
                    _x1, _x2 = self.scale(x1, x2, fRatio)
                else:
                    _x1, _x2 = x1, x2
                _y1, _y2 = self.scale(y1, y2, fRatio)
                hy[_y1:_y2 + 1] += float(x2 - x1) / w
                hx[_x1:_x2 + 1] += float(y2 - y1) / h
            except ZeroDivisionError:
                pass
            except ValueError:
                pass

        return hy, hx
예제 #29
0
    def processParameters(self):
        """
            what to do with the parameters provided by the command line
        """
        if self.colid is None:
            print('collection id missing!')
            sys.exit(1)

        self.bFullCol = self.docid != None

        if self.bRegenerateMPXML and self.docid is not None:
            l = glob.glob(os.path.join(self.coldir, sCOL, self.docid,
                                       "*.pxml"))
            doc = MultiPageXml.makeMultiPageXml(l)
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            doc.write(outputFileName,
                      xml_declaration=True,
                      encoding="UTF-8",
                      pretty_print=True)
            return doc
        return None
예제 #30
0
    def _doShape_getChildByName(cls, node, name, ShapeClass, fun=None):
        """
        do a MultiPageXml.getChildByName from a node to get some nodes
            e.g. "SeparatorRegion"
        construct a shape of given Shapely class from the coordinates of each node
            e.g.  <SeparatorRegion orient="horizontal 373.8 0.006" row="1">
                <Coords points="3324,392 3638,394"/>
              </SeparatorRegion>
        if fun is given applies fun, with arguments: shape, current_node
        return the list of shape objects in same order as retrived by getChildByName
        """
        lO = []
        for _nd in MultiPageXml.getChildByName(node, name):
            try:
                o = cls._shapeFromNodePoints(_nd, ShapeClass)
            except Exception as e:
                print('ERROR: cannot load this element "%s"' % str(_nd))
                print('  because "%s"' % e)
                continue
            if not fun is None: fun(o, _nd)
            lO.append(o)

        return lO