Пример #1
0
    def parseDocFile(self, sFilename, iVerbose=0):
        """
        Load that document as a CRF Graph.
        Also set the self.doc variable!
        
        Return a CRF Graph object
        """
        self.doc = etree.parse(sFilename)
        self.lNode, self.lEdge = list(), list()
        self.lNodeBlock = []  # text node
        self.lNodeGridLine = []  # grid line node

        root = self.doc.getroot()

        doer = GridAnnotator(self.iGridStep_H, self.iGridStep_V)

        #map the groundtruth table separators, if any, to our grid
        ltlHlV = doer.get_grid_GT_index_from_DOM(root, self.fMinPageCoverage)
        for (lHi, lVi) in ltlHlV:
            traceln(" - found %d horizontal,  %d vertical  GT separators" %
                    (len(lHi), len(lVi)))

        #create DOM node reflecting the grid
        #first clean (just in case!)
        n = doer.remove_grid_from_dom(root)
        if n > 0: traceln(" - removed %d existing grid lines" % n)

        # we add GridSeparator elements. Groundtruth ones have type="1"
        n = doer.add_grid_to_DOM(root, ltlHlV)
        traceln(" - added   %d grid lines  %s" %
                (n, (self.iGridStep_H, self.iGridStep_V)))

        lClassicType = [
            nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType
        ]
        lSpecialType = [
            nt for nt in self.getNodeTypeList()
            if nt not in self._lClassicNodeType
        ]
        for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc):
            #now that we have the page, let's create the node for each type!
            lClassicPageNode = [
                nd for nodeType in lClassicType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]
            lSpecialPageNode = [
                nd for nodeType in lSpecialType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]

            self.lNode.extend(lClassicPageNode)  # e.g. the TextLine objects
            self.lNodeBlock.extend(lClassicPageNode)

            self.lNode.extend(lSpecialPageNode)  # e.g. the grid lines!
            self.lNodeGridLine.extend(lSpecialPageNode)

            #no previous page to consider (for cross-page links...) => None
            lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode)
            self.lEdge.extend(lClassicPageEdge)

            # Now, compute edges between special and classic objects...
            lSpecialPageEdge = self.computeSpecialEdges(
                lClassicPageNode, lSpecialPageNode)
            self.lEdge.extend(lSpecialPageEdge)

            #if iVerbose>=2: traceln("\tPage %5d    %6d nodes    %7d edges"%(pnum, len(lPageNode), len(lPageEdge)))
            if iVerbose >= 2:
                traceln("\tPage %5d" % (pnum))
                traceln("\t   block: %6d nodes    %7d edges (to block)" %
                        (pnum, len(lClassicPageNode), len(lClassicPageEdge)))
                traceln("\t   line: %6d nodes    %7d edges (from block)" %
                        (pnum, len(lSpecialPageNode), len(lSpecialPageEdge)))

        if iVerbose:
            traceln("\t\t (%d nodes,  %d edges)" %
                    (len(self.lNode), len(self.lEdge)))

        return self
Пример #2
0
    def computeSpecialEdges(cls, lClassicPageNode, lSpecialPageNode):
        """
        Compute:
        - edges between each block and the grid line above/across/below the block
        - edges between grid lines
        return a list of edges
        """
        # indexing the grid lines
        dGridLineByIndex = {
            GridAnnotator.snapToGridIndex(nd.y1, cls.iGridStep_V): nd
            for nd in lSpecialPageNode
        }

        for nd in lSpecialPageNode:
            #print(nd, dGridLineByIndex[GridAnnotator.snapToGridIndex(nd.y1, cls.iGridStep_V)])
            assert dGridLineByIndex[GridAnnotator.snapToGridIndex(
                nd.y1,
                cls.iGridStep_V)] == nd, "internal error inconsistent grid"

        # block to grid line edges
        lEdge = []
        fLenNorm = float(cls.iGridStep_V * cls.iBlockVisibility)
        imin, imax = 100, -1
        assert lClassicPageNode, "ERROR: empty page!!??"

        for ndBlock in lClassicPageNode:
            ### print("---- ", ndBlock)
            # i1 = GridAnnotator.snapToGridIndex(nd.x1, cls.iGridStep_V)
            # i2 = GridAnnotator.snapToGridIndex(nd.x2, cls.iGridStep_V)
            i1 = int(math.floor(ndBlock.y1 / float(cls.iGridStep_V)))
            i2 = int(math.ceil(ndBlock.y2 / float(cls.iGridStep_V)))
            assert i2 >= i1

            yBlkAvg = (ndBlock.y1 + ndBlock.y2) / 2.0

            #Also make visible the iBlockVisibility-1 previous grid lines, if any
            for i in range(max(0, i1 - cls.iBlockVisibility + 1), i1 + 1):
                edge = Edge_BL(ndBlock, dGridLineByIndex[i])
                edge.len = (yBlkAvg - i * cls.iGridStep_V) / fLenNorm
                edge._gridtype = -1
                lEdge.append(edge)
                imin = min(i, imin)
                ### print(ndBlock.y1, i, edge.len)

            for i in range(max(0, i1 + 1), max(0, i2)):
                ndLine = dGridLineByIndex[i]
                edge = Edge_BL(ndBlock, ndLine)
                edge.len = (yBlkAvg - i * cls.iGridStep_V) / fLenNorm
                edge._gridtype = 0  # grid line is crossing the block
                assert ndBlock.y1 < i * cls.iGridStep_V
                assert i * cls.iGridStep_V < ndBlock.y2
                ### print(ndBlock.y1, ndBlock.y2, i, edge.len)
                lEdge.append(edge)
                imax = max(imax, i)

            for i in range(max(0, i2), i2 + cls.iBlockVisibility):
                try:
                    edge = Edge_BL(ndBlock, dGridLineByIndex[i])
                except KeyError:
                    break  #  out of the grid
                edge.len = (yBlkAvg - i * cls.iGridStep_V) / fLenNorm
                edge._gridtype = +1
                lEdge.append(edge)
                imax = max(imax, i)
                ### print(ndBlock.y2, i, edge.len)

        #now filter those edges
        n0 = len(lEdge)
        lEdge = cls._filterBadEdge(lEdge, imin, imax, dGridLineByIndex)
        traceln(" - filtering: removed %d edges due to obstruction." %
                (len(lEdge) - n0))
        if False:
            print("--- After filtering: %d edges" % len(lEdge))
            lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid)
            for edge in lSortedEdge:
                print("Block domid=%s y1=%s y2=%s" %
                      (edge.A.domid, edge.A.y1, edge.A.y2) + "  %s line %s " %
                      (["↑", "-", "↓"][1 + edge._gridtype],
                       edge.B.y1 / cls.iGridStep_V) + "domid=%s y1=%s" %
                      (edge.B.domid, edge.B.y1))
        #what differ from previosu version
        cls._makeConsistentLabelForEmptyGridRow(lEdge, lClassicPageNode,
                                                dGridLineByIndex)

        # grid line to grid line edges
        n = len(dGridLineByIndex)
        for i in range(n):
            A = dGridLineByIndex[i]
            for j in range(i + 1, min(n, i + cls.iGridVisibility + 1)):
                edge = Edge_LL(A, dGridLineByIndex[j])
                edge.len = (j - i)
                lEdge.append(edge)

        return lEdge