示例#1
0
    def parseDocFile(self, sFilename, iVerbose=0):
        """
        Load that document as a CRF Graph.
        Also set the self.doc variable!
        
        Return a CRF Graph object
        """
        self.doc = etree.parse(sFilename)
        self.lNode, self.lEdge = list(), list()
        self.lNodeBlock = []  # text node
        self.lNodeCutLine = []  # cut line node

        root = self.doc.getroot()

        doer = BaselineCutAnnotator()
        doer.setLabelScheme_SIO()  #use SIO instead of SO labels!

        #doer.setModulo(self.iModulo)  # this is optional

        #load the groundtruth table separators, if any, per page (1 in tABP)
        ltlYlX = doer.get_separator_YX_from_DOM(root, self.fMinPageCoverage)
        for (lHi, lVi) in ltlYlX:
            traceln(" - found %d horizontal,  %d vertical  GT separators" %
                    (len(lHi), len(lVi)))

        #create DOM node reflecting the cuts
        #first clean (just in case!)
        n = doer.remove_cuts_from_dom(root)
        if n > 0:
            traceln(" - removed %d pre-existing cut lines" % n)

        # if GT, then we have labelled cut lines in DOM
        _ltlYCutXCut = doer.add_cut_to_DOM(root, ltlYlX=ltlYlX)

        lClassicType = [
            nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType
        ]
        lSpecialType = [
            nt for nt in self.getNodeTypeList()
            if nt not in self._lClassicNodeType
        ]

        for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
            #now that we have the page, let's create the node for each type!
            lClassicPageNode = [
                nd for nodeType in lClassicType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]
            lSpecialPageNode = [
                nd for nodeType in lSpecialType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]

            self.lNode.extend(lClassicPageNode)  # e.g. the TextLine objects
            self.lNodeBlock.extend(lClassicPageNode)

            self.lNode.extend(lSpecialPageNode)  # e.g. the cut lines!
            self.lNodeCutLine.extend(lSpecialPageNode)

            #no previous page to consider (for cross-page links...) => None
            lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode)
            self.lEdge.extend(lClassicPageEdge)

            # Now, compute edges between special and classic objects...
            lSpecialPageEdge = self.computeSpecialEdges(
                lClassicPageNode, lSpecialPageNode, doer.bCutIsBeforeText)
            self.lEdge.extend(lSpecialPageEdge)

            #if iVerbose>=2: traceln("\tPage %5d    %6d nodes    %7d edges"%(pnum, len(lPageNode), len(lPageEdge)))
            if iVerbose >= 2:
                traceln("\tPage %5d" % (pnum))
                traceln("\t   block: %6d nodes    %7d edges (to block)" %
                        (pnum, len(lClassicPageNode), len(lClassicPageEdge)))
                traceln("\t   line: %6d nodes    %7d edges (from block)" %
                        (pnum, len(lSpecialPageNode), len(lSpecialPageEdge)))

        if iVerbose:
            traceln("\t\t (%d nodes,  %d edges)" %
                    (len(self.lNode), len(self.lEdge)))

        return self
示例#2
0
    def parseDocFile(self, sFilename, iVerbose=0):
        """
        Load that document as a CRF Graph.
        Also set the self.doc variable!
        
        Return a CRF Graph object
        """
        self.doc = etree.parse(sFilename)
        self.lNode, self.lEdge = list(), list()
        self.lNodeBlock = []  # text node
        self.lNodeGridLine = []  # grid line node

        root = self.doc.getroot()

        doer = GridAnnotator(self.iGridStep_H, self.iGridStep_V)

        #map the groundtruth table separators, if any, to our grid
        ltlHlV = doer.get_grid_GT_index_from_DOM(root, self.fMinPageCoverage)
        for (lHi, lVi) in ltlHlV:
            traceln(" - found %d horizontal,  %d vertical  GT separators" %
                    (len(lHi), len(lVi)))

        #create DOM node reflecting the grid
        #first clean (just in case!)
        n = doer.remove_grid_from_dom(root)
        if n > 0: traceln(" - removed %d existing grid lines" % n)

        # we add GridSeparator elements. Groundtruth ones have type="1"
        n = doer.add_grid_to_DOM(root, ltlHlV)
        traceln(" - added   %d grid lines  %s" %
                (n, (self.iGridStep_H, self.iGridStep_V)))

        lClassicType = [
            nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType
        ]
        lSpecialType = [
            nt for nt in self.getNodeTypeList()
            if nt not in self._lClassicNodeType
        ]
        for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc):
            #now that we have the page, let's create the node for each type!
            lClassicPageNode = [
                nd for nodeType in lClassicType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]
            lSpecialPageNode = [
                nd for nodeType in lSpecialType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]

            self.lNode.extend(lClassicPageNode)  # e.g. the TextLine objects
            self.lNodeBlock.extend(lClassicPageNode)

            self.lNode.extend(lSpecialPageNode)  # e.g. the grid lines!
            self.lNodeGridLine.extend(lSpecialPageNode)

            #no previous page to consider (for cross-page links...) => None
            lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode)
            self.lEdge.extend(lClassicPageEdge)

            # Now, compute edges between special and classic objects...
            lSpecialPageEdge = self.computeSpecialEdges(
                lClassicPageNode, lSpecialPageNode)
            self.lEdge.extend(lSpecialPageEdge)

            #if iVerbose>=2: traceln("\tPage %5d    %6d nodes    %7d edges"%(pnum, len(lPageNode), len(lPageEdge)))
            if iVerbose >= 2:
                traceln("\tPage %5d" % (pnum))
                traceln("\t   block: %6d nodes    %7d edges (to block)" %
                        (pnum, len(lClassicPageNode), len(lClassicPageEdge)))
                traceln("\t   line: %6d nodes    %7d edges (from block)" %
                        (pnum, len(lSpecialPageNode), len(lSpecialPageEdge)))

        if iVerbose:
            traceln("\t\t (%d nodes,  %d edges)" %
                    (len(self.lNode), len(self.lEdge)))

        return self