예제 #1
0
                   )
    
    # ntA.setXpathExpr( (".//pc:TextLine | .//pc:TextRegion"        #how to find the nodes
    #                   , "./pc:TextEquiv")       #how to get their text
    #                 )
    DU_GRAPH.addNodeType(nt)
    
    return DU_GRAPH


if __name__ == "__main__":
    #     import better_exceptions
    #     better_exceptions.MAX_LENGTH = None
    
    # standard command line options for CRF- ECN- GAT-based methods
    usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0])

    traceln("VERSION: %s" % DU_Task_Factory.getVersion())

    # --- 
    #parse the command line
    (options, args) = parser.parse_args()

    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        DU_Task_Factory.exit(usage, 1, e)

    doer = DU_Task_Factory.getDoer(sModelDir, sModelName
                                   , options                    = options
def main(sys_argv_0, sLabelAttribute, cNodeType=My_ConjugateNodeType):

    
    def getConfiguredGraphClass(_doer):
        """
        In this class method, we must return a configured graph class
        """
        # each graph reflects 1 page
        if options.bSeparator:
            DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml_Separator
        else:
            DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml
    
        ntClass = cNodeType
    
        nt = ntClass(sLabelAttribute         #some short prefix because labels below are prefixed with it
                      , []                   # in conjugate, we accept all labels, andNone becomes "none"
                      , []
                      , False                # unused
                      , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3))  #we reduce overlap in this way
                      )    
        nt.setLabelAttribute(sLabelAttribute)
        nt.setXpathExpr( (".//pc:TextLine"        #how to find the nodes            
                          #, "./pc:TextEquiv")       #how to get their text
                          , ".//pc:Unicode")       #how to get their text
                       )
        DU_GRAPH.addNodeType(nt)
        
        return DU_GRAPH

    # standard command line options for CRF- ECN- GAT-based methods
    usage, parser = DU_Task_Factory.getStandardOptionsParser(sys_argv_0)
    parser.add_option("--separator", dest='bSeparator', action="store_true"
                      , default=False, help="Use the graphical spearators, if any, as edge features.") 
    parser.add_option("--text"       , dest='bText'     , action="store_true"
                      , default=False, help="Use textual information if any, as node and edge features.") 
    parser.add_option("--edge_vh", "--edge_hv"    , dest='bShift'    , action="store_true"
                      , default=False, help="Shift edge feature by range depending on edge type.") 
    parser.add_option("--jsonocr", dest='bJsonOcr',  action="store_true"
                          , help="I/O is in json")   
    traceln("VERSION: %s" % DU_Task_Factory.getVersion())

    # --- 
    #parse the command line
    (options, args) = parser.parse_args()

    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        DU_Task_Factory.exit(usage, 1, e)
    if options.bText     : traceln(" - using textual data, if any")
    if options.bSeparator: traceln(" - using graphical separators, if any")
    if options.bShift    : traceln(" - shift edge features by edge type")
    
    if options.bText:
        if options.bSeparator:
            if options.bShift:
                cFeatureDefinition = Features_June19_Full_Separator_Shift
            else:
                cFeatureDefinition = Features_June19_Full_Separator
        else: 
            if options.bShift:
                cFeatureDefinition = Features_June19_Full_Shift
            else:  
                cFeatureDefinition = Features_June19_Full 
    else:
        if options.bSeparator:
            if options.bShift:
                cFeatureDefinition = Features_June19_Simple_Separator_Shift
            else:  
                cFeatureDefinition = Features_June19_Simple_Separator
        else: 
            if options.bShift:
                cFeatureDefinition = Features_June19_Simple_Shift 
            else:  
                cFeatureDefinition = Features_June19_Simple 

    # === SETTING the graph type (and its node type) a,d the feature extraction pipe
    doer = DU_Task_Factory.getDoer(sModelDir, sModelName
                                   , options                    = options
                                   , fun_getConfiguredGraphClass= getConfiguredGraphClass
                                   , cFeatureDefinition         = cFeatureDefinition
                                   )
    
    # == LEARNER CONFIGURATION ===
    # setting the learner configuration, in a standard way 
    # (from command line options, or from a JSON configuration file)
    dLearnerConfig = doer.getStandardLearnerConfig(options)
    
    
#     # force a balanced weighting
#     print("Forcing balanced weights")
#     dLearnerConfig['balanced'] = True
    
    # of course, you can put yours here instead.
    doer.setLearnerConfiguration(dLearnerConfig)

    # === GO!! ===
    # act as per specified in the command line (--trn , --fold-run, ...)
    doer.standardDo(options)
    
    del doer
예제 #3
0
            cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2)

        self.setNbClass(
            nbClass
        )  #so that we check if all classes are represented in the training set

        self.bsln_mdl = self.addBaseline_LogisticRegression(
        )  #use a LR model trained by GridSearch as baseline

    #=== END OF CONFIGURATION =============================================================


if __name__ == "__main__":

    version = "v.01"
    usage, description, parser = DU_Task_Factory.getStandardOptionsParser(
        sys.argv[0], version)

    # ---
    #parse the command line
    (options, args) = parser.parse_args()

    # ---
    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        _exit(usage, 1, e)

    doer = DU_GTBooks(sModelName,
                      sModelDir,
                      C=options.crf_C,
예제 #4
0
        ".//pc:TextLine"  #how to find the nodes            
        #, "./pc:TextEquiv")       #how to get their text
        ,
        ".//pc:Unicode")  #how to get their text
                    )
    DU_GRAPH.addNodeType(nt)

    return DU_GRAPH


if __name__ == "__main__":
    #     import better_exceptions
    #     better_exceptions.MAX_LENGTH = None

    # standard command line options for CRF- ECN- GAT-based methods
    usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0])

    traceln("VERSION: %s" % DU_Task_Factory.getVersion())

    # ---
    #parse the command line
    (options, args) = parser.parse_args()

    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        DU_Task_Factory.exit(usage, 1, e)

    # === SETTING the graph type (and its node type) a,d the feature extraction pipe
    doer = DU_Task_Factory.getDoer(
예제 #5
0
        #, "./pc:TextEquiv")       #how to get their text
        ,
        ".//pc:Unicode")  #how to get their text
                    )
    DU_GRAPH.addNodeType(nt)

    return DU_GRAPH


if __name__ == "__main__":
    #     import better_exceptions
    #     better_exceptions.MAX_LENGTH = None
    global options

    # standard command line options for CRF- ECN- GAT-based methods
    usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0])

    parser.add_option(
        "--separator",
        dest='bSeparator',
        action="store_true",
        default=False,
        help="Use the graphical spearators, if any, as edge features.")
    traceln("VERSION: %s" % DU_Task_Factory.getVersion())

    # ---
    #parse the command line
    (options, args) = parser.parse_args()

    try:
        sModelDir, sModelName = args
예제 #6
0
def main(sys_argv_0, sLabelAttribute, cNodeType=NodeType_PageXml_type_woText):
    def getConfiguredGraphClass(_doer):
        """
        In this class method, we must return a configured graph class
        """

        DU_GRAPH = Graph_MultiSinglePageXml

        ntClass = cNodeType
        #lLabels = ['heading','paragraph','paragraph_left','paragraph_right','None']
        lLabels = ['heading', 'paragraph', 'footnote', 'None']

        #lLabels.append('IGNORE')

        nt = ntClass(
            sLabelAttribute  #some short prefix because labels below are prefixed with it
            ,
            lLabels  # in conjugate, we accept all labels, andNone becomes "none"
            ,
            [],
            False  # unused
            ,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)
                                       )  #we reduce overlap in this way
        )

        nt.setLabelAttribute(sLabelAttribute)
        nt.setXpathExpr((
            ".//pc:TextLine"  #how to find the nodes            
            #, "./pc:TextEquiv")       #how to get their text
            ,
            ".//pc:Unicode")  #how to get their text
                        )
        DU_GRAPH.addNodeType(nt)

        return DU_GRAPH

    # standard command line options for CRF- ECN- GAT-based methods
    usage, parser = DU_Task_Factory.getStandardOptionsParser(sys_argv_0)
    parser.add_option(
        "--separator",
        dest='bSeparator',
        action="store_true",
        default=False,
        help="Use the graphical spearators, if any, as edge features.")
    parser.add_option(
        "--text",
        dest='bText',
        action="store_true",
        default=False,
        help="Use textual information if any, as node and edge features.")
    parser.add_option(
        "--edge_vh",
        "--edge_hv",
        dest='bShift',
        action="store_true",
        default=False,
        help="Shift edge feature by range depending on edge type.")
    parser.add_option(
        "--spm",
        dest='sSPModel',
        action="store",
        type="string",
        help=
        "Textual features are computed based on the given SentencePiece model. e.g. model/toto.model."
    )
    traceln("VERSION: %s" % DU_Task_Factory.getVersion())

    # ---
    #parse the command line
    (options, args) = parser.parse_args()

    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        DU_Task_Factory.exit(usage, 1, e)

    if options.sSPModel:
        if not (options.sSPModel.endswith(".model")):
            options.sSPModel = options.sSPModel + ".model"
        traceln(
            " - using SentencePiece model '%s' to create textual features" %
            options.sSPModel)
        # just checking things early...
        import sentencepiece as spm
        open(options.sSPModel).close()
        dFeatureConfig = {"sSPModel": options.sSPModel}
        cFeatureDefinition = Features_June19_Full_SPM

        options.bText = True
    elif options.bText:
        cFeatureDefinition = Features_GHENT_UNIGRAM
        dFeatureConfig = {}
    else:
        cFeatureDefinition = Features_GHENT
        dFeatureConfig = {}

    if options.bText: traceln(" - using textual data, if any")
    if options.bSeparator: traceln(" - using graphical separators, if any")
    if options.bShift: traceln(" - shift edge features by edge type")
    #     if options.bText:
    #         if options.bSeparator:
    #             if options.bShift:
    #                 cFeatureDefinition = Features_June19_Full_Separator_Shift
    #             else:
    #                 cFeatureDefinition = Features_June19_Full_Separator
    #         else:
    #             if options.bShift:
    #                 cFeatureDefinition = Features_June19_Full_Shift
    #             else:
    # #                 cFeatureDefinition = Features_June19_Full
    #                 cFeatureDefinition = Features_BAR_Full
    #
    #     else:
    #         if options.bSeparator:
    #             if options.bShift:
    #                 cFeatureDefinition = Features_June19_Simple_Separator_Shift
    #             else:
    #                 cFeatureDefinition = Features_June19_Simple_Separator
    #         else:
    #             if options.bShift:
    #                 cFeatureDefinition = Features_June19_Simple_Shift
    #             else:
    #                 cFeatureDefinition = Features_June19_Simple

    # === SETTING the graph type (and its node type) a,d the feature extraction pipe
    doer = DU_Task_Factory.getDoer(
        sModelDir,
        sModelName,
        options=options,
        fun_getConfiguredGraphClass=getConfiguredGraphClass,
        cFeatureDefinition=cFeatureDefinition,
        dFeatureConfig=dFeatureConfig)

    # == LEARNER CONFIGURATION ===
    # setting the learner configuration, in a standard way
    # (from command line options, or from a JSON configuration file)
    dLearnerConfig = doer.getStandardLearnerConfig(options)

    #     # force a balanced weighting
    #     print("Forcing balanced weights")
    dLearnerConfig['balanced'] = True

    # of course, you can put yours here instead.
    doer.setLearnerConfiguration(dLearnerConfig)
    doer.setAdditionalDataProvider(getDataToPickle_for_table)

    # === CONJUGATE MODE ===
    #doer.setConjugateMode()

    # === GO!! ===
    # act as per specified in the command line (--trn , --fold-run, ...)
    doer.standardDo(options)

    del doer
예제 #7
0
                 )    
    nt.setLabelAttribute("type")
    nt.setXpathExpr((".//pc:TextRegion"
                      , "")
                   )
    DU_GRAPH.addNodeType(nt)
    
    return DU_GRAPH


if __name__ == "__main__":
    #     import better_exceptions
    #     better_exceptions.MAX_LENGTH = None
    
    # standard command line options for CRF- ECN- GAT-based methods
    usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0])

    parser.add_option("--TextRegion"     , dest='bTextRegion' , action="store_true"
                        , default=False, help="Tag TextRegion, instead of TextLine") 
    parser.add_option("--no_text"     , dest='bNoText'   , action="store_true"
                        , default=False, help="Do not use text, otherwise use utext unigrams") 

    traceln("VERSION: %s" % DU_Task_Factory.getVersion())

    # --- 
    #parse the command line
    (options, args) = parser.parse_args()
    bTextRegion = options.bTextRegion
    bText       = not(options.bNoText)
    traceln("bTextRegion=%s  bText=%s" % (bTextRegion, bText))