示例#1
0
 def __init__(self, sModelName, sModelDir, 
              iBlockVisibility = None,
              iLineVisibility = None,
              sComment = None,
              C=None, tol=None, njobs=None, max_iter=None,
              inference_cache=None): 
     
     DU_ABPTableRCut.iBlockVisibility = iBlockVisibility
     DU_ABPTableRCut.iLineVisibility  = iLineVisibility
     
     DU_CRF_Task.__init__(self
                  , sModelName, sModelDir
                  , dFeatureConfig = {'row_row':{} , 'row_hdr':{}, 'row_sepH':{},
                                      'hdr_row':{} , 'hdr_hdr':{}, 'hdr_sepH':{},
                                      'sepH_row':{},'sepH_hdr':{}, 'sepH_sepH':{},
                                       'row':{}    , 'hdr':{}    , 'sepH':{}}
                  , dLearnerConfig = {
                                'C'                : .1   if C               is None else C
                              , 'njobs'            : 4    if njobs           is None else njobs
                              , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                              #, 'tol'              : .1
                              , 'tol'              : .05  if tol             is None else tol
                              , 'save_every'       : 50     #save every 50 iterations,for warm start
                              , 'max_iter'         : 10   if max_iter        is None else max_iter
                      }
                  , sComment=sComment
                  #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
                  ,cFeatureDefinition=My_FeatureDefinition_v2
                  )
示例#2
0
 def __init__(self, sModelName, sModelDir, sComment=None): 
     
     DU_CRF_Task.__init__(self
                          , sModelName, sModelDir
                          , dFeatureConfig = {
                                 'n_tfidf_node'    : 10
                               , 't_ngrams_node'   : (2,2)
                               , 'b_tfidf_node_lc' : False    
                               , 'n_tfidf_edge'    : 10
                               , 't_ngrams_edge'   : (2,2)
                               , 'b_tfidf_edge_lc' : False    
                           }
                          , dLearnerConfig = {
                                'C'                : .1 
                              , 'njobs'            : 2
                              , 'inference_cache'  : 10
                              , 'tol'              : .1
                              , 'save_every'       : 5     #save every 50 iterations,for warm start
                              #, 'max_iter'         : 1000
                              , 'max_iter'         : 2
                              }
                          , sComment=sComment
                          )
     
     self.addBaseline_LogisticRegression()    #use a LR model as baseline
示例#3
0
    def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): 
        
        #another way to specify the graph class
        # defining a  getConfiguredGraphClass is preferred
        self.configureGraphClass(self.DU_GRAPH)

        DU_CRF_Task.__init__(self
                     , sModelName, sModelDir
                     , dLearnerConfig = {
                                   'C'                : .1   if C               is None else C
                                 , 'njobs'            : 8    if njobs           is None else njobs
                                 , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                                 #, 'tol'              : .1
                                 , 'tol'              : .05  if tol             is None else tol
                                 , 'save_every'       : 50     #save every 50 iterations,for warm start
                                 , 'max_iter'         : 1000 if max_iter        is None else max_iter
                         }
                     , sComment=sComment
                     , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText_v3
                     , dFeatureConfig = {
                         #config for the extractor of nodes of each type
                         "text": None,    
                         "sprtr": None,
                         #config for the extractor of edges of each type
                         "text_text": None,    
                         "text_sprtr": None,    
                         "sprtr_text": None,    
                         "sprtr_sprtr": None    
                         }
                     )
        
        traceln("- classes: ", self.DU_GRAPH.getLabelNameList())

        self.bsln_mdl = self.addBaseline_LogisticRegression()    #use a LR model trained by GridSearch as baseline
    def __init__(self,
                 sModelName,
                 sModelDir,
                 iBlockVisibility=None,
                 iLineVisibility=None,
                 fCutHeight=None,
                 bCutAbove=None,
                 lRadAngle=None,
                 bTxt=None,
                 sComment=None,
                 cFeatureDefinition=None,
                 dFeatureConfig={},
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):

        DU_ABPTableSkewedRowCutLine.iBlockVisibility = iBlockVisibility
        DU_ABPTableSkewedRowCutLine.iLineVisibility = iLineVisibility
        DU_ABPTableSkewedRowCutLine.fCutHeight = fCutHeight
        DU_ABPTableSkewedRowCutLine.bCutAbove = bCutAbove
        DU_ABPTableSkewedRowCutLine.lRadAngle = lRadAngle
        DU_ABPTableSkewedRowCutLine.bTxt = bTxt

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            dFeatureConfig={
                'row_row': {},
                'row_sepH': {},
                'sepH_row': {},
                'sepH_sepH': {},
                'sepH': {},
                'row': {}
            },
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                4 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                10 if max_iter is None else max_iter
            },
            sComment=sComment
            #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
            ,
            cFeatureDefinition=My_FeatureDefinition_v3_txt
            if self.bTxt else My_FeatureDefinition_v3)
示例#5
0
    def __init__(self,
                 sModelName,
                 sModelDir,
                 sComment=None,
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):
        #NOTE: we might get a list in C tol max_iter inference_cache  (in case of gridsearch)

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            DU_GRAPH,
            dFeatureConfig={
                'nbClass': nbClass,
                't_ngrams_node': (2, 4),
                'b_node_lc': False,
                't_ngrams_edge': (2, 4),
                'b_edge_lc': False,
                'n_jobs':
                5  #n_jobs when fitting the internal Logit feat extractor model by grid search
            },
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                5 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                1000 if max_iter is None else max_iter
            },
            sComment=sComment,
            cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2)

        self.setNbClass(
            nbClass
        )  #so that we check if all classes are represented in the training set

        self.bsln_mdl = self.addBaseline_LogisticRegression(
        )  #use a LR model trained by GridSearch as baseline
示例#6
0
    def __init__(self,
                 sModelName,
                 sModelDir,
                 iGridVisibility=None,
                 iBlockVisibility=None,
                 sComment=None,
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):

        DU_ABPTableRG.iGridVisibility = iGridVisibility
        DU_ABPTableRG.iBlockVisibility = iBlockVisibility

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            dFeatureConfig={
                'row_row': {},
                'row_gh': {},
                'gh_row': {},
                'gh_gh': {},
                'gh': {},
                'row': {}
            },
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                4 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                10 if max_iter is None else max_iter
            },
            sComment=sComment
            #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
            ,
            cFeatureDefinition=My_FeatureDefinition)

        self.cModelClass.setBalancedWeights(True)
    def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): 

        DU_CRF_Task.__init__(self
                     , sModelName, sModelDir
                     , dFeatureConfig = {  }
                     , dLearnerConfig = {
                                   'C'                : .1   if C               is None else C
                                 , 'njobs'            : 4    if njobs           is None else njobs
                                 , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                                 #, 'tol'              : .1
                                 , 'tol'              : .05  if tol             is None else tol
                                 , 'save_every'       : 50     #save every 50 iterations,for warm start
                                 , 'max_iter'         : 10   if max_iter        is None else max_iter
                         }
                     , sComment=sComment
                     #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
                     ,cFeatureDefinition=My_FeatureDefinition_v2
                     )
示例#8
0
 def __init__(self, sModelName, sModelDir, feat_select=None, sComment=None):
     if feat_select == 'chi2':
         DU_CRF_Task.__init__(self,
                              sModelName,
                              sModelDir,
                              DU_GRAPH,
                              dFeatureConfig=dFeatureConfig_FeatSelect,
                              dLearnerConfig=dLearnerConfig,
                              sComment=sComment)
     else:
         DU_CRF_Task.__init__(self,
                              sModelName,
                              sModelDir,
                              DU_GRAPH,
                              dFeatureConfig=dFeatureConfig_Baseline,
                              dLearnerConfig=dLearnerConfig,
                              sComment=sComment)
     self.addBaseline_LogisticRegression()  #use a LR model as baseline
示例#9
0
    def __init__(self,
                 sModelName,
                 sModelDir,
                 sComment=None,
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):

        if sComment is None: sComment = sModelName
        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            dFeatureConfig={},
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                2 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                200 if max_iter is None else max_iter
            },
            sComment=sComment
            #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
            ,
            cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText_v3
        )

        #self.setNbClass(3)     #so that we check if all classes are represented in the training set

        if options.bBaseline:
            self.bsln_mdl = self.addBaseline_LogisticRegression(
            )  #use a LR model trained by GridSearch as baseline
示例#10
0
    def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): 
        
        if self.bHTR:
            cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes
            dFeatureConfig = {  
                               'n_tfidf_node':100, 't_ngrams_node':(1,2), 'b_tfidf_node_lc':False
                              , 'n_tfidf_edge':100, 't_ngrams_edge':(1,2), 'b_tfidf_edge_lc':False }
        else:
            cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes_noText
            dFeatureConfig = { } 
                               #'n_tfidf_node':None, 't_ngrams_node':None, 'b_tfidf_node_lc':None
                              #, 'n_tfidf_edge':None, 't_ngrams_edge':None, 'b_tfidf_edge_lc':None }
        
        DU_CRF_Task.__init__(self
                     , sModelName, sModelDir
                     , dFeatureConfig = dFeatureConfig
                     , dLearnerConfig = {
                                   'C'                : .1   if C               is None else C
                                 , 'njobs'            : 16    if njobs           is None else njobs
                                 , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                                 #, 'tol'              : .1
                                 , 'tol'              : .05  if tol             is None else tol
                                 , 'save_every'       : 50     #save every 50 iterations,for warm start
                                 , 'max_iter'         : 1000 if max_iter        is None else max_iter
                         }
                     , sComment=sComment
                     , cFeatureDefinition=cFeatureDefinition
#                     , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText
#                      , dFeatureConfig = {
#                          #config for the extractor of nodes of each type
#                          "text": None,    
#                          "sprtr": None,
#                          #config for the extractor of edges of each type
#                          "text_text": None,    
#                          "text_sprtr": None,    
#                          "sprtr_text": None,    
#                          "sprtr_sprtr": None    
#                          }
                     )
        
        traceln("- classes: ", self.getGraphClass().getLabelNameList())

        self.bsln_mdl = self.addBaseline_LogisticRegression()    #use a LR model trained by GridSearch as baseline
示例#11
0
 def runForExternalMLMethod(self,
                            lsColDir,
                            storeX,
                            applyY,
                            bRevertEdges=False):
     """
     Return the list of produced files
     """
     self.sXmlFilenamePattern = "*.mpxml"
     return DU_CRF_Task.runForExternalMLMethod(self, lsColDir, storeX,
                                               applyY, bRevertEdges)
示例#12
0
            except:
                pass  #we only need the transformer
            lsOutputFilename = doer.runForExternalMLMethod(
                lRun, options.storeX, options.applyY, options.bRevertEdges)
        else:
            doer.load()
            lsOutputFilename = doer.predict(lRun)

        traceln("Done, see in:\n  %s" % lsOutputFilename)


# ----------------------------------------------------------------------------
if __name__ == "__main__":

    version = "v.01"
    usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(
        sys.argv[0], version)
    #     parser.add_option("--annotate", dest='bAnnotate',  action="store_true",default=False,  help="Annotate the textlines with BIES labels")

    #FOR GCN
    parser.add_option("--revertEdges",
                      dest='bRevertEdges',
                      action="store_true",
                      help="Revert the direction of the edges")
    parser.add_option("--detail",
                      dest='bDetailedReport',
                      action="store_true",
                      default=False,
                      help="Display detailed reporting (score per document)")
    parser.add_option("--baseline",
                      dest='bBaseline',
                      action="store_true",
示例#13
0
 def predict(self, lsColDir):
     """
     Return the list of produced files
     """
     self.sXmlFilenamePattern = "*.mpxml"
     return DU_CRF_Task.predict(self, lsColDir)
示例#14
0
def main(DU_BAR):
    version = "v.01"
    usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(
        sys.argv[0], version)
    parser.add_option("--docid",
                      dest='docid',
                      action="store",
                      default=None,
                      help="only process docid")
    # ---
    #parse the command line
    (options, args) = parser.parse_args()

    # ---
    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        _exit(usage, 1, e)

    doer = DU_BAR(sModelName,
                  sModelDir,
                  C=options.crf_C,
                  tol=options.crf_tol,
                  njobs=options.crf_njobs,
                  max_iter=options.max_iter,
                  inference_cache=options.crf_inference_cache)

    if options.docid:
        sDocId = options.docid
    else:
        sDocId = None
    if options.rm:
        doer.rm()
        sys.exit(0)

    lTrn, lTst, lRun, lFold = [
        _checkFindColDir(lsDir)
        for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold]
    ]
    #     if options.bAnnotate:
    #         doer.annotateDocument(lTrn)
    #         traceln('annotation done')
    #         sys.exit(0)

    ## use. a_mpxml files
    doer.sXmlFilenamePattern = doer.sLabeledXmlFilenamePattern

    if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish:
        if options.iFoldInitNum:
            """
            initialization of a cross-validation
            """
            splitter, ts_trn, lFilename_trn = doer._nfold_Init(
                lFold, options.iFoldInitNum, bStoreOnDisk=True)
        elif options.iFoldRunNum:
            """
            Run one fold
            """
            oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum,
                                                  options.warm)
            traceln(oReport)
        elif options.bFoldFinish:
            tstReport = doer._nfold_Finish()
            traceln(tstReport)
        else:
            assert False, "Internal error"
        #no more processing!!
        exit(0)
        #-------------------

    if lFold:
        loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
        import graph.GraphModel
        sReportPickleFilename = os.path.join(sModelDir,
                                             sModelName + "__report.txt")
        traceln("Results are in %s" % sReportPickleFilename)
        graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename,
                                                      loTstRpt)
    elif lTrn:
        doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
        try:
            traceln("Baseline best estimator: %s" %
                    doer.bsln_mdl.best_params_)  #for GridSearch
        except:
            pass
        traceln(" --- CRF Model ---")
        traceln(doer.getModel().getModelInfo())
    elif lTst:
        doer.load()
        tstReport = doer.test(lTst)
        traceln(tstReport)

    if lRun:
        if options.storeX or options.applyY:
            try:
                doer.load()
            except:
                pass  #we only need the transformer
            lsOutputFilename = doer.runForExternalMLMethod(
                lRun, options.storeX, options.applyY)
        else:
            doer.load()
            lsOutputFilename = doer.predict(lRun)
        traceln("Done, see in:\n  %s" % lsOutputFilename)
def main_command_line(TableSkewedRowCut_CLASS):        
    version = "v.01"
    usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version)
#     parser.add_option("--annotate", dest='bAnnotate',  action="store_true",default=False,  help="Annotate the textlines with BIES labels")    

    #FOR GCN
    parser.add_option("--revertEdges", dest='bRevertEdges',  action="store_true", help="Revert the direction of the edges") 
    parser.add_option("--detail", dest='bDetailedReport',  action="store_true", default=False,help="Display detailed reporting (score per document)") 
    parser.add_option("--baseline", dest='bBaseline',  action="store_true", default=False, help="report baseline method") 
    parser.add_option("--line_see_line", dest='iLineVisibility',  action="store",
                      type=int, default=GraphSkewedCut.iLineVisibility,
                      help="seeline2line: how far in pixel can a line see another cut line?") 
    parser.add_option("--block_see_line", dest='iBlockVisibility',  action="store",
                      type=int, default=GraphSkewedCut.iBlockVisibility,
                      help="seeblock2line: how far in pixel can a block see a cut line?") 
    parser.add_option("--height", dest="fCutHeight", default=GraphSkewedCut.fCutHeight
                      , action="store", type=float, help="Minimal height of a cut") 
    parser.add_option("--cut-above", dest='bCutAbove',  action="store_true", default=False
                        ,help="Each object defines one or several cuts above it (instead of below as by default)") 
    parser.add_option("--angle", dest='lsAngle'
                      ,  action="store", type="string", default="-1,0,+1"
                        ,help="Allowed cutting angles, in degree, comma-separated") 

    parser.add_option("--graph", dest='bGraph',  action="store_true", help="Store the graph in the XML for displaying it") 
            
    # --- 
    #parse the command line
    (options, args) = parser.parse_args()

    if options.bGraph:
        import os.path
        # hack
        TableSkewedRowCut_CLASS.bCutAbove = options.bCutAbove
        traceln("\t%s.bCutAbove=" % TableSkewedRowCut_CLASS.__name__, TableSkewedRowCut_CLASS.bCutAbove)
        TableSkewedRowCut_CLASS.lRadAngle = [math.radians(v) for v in [float(s) for s in options.lsAngle.split(",")]]
        traceln("\t%s.lRadAngle=" % TableSkewedRowCut_CLASS.__name__, TableSkewedRowCut_CLASS.lRadAngle)
        for sInputFilename in args:
            sp, sf = os.path.split(sInputFilename)
            sOutFilename = os.path.join(sp, "graph-" + sf)
            doer = TableSkewedRowCut_CLASS("debug", "."
                                           , iBlockVisibility=options.iBlockVisibility
                                           , iLineVisibility=options.iLineVisibility
                                           , fCutHeight=options.fCutHeight
                                           , bCutAbove=options.bCutAbove
                                           , lRadAngle=[math.radians(float(s)) for s in options.lsAngle.split(",")])
            o = doer.cGraphClass()
            o.parseDocFile(sInputFilename, 9)
            o.parseDocLabels()
            o.addParsedLabelToDom()
            o.addEdgaddEdgeToDoc         print('Graph edges added to %s'%sOutFilename)
            o.doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        SkewedCutAnnotator.gtStatReport()
        exit(0)
    
    # --- 
    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        _exit(usage, 1, e)
    
    main(TableSkewedRowCut_CLASS, sModelDir, sModelName, options)