예제 #1
0
 def __init__(self, sModelName, sModelDir, 
              iBlockVisibility = None,
              iLineVisibility = None,
              sComment = None,
              C=None, tol=None, njobs=None, max_iter=None,
              inference_cache=None): 
     
     DU_ABPTableRCut.iBlockVisibility = iBlockVisibility
     DU_ABPTableRCut.iLineVisibility  = iLineVisibility
     
     DU_CRF_Task.__init__(self
                  , sModelName, sModelDir
                  , dFeatureConfig = {'row_row':{} , 'row_hdr':{}, 'row_sepH':{},
                                      'hdr_row':{} , 'hdr_hdr':{}, 'hdr_sepH':{},
                                      'sepH_row':{},'sepH_hdr':{}, 'sepH_sepH':{},
                                       'row':{}    , 'hdr':{}    , 'sepH':{}}
                  , dLearnerConfig = {
                                'C'                : .1   if C               is None else C
                              , 'njobs'            : 4    if njobs           is None else njobs
                              , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                              #, 'tol'              : .1
                              , 'tol'              : .05  if tol             is None else tol
                              , 'save_every'       : 50     #save every 50 iterations,for warm start
                              , 'max_iter'         : 10   if max_iter        is None else max_iter
                      }
                  , sComment=sComment
                  #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
                  ,cFeatureDefinition=My_FeatureDefinition_v2
                  )
예제 #2
0
 def __init__(self, sModelName, sModelDir, sComment=None): 
     
     DU_CRF_Task.__init__(self
                          , sModelName, sModelDir
                          , dFeatureConfig = {
                                 'n_tfidf_node'    : 10
                               , 't_ngrams_node'   : (2,2)
                               , 'b_tfidf_node_lc' : False    
                               , 'n_tfidf_edge'    : 10
                               , 't_ngrams_edge'   : (2,2)
                               , 'b_tfidf_edge_lc' : False    
                           }
                          , dLearnerConfig = {
                                'C'                : .1 
                              , 'njobs'            : 2
                              , 'inference_cache'  : 10
                              , 'tol'              : .1
                              , 'save_every'       : 5     #save every 50 iterations,for warm start
                              #, 'max_iter'         : 1000
                              , 'max_iter'         : 2
                              }
                          , sComment=sComment
                          )
     
     self.addBaseline_LogisticRegression()    #use a LR model as baseline
예제 #3
0
    def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): 
        
        #another way to specify the graph class
        # defining a  getConfiguredGraphClass is preferred
        self.configureGraphClass(self.DU_GRAPH)

        DU_CRF_Task.__init__(self
                     , sModelName, sModelDir
                     , dLearnerConfig = {
                                   'C'                : .1   if C               is None else C
                                 , 'njobs'            : 8    if njobs           is None else njobs
                                 , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                                 #, 'tol'              : .1
                                 , 'tol'              : .05  if tol             is None else tol
                                 , 'save_every'       : 50     #save every 50 iterations,for warm start
                                 , 'max_iter'         : 1000 if max_iter        is None else max_iter
                         }
                     , sComment=sComment
                     , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText_v3
                     , dFeatureConfig = {
                         #config for the extractor of nodes of each type
                         "text": None,    
                         "sprtr": None,
                         #config for the extractor of edges of each type
                         "text_text": None,    
                         "text_sprtr": None,    
                         "sprtr_text": None,    
                         "sprtr_sprtr": None    
                         }
                     )
        
        traceln("- classes: ", self.DU_GRAPH.getLabelNameList())

        self.bsln_mdl = self.addBaseline_LogisticRegression()    #use a LR model trained by GridSearch as baseline
    def __init__(self,
                 sModelName,
                 sModelDir,
                 iBlockVisibility=None,
                 iLineVisibility=None,
                 fCutHeight=None,
                 bCutAbove=None,
                 lRadAngle=None,
                 bTxt=None,
                 sComment=None,
                 cFeatureDefinition=None,
                 dFeatureConfig={},
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):

        DU_ABPTableSkewedRowCutLine.iBlockVisibility = iBlockVisibility
        DU_ABPTableSkewedRowCutLine.iLineVisibility = iLineVisibility
        DU_ABPTableSkewedRowCutLine.fCutHeight = fCutHeight
        DU_ABPTableSkewedRowCutLine.bCutAbove = bCutAbove
        DU_ABPTableSkewedRowCutLine.lRadAngle = lRadAngle
        DU_ABPTableSkewedRowCutLine.bTxt = bTxt

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            dFeatureConfig={
                'row_row': {},
                'row_sepH': {},
                'sepH_row': {},
                'sepH_sepH': {},
                'sepH': {},
                'row': {}
            },
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                4 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                10 if max_iter is None else max_iter
            },
            sComment=sComment
            #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
            ,
            cFeatureDefinition=My_FeatureDefinition_v3_txt
            if self.bTxt else My_FeatureDefinition_v3)
예제 #5
0
    def __init__(self,
                 sModelName,
                 sModelDir,
                 sComment=None,
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):
        #NOTE: we might get a list in C tol max_iter inference_cache  (in case of gridsearch)

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            DU_GRAPH,
            dFeatureConfig={
                'nbClass': nbClass,
                't_ngrams_node': (2, 4),
                'b_node_lc': False,
                't_ngrams_edge': (2, 4),
                'b_edge_lc': False,
                'n_jobs':
                5  #n_jobs when fitting the internal Logit feat extractor model by grid search
            },
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                5 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                1000 if max_iter is None else max_iter
            },
            sComment=sComment,
            cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2)

        self.setNbClass(
            nbClass
        )  #so that we check if all classes are represented in the training set

        self.bsln_mdl = self.addBaseline_LogisticRegression(
        )  #use a LR model trained by GridSearch as baseline
예제 #6
0
    def __init__(self,
                 sModelName,
                 sModelDir,
                 iGridVisibility=None,
                 iBlockVisibility=None,
                 sComment=None,
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):

        DU_ABPTableRG.iGridVisibility = iGridVisibility
        DU_ABPTableRG.iBlockVisibility = iBlockVisibility

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            dFeatureConfig={
                'row_row': {},
                'row_gh': {},
                'gh_row': {},
                'gh_gh': {},
                'gh': {},
                'row': {}
            },
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                4 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                10 if max_iter is None else max_iter
            },
            sComment=sComment
            #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
            ,
            cFeatureDefinition=My_FeatureDefinition)

        self.cModelClass.setBalancedWeights(True)
    def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): 

        DU_CRF_Task.__init__(self
                     , sModelName, sModelDir
                     , dFeatureConfig = {  }
                     , dLearnerConfig = {
                                   'C'                : .1   if C               is None else C
                                 , 'njobs'            : 4    if njobs           is None else njobs
                                 , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                                 #, 'tol'              : .1
                                 , 'tol'              : .05  if tol             is None else tol
                                 , 'save_every'       : 50     #save every 50 iterations,for warm start
                                 , 'max_iter'         : 10   if max_iter        is None else max_iter
                         }
                     , sComment=sComment
                     #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
                     ,cFeatureDefinition=My_FeatureDefinition_v2
                     )
예제 #8
0
 def __init__(self, sModelName, sModelDir, feat_select=None, sComment=None):
     if feat_select == 'chi2':
         DU_CRF_Task.__init__(self,
                              sModelName,
                              sModelDir,
                              DU_GRAPH,
                              dFeatureConfig=dFeatureConfig_FeatSelect,
                              dLearnerConfig=dLearnerConfig,
                              sComment=sComment)
     else:
         DU_CRF_Task.__init__(self,
                              sModelName,
                              sModelDir,
                              DU_GRAPH,
                              dFeatureConfig=dFeatureConfig_Baseline,
                              dLearnerConfig=dLearnerConfig,
                              sComment=sComment)
     self.addBaseline_LogisticRegression()  #use a LR model as baseline
예제 #9
0
    def __init__(self,
                 sModelName,
                 sModelDir,
                 sComment=None,
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):

        if sComment is None: sComment = sModelName
        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            dFeatureConfig={},
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                2 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                200 if max_iter is None else max_iter
            },
            sComment=sComment
            #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
            ,
            cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText_v3
        )

        #self.setNbClass(3)     #so that we check if all classes are represented in the training set

        if options.bBaseline:
            self.bsln_mdl = self.addBaseline_LogisticRegression(
            )  #use a LR model trained by GridSearch as baseline
예제 #10
0
    def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): 
        
        if self.bHTR:
            cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes
            dFeatureConfig = {  
                               'n_tfidf_node':100, 't_ngrams_node':(1,2), 'b_tfidf_node_lc':False
                              , 'n_tfidf_edge':100, 't_ngrams_edge':(1,2), 'b_tfidf_edge_lc':False }
        else:
            cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes_noText
            dFeatureConfig = { } 
                               #'n_tfidf_node':None, 't_ngrams_node':None, 'b_tfidf_node_lc':None
                              #, 'n_tfidf_edge':None, 't_ngrams_edge':None, 'b_tfidf_edge_lc':None }
        
        DU_CRF_Task.__init__(self
                     , sModelName, sModelDir
                     , dFeatureConfig = dFeatureConfig
                     , dLearnerConfig = {
                                   'C'                : .1   if C               is None else C
                                 , 'njobs'            : 16    if njobs           is None else njobs
                                 , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                                 #, 'tol'              : .1
                                 , 'tol'              : .05  if tol             is None else tol
                                 , 'save_every'       : 50     #save every 50 iterations,for warm start
                                 , 'max_iter'         : 1000 if max_iter        is None else max_iter
                         }
                     , sComment=sComment
                     , cFeatureDefinition=cFeatureDefinition
#                     , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText
#                      , dFeatureConfig = {
#                          #config for the extractor of nodes of each type
#                          "text": None,    
#                          "sprtr": None,
#                          #config for the extractor of edges of each type
#                          "text_text": None,    
#                          "text_sprtr": None,    
#                          "sprtr_text": None,    
#                          "sprtr_sprtr": None    
#                          }
                     )
        
        traceln("- classes: ", self.getGraphClass().getLabelNameList())

        self.bsln_mdl = self.addBaseline_LogisticRegression()    #use a LR model trained by GridSearch as baseline