def __init__(self, sModelName, sModelDir, iBlockVisibility = None, iLineVisibility = None, sComment = None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): DU_ABPTableRCut.iBlockVisibility = iBlockVisibility DU_ABPTableRCut.iLineVisibility = iLineVisibility DU_CRF_Task.__init__(self , sModelName, sModelDir , dFeatureConfig = {'row_row':{} , 'row_hdr':{}, 'row_sepH':{}, 'hdr_row':{} , 'hdr_hdr':{}, 'hdr_sepH':{}, 'sepH_row':{},'sepH_hdr':{}, 'sepH_sepH':{}, 'row':{} , 'hdr':{} , 'sepH':{}} , dLearnerConfig = { 'C' : .1 if C is None else C , 'njobs' : 4 if njobs is None else njobs , 'inference_cache' : 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol' : .05 if tol is None else tol , 'save_every' : 50 #save every 50 iterations,for warm start , 'max_iter' : 10 if max_iter is None else max_iter } , sComment=sComment #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText ,cFeatureDefinition=My_FeatureDefinition_v2 )
def __init__(self, sModelName, sModelDir, sComment=None): DU_CRF_Task.__init__(self , sModelName, sModelDir , dFeatureConfig = { 'n_tfidf_node' : 10 , 't_ngrams_node' : (2,2) , 'b_tfidf_node_lc' : False , 'n_tfidf_edge' : 10 , 't_ngrams_edge' : (2,2) , 'b_tfidf_edge_lc' : False } , dLearnerConfig = { 'C' : .1 , 'njobs' : 2 , 'inference_cache' : 10 , 'tol' : .1 , 'save_every' : 5 #save every 50 iterations,for warm start #, 'max_iter' : 1000 , 'max_iter' : 2 } , sComment=sComment ) self.addBaseline_LogisticRegression() #use a LR model as baseline
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): #another way to specify the graph class # defining a getConfiguredGraphClass is preferred self.configureGraphClass(self.DU_GRAPH) DU_CRF_Task.__init__(self , sModelName, sModelDir , dLearnerConfig = { 'C' : .1 if C is None else C , 'njobs' : 8 if njobs is None else njobs , 'inference_cache' : 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol' : .05 if tol is None else tol , 'save_every' : 50 #save every 50 iterations,for warm start , 'max_iter' : 1000 if max_iter is None else max_iter } , sComment=sComment , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText_v3 , dFeatureConfig = { #config for the extractor of nodes of each type "text": None, "sprtr": None, #config for the extractor of edges of each type "text_text": None, "text_sprtr": None, "sprtr_text": None, "sprtr_sprtr": None } ) traceln("- classes: ", self.DU_GRAPH.getLabelNameList()) self.bsln_mdl = self.addBaseline_LogisticRegression() #use a LR model trained by GridSearch as baseline
def __init__(self, sModelName, sModelDir, iBlockVisibility=None, iLineVisibility=None, fCutHeight=None, bCutAbove=None, lRadAngle=None, bTxt=None, sComment=None, cFeatureDefinition=None, dFeatureConfig={}, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): DU_ABPTableSkewedRowCutLine.iBlockVisibility = iBlockVisibility DU_ABPTableSkewedRowCutLine.iLineVisibility = iLineVisibility DU_ABPTableSkewedRowCutLine.fCutHeight = fCutHeight DU_ABPTableSkewedRowCutLine.bCutAbove = bCutAbove DU_ABPTableSkewedRowCutLine.lRadAngle = lRadAngle DU_ABPTableSkewedRowCutLine.bTxt = bTxt DU_CRF_Task.__init__( self, sModelName, sModelDir, dFeatureConfig={ 'row_row': {}, 'row_sepH': {}, 'sepH_row': {}, 'sepH_sepH': {}, 'sepH': {}, 'row': {} }, dLearnerConfig={ 'C': .1 if C is None else C, 'njobs': 4 if njobs is None else njobs, 'inference_cache': 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol': .05 if tol is None else tol, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 10 if max_iter is None else max_iter }, sComment=sComment #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText , cFeatureDefinition=My_FeatureDefinition_v3_txt if self.bTxt else My_FeatureDefinition_v3)
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): #NOTE: we might get a list in C tol max_iter inference_cache (in case of gridsearch) DU_CRF_Task.__init__( self, sModelName, sModelDir, DU_GRAPH, dFeatureConfig={ 'nbClass': nbClass, 't_ngrams_node': (2, 4), 'b_node_lc': False, 't_ngrams_edge': (2, 4), 'b_edge_lc': False, 'n_jobs': 5 #n_jobs when fitting the internal Logit feat extractor model by grid search }, dLearnerConfig={ 'C': .1 if C is None else C, 'njobs': 5 if njobs is None else njobs, 'inference_cache': 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol': .05 if tol is None else tol, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 1000 if max_iter is None else max_iter }, sComment=sComment, cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2) self.setNbClass( nbClass ) #so that we check if all classes are represented in the training set self.bsln_mdl = self.addBaseline_LogisticRegression( ) #use a LR model trained by GridSearch as baseline
def __init__(self, sModelName, sModelDir, iGridVisibility=None, iBlockVisibility=None, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): DU_ABPTableRG.iGridVisibility = iGridVisibility DU_ABPTableRG.iBlockVisibility = iBlockVisibility DU_CRF_Task.__init__( self, sModelName, sModelDir, dFeatureConfig={ 'row_row': {}, 'row_gh': {}, 'gh_row': {}, 'gh_gh': {}, 'gh': {}, 'row': {} }, dLearnerConfig={ 'C': .1 if C is None else C, 'njobs': 4 if njobs is None else njobs, 'inference_cache': 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol': .05 if tol is None else tol, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 10 if max_iter is None else max_iter }, sComment=sComment #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText , cFeatureDefinition=My_FeatureDefinition) self.cModelClass.setBalancedWeights(True)
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): DU_CRF_Task.__init__(self , sModelName, sModelDir , dFeatureConfig = { } , dLearnerConfig = { 'C' : .1 if C is None else C , 'njobs' : 4 if njobs is None else njobs , 'inference_cache' : 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol' : .05 if tol is None else tol , 'save_every' : 50 #save every 50 iterations,for warm start , 'max_iter' : 10 if max_iter is None else max_iter } , sComment=sComment #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText ,cFeatureDefinition=My_FeatureDefinition_v2 )
def __init__(self, sModelName, sModelDir, feat_select=None, sComment=None): if feat_select == 'chi2': DU_CRF_Task.__init__(self, sModelName, sModelDir, DU_GRAPH, dFeatureConfig=dFeatureConfig_FeatSelect, dLearnerConfig=dLearnerConfig, sComment=sComment) else: DU_CRF_Task.__init__(self, sModelName, sModelDir, DU_GRAPH, dFeatureConfig=dFeatureConfig_Baseline, dLearnerConfig=dLearnerConfig, sComment=sComment) self.addBaseline_LogisticRegression() #use a LR model as baseline
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): if sComment is None: sComment = sModelName DU_CRF_Task.__init__( self, sModelName, sModelDir, dFeatureConfig={}, dLearnerConfig={ 'C': .1 if C is None else C, 'njobs': 2 if njobs is None else njobs, 'inference_cache': 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol': .05 if tol is None else tol, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 200 if max_iter is None else max_iter }, sComment=sComment #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText , cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText_v3 ) #self.setNbClass(3) #so that we check if all classes are represented in the training set if options.bBaseline: self.bsln_mdl = self.addBaseline_LogisticRegression( ) #use a LR model trained by GridSearch as baseline
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): if self.bHTR: cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes dFeatureConfig = { 'n_tfidf_node':100, 't_ngrams_node':(1,2), 'b_tfidf_node_lc':False , 'n_tfidf_edge':100, 't_ngrams_edge':(1,2), 'b_tfidf_edge_lc':False } else: cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes_noText dFeatureConfig = { } #'n_tfidf_node':None, 't_ngrams_node':None, 'b_tfidf_node_lc':None #, 'n_tfidf_edge':None, 't_ngrams_edge':None, 'b_tfidf_edge_lc':None } DU_CRF_Task.__init__(self , sModelName, sModelDir , dFeatureConfig = dFeatureConfig , dLearnerConfig = { 'C' : .1 if C is None else C , 'njobs' : 16 if njobs is None else njobs , 'inference_cache' : 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol' : .05 if tol is None else tol , 'save_every' : 50 #save every 50 iterations,for warm start , 'max_iter' : 1000 if max_iter is None else max_iter } , sComment=sComment , cFeatureDefinition=cFeatureDefinition # , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText # , dFeatureConfig = { # #config for the extractor of nodes of each type # "text": None, # "sprtr": None, # #config for the extractor of edges of each type # "text_text": None, # "text_sprtr": None, # "sprtr_text": None, # "sprtr_sprtr": None # } ) traceln("- classes: ", self.getGraphClass().getLabelNameList()) self.bsln_mdl = self.addBaseline_LogisticRegression() #use a LR model trained by GridSearch as baseline
def runForExternalMLMethod(self, lsColDir, storeX, applyY, bRevertEdges=False): """ Return the list of produced files """ self.sXmlFilenamePattern = "*.mpxml" return DU_CRF_Task.runForExternalMLMethod(self, lsColDir, storeX, applyY, bRevertEdges)
except: pass #we only need the transformer lsOutputFilename = doer.runForExternalMLMethod( lRun, options.storeX, options.applyY, options.bRevertEdges) else: doer.load() lsOutputFilename = doer.predict(lRun) traceln("Done, see in:\n %s" % lsOutputFilename) # ---------------------------------------------------------------------------- if __name__ == "__main__": version = "v.01" usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser( sys.argv[0], version) # parser.add_option("--annotate", dest='bAnnotate', action="store_true",default=False, help="Annotate the textlines with BIES labels") #FOR GCN parser.add_option("--revertEdges", dest='bRevertEdges', action="store_true", help="Revert the direction of the edges") parser.add_option("--detail", dest='bDetailedReport', action="store_true", default=False, help="Display detailed reporting (score per document)") parser.add_option("--baseline", dest='bBaseline', action="store_true",
def predict(self, lsColDir): """ Return the list of produced files """ self.sXmlFilenamePattern = "*.mpxml" return DU_CRF_Task.predict(self, lsColDir)
def main(DU_BAR): version = "v.01" usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser( sys.argv[0], version) parser.add_option("--docid", dest='docid', action="store", default=None, help="only process docid") # --- #parse the command line (options, args) = parser.parse_args() # --- try: sModelDir, sModelName = args except Exception as e: traceln("Specify a model folder and a model name!") _exit(usage, 1, e) doer = DU_BAR(sModelName, sModelDir, C=options.crf_C, tol=options.crf_tol, njobs=options.crf_njobs, max_iter=options.max_iter, inference_cache=options.crf_inference_cache) if options.docid: sDocId = options.docid else: sDocId = None if options.rm: doer.rm() sys.exit(0) lTrn, lTst, lRun, lFold = [ _checkFindColDir(lsDir) for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold] ] # if options.bAnnotate: # doer.annotateDocument(lTrn) # traceln('annotation done') # sys.exit(0) ## use. a_mpxml files doer.sXmlFilenamePattern = doer.sLabeledXmlFilenamePattern if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish: if options.iFoldInitNum: """ initialization of a cross-validation """ splitter, ts_trn, lFilename_trn = doer._nfold_Init( lFold, options.iFoldInitNum, bStoreOnDisk=True) elif options.iFoldRunNum: """ Run one fold """ oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm) traceln(oReport) elif options.bFoldFinish: tstReport = doer._nfold_Finish() traceln(tstReport) else: assert False, "Internal error" #no more processing!! exit(0) #------------------- if lFold: loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl) import graph.GraphModel sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt") traceln("Results are in %s" % sReportPickleFilename) graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt) elif lTrn: doer.train_save_test(lTrn, lTst, options.warm, options.pkl) try: traceln("Baseline best estimator: %s" % doer.bsln_mdl.best_params_) #for GridSearch except: pass traceln(" --- CRF Model ---") traceln(doer.getModel().getModelInfo()) elif lTst: doer.load() tstReport = doer.test(lTst) traceln(tstReport) if lRun: if options.storeX or options.applyY: try: doer.load() except: pass #we only need the transformer lsOutputFilename = doer.runForExternalMLMethod( lRun, options.storeX, options.applyY) else: doer.load() lsOutputFilename = doer.predict(lRun) traceln("Done, see in:\n %s" % lsOutputFilename)
def main_command_line(TableSkewedRowCut_CLASS): version = "v.01" usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version) # parser.add_option("--annotate", dest='bAnnotate', action="store_true",default=False, help="Annotate the textlines with BIES labels") #FOR GCN parser.add_option("--revertEdges", dest='bRevertEdges', action="store_true", help="Revert the direction of the edges") parser.add_option("--detail", dest='bDetailedReport', action="store_true", default=False,help="Display detailed reporting (score per document)") parser.add_option("--baseline", dest='bBaseline', action="store_true", default=False, help="report baseline method") parser.add_option("--line_see_line", dest='iLineVisibility', action="store", type=int, default=GraphSkewedCut.iLineVisibility, help="seeline2line: how far in pixel can a line see another cut line?") parser.add_option("--block_see_line", dest='iBlockVisibility', action="store", type=int, default=GraphSkewedCut.iBlockVisibility, help="seeblock2line: how far in pixel can a block see a cut line?") parser.add_option("--height", dest="fCutHeight", default=GraphSkewedCut.fCutHeight , action="store", type=float, help="Minimal height of a cut") parser.add_option("--cut-above", dest='bCutAbove', action="store_true", default=False ,help="Each object defines one or several cuts above it (instead of below as by default)") parser.add_option("--angle", dest='lsAngle' , action="store", type="string", default="-1,0,+1" ,help="Allowed cutting angles, in degree, comma-separated") parser.add_option("--graph", dest='bGraph', action="store_true", help="Store the graph in the XML for displaying it") # --- #parse the command line (options, args) = parser.parse_args() if options.bGraph: import os.path # hack TableSkewedRowCut_CLASS.bCutAbove = options.bCutAbove traceln("\t%s.bCutAbove=" % TableSkewedRowCut_CLASS.__name__, TableSkewedRowCut_CLASS.bCutAbove) TableSkewedRowCut_CLASS.lRadAngle = [math.radians(v) for v in [float(s) for s in options.lsAngle.split(",")]] traceln("\t%s.lRadAngle=" % TableSkewedRowCut_CLASS.__name__, TableSkewedRowCut_CLASS.lRadAngle) for sInputFilename in args: sp, sf = os.path.split(sInputFilename) sOutFilename = os.path.join(sp, "graph-" + sf) doer = TableSkewedRowCut_CLASS("debug", "." , iBlockVisibility=options.iBlockVisibility , iLineVisibility=options.iLineVisibility , fCutHeight=options.fCutHeight , bCutAbove=options.bCutAbove , lRadAngle=[math.radians(float(s)) for s in options.lsAngle.split(",")]) o = doer.cGraphClass() o.parseDocFile(sInputFilename, 9) o.parseDocLabels() o.addParsedLabelToDom() o.addEdgaddEdgeToDoc print('Graph edges added to %s'%sOutFilename) o.doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True) SkewedCutAnnotator.gtStatReport() exit(0) # --- try: sModelDir, sModelName = args except Exception as e: traceln("Specify a model folder and a model name!") _exit(usage, 1, e) main(TableSkewedRowCut_CLASS, sModelDir, sModelName, options)