def predict(self, graph, bProba=False): """ predict the class of each node of the graph return a numpy array, which is a 1-dim array of size the number of nodes of the graph. """ [X] = self.get_lX([graph]) bConstraint = graph.getPageConstraint() traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([graph])) self._computeModelCaracteristics( [X] ) #we discover here dynamically the number of features of nodes and edges traceln("\t\t %s" % self._getNbFeatureAsText()) n_jobs = self.ssvm.n_jobs self.ssvm.n_jobs = 1 if bConstraint: [Y] = self._ssvm_ad3plus_predict( [X], [graph.instanciatePageConstraints()]) else: [Y] = self.ssvm.predict([X]) self.ssvm.n_jobs = n_jobs if bProba: # do like if we return some proba. 0 or 1 actually... # similar to 1-hot encoding n = Y.shape[0] Y_proba = np.zeros((n, 2), dtype=Y.dtype) Y_proba[np.arange(n), Y] = 1.0 return Y_proba else: return Y
def test(self, lGraph, lsDocName=None): """ Test the model using those graphs and report results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ assert lGraph lLabelName = lGraph[0].getLabelNameList() bConstraint = lGraph[0].getPageConstraint() traceln("- computing features on test set") chronoOn("test") lX, lY = self.get_lX_lY(lGraph) traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) self._computeModelCaracteristics( lX ) #we discover here dynamically the number of features of nodes and edges traceln("\t %s" % self._getNbFeatureAsText()) traceln("[%.1fs] done\n" % chronoOff("test")) traceln("- predicting on test set") chronoOn("test2") if bConstraint: lConstraints = [g.instanciatePageConstraints() for g in lGraph] lY_pred = self._ssvm_ad3plus_predict(lX, lConstraints) else: lY_pred = self.ssvm.predict(lX) traceln(" [%.1fs] done\n" % chronoOff("test2")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsDocName) lBaselineTestReport = self._testBaselines(lX, lY, lLabelName, lsDocName=lsDocName) tstRpt.attach(lBaselineTestReport) tstRpt.attach( self._testEdgeBaselines(lX, lY, lLabelName, lsDocName=lsDocName)) # do some garbage collection del lX, lY gc.collect() return tstRpt
def _prepare_for_test(self, lGraph): traceln("\t- computing features on test set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) traceln("\t [%.1fs] done\n" % chronoOff()) gcn_graph_test = self.convert_lX_lY_to_GCNDataset(lX, lY, training=False, test=True) return gcn_graph_test, lX, lY
def testFiles(self, lsFilename, loadFun, bBaseLine=False): """ Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list). It reports results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ lX, lY, lY_pred = [], [], [] lLabelName = None traceln("- predicting on test set") chronoOn("testFiles") # ??? why commenting this? #with tf.Session(graph=self.tf_graph) as session: #session.run(self.gcn_model.init) #self.gcn_model.restore_model(session, self.getModelFilename()) for sFilename in lsFilename: lg = loadFun(sFilename) # returns a singleton list for g in lg: if g.bConjugate: g.computeEdgeLabels() [X], [Y] = self.get_lX_lY([g]) gcn_graph_test = self.convert_lX_lY_to_GCNDataset( [X], [Y], training=False, test=True) if lLabelName == None: lLabelName = g.getEdgeLabelNameList( ) if g.bConjugate else g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) tNF_EF = (X[0].shape[1], X[2].shape[1]) traceln("node-dim,edge-dim", tNF_EF) # else: # assert lLabelName == g.getLabelNameList(), "Inconsistency among label spaces" #SC lY_pred_ = self.gcn_model.predict_lG(session, gcn_graph_test, verbose=False) # [Y_pred] = self.gcn_model.predict_prob_lG(self.tf_session, gcn_graph_test, verbose=False) # lY_pred.append(Y_pred.argmax(axis=1)) [Y_pred] = self.gcn_model.predict_lG(self.tf_session, gcn_graph_test, verbose=False) lY_pred.append(Y_pred) lX.append(X) lY.append(Y) del g # this can be very large gc.collect() traceln("[%.1fs] done\n" % chronoOff("testFiles")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsFilename) # ??? why commented out? #TODO # if bBaseLine: # lBaselineTestReport = self._testBaselinesEco(lX, lY, lLabelName, lsDocName=lsFilename) # tstRpt.attach(lBaselineTestReport) del lX, lY gc.collect() return tstRpt
def _prepare_for_train(self, lGraph, lGraph_vld): """ Prepare for training eCN or EnsembleECN """ traceln('ECN Training ', self.sName) traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) self._computeModelCaracteristics( lX ) # we discover here dynamically the number of features of nodes and edges # self._tNF_EF contains the number of node features and edge features traceln("\t\t %s" % self._getNbFeatureAsText()) traceln("\t [%.1fs] done\n" % chronoOff()) nb_class = len( lGraph[0].getLabelNameList()) #Is it better to do Y.shape ? traceln("\t- %d classes" % nb_class) traceln("\t- retrieving or creating model...") self.model_config['node_dim'] = self._tNF_EF[0] self.model_config['edge_dim'] = self._tNF_EF[1] self.model_config['nb_class'] = nb_class if False: with open('linear_reg', 'wb') as save_file: pickle.dump((lX, lY), save_file, pickle.HIGHEST_PROTOCOL) #This converts the lX,lY in the format necessary for GCN Models gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True) #Save the label Binarizer for prediction usage fd_lb = open(self.getlabelBinarizerFilename(), 'wb') pickle.dump(self.labelBinarizer, fd_lb) fd_lb.close() #TODO Save the validation set too to reproduce experiments random.shuffle(gcn_graph) if lGraph_vld: gcn_graph_train = gcn_graph lX_vld, lY_vld = self.get_lX_lY(lGraph_vld) gcn_graph_val = self.convert_lX_lY_to_GCNDataset(lX_vld, lY_vld, test=True) del lX_vld, lY_vld else: #Get a validation set from the training set split_idx = max( 1, int(self.model_config['ratio_train_val'] * len(gcn_graph))) traceln(" - using %d train graphs as validation graphs" % split_idx) gcn_graph_train = [] gcn_graph_val = [] gcn_graph_val.extend(gcn_graph[:split_idx]) gcn_graph_train.extend(gcn_graph[split_idx:]) traceln("%d training graphs -- %d validation graphs" % (len(gcn_graph_train), len(gcn_graph_val))) self._cleanTmpCheckpointFiles() return gcn_graph_train, gcn_graph_val
def testFiles(self, lsFilename, loadFun, bBaseLine=False): """ Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list). It reports results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ traceln("- predicting on test set") chronoOn("testFiles") lX, lY, lY_pred = [], [], [] lLabelName = None for sFilename in lsFilename: lg = loadFun(sFilename) # returns a singleton list for g in lg: if g.bConjugate: g.computeEdgeLabels() [X], [Y] = self.get_lX_lY([g]) [gcn_graph_test ] = self.convert_lX_lY_to_GCNDataset([X], [Y], training=False, test=True) if lLabelName == None: lLabelName = g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) tNF_EF = (X[0].shape[1], X[2].shape[1]) traceln("node-dim,edge-dim", tNF_EF) m_pred = [] for du_model in self.models: [Y_pred] = du_model.gcn_model.predict_prob_lG( du_model.tf_session, [gcn_graph_test], verbose=False) m_pred.append([Y_pred]) [Y_pred], [_Y_pred_proba ] = DU_Ensemble_ECN.average_prediction(m_pred) #lX.append(X) lY.append(Y) lY_pred.append(Y_pred) del _Y_pred_proba g.detachFromDoc() del g # this can be very large gc.collect() traceln("[%.1fs] done\n" % chronoOff("testFiles")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsFilename) del lX, lY, lY_pred gc.collect() return tstRpt
def testFiles(self, lsFilename, loadFun, bBaseLine=False): """ Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list). It reports results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ lX, lY, lY_pred = [], [], [] lLabelName = None traceln("- predicting on test set") chronoOn("testFiles") for sFilename in lsFilename: lg = loadFun(sFilename) #returns a singleton list for g in lg: if self.bConjugate: g.computeEdgeLabels() [X], [Y] = self.get_lX_lY([g]) if lLabelName == None: lLabelName = g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) self._computeModelCaracteristics( [X] ) #we discover here dynamically the number of features of nodes and edges traceln("\t %s" % self._getNbFeatureAsText()) else: assert lLabelName == g.getLabelNameList( ), "Inconsistency among label spaces" n_jobs = self.ssvm.n_jobs self.ssvm.n_jobs = 1 if g.getPageConstraint(): lConstraints = g.instanciatePageConstraints() [Y_pred] = self._ssvm_ad3plus_predict([X], [lConstraints]) else: #since we pass a single graph, let force n_jobs to 1 !! [Y_pred] = self.ssvm.predict([X]) self.ssvm.n_jobs = n_jobs lX.append(X) lY.append(Y) lY_pred.append(Y_pred) #g.detachFromDOM() del g #this can be very large gc.collect() traceln("[%.1fs] done\n" % chronoOff("testFiles")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsFilename) if bBaseLine: lBaselineTestReport = self._testBaselinesEco(lX, lY, lLabelName, lsDocName=lsFilename) tstRpt.attach(lBaselineTestReport) # if True: # #experimental code, not so interesting... # node_transformer, _ = self.getTransformers() # try: # _testable_extractor_ = node_transformer._testable_extractor_ # lExtractorTestReport = _testable_extractor_.testEco(lX, lY) # tstRpt.attach(lExtractorTestReport) # except AttributeError: # pass #do some garbage collection del lX, lY gc.collect() return tstRpt
def gridsearch(self, lGraph, verbose=0): """ do a grid search instead of a normal training """ traceln("--- GRID SEARCH FOR CRF MODEL ---") traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) dPrm = {} dPrm['C'] = self.C if type(self.C) == list else [self.C] dPrm['tol'] = self.tol if type(self.tol) == list else [self.tol] dPrm['inference_cache'] = self.inference_cache if type( self.inference_cache) == list else [self.inference_cache] dPrm['max_iter'] = self.max_iter if type( self.max_iter) == list else [self.max_iter] traceln("\t- creating a SSVM-trained CRF model") traceln("\t\t- computing class weight:") clsWeights = self.computeClassWeight(lY) traceln("\t\t\t%s" % clsWeights) crf = self._getCRFModel(clsWeights) self._ssvm = OneSlackSSVM( crf #, inference_cache=self.inference_cache, C=self.C, tol=self.tol , n_jobs=self.njobs #, logger=SaveLogger(sModelFN, save_every=self.save_every) #, max_iter=self.max_iter , show_loss_every=10 # , verbose=verbose) , verbose=1) self._gs_ssvm = GridSearchCV(self._ssvm, dPrm, n_jobs=1, verbose=verbose) self.ssvm = None chronoOn() traceln("\t - training by grid search a graph-based model") traceln("\t\t solver parameters for grid search:", " inference_cache=", self.inference_cache, " C=", self.C, " tol=", self.tol, " n_jobs=", self.njobs, " max_iter=", self.max_iter) self._gs_ssvm.fit(lX, lY) traceln( "\t [%.1fs] done (graph-based model is trained with best parameters, selected by grid search) \n" % chronoOff()) self.ssvm = self._gs_ssvm.best_estimator_ #Estimator that was chosen by the search try: #win32 dBestParams = self._gs_ssvm.best_params_ except: #do not know how to get this... in dBestParams = { 'C': self.ssvm.C, 'inference_cache': self.ssvm.inference_cache, 'max_iter': self.ssvm.max_iter, 'tol': self.ssvm.tol } self.storeBestParams(dBestParams) traceln("\t", "- " * 20) traceln("\tBest parameters: ", dBestParams) traceln("\t", "- " * 20) try: self.ssvm.alphas = None self.ssvm.constraints_ = None self.ssvm.inference_cache_ = None traceln( "\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)" ) except Exception as e: traceln( "\t\t(COULD NOT make the model slimmer. Got exception: %s" % str(e)) logger = SaveLogger(self.getModelFilename()) logger(self.ssvm) #save this model! traceln(self.getModelInfo()) #Also save the details of this grid search sFN = self.getModelFilename()[:-4] + "GridSearchCV.pkl" try: self.gzip_cPickle_dump(sFN, self._gs_ssvm) traceln("\n\n--- GridSearchCV details: (also in %s)" % sFN) traceln("--- Best parameters set found on development set:") traceln(self._gs_ssvm.best_params_) traceln("--- Grid scores on development set:") means = self._gs_ssvm.cv_results_['mean_test_score'] stds = self._gs_ssvm.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, self._gs_ssvm.cv_results_['params']): traceln("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) traceln("--- ---") except Exception as e: traceln( "WARNING: error while dealing with the GridSearchCV object.") traceln(e) #the baseline model(s) if any self._trainBaselines(lX, lY) #do some garbage collection del lX, lY gc.collect() return
def train(self, lGraph_trn, lGraph_vld, bWarmStart=True, expiration_timestamp=None, verbose=0): """ Train a CRF model using the list of labelled graph as training if bWarmStart if True, try to continue from previous training, IF the stored model is older than expiration_timestamp!! , otherwise, starts from scratch return nothing """ if self.bGridSearch: return self.gridsearch(lGraph_trn, verbose=verbose) traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph_trn)) lX, lY = self.get_lX_lY(lGraph_trn) lX_vld, lY_vld = self.get_lX_lY(lGraph_vld) bMakeSlim = not bWarmStart # for warm-start mode, we do not make the model slimer!" traceln("\t- retrieving or creating model...") self.ssvm = None sModelFN = self.getModelFilename() if bWarmStart: try: try: self.ssvm = self._loadIfFresh( sModelFN + "._last_", expiration_timestamp, lambda x: SaveLogger(x).load()) traceln( "\t- warmstarting with last saved model (not necessarily best one)!" ) except: self.ssvm = self._loadIfFresh( sModelFN, expiration_timestamp, lambda x: SaveLogger(x).load()) traceln("\t- warmstarting from last best model!") #we allow to change the max_iter of the model try: self.ssvm.max_iter #to make sure we do something that makes sense... if self.ssvm.max_iter != self.max_iter: traceln( "\t- changing max_iter value from (stored) %d to %d" % (self.ssvm.max_iter, self.max_iter)) self.ssvm.max_iter = self.max_iter except AttributeError: traceln("\t- cannot access or change the max_iter value") try: self.ssvm.n_jobs #to make sure we do something that makes sense... if self.ssvm.n_jobs != self.njobs: traceln( "\t- changing n_jobs value from (stored) %d to %d" % (self.ssvm.n_jobs, self.njobs)) self.ssvm.n_jobs = self.njobs except AttributeError: traceln("\t- cannot access or change the n_jobs value") except Exception as e: self.ssvm = None traceln("\t- Cannot warmstart: %s" % e) #self.ssvm is either None or containing a nice ssvm model!! chronoOn("train") traceln("\t- training graph-based model") traceln("\t\t solver parameters:", " inference_cache=", self.inference_cache, " C=", self.C, " tol=", self.tol, " n_jobs=", self.njobs) if not self.ssvm: traceln("\t- creating a new SSVM-trained CRF model") traceln("\t\t- computing class weight:") if self.balanced: traceln("\t\tusing balanced weights") self.setBalancedWeights() clsWeights = self.computeClassWeight(lY) traceln("\t\t\t --> %s" % clsWeights) #clsWeights = np.array([1, 4.5]) # These weights are tuned for best performance of LR and SVM and hence consistently used here crf = self._getCRFModel(clsWeights) self.ssvm = OneSlackSSVM(crf, inference_cache=self.inference_cache, C=self.C, tol=self.tol, n_jobs=self.njobs, logger=SaveLogger( sModelFN, save_every=self.save_every), max_iter=self.max_iter, show_loss_every=10, verbose=verbose) bWarmStart = False if lGraph_vld: self.ssvm.fit_with_valid(lX, lY, lX_vld, lY_vld, warm_start=bWarmStart, valid_every=self.save_every) else: # old classical method self.ssvm.fit(lX, lY, warm_start=bWarmStart) traceln("\t [%.1fs] done (graph-CRF model is trained) \n" % chronoOff("train")) #traceln(self.getModelInfo()) #cleaning useless data that takes MB on disk if bMakeSlim: self.ssvm.alphas = None self.ssvm.constraints_ = None self.ssvm.inference_cache_ = None traceln( "\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)" ) #the baseline model(s) if any self._trainBaselines(lX, lY) #do some garbage collection del lX, lY gc.collect() return