Exemplo n.º 1
0
    def predict(self, graph, bProba=False):
        """
        predict the class of each node of the graph
        return a numpy array, which is a 1-dim array of size the number of nodes of the graph. 
        """
        [X] = self.get_lX([graph])
        bConstraint = graph.getPageConstraint()
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber([graph]))
        self._computeModelCaracteristics(
            [X]
        )  #we discover here dynamically the number of features of nodes and edges
        traceln("\t\t %s" % self._getNbFeatureAsText())

        n_jobs = self.ssvm.n_jobs
        self.ssvm.n_jobs = 1
        if bConstraint:
            [Y] = self._ssvm_ad3plus_predict(
                [X], [graph.instanciatePageConstraints()])
        else:
            [Y] = self.ssvm.predict([X])
        self.ssvm.n_jobs = n_jobs

        if bProba:
            # do like if we return some proba. 0 or 1 actually...
            # similar to 1-hot encoding
            n = Y.shape[0]
            Y_proba = np.zeros((n, 2), dtype=Y.dtype)
            Y_proba[np.arange(n), Y] = 1.0
            return Y_proba
        else:
            return Y
Exemplo n.º 2
0
    def test(self, lGraph, lsDocName=None):
        """
        Test the model using those graphs and report results on stderr
        if some baseline model(s) were set, they are also tested
        Return a Report object
        """
        assert lGraph
        lLabelName = lGraph[0].getLabelNameList()
        bConstraint = lGraph[0].getPageConstraint()

        traceln("- computing features on test set")
        chronoOn("test")
        lX, lY = self.get_lX_lY(lGraph)
        traceln("\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        self._computeModelCaracteristics(
            lX
        )  #we discover here dynamically the number of features of nodes and edges
        traceln("\t %s" % self._getNbFeatureAsText())
        traceln("[%.1fs] done\n" % chronoOff("test"))

        traceln("- predicting on test set")
        chronoOn("test2")
        if bConstraint:
            lConstraints = [g.instanciatePageConstraints() for g in lGraph]
            lY_pred = self._ssvm_ad3plus_predict(lX, lConstraints)
        else:
            lY_pred = self.ssvm.predict(lX)

        traceln(" [%.1fs] done\n" % chronoOff("test2"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsDocName)

        lBaselineTestReport = self._testBaselines(lX,
                                                  lY,
                                                  lLabelName,
                                                  lsDocName=lsDocName)
        tstRpt.attach(lBaselineTestReport)

        tstRpt.attach(
            self._testEdgeBaselines(lX, lY, lLabelName, lsDocName=lsDocName))

        # do some garbage collection
        del lX, lY
        gc.collect()

        return tstRpt
Exemplo n.º 3
0
    def _prepare_for_test(self, lGraph):

        traceln("\t- computing features on test set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()

        lX, lY = self.get_lX_lY(lGraph)
        traceln("\t [%.1fs] done\n" % chronoOff())

        gcn_graph_test = self.convert_lX_lY_to_GCNDataset(lX,
                                                          lY,
                                                          training=False,
                                                          test=True)

        return gcn_graph_test, lX, lY
Exemplo n.º 4
0
    def testFiles(self, lsFilename, loadFun, bBaseLine=False):
        """
        Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list).
        It reports results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        lX, lY, lY_pred = [], [], []
        lLabelName = None
        traceln("- predicting on test set")
        chronoOn("testFiles")

        #   ??? why commenting this?
        #with tf.Session(graph=self.tf_graph) as session:
        #session.run(self.gcn_model.init)
        #self.gcn_model.restore_model(session, self.getModelFilename())

        for sFilename in lsFilename:

            lg = loadFun(sFilename)  # returns a singleton list
            for g in lg:
                if g.bConjugate: g.computeEdgeLabels()
                [X], [Y] = self.get_lX_lY([g])

                gcn_graph_test = self.convert_lX_lY_to_GCNDataset(
                    [X], [Y], training=False, test=True)
                if lLabelName == None:
                    lLabelName = g.getEdgeLabelNameList(
                    ) if g.bConjugate else g.getLabelNameList()
                    traceln("\t #nodes=%d  #edges=%d " %
                            Graph.getNodeEdgeTotalNumber([g]))
                    tNF_EF = (X[0].shape[1], X[2].shape[1])
                    traceln("node-dim,edge-dim", tNF_EF)

    #             else:
    #                 assert lLabelName == g.getLabelNameList(), "Inconsistency among label spaces"

    #SC     lY_pred_ = self.gcn_model.predict_lG(session, gcn_graph_test, verbose=False)
    #             [Y_pred] = self.gcn_model.predict_prob_lG(self.tf_session, gcn_graph_test, verbose=False)
    #             lY_pred.append(Y_pred.argmax(axis=1))
                [Y_pred] = self.gcn_model.predict_lG(self.tf_session,
                                                     gcn_graph_test,
                                                     verbose=False)
                lY_pred.append(Y_pred)

                lX.append(X)
                lY.append(Y)
                del g  # this can be very large
                gc.collect()

        traceln("[%.1fs] done\n" % chronoOff("testFiles"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsFilename)

        # ??? why commented out?
        #TODO
        # if bBaseLine:
        # lBaselineTestReport = self._testBaselinesEco(lX, lY, lLabelName, lsDocName=lsFilename)
        # tstRpt.attach(lBaselineTestReport)

        del lX, lY
        gc.collect()

        return tstRpt
Exemplo n.º 5
0
    def _prepare_for_train(self, lGraph, lGraph_vld):
        """
        Prepare for training eCN or EnsembleECN
        """
        traceln('ECN Training ', self.sName)
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()

        lX, lY = self.get_lX_lY(lGraph)
        self._computeModelCaracteristics(
            lX
        )  # we discover here dynamically the number of features of nodes and edges
        # self._tNF_EF contains the number of node features and edge features
        traceln("\t\t %s" % self._getNbFeatureAsText())
        traceln("\t [%.1fs] done\n" % chronoOff())
        nb_class = len(
            lGraph[0].getLabelNameList())  #Is it better to do Y.shape ?
        traceln("\t- %d classes" % nb_class)

        traceln("\t- retrieving or creating model...")

        self.model_config['node_dim'] = self._tNF_EF[0]
        self.model_config['edge_dim'] = self._tNF_EF[1]
        self.model_config['nb_class'] = nb_class

        if False:
            with open('linear_reg', 'wb') as save_file:
                pickle.dump((lX, lY), save_file, pickle.HIGHEST_PROTOCOL)

        #This converts the lX,lY in the format necessary for GCN Models
        gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True)

        #Save the label Binarizer for prediction usage
        fd_lb = open(self.getlabelBinarizerFilename(), 'wb')
        pickle.dump(self.labelBinarizer, fd_lb)
        fd_lb.close()

        #TODO Save the validation set too to reproduce experiments
        random.shuffle(gcn_graph)

        if lGraph_vld:
            gcn_graph_train = gcn_graph
            lX_vld, lY_vld = self.get_lX_lY(lGraph_vld)
            gcn_graph_val = self.convert_lX_lY_to_GCNDataset(lX_vld,
                                                             lY_vld,
                                                             test=True)
            del lX_vld, lY_vld
        else:
            #Get a validation set from the training set
            split_idx = max(
                1, int(self.model_config['ratio_train_val'] * len(gcn_graph)))
            traceln(" - using %d train graphs as validation graphs" %
                    split_idx)
            gcn_graph_train = []
            gcn_graph_val = []
            gcn_graph_val.extend(gcn_graph[:split_idx])
            gcn_graph_train.extend(gcn_graph[split_idx:])
        traceln("%d training graphs --  %d validation graphs" %
                (len(gcn_graph_train), len(gcn_graph_val)))
        self._cleanTmpCheckpointFiles()

        return gcn_graph_train, gcn_graph_val
Exemplo n.º 6
0
    def testFiles(self, lsFilename, loadFun, bBaseLine=False):
        """
        Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list).
        It reports results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        traceln("- predicting on test set")
        chronoOn("testFiles")
        lX, lY, lY_pred = [], [], []
        lLabelName = None

        for sFilename in lsFilename:

            lg = loadFun(sFilename)  # returns a singleton list
            for g in lg:
                if g.bConjugate: g.computeEdgeLabels()
                [X], [Y] = self.get_lX_lY([g])

                [gcn_graph_test
                 ] = self.convert_lX_lY_to_GCNDataset([X], [Y],
                                                      training=False,
                                                      test=True)

                if lLabelName == None:
                    lLabelName = g.getLabelNameList()
                    traceln("\t #nodes=%d  #edges=%d " %
                            Graph.getNodeEdgeTotalNumber([g]))
                    tNF_EF = (X[0].shape[1], X[2].shape[1])
                    traceln("node-dim,edge-dim", tNF_EF)

                m_pred = []
                for du_model in self.models:
                    [Y_pred] = du_model.gcn_model.predict_prob_lG(
                        du_model.tf_session, [gcn_graph_test], verbose=False)
                    m_pred.append([Y_pred])

                [Y_pred], [_Y_pred_proba
                           ] = DU_Ensemble_ECN.average_prediction(m_pred)

                #lX.append(X)
                lY.append(Y)
                lY_pred.append(Y_pred)
                del _Y_pred_proba
                g.detachFromDoc()
                del g  # this can be very large
            gc.collect()

        traceln("[%.1fs] done\n" % chronoOff("testFiles"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsFilename)

        del lX, lY, lY_pred
        gc.collect()

        return tstRpt
Exemplo n.º 7
0
    def testFiles(self, lsFilename, loadFun, bBaseLine=False):
        """
        Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list).
        It reports results on stderr
        
        if some baseline model(s) were set, they are also tested
        
        Return a Report object
        """
        lX, lY, lY_pred = [], [], []
        lLabelName = None
        traceln("- predicting on test set")
        chronoOn("testFiles")
        for sFilename in lsFilename:
            lg = loadFun(sFilename)  #returns a singleton list
            for g in lg:
                if self.bConjugate: g.computeEdgeLabels()
                [X], [Y] = self.get_lX_lY([g])

                if lLabelName == None:
                    lLabelName = g.getLabelNameList()
                    traceln("\t #nodes=%d  #edges=%d " %
                            Graph.getNodeEdgeTotalNumber([g]))
                    self._computeModelCaracteristics(
                        [X]
                    )  #we discover here dynamically the number of features of nodes and edges
                    traceln("\t %s" % self._getNbFeatureAsText())
                else:
                    assert lLabelName == g.getLabelNameList(
                    ), "Inconsistency among label spaces"
                n_jobs = self.ssvm.n_jobs
                self.ssvm.n_jobs = 1
                if g.getPageConstraint():
                    lConstraints = g.instanciatePageConstraints()
                    [Y_pred] = self._ssvm_ad3plus_predict([X], [lConstraints])
                else:
                    #since we pass a single graph, let force n_jobs to 1 !!
                    [Y_pred] = self.ssvm.predict([X])
                self.ssvm.n_jobs = n_jobs

                lX.append(X)
                lY.append(Y)
                lY_pred.append(Y_pred)
                #g.detachFromDOM()
                del g  #this can be very large
                gc.collect()
        traceln("[%.1fs] done\n" % chronoOff("testFiles"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsFilename)

        if bBaseLine:
            lBaselineTestReport = self._testBaselinesEco(lX,
                                                         lY,
                                                         lLabelName,
                                                         lsDocName=lsFilename)
            tstRpt.attach(lBaselineTestReport)

#         if True:
#             #experimental code, not so interesting...
#             node_transformer, _ = self.getTransformers()
#             try:
#                 _testable_extractor_ = node_transformer._testable_extractor_
#                 lExtractorTestReport = _testable_extractor_.testEco(lX, lY)
#                 tstRpt.attach(lExtractorTestReport)
#             except AttributeError:
#                 pass

#do some garbage collection
        del lX, lY
        gc.collect()

        return tstRpt
Exemplo n.º 8
0
    def gridsearch(self, lGraph, verbose=0):
        """
        do a grid search instead of a normal training
        """
        traceln("--- GRID SEARCH FOR CRF MODEL ---")
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)

        dPrm = {}
        dPrm['C'] = self.C if type(self.C) == list else [self.C]
        dPrm['tol'] = self.tol if type(self.tol) == list else [self.tol]
        dPrm['inference_cache'] = self.inference_cache if type(
            self.inference_cache) == list else [self.inference_cache]
        dPrm['max_iter'] = self.max_iter if type(
            self.max_iter) == list else [self.max_iter]

        traceln("\t- creating a SSVM-trained CRF model")

        traceln("\t\t- computing class weight:")
        clsWeights = self.computeClassWeight(lY)
        traceln("\t\t\t%s" % clsWeights)

        crf = self._getCRFModel(clsWeights)

        self._ssvm = OneSlackSSVM(
            crf
            #, inference_cache=self.inference_cache, C=self.C, tol=self.tol
            ,
            n_jobs=self.njobs
            #, logger=SaveLogger(sModelFN, save_every=self.save_every)
            #, max_iter=self.max_iter
            ,
            show_loss_every=10
            #                            , verbose=verbose)
            ,
            verbose=1)

        self._gs_ssvm = GridSearchCV(self._ssvm,
                                     dPrm,
                                     n_jobs=1,
                                     verbose=verbose)
        self.ssvm = None

        chronoOn()
        traceln("\t - training by grid search a graph-based model")
        traceln("\t\t solver parameters for grid search:", " inference_cache=",
                self.inference_cache, " C=", self.C, " tol=", self.tol,
                " n_jobs=", self.njobs, " max_iter=", self.max_iter)
        self._gs_ssvm.fit(lX, lY)
        traceln(
            "\t [%.1fs] done (graph-based model is trained with best parameters, selected by grid search) \n"
            % chronoOff())

        self.ssvm = self._gs_ssvm.best_estimator_  #Estimator that was chosen by the search

        try:
            #win32
            dBestParams = self._gs_ssvm.best_params_
        except:
            #do not know how to get this... in
            dBestParams = {
                'C': self.ssvm.C,
                'inference_cache': self.ssvm.inference_cache,
                'max_iter': self.ssvm.max_iter,
                'tol': self.ssvm.tol
            }

        self.storeBestParams(dBestParams)
        traceln("\t", "- " * 20)
        traceln("\tBest parameters: ", dBestParams)
        traceln("\t", "- " * 20)

        try:
            self.ssvm.alphas = None
            self.ssvm.constraints_ = None
            self.ssvm.inference_cache_ = None
            traceln(
                "\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)"
            )
        except Exception as e:
            traceln(
                "\t\t(COULD NOT make the model slimmer. Got exception: %s" %
                str(e))

        logger = SaveLogger(self.getModelFilename())
        logger(self.ssvm)  #save this model!

        traceln(self.getModelInfo())

        #Also save the details of this grid search
        sFN = self.getModelFilename()[:-4] + "GridSearchCV.pkl"
        try:
            self.gzip_cPickle_dump(sFN, self._gs_ssvm)
            traceln("\n\n--- GridSearchCV details: (also in %s)" % sFN)
            traceln("--- Best parameters set found on development set:")
            traceln(self._gs_ssvm.best_params_)
            traceln("--- Grid scores on development set:")
            means = self._gs_ssvm.cv_results_['mean_test_score']
            stds = self._gs_ssvm.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         self._gs_ssvm.cv_results_['params']):
                traceln("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
            traceln("--- ---")
        except Exception as e:
            traceln(
                "WARNING: error while dealing with the GridSearchCV object.")
            traceln(e)

        #the baseline model(s) if any
        self._trainBaselines(lX, lY)

        #do some garbage collection
        del lX, lY
        gc.collect()
        return
Exemplo n.º 9
0
    def train(self,
              lGraph_trn,
              lGraph_vld,
              bWarmStart=True,
              expiration_timestamp=None,
              verbose=0):
        """
        Train a CRF model using the list of labelled graph as training
        if bWarmStart if True, try to continue from previous training, IF the stored model is older than expiration_timestamp!!
            , otherwise, starts from scratch
        return nothing
        """
        if self.bGridSearch:
            return self.gridsearch(lGraph_trn, verbose=verbose)

        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph_trn))
        lX, lY = self.get_lX_lY(lGraph_trn)
        lX_vld, lY_vld = self.get_lX_lY(lGraph_vld)
        bMakeSlim = not bWarmStart  # for warm-start mode, we do not make the model slimer!"

        traceln("\t- retrieving or creating model...")
        self.ssvm = None
        sModelFN = self.getModelFilename()
        if bWarmStart:
            try:
                try:
                    self.ssvm = self._loadIfFresh(
                        sModelFN + "._last_", expiration_timestamp,
                        lambda x: SaveLogger(x).load())
                    traceln(
                        "\t- warmstarting with last saved model (not necessarily best one)!"
                    )
                except:
                    self.ssvm = self._loadIfFresh(
                        sModelFN, expiration_timestamp,
                        lambda x: SaveLogger(x).load())
                    traceln("\t- warmstarting from last best model!")
                #we allow to change the max_iter of the model
                try:
                    self.ssvm.max_iter  #to make sure we do something that makes sense...
                    if self.ssvm.max_iter != self.max_iter:
                        traceln(
                            "\t- changing max_iter value from (stored) %d to %d"
                            % (self.ssvm.max_iter, self.max_iter))
                        self.ssvm.max_iter = self.max_iter
                except AttributeError:
                    traceln("\t- cannot access or change the max_iter value")

                try:
                    self.ssvm.n_jobs  #to make sure we do something that makes sense...
                    if self.ssvm.n_jobs != self.njobs:
                        traceln(
                            "\t- changing n_jobs value from (stored) %d to %d"
                            % (self.ssvm.n_jobs, self.njobs))
                        self.ssvm.n_jobs = self.njobs
                except AttributeError:
                    traceln("\t- cannot access or change the n_jobs value")

            except Exception as e:
                self.ssvm = None
                traceln("\t- Cannot warmstart: %s" % e)
            #self.ssvm is either None or containing a nice ssvm model!!

        chronoOn("train")
        traceln("\t- training graph-based model")
        traceln("\t\t solver parameters:", " inference_cache=",
                self.inference_cache, " C=", self.C, " tol=", self.tol,
                " n_jobs=", self.njobs)

        if not self.ssvm:
            traceln("\t- creating a new SSVM-trained CRF model")

            traceln("\t\t- computing class weight:")
            if self.balanced:
                traceln("\t\tusing balanced weights")
                self.setBalancedWeights()
            clsWeights = self.computeClassWeight(lY)
            traceln("\t\t\t --> %s" % clsWeights)

            #clsWeights = np.array([1, 4.5])
            # These weights are tuned for best performance of LR and SVM and hence consistently used here
            crf = self._getCRFModel(clsWeights)

            self.ssvm = OneSlackSSVM(crf,
                                     inference_cache=self.inference_cache,
                                     C=self.C,
                                     tol=self.tol,
                                     n_jobs=self.njobs,
                                     logger=SaveLogger(
                                         sModelFN, save_every=self.save_every),
                                     max_iter=self.max_iter,
                                     show_loss_every=10,
                                     verbose=verbose)
            bWarmStart = False

        if lGraph_vld:
            self.ssvm.fit_with_valid(lX,
                                     lY,
                                     lX_vld,
                                     lY_vld,
                                     warm_start=bWarmStart,
                                     valid_every=self.save_every)
        else:
            # old classical method
            self.ssvm.fit(lX, lY, warm_start=bWarmStart)
        traceln("\t [%.1fs] done (graph-CRF model is trained) \n" %
                chronoOff("train"))

        #traceln(self.getModelInfo())

        #cleaning useless data that takes MB on disk
        if bMakeSlim:
            self.ssvm.alphas = None
            self.ssvm.constraints_ = None
            self.ssvm.inference_cache_ = None
            traceln(
                "\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)"
            )

        #the baseline model(s) if any
        self._trainBaselines(lX, lY)

        #do some garbage collection
        del lX, lY
        gc.collect()

        return