Exemplo n.º 1
0
 def createStatObj(self, results=None, exp_pred=None, responseType=None, nExtFolds=None):
     #Initialize res (statObj) for statistic results
     res = {}
     # Classification
     res["CA"] = None
     res["CM"] = None
     res["MCC"] = None
     #Regression
     res["R2"] = None
     res["RMSE"] = None
     #Both
     res["StabilityValue"] = None
     res["foldStat"] = {
             #Regression
             "R2"   : None,
             "RMSE" : None,
             #Classification
             "CM"   : None,
             "CA"   : None,
             "MCC"  : None }
     if results is None or exp_pred is None or responseType is None or nExtFolds is None:
         return res 
     #Calculate the (R2, RMSE) or (CM, CA) results depending on Classification or regression
     if responseType == "Classification":
         #Compute CA
         res["CA"] = sum(r[0] for r in results) / self.nExtFolds
         #Compute CM
         res["CM"] = results[0][1]                      # Get the first ConfMat
         for r in results[1:]:
             for Lidx,line in enumerate(r[1]):
                 for idx,val in enumerate(line):
                     res["CM"][Lidx][idx] = res["CM"][Lidx][idx] + val   #Add each same ConfMat position
         #Compute MCC 
         res["MCC"] = evalUtilities.calcMCC(res["CM"])
         #Compute foldStat
         res["foldStat"]["CA"] = [r[0] for r in results]
         res["foldStat"]["CM"] = [r[1] for r in results]
         res["foldStat"]["MCC"] = [evalUtilities.calcMCC(r[1]) for r in results]
         #Compute Stability
         res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["CA"])
     else:
         #compute R2
         res["R2"] = evalUtilities.calcRsqrt(exp_pred)
         #compute RMSE
         res["RMSE"] = evalUtilities.calcRMSE(exp_pred)
         #Compute foldStat
         res["foldStat"]["RMSE"] = [r[0] for r in results]
         res["foldStat"]["R2"] = [r[1] for r in results]
         #Compute Stability
         res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["R2"])
     return res
Exemplo n.º 2
0
    def getProbabilitiesAsAttribute(self, algorithm=None, minsup=None, atts=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
                
			parameters:
                algo - key for the structural feature generation algorithm (set dependent structural features that have to be calculated inside the crossvalidation)
                minsup - minimum support for the algorithm
                atts - attributes to be removed before learning (e.g. meta etc...)
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None

        if algorithm:
            self.__log(" Additional features to be calculated inside of cross-validation")
            self.__log(" Algorithm for structural features: " + str(algorithm))
            self.__log(" Minimum support parameter: " + str(minsup))

        # Set the response type
        self.responseType = (
            self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        )
        self.__log("  " + str(self.responseType))

        # Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds)

        # Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        # Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        rocs = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        # Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN], negate=1)
            self.__checkTrainData(trainData)

        # Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        for ml in sortedML:
            self.__log("    > " + str(ml) + "...")
            try:
                # Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                rocs[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []

                ### mods TG
                prediction_attribute = orange.FloatVariable("class_prob")
                domain = [data.domain.attributes, prediction_attribute, data.domain.classvar]
                data_new = orange.ExampleTable(domain)

                logTxt = ""
                for foldN in range(self.nExtFolds):
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs[foldN], negate=1)
                    orig_len = len(trainData.domain.attributes)
                    # add structural descriptors to the training data (TG)
                    if algorithm:
                        trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, algorithm, minsup)
                        trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts)

                    testData = self.data.select(DataIdxs[foldN])
                    # print "IDX: ",
                    # print DataIdxs[foldN]
                    # calculate the feature values for the test data (TG)
                    if algorithm:
                        cut_off = orig_len - len(atts)
                        smarts = trainData.domain.attributes[cut_off:]
                        self.__log("  Number of structural features added: " + str(len(smarts)))
                        testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts)
                        testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts)

                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    # Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    if dontOptimize:
                        logTxt += (
                            "       Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n"
                        )
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        runPath = miscUtilities.createScratchDir(
                            baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)
                        )
                        trainData.save(os.path.join(runPath, "trainData.tab"))

                        tunedPars = paramOptUtilities.getOptParam(
                            learner=MLmethods[ml],
                            trainDataFile=os.path.join(runPath, "trainData.tab"),
                            paramList=self.paramList,
                            useGrid=False,
                            verbose=self.verbose,
                            queueType=self.queueType,
                            runPath=runPath,
                            nExtFolds=None,
                            nFolds=self.nInnerFolds,
                            logFile=self.logFile,
                            getTunedPars=True,
                        )
                        if not MLmethods[ml] or not MLmethods[ml].optimized:
                            self.__log(
                                "       WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized."
                            )
                            self.__log("                It will be ignored")
                            # self.__log("                It will be set to default parameters")
                            self.__log("                    DEBUG can be done in: " + runPath)
                            # Set learner back to default
                            # MLmethods[ml] = MLmethods[ml].__class__()
                            raise Exception("The learner " + str(ml) + " was not optimized.")
                        else:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optAcc[ml].append(tunedPars[0])
                            else:
                                res = orngTest.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                    randomGenerator=random.randint(0, 100),
                                )
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)

                            miscUtilities.removeDir(runPath)
                    # Train the model
                    model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    # Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                        roc = self.aroc(testData, [model])
                        rocs[ml].append(roc)

                    # save the prediction probabilities

                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )
                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                self.__log("       Learner " + str(ml) + " failed to create/optimize the model!")
                res = self.createStatObj()
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            # We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log(
                "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods."
            )

            if len(consensusMLs) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            if len(consensusMLs) >= 2:
                # Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])
                )
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        exprTest1 = exprTest0.replace(CLASS0, CLASS1)
                        expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    # Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        # By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
Exemplo n.º 3
0
    def getAcc(self, callBack=None, algorithm=None, params=None, atts=None, holdout=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
                
			parameters:
                algorithm - list of feature generation algorithms (set dependent features that have to be calculated inside the crossvalidation)
                params - dictionary of parameters
                atts - attributes to be removed before learning (e.g. meta etc...)
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None

        if holdout:
            self.nExtFolds = 1

        if algorithm:
            self.__log(" Additional features to be calculated inside of cross-validation")
            for i in algorithm:
                self.__log(" Algorithm: " + str(i))
            for j, v in params.iteritems():
                self.__log(" Parameter: " + str(j) + " = " + str(v))

        # Set the response type
        self.responseType = (
            self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        )
        self.__log("  " + str(self.responseType))

        # Create the Train and test sets
        DataIdxs = None
        if holdout:
            self.__log("Using hold out evaluation with " + str(holdout) + "*100 % of data for training")
            DataIdxs = dataUtilities.SeedDataSampler_holdOut(self.data, holdout)
        else:
            DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds)

        # Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        # Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        rocs = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        # Check data in advance so that, by chance, it will not fail at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN], negate=1)
            self.__checkTrainData(trainData)

        # Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        stepsDone = 0
        nTotalSteps = len(sortedML) * self.nExtFolds
        for ml in sortedML:
            self.__log("    > " + str(ml) + "...")
            try:
                # Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                rocs[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []
                logTxt = ""

                for foldN in range(self.nExtFolds):
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs[foldN], negate=1)
                    orig_len = len(trainData.domain.attributes)
                    refs = None
                    methods = [
                        "rdk_MACCS_keys",
                        "rdk_topo_fps",
                        "rdk_morgan_fps",
                        "rdk_morgan_features_fps",
                        "rdk_atompair_fps",
                    ]
                    train_domain = None
                    # add structural descriptors to the training data (TG)
                    if algorithm:
                        for i in range(len(algorithm)):
                            if algorithm[i] == "structClust":
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                actData = orange.ExampleTable(trainData.domain)
                                for d in trainData:
                                    # only valid for simboosted qsar paper experiments!?
                                    if d.getclass() == "2":
                                        actData.append(d)

                                refs = structuralClustering.getReferenceStructures(
                                    actData,
                                    threshold=params["threshold"],
                                    minClusterSize=params["minClusterSize"],
                                    numThreads=2,
                                )
                                self.__log(
                                    " found "
                                    + str(len(refs))
                                    + " reference structures in "
                                    + str(len(actData))
                                    + " active structures"
                                )
                                orig_len = orig_len + (len(refs) * len(methods))
                                trainData_sim = SimBoostedQSAR.getSimDescriptors(refs, trainData, methods)

                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_sim, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_sim, [])

                            elif algorithm[i] == "ECFP":
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                trainData_ecfp = getCinfonyDesc.getCinfonyDescResults(trainData, ["rdk.FingerPrints"])
                                train_domain = trainData_ecfp.domain
                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, [])

                            else:
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                trainData_structDesc = getStructuralDesc.getStructuralDescResult(
                                    trainData, algorithm[i], params["minsup"]
                                )
                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, [])

                                    # trainData.save("/home/girschic/proj/AZ/ProjDev/train.tab")
                    testData = self.data.select(DataIdxs[foldN])
                    # calculate the feature values for the test data (TG)
                    if algorithm:
                        for i in range(len(algorithm)):
                            if algorithm[i] == "structClust":
                                self.__log(str(algorithm[i]))
                                testData_sim = SimBoostedQSAR.getSimDescriptors(refs, testData, methods)
                                if i == (len(algorithm) - 1):
                                    testData = dataUtilities.attributeDeselectionData(testData_sim, atts)
                                else:
                                    testData = dataUtilities.attributeDeselectionData(testData_sim, [])
                            elif algorithm[i] == "ECFP":
                                self.__log(str(algorithm[i]))
                                # testData_ecfp = orange.ExampleTable(train_domain)
                                tmp_dat = []
                                for d in testData:
                                    tmp = getCinfonyDesc.getRdkFPforTestInstance(train_domain, d)
                                    tmp_dat.append(tmp)
                                testData_ecfp = orange.ExampleTable(tmp_dat[0].domain, tmp_dat)
                                if i == (len(algorithm) - 1):
                                    # 						print "removing atts"
                                    testData = dataUtilities.attributeDeselectionData(testData_ecfp, atts)
                                else:
                                    # 						print "removing no atts"
                                    testData = dataUtilities.attributeDeselectionData(testData_ecfp, [])

                            else:
                                cut_off = orig_len - len(atts)
                                smarts = trainData.domain.attributes[cut_off:]
                                self.__log("  Number of structural features added: " + str(len(smarts)))
                                testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts)
                                if i == (len(algorithm) - 1):
                                    testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts)
                                else:
                                    testData = dataUtilities.attributeDeselectionData(testData_structDesc, [])

                    #                testData.save("/home/girschic/proj/AZ/ProjDev/test.tab")
                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    # Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    if dontOptimize:
                        logTxt += (
                            "       Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n"
                        )
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        runPath = miscUtilities.createScratchDir(
                            baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)
                        )
                        # 		    self.__log("	run path:"+str(runPath))
                        trainData.save(os.path.join(runPath, "trainData.tab"))

                        tunedPars = paramOptUtilities.getOptParam(
                            learner=MLmethods[ml],
                            trainDataFile=os.path.join(runPath, "trainData.tab"),
                            paramList=self.paramList,
                            useGrid=False,
                            verbose=self.verbose,
                            queueType=self.queueType,
                            runPath=runPath,
                            nExtFolds=None,
                            nFolds=self.nInnerFolds,
                            logFile=self.logFile,
                            getTunedPars=True,
                        )
                        if not MLmethods[ml] or not MLmethods[ml].optimized:
                            self.__log(
                                "       WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized."
                            )
                            self.__log("                It will be ignored")
                            # self.__log("                It will be set to default parameters")
                            self.__log("                    DEBUG can be done in: " + runPath)
                            # Set learner back to default
                            # MLmethods[ml] = MLmethods[ml].__class__()
                            raise Exception("The learner " + str(ml) + " was not optimized.")
                        else:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optAcc[ml].append(tunedPars[0])
                            else:
                                res = orngTest.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                    randomGenerator=random.randint(0, 100),
                                )
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)

                            miscUtilities.removeDir(runPath)
                    # Train the model
                    model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    # Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                        roc = self.aroc(testData, [model])
                        rocs[ml].append(roc)
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )

                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                print "Unexpected error:",
                print sys.exc_info()[0]
                print sys.exc_info()[1]
                self.__log("       Learner " + str(ml) + " failed to create/optimize the model!")
                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            # We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log(
                "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods."
            )

            if len(consensusMLs) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            if len(consensusMLs) >= 2:
                # Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])
                )
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        exprTest1 = exprTest0.replace(CLASS0, CLASS1)
                        expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    # Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        # By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
Exemplo n.º 4
0
    def createStatObj(
        self,
        results=None,
        exp_pred=None,
        nTrainCmpds=None,
        nTestCmpds=None,
        responseType=None,
        nExtFolds=None,
        userAlert="",
        rocs=None,
    ):
        # Initialize res (statObj) for statistic results
        res = {}
        self.__log("Starting to create Stat Obj")
        # Classification
        res["CA"] = None
        res["CM"] = None
        res["MCC"] = None
        res["ROC"] = None
        # Regression
        res["Q2"] = None
        res["RMSE"] = None
        # Both
        res["StabilityValue"] = None
        res["userAlert"] = userAlert
        res["selected"] = False
        res["stable"] = False
        res["responseType"] = False
        res["foldStat"] = {
            "nTrainCmpds": None,
            "nTestCmpds": None,
            # Regression
            "Q2": None,
            "RMSE": None,
            # Classification
            "CM": None,
            "CA": None,
            "MCC": None,
            "ROC": None,
        }
        if (
            results is None
        ):  # or exp_pred is None or responseType is None or nExtFolds is None or nTestCmpds is None or nTrainCmpds is None:
            self.__log("    NONE...")
            return res
        res["responseType"] = responseType
        # Calculate the (Q2, RMSE) or (CM, CA) results depending on Classification or regression
        if responseType == "Classification":
            # Compute CA
            res["CA"] = sum(r[0] for r in results) / nExtFolds
            # Compute CM
            res["CM"] = copy.deepcopy(results[0][1])  # Get the first ConfMat
            for r in results[1:]:
                for Lidx, line in enumerate(r[1]):
                    for idx, val in enumerate(line):
                        res["CM"][Lidx][idx] = res["CM"][Lidx][idx] + val  # Add each same ConfMat position
            # Compute MCC
            res["MCC"] = evalUtilities.calcMCC(res["CM"])
            # Compute ROC
            res["ROC"] = sum(ro[0] for ro in rocs) / self.nExtFolds
            # Compute foldStat
            res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds]
            res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds]
            res["foldStat"]["CA"] = [r[0] for r in results]
            res["foldStat"]["CM"] = [r[1] for r in results]
            res["foldStat"]["MCC"] = [evalUtilities.calcMCC(r[1]) for r in results]
            res["foldStat"]["ROC"] = [ro for ro in rocs]
            # Compute Stability
            res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["CA"])
        else:
            # compute Q2
            res["Q2"] = evalUtilities.calcRsqrt(exp_pred)
            # compute RMSE
            res["RMSE"] = evalUtilities.calcRMSE(exp_pred)
            # Compute foldStat
            res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds]
            res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds]
            res["foldStat"]["RMSE"] = [r[0] for r in results]
            res["foldStat"]["Q2"] = [r[1] for r in results]
            # Compute Stability value
            res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["Q2"])
        # Evaluate stability of ML
        StabilityValue = res["StabilityValue"]
        if StabilityValue is not None:
            if responseType == "Classification":
                if statc.mean(res["foldStat"]["nTestCmpds"]) > 50:
                    stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L
                else:
                    stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H
            else:
                if statc.mean(res["foldStat"]["nTestCmpds"]) > 50:
                    stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L
                else:
                    stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H
            if StabilityValue < stableTH:  # Select only stable models
                res["stable"] = True

        return res
Exemplo n.º 5
0
def getStatistics(
        dataset,
        runningDir,
        resultsFile,
        mlList=[ml for ml in MLMETHODS if AZOC.MLMETHODS[ml]["useByDefault"]],
        queueType="NoSGE",
        verbose=0,
        getAllModels=False,
        callBack=None):
    """
                runningDir           (An existing dir for creating one job dir per fold)
                    |
                    +---- status     (The overall status:   "started", "finished" or the progress "1/10", "2/10", ...)
                    |
                    +---- fold_1
                    |
                    +---- fold_2
                    |
                    .
                    .
                    .
               
            The running will be monitorized by this method.
            Whenever a MLMethod fails the respective fold job is restarted 
        """
    if dataset.domain.classVar.varType == orange.VarTypes.Discrete:
        responseType = "Classification"
    else:
        responseType = "Regression"
    #Create the Train and test sets
    DataIdxs = dataUtilities.SeedDataSampler(dataset, AZOC.QSARNEXTFOLDS)
    #Check data in advance so that, by chance, it will not faill at the last fold!
    #for foldN in range(AZOC.QSARNEXTFOLDS):
    #trainData = dataset.select(DataIdxs,foldN,negate=1)
    #checkTrainData(trainData)

    jobs = {}
    thisDir = os.getcwd()
    os.chdir(runningDir)
    #PID = os.getpid()
    #print "Started getStatistics in Process with PID: "+str(PID)
    #os.system('echo "'+str(PID)+'" > '+os.path.join(runningDir,"PID"))
    os.system('echo "started" > ' + os.path.join(runningDir, "status"))
    # Start  all Fold jobs
    stepsDone = 0
    nTotalSteps = AZOC.QSARNEXTFOLDS
    for fold in range(AZOC.QSARNEXTFOLDS):
        job = str(fold)
        print "Starting job for fold ", job
        trainData = dataset.select(DataIdxs, fold, negate=1)
        jobs[job] = {
            "job": job,
            "path": os.path.join(runningDir, "fold_" + job),
            "running": False,
            "failed": False,
            "finished": False
        }

        # Uncomment next 3 lines for running in finished jobs dirs
        #st, jID = commands.getstatusoutput("cat "+os.path.join(runningDir, "fold_"+job,"jID"))
        #jobs[job]["jID"] = jID
        #continue

        os.system("rm -rf " + jobs[job]["path"])
        os.system("mkdir -p " + jobs[job]["path"])
        trainData.save(os.path.join(jobs[job]["path"], "trainData.tab"))
        file_h = open(os.path.join(jobs[job]["path"], "run.sh"), "w")
        file_h.write("#!/bin/tcsh\n")
        file_h.write(
            "source " +
            os.path.join(os.environ["AZORANGEHOME"], "templateProfile") + "\n")
        file_h.write("python " +
                     os.path.join(jobs[job]["path"], "QsubScript.py") + "\n")
        file_h.close()

        file_h = open(os.path.join(jobs[job]["path"], "QsubScript.py"), "w")
        file_h.write("import os\n")
        file_h.write("from AZutilities import dataUtilities\n")
        file_h.write("from AZutilities import competitiveWorkflow\n")
        file_h.write("data = dataUtilities.DataTable('" +
                     os.path.join(jobs[job]["path"], "trainData.tab") + "')\n")
        file_h.write('os.system(\'echo "running" > ' +
                     os.path.join(jobs[job]["path"], "status") + ' \')\n')
        file_h.write("models = competitiveWorkflow.getModel(data, mlList=" +
                     str(mlList) + ", savePath = '" +
                     os.path.join(jobs[job]["path"], "results.pkl") +
                     "', queueType = '" + queueType + "', getAllModels = " +
                     str(getAllModels) + ")\n")
        file_h.write("nModelsSaved = 0\n")
        file_h.write("for model in models:\n")
        file_h.write("    if not models[model] is None:\n")
        file_h.write("        models[model].write('" +
                     os.path.join(jobs[job]["path"], "model") +
                     "'+'_'+model)\n")
        file_h.write('        nModelsSaved += 1\n')
        file_h.write(
            'if nModelsSaved == len([m for m in models if not models[m] is None ]):\n'
        )
        file_h.write('    os.system(\'echo "finished" > ' +
                     os.path.join(jobs[job]["path"], "status") + ' \')\n')
        file_h.write('else:\n')
        file_h.write('    os.system(\'echo "failed" > ' +
                     os.path.join(jobs[job]["path"], "status") + ' \')\n')
        file_h.close()

        os.chdir(os.path.join(jobs[job]["path"]))
        if queueType == "NoSGE":  # Serial mode
            status, out = commands.getstatusoutput(
                "tcsh " + os.path.join(jobs[job]["path"], "run.sh"))
            if status:
                print "ERROR on Job " + str(
                    job) + " (will be restarted latter)"
                print out
            else:
                statusFile = os.path.join(jobs[job]["path"], "status")
                if os.path.isfile(statusFile):
                    st, status = commands.getstatusoutput("cat " + statusFile)
                else:
                    print "ERROR: Missing status file"
                    status = None
                if not status:
                    print "ERROR! job " + job + " has no status!"
                    jobs[job]["failed"] = True
                elif status == "failed":
                    print "Job " + job + " failed to build all models"
                    jobs[job]["failed"] = True
                elif status == "finished":
                    jobs[job]["finished"] = True

                if not isJobProgressingOK(jobs[job]):
                    print "Job " + job + " failed to build one or more models in getMLStatistics"
                    jobs[job]["failed"] = True
                    jobs[job]["finished"] = False
                if jobs[job]["failed"]:
                    print "Job " + job + " FAILED"
                else:
                    print "Finished Job " + str(job) + " with success"
            if callBack:
                stepsDone += 1
                if not callBack((100 * stepsDone) / nTotalSteps): return None
        else:
            cmd = "qsub -cwd -q batch.q" + AZOC.SGE_QSUB_ARCH_OPTION_CURRENT + os.path.join(
                jobs[job]["path"], "run.sh")
            status, out = commands.getstatusoutput(cmd)
            if status:
                print "ERROR on Job " + str(job) + " (will be skipped)"
                print out
                #raise Exception("ERROR starting job for folder "+str(job))
            # Your job 955801 ("template_run.sh") has been submitted
            jID = out.strip().split(" ")[2]
            print "    jID: ", jID
            os.system('echo "' + jID + '" > ' +
                      os.path.join(jobs[job]["path"], "jID"))
            jobs[job]["running"] = True
            jobs[job]["jID"] = jID
        os.chdir(runningDir)
    os.chdir(thisDir)

    finished = []
    if queueType == "NoSGE":
        failed = []
        #Report failed Jobs
        for job in jobs:
            if jobs[job]["finished"]:
                finished.append(job)
        for job in jobs:
            if jobs[job]["failed"]:
                failed.append(job)
        print "Successful finished Jobs: ", finished
        print "Failed Jobs: ", failed
    else:  # Monitor SGE jobs untill all are finished
        #Monitor Fold jobs
        updateJobsStatus(jobs)
        for job in jobs:
            if jobs[job]["finished"]:
                finished.append(job)
        print "Jobs already finished: ", finished
        os.system(' echo "' + str(len(finished)) + '/' +
                  str(AZOC.QSARNEXTFOLDS) + '" > ' +
                  os.path.join(runningDir, "status"))
        while len(finished) < AZOC.QSARNEXTFOLDS:
            print ".",
            sys.stdout.flush()
            updateJobsStatus(jobs)
            for job in jobs:
                if jobs[job]["finished"] and job not in finished:
                    finished.append(job)
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None
                    print time.asctime() + ": Finished job " + str(job)
            os.system(' echo "' + str(len(finished)) + '/' +
                      str(AZOC.QSARNEXTFOLDS) + '" > ' +
                      os.path.join(runningDir, "status"))
            for job in [j for j in jobs if jobs[j]["failed"]]:
                jobs[job] = restartJob(jobs[job])
            time.sleep(5)

    print "All fold jobs finished!"
    # Gather the results
    print "Gathering results..."
    #Var for saving each Fols result
    results = {}
    exp_pred = {}
    nTrainEx = {}
    nTestEx = {}
    # Var for saving the statistics results
    statistics = {}

    mlMethods = [ml for ml in AZOC.MLMETHODS] + ["Consensus"]
    sortedJobs = [job for job in jobs]
    sortedJobs.sort(cmp=lambda x, y: int(x) > int(y) and 1 or -1)
    # Place for storing the selected models results
    results["selectedML"] = []
    exp_pred["selectedML"] = []
    nTrainEx["selectedML"] = []
    nTestEx["selectedML"] = []
    foldSelectedML = []

    for ml in mlMethods:  # Loop over each MLMethod
        try:
            #Var for saving each Fols result
            results[ml] = []
            exp_pred[ml] = []
            nTrainEx[ml] = []
            nTestEx[ml] = []
            logTxt = ""

            for job in sortedJobs:  #loop over each fold
                modelPath = os.path.join(jobs[job]["path"], "model_" + ml)
                if not os.path.isdir(modelPath):
                    if getAllModels:
                        print "MLMethod " + ml + " not available in fold " + job
                    continue

                resFile = os.path.join(jobs[job]["path"], "results.pkl")
                statFile_h = open(resFile)
                foldStat = pickle.load(statFile_h)
                statFile_h.close()

                #load model
                model = AZBaseClasses.modelRead(modelPath)
                #Test the model
                testData = dataset.select(DataIdxs, int(job))
                nTrainEx[ml].append(model.NTrainEx)
                nTestEx[ml].append(len(testData))
                if foldStat[ml]["selected"]:
                    foldSelectedML.append(ml)
                    nTrainEx["selectedML"].append(model.NTrainEx)
                    nTestEx["selectedML"].append(len(testData))

                if responseType == "Classification":
                    results[ml].append(
                        (evalUtilities.getClassificationAccuracy(
                            testData,
                            model), evalUtilities.getConfMat(testData, model)))
                    if foldStat[ml]["selected"]:
                        results["selectedML"].append(results[ml][-1])
                else:
                    local_exp_pred = []
                    for ex in testData:
                        local_exp_pred.append((ex.getclass(), model(ex)))
                    results[ml].append(
                        (evalUtilities.calcRMSE(local_exp_pred),
                         evalUtilities.calcRsqrt(local_exp_pred)))
                    #Save the experimental value and correspondent predicted value
                    exp_pred[ml] += local_exp_pred
                    if foldStat[ml]["selected"]:
                        results["selectedML"].append(results[ml][-1])
                        exp_pred["selectedML"] += local_exp_pred

            res = createStatObj(results[ml], exp_pred[ml],
                                nTrainEx[ml], nTestEx[ml], responseType,
                                len(sortedJobs), logTxt)
            if not res:
                raise Exception("No results available!")
            if getAllModels:
                statistics[ml] = copy.deepcopy(res)
                writeResults(statistics, resultsFile)
            print "       OK", ml
        except:
            print "Error on MLmethod " + ml + ". It will be skipped"
    ml = "selectedML"
    res = createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],
                        responseType, len(sortedJobs), logTxt, foldSelectedML)
    if not res:
        raise Exception("No results available!")
    statistics[ml] = copy.deepcopy(res)
    writeResults(statistics, resultsFile)
    os.system(' echo "finished" > ' + os.path.join(runningDir, "status"))
    return statistics
Exemplo n.º 6
0
    def getAcc(self, algorithm = None, minsup = None, atts = None):
        """ For regression problems, it returns the RMSE and the R2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"R2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None

        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None
        
        if (self.algorithm):
            self.__log(" Additional structural features to be calculated inside of cross-validation")
            self.__log(" Algorithm for structural features: "+str(self.algorithm))
            self.__log(" Minimum support parameter: "+str(self.minsup))
        
        # Set the response type
        responseType =  self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification"  or "Regression"
        self.__log("  "+str(responseType))

        #Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) 
        
        #Var for saving each Fols result
        results = {}
        exp_pred = {}
        
        #Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models={}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  "+str([x for x in MLmethods]))
        for ml in MLmethods:
          self.__log("    > "+str(ml)+"...")
          try:
            #Var for saving each Fols result
            results[ml] = []
            exp_pred[ml] = []
            models[ml] = []
            for foldN in range(self.nExtFolds):
                if type(self.learner) == dict:
                    self.paramList = None

                trainData = self.data.select(DataIdxs[foldN],negate=1)
                orig_len = len(trainData.domain.attributes)

		if (self.algorithm):
			# add structural descriptors to the training data (TG)
                	trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, self.algorithm, self.minsup)
                	trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, self.atts)

                runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam")
                trainData.save(os.path.join(runPath,"trainData.tab"))

                testData = self.data.select(DataIdxs[foldN])
		if (self.algorithm):
			# calculate the feature values for the test data (TG)
			cut_off = orig_len - len(self.atts)
                	smarts = trainData.domain.attributes[cut_off:]
			self.__log("  Number of structural features added: "+str(len(smarts)))
			testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData,smarts)
			testData = dataUtilities.attributeDeselectionData(testData_structDesc, self.atts)

                paramOptUtilities.getOptParam(
                    learner = MLmethods[ml], 
                    trainDataFile = os.path.join(runPath,"trainData.tab"), 
                    paramList = self.paramList, 
                    useGrid = False, 
                    verbose = self.verbose, 
                    queueType = self.queueType, 
                    runPath = runPath, 
                    nExtFolds = None, 
                    nFolds = self.nInnerFolds
                    )
                if not MLmethods[ml].optimized:
                    self.__log("       The learner "+str(ml)+" was not optimized.")
                    raise Exception("The learner "+str(ml)+" was not optimized.")
                miscUtilities.removeDir(runPath) 
		
                #Train the model
                model = MLmethods[ml](trainData)
                models[ml].append(model)
                #Test the model
                if responseType == "Classification":
                    results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                else:
                    local_exp_pred = []
                    for ex in testData:
                        local_exp_pred.append((ex.getclass(), model(ex)))
                    results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                    #Save the experimental value and correspondent predicted value
                    exp_pred[ml] += local_exp_pred
            res = self.createStatObj(results[ml], exp_pred[ml], responseType, self.nExtFolds)
            if self.verbose > 0: 
                print "AccWOptParamGetter!Results  "+ml+":\n"
                pprint(res)
            if not res:
                raise Exception("No results available!")
            statistics[ml] = res.copy()
            self.__writeResults(res)
            self.__log("       OK")
          except:
            self.__log("       Learner "+str(ml)+" failed to optimize!")
            res = self.createStatObj()
            statistics[ml] = res.copy()

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            #We still need to build a consensus model out of the stable models 
            #   ONLY if there are more that one model stable!
            stableML={}
            for modelName in statistics:
                if statistics[modelName]["StabilityValue"] < AZOC.QSARSTABILITYTHRESHOLD:   # Select only stable models
                    stableML[modelName] = statistics[modelName].copy()
            if len(stableML) >= 2:
                self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.")
                if responseType == "Classification":
                    CLASS0 = str(self.data.domain.classVar.values[0])
                    CLASS1 = str(self.data.domain.classVar.values[1])
                    exprTest0 = "(0"
                    for ml in stableML:
                        exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" "
                    exprTest0 += ")/IF0(sum([False"
                    for ml in stableML:
                        exprTest0 += ", "+ml+" == "+CLASS0+" "
                    exprTest0 += "]),1)"
                    exprTest1 = exprTest0.replace(CLASS0,CLASS1)
                    expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1]
                else:
                    R2sum = sum([stableML[ml]["R2"] for ml in stableML])
                    expression = "(1 / "+str(R2sum)+") * (0"
                    for ml in stableML:
                        expression += " + "+str(stableML[ml]["R2"])+" * "+ml+" "
                    expression += ")"

                #Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                self.__log("Calculating the statistics for a Consensus model")
                for foldN in range(self.nExtFolds):
                    testData = self.data.select(DataIdxs[foldN])
                    consensusClassifiers = {}
                    for learnerName in stableML:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression)     
                    #Test the model
                    if responseType == "Classification":
                        Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                        #Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, responseType, self.nExtFolds)
                statistics["Consensus"] = res.copy()
                statistics["Consensus"]["IndividualStatistics"] = stableML.copy()
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics
                 
        #By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
Exemplo n.º 7
0
def createStatObj(results=None,
                  exp_pred=None,
                  nTrainCmpds=None,
                  nTestCmpds=None,
                  responseType=None,
                  nExtFolds=None,
                  userAlert="",
                  foldSelectedML=None):
    #Initialize res (statObj) for statistic results
    res = {}
    # Classification
    res["CA"] = None
    res["CM"] = None
    res["MCC"] = None
    #Regression
    res["Q2"] = None
    res["RMSE"] = None
    #Both
    res["StabilityValue"] = None
    res["userAlert"] = userAlert
    res["selected"] = False
    res["stable"] = False
    res["responseType"] = False
    res["foldStat"] = {
        "nTrainCmpds": None,
        "nTestCmpds": None,
        #Regression
        "Q2": None,
        "RMSE": None,
        #Classification
        "CM": None,
        "CA": None,
        "MCC": None
    }
    if not results or results is None or exp_pred is None or responseType is None or nExtFolds is None or nTestCmpds is None or nTrainCmpds is None:
        return res
    res["responseType"] = responseType
    #Calculate the (Q2, RMSE) or (CM, CA) results depending on Classification or regression
    if responseType == "Classification":
        #Compute CA
        res["CA"] = sum(r[0] for r in results) / nExtFolds
        #Compute CM
        res["CM"] = copy.deepcopy(results[0][1])  # Get the first ConfMat
        for r in results[1:]:
            for Lidx, line in enumerate(r[1]):
                for idx, val in enumerate(line):
                    res["CM"][Lidx][idx] = res["CM"][Lidx][
                        idx] + val  #Add each same ConfMat position
        #Compute MCC
        res["MCC"] = evalUtilities.calcMCC(res["CM"])
        #Compute foldStat
        res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds]
        res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds]
        res["foldStat"]["CA"] = [r[0] for r in results]
        res["foldStat"]["CM"] = [r[1] for r in results]
        res["foldStat"]["MCC"] = [evalUtilities.calcMCC(r[1]) for r in results]
        #Compute Stability
        res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["CA"])
    else:
        #compute Q2
        res["Q2"] = evalUtilities.calcRsqrt(exp_pred)
        #compute RMSE
        res["RMSE"] = evalUtilities.calcRMSE(exp_pred)
        #Compute foldStat
        res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds]
        res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds]
        res["foldStat"]["RMSE"] = [r[0] for r in results]
        res["foldStat"]["Q2"] = [r[1] for r in results]
        #Compute Stability value
        res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["Q2"])

    # Save selectedMLs if passed
    if foldSelectedML:
        res["foldStat"]["foldSelectedML"] = [ml for ml in foldSelectedML]

    #Evaluate stability of ML
    StabilityValue = res["StabilityValue"]
    if StabilityValue is not None:
        if responseType == "Classification":
            if statc.mean(res["foldStat"]["nTestCmpds"]) > 50:
                stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L
            else:
                stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H
        else:
            if statc.mean(res["foldStat"]["nTestCmpds"]) > 50:
                stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L
            else:
                stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H
        if StabilityValue < stableTH:  # Select only stable models
            res["stable"] = True

    return res
Exemplo n.º 8
0
    def getAcc(self, callBack = None, callBackWithFoldModel = None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None
        # Set the response type
        self.responseType =  self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification"  or "Regression"
        self.__log("  "+str(self.responseType))

        #Create the Train and test sets
        if self.usePreDefFolds:
            DataIdxs = self.preDefIndices 
        else:
            DataIdxs = self.sampler(self.data, self.nExtFolds) 
        foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0] #Folds used only from 1 on ... 0 are for fixed train Bias
        nFolds = len(foldsN)
        #Fix the Indexes based on DataIdxs
        # (0s) represents the train set  ( >= 1s) represents the test set folds
        if self.useVarCtrlCV:
            nShifted = [0] * nFolds
            for idx,isTest in enumerate(self.preDefIndices):  # self.preDefIndices == 0 are to be used in TrainBias
                if not isTest:
                    if DataIdxs[idx]:
                        nShifted[DataIdxs[idx]] += 1
                        DataIdxs[idx] = 0
            for idx,shift in enumerate(nShifted):
                self.__log("In fold "+str(idx)+", "+str(shift)+" examples were shifted to the train set.")

        #Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}
        
        #Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models={}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  "+str([x for x in MLmethods]))

        #Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in foldsN:
            trainData = self.data.select(DataIdxs,foldN,negate=1)
            self.__checkTrainData(trainData)

        #Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0,"PLS")

        stepsDone = 0
        nTotalSteps = len(sortedML) * self.nExtFolds  
        for ml in sortedML:
          startTime = time.time()
          self.__log("    > "+str(ml)+"...")
          try:
            #Var for saving each Fols result
            results[ml] = []
            exp_pred[ml] = []
            models[ml] = []
            nTrainEx[ml] = []
            nTestEx[ml] = []
            optAcc[ml] = []
            logTxt = ""
            for foldN in foldsN:
                if type(self.learner) == dict:
                    self.paramList = None

                trainData = self.data.select(DataIdxs,foldN,negate=1)
                testData = self.data.select(DataIdxs,foldN)
                smilesAttr = dataUtilities.getSMILESAttr(trainData)
                if smilesAttr:
                    self.__log("Found SMILES attribute:"+smilesAttr)
                    if MLmethods[ml].specialType == 1:
                       trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) 
                       testData = dataUtilities.attributeSelectionData(testData, [smilesAttr, testData.domain.classVar.name]) 
                       self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain]))
                    else:
                       trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) 
                       testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) 
                       self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]]))

                nTrainEx[ml].append(len(trainData))
                nTestEx[ml].append(len(testData))
                #Test if trainsets inside optimizer will respect dataSize criterias.
                #  if not, don't optimize, but still train the model
                dontOptimize = False
                if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20):
                    dontOptimize = True
                else:                      
                    tmpDataIdxs = self.sampler(trainData, self.nInnerFolds)
                    tmpTrainData = trainData.select(tmpDataIdxs,1,negate=1)
                    if not self.__checkTrainData(tmpTrainData, False):
                        dontOptimize = True

                SpecialModel = None
                if dontOptimize:
                    logTxt += "       Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n"
                    self.__log(logTxt)
                    if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                        res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100))
                        CA = evalUtilities.CA(res)[0]
                        optAcc[ml].append(CA)
                    else:
                        res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100))
                        R2 = evalUtilities.R2(res)[0]
                        optAcc[ml].append(R2)
                else:
                    if MLmethods[ml].specialType == 1: 
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                    optInfo, SpecialModel = MLmethods[ml].optimizePars(trainData, folds = 5)
                                    optAcc[ml].append(optInfo["Acc"])
                            else:
                                    res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100))
                                    R2 = evalUtilities.R2(res)[0]
                                    optAcc[ml].append(R2)
                    else:
                            runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam", seed = id(trainData))
                            trainData.save(os.path.join(runPath,"trainData.tab"))
                            tunedPars = paramOptUtilities.getOptParam(
                                learner = MLmethods[ml], 
                                trainDataFile = os.path.join(runPath,"trainData.tab"), 
                                paramList = self.paramList, 
                                useGrid = False, 
                                verbose = self.verbose, 
                                queueType = self.queueType, 
                                runPath = runPath, 
                                nExtFolds = None, 
                                nFolds = self.nInnerFolds,
                                logFile = self.logFile,
                                getTunedPars = True,
                                fixedParams = self.fixedParams)
                            if not MLmethods[ml] or not MLmethods[ml].optimized:
                                self.__log("       WARNING: GETACCWOPTPARAM: The learner "+str(ml)+" was not optimized.")
                                self.__log("                It will be ignored")
                                #self.__log("                It will be set to default parameters")
                                self.__log("                    DEBUG can be done in: "+runPath)
                                #Set learner back to default 
                                #MLmethods[ml] = MLmethods[ml].__class__()
                                raise Exception("The learner "+str(ml)+" was not optimized.")
                            else:
                                if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                    optAcc[ml].append(tunedPars[0])
                                else:
                                    res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100))
                                    R2 = evalUtilities.R2(res)[0]
                                    optAcc[ml].append(R2)

                                miscUtilities.removeDir(runPath) 
                #Train the model
                if SpecialModel is not None:
                    model = SpecialModel 
                else:
                    model = MLmethods[ml](trainData)
                models[ml].append(model)
                #Test the model
                if self.responseType == "Classification":
                    results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                else:
                    local_exp_pred = []
                    # Predict using bulk-predict
                    predictions = model(testData)
                    # Gather predictions
                    for n,ex in enumerate(testData):
                        local_exp_pred.append((ex.getclass().value, predictions[n].value))
                    results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                    #Save the experimental value and correspondent predicted value
                    exp_pred[ml] += local_exp_pred
                if callBack:
                     stepsDone += 1
                     if not callBack((100*stepsDone)/nTotalSteps): return None
                if callBackWithFoldModel:
                    callBackWithFoldModel(model) 

            res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None )
            if self.verbose > 0: 
                print "UnbiasedAccuracyGetter!Results  "+ml+":\n"
                pprint(res)
            if not res:
                raise Exception("No results available!")
            res["runningTime"] = time.time() - startTime
            statistics[ml] = copy.deepcopy(res)
            self.__writeResults(statistics)
            self.__log("       OK")
          except:
            self.__log("       Learner "+str(ml)+" failed to create/optimize the model!")
            error = str(sys.exc_info()[0]) +" "+\
                        str(sys.exc_info()[1]) +" "+\
                        str(traceback.extract_tb(sys.exc_info()[2]))
            self.__log(error)
 
            res = self.createStatObj()
            statistics[ml] = copy.deepcopy(res)
            self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            #We still need to build a consensus model out of the stable models 
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0)
            consensusMLs={}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log("Found "+str(len(consensusMLs))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.")

            if len(consensusMLs) <= 1:   # we need more models to build a consensus!
                consensusMLs={}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            # Exclude specialType models 
            excludeThis = []
            for learnerName in consensusMLs:
                if models[learnerName][0].specialType > 0:
                    excludeThis.append(learnerName)
            for learnerName in excludeThis:
                consensusMLs.pop(learnerName)
                self.__log("    > Excluded special model " + learnerName)
            self.__log("    > Stable modules: " + str(consensusMLs.keys()))

            if len(consensusMLs) >= 2:
                #Var for saving each Fols result
                startTime = time.time()
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log("Calculating the statistics for a Consensus model based on "+str([ml for ml in consensusMLs]))
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        # exprTest0
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(optAcc[ml][foldN])+" "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", "+ml+" == "+CLASS0+" "
                        exprTest0 += "]),1)"
                        # exprTest1
                        exprTest1 = "(0"
                        for ml in consensusMLs:
                            exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(optAcc[ml][foldN])+" "
                        exprTest1 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest1 += ", "+ml+" == "+CLASS1+" "
                        exprTest1 += "]),1)"
                        # Expression
                        expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / "+str(Q2sum)+") * (0"
                        for ml in consensusMLs:
                            expression += " + "+str(optAcc[ml][foldN])+" * "+ml+" "
                        expression += ")"

                    testData = self.data.select(DataIdxs,foldN+1)  # fold 0 if for the train Bias!!
                    smilesAttr = dataUtilities.getSMILESAttr(testData)
                    if smilesAttr:
                        self.__log("Found SMILES attribute:"+smilesAttr)
                        testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr])
                        self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]]))

                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression)     
                    CnTrainEx.append(model.NTrainEx)
                    #Test the model
                    if self.responseType == "Classification":
                        Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                    else:
                        local_exp_pred = []
                        # Predict using bulk-predict
                        predictions = model(testData)
                        # Gather predictions
                        for n,ex in enumerate(testData):
                            local_exp_pred.append((ex.getclass().value, predictions[n].value))
                        Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                        #Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None )
                res["runningTime"] = time.time() - startTime
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics
                 
        #By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
    def getAcc(self):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None
        # Set the response type
        self.responseType =  self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification"  or "Regression"
        self.__log("  "+str(self.responseType))

        #Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) 
        
        #Var for saving each Fols result
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}
        
        #Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models={}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  "+str([x for x in MLmethods]))

        #Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN],negate=1)
            self.__checkTrainData(trainData)

        for ml in MLmethods:
          self.__log("    > "+str(ml)+"...")
          try:
            #Var for saving each Fols result
            results[ml] = []
            exp_pred[ml] = []
            models[ml] = []
            nTrainEx[ml] = []
            nTestEx[ml] = []
            logTxt = "" 
            for foldN in range(self.nExtFolds):
                if type(self.learner) == dict:
                    self.paramList = None

                trainData = self.data.select(DataIdxs[foldN],negate=1)
                testData = self.data.select(DataIdxs[foldN])
                nTrainEx[ml].append(len(trainData))
                nTestEx[ml].append(len(testData))
                #Test if trainsets inside optimizer will respect dataSize criterias.
                #  if not, don't optimize, but still train the model
                dontOptimize = False
                if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20):
                    dontOptimize = True
                else:                      
                    tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                    tmpTrainData = trainData.select(tmpDataIdxs[0],negate=1)
                    if not self.__checkTrainData(tmpTrainData, False):
                        dontOptimize = True

                if dontOptimize:
                    logTxt += "       Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n"
                    self.__log(logTxt)
                else:
                    runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam")
                    trainData.save(os.path.join(runPath,"trainData.tab"))

                    paramOptUtilities.getOptParam(
                        learner = MLmethods[ml], 
                        trainDataFile = os.path.join(runPath,"trainData.tab"), 
                        paramList = self.paramList, 
                        useGrid = False, 
                        verbose = self.verbose, 
                        queueType = self.queueType, 
                        runPath = runPath, 
                        nExtFolds = None, 
                        nFolds = self.nInnerFolds)
                    if not MLmethods[ml].optimized:
                        self.__log("       The learner "+str(ml)+" was not optimized.")
                        raise Exception("The learner "+str(ml)+" was not optimized.")
                    miscUtilities.removeDir(runPath) 
                #Train the model
                model = MLmethods[ml](trainData)
                models[ml].append(model)
                #Test the model
                if self.responseType == "Classification":
                    results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                else:
                    local_exp_pred = []
                    for ex in testData:
                        local_exp_pred.append((ex.getclass(), model(ex)))
                    results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                    #Save the experimental value and correspondent predicted value
                    exp_pred[ml] += local_exp_pred
   
            res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt)
            if self.verbose > 0: 
                print "AccWOptParamGetter!Results  "+ml+":\n"
                pprint(res)
            if not res:
                raise Exception("No results available!")
            statistics[ml] = res.copy()
            self.__writeResults(statistics)
            self.__log("       OK")
          except:
            self.__log("       Learner "+str(ml)+" failed to create/optimize the model!")
            res = self.createStatObj()
            statistics[ml] = res.copy()

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            #We still need to build a consensus model out of the stable models 
            #   ONLY if there are more that one model stable!
            stableML={}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None:
                    if self.responseType == "Classification":
                        if statc.mean(statistics[modelName]["foldStat"]["nTestCmpds"]) > 50:
                            stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L
                        else:
                            stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H
                    else:
                        if statc.mean(statistics[modelName]["foldStat"]["nTestCmpds"]) > 50:
                            stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L
                        else:
                            stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H
                    if StabilityValue < stableTH:   # Select only stable models
                        stableML[modelName] = statistics[modelName].copy()
            if len(stableML) >= 2:
                self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.")
                if self.responseType == "Classification":
                    CLASS0 = str(self.data.domain.classVar.values[0])
                    CLASS1 = str(self.data.domain.classVar.values[1])
                    exprTest0 = "(0"
                    for ml in stableML:
                        exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" "
                    exprTest0 += ")/IF0(sum([False"
                    for ml in stableML:
                        exprTest0 += ", "+ml+" == "+CLASS0+" "
                    exprTest0 += "]),1)"
                    exprTest1 = exprTest0.replace(CLASS0,CLASS1)
                    expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1]
                else:
                    Q2sum = sum([stableML[ml]["Q2"] for ml in stableML])
                    expression = "(1 / "+str(Q2sum)+") * (0"
                    for ml in stableML:
                        expression += " + "+str(stableML[ml]["Q2"])+" * "+ml+" "
                    expression += ")"

                #Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log("Calculating the statistics for a Consensus model")
                for foldN in range(self.nExtFolds):
                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in stableML:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression)     
                    CnTrainEx.append(model.NTrainEx)
                    #Test the model
                    if self.responseType == "Classification":
                        Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                        #Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = res.copy()
                statistics["Consensus"]["IndividualStatistics"] = stableML.copy()
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics
                 
        #By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
Exemplo n.º 10
0
def getStatistics(dataset, runningDir, resultsFile, queueType = "NoSGE", verbose = 0, getAllModels = False, callBack = None):
        """
                runningDir           (An existing dir for creating one job dir per fold)
                    |
                    +---- status     (The overall status:   "started", "finished" or the progress "1/10", "2/10", ...)
                    |
                    +---- fold_1
                    |
                    +---- fold_2
                    |
                    .
                    .
                    .
               
            The running will be monitorized by this method.
            Whenever a MLMethod fails the respective fold job is restarted 
        """
        if dataset.domain.classVar.varType == orange.VarTypes.Discrete: 
            responseType = "Classification"
        else:
            responseType = "Regression"
        #Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(dataset, AZOC.QSARNEXTFOLDS )
        #Check data in advance so that, by chance, it will not faill at the last fold!
        #for foldN in range(AZOC.QSARNEXTFOLDS):
            #trainData = dataset.select(DataIdxs[foldN],negate=1)
            #checkTrainData(trainData)

        jobs = {}
        thisDir = os.getcwd()
        os.chdir(runningDir)
        #PID = os.getpid() 
        #print "Started getStatistics in Process with PID: "+str(PID)
        #os.system('echo "'+str(PID)+'" > '+os.path.join(runningDir,"PID"))
        os.system('echo "started" > '+os.path.join(runningDir,"status"))
        # Start  all Fold jobs
        stepsDone = 0
        nTotalSteps = AZOC.QSARNEXTFOLDS 
        for fold in range(AZOC.QSARNEXTFOLDS):
            job = str(fold)
            print "Starting job for fold ",job
            trainData = dataset.select(DataIdxs[fold],negate=1)
            jobs[job] = {"job":job,"path":os.path.join(runningDir, "fold_"+job), "running":False, "failed":False, "finished":False}

            # Uncomment next 3 lines for running in finished jobs dirs
            #st, jID = commands.getstatusoutput("cat "+os.path.join(runningDir, "fold_"+job,"jID"))
            #jobs[job]["jID"] = jID
            #continue
            
            os.system("rm -rf "+jobs[job]["path"])
            os.system("mkdir -p "+jobs[job]["path"])
            trainData.save(os.path.join(jobs[job]["path"],"trainData.tab"))
            file_h = open(os.path.join(jobs[job]["path"],"run.sh"),"w")
            file_h.write("#!/bin/tcsh\n")
            file_h.write("source "+os.path.join(os.environ["AZORANGEHOME"], "templateProfile") + "\n")
            file_h.write("python "+os.path.join(jobs[job]["path"],"QsubScript.py")+"\n")
            file_h.close()

            file_h = open(os.path.join(jobs[job]["path"],"QsubScript.py"),"w")
            file_h.write("import os\n")
            file_h.write("from AZutilities import dataUtilities\n")
            file_h.write("from AZutilities import competitiveWorkflow\n")
            file_h.write("data = dataUtilities.DataTable('"+os.path.join(jobs[job]["path"],"trainData.tab")+"')\n")
            file_h.write('os.system(\'echo "running" > '+os.path.join(jobs[job]["path"],"status")+' \')\n')
            file_h.write("models = competitiveWorkflow.getModel(data, savePath = '"+os.path.join(jobs[job]["path"],"results.pkl")+"', queueType = '"+queueType+"', getAllModels = "+str(getAllModels)+")\n")
            file_h.write("nModelsSaved = 0\n")
            file_h.write("for model in models:\n")
            file_h.write("    if not models[model] is None:\n")
            file_h.write("        models[model].write('"+os.path.join(jobs[job]["path"],"model")+"'+'_'+model)\n")
            file_h.write('        nModelsSaved += 1\n')
            file_h.write('if nModelsSaved == len([m for m in models if not models[m] is None ]):\n')
            file_h.write('    os.system(\'echo "finished" > '+os.path.join(jobs[job]["path"],"status")+' \')\n')
            file_h.write('else:\n')
            file_h.write('    os.system(\'echo "failed" > '+os.path.join(jobs[job]["path"],"status")+' \')\n')
            file_h.close()
            
            os.chdir(os.path.join(jobs[job]["path"]))
            if queueType == "NoSGE":  # Serial mode
                status, out = commands.getstatusoutput("tcsh " + os.path.join(jobs[job]["path"],"run.sh"))
                if status:
                    print "ERROR on Job "+str(job)+" (will be restarted latter)"
                    print out
                else:
                    statusFile = os.path.join(jobs[job]["path"],"status")
                    if os.path.isfile(statusFile):
                        st, status = commands.getstatusoutput("cat "+statusFile)
                    else:
                        print "ERROR: Missing status file"
                        status = None
                    if not status:
                        print "ERROR! job "+job+" has no status!"
                        jobs[job]["failed"] = True
                    elif status == "failed":
                        print "Job "+job+" failed to build all models"
                        jobs[job]["failed"] = True
                    elif status == "finished":
                        jobs[job]["finished"] = True
 
                    if not isJobProgressingOK(jobs[job]):
                        print "Job "+job+" failed to build one or more models in getMLStatistics"
                        jobs[job]["failed"] = True 
                        jobs[job]["finished"] = False 
                    if jobs[job]["failed"]:
                        print "Job "+job+" FAILED"    
                    else:
                        print "Finished Job "+str(job)+" with success"
                if callBack:
                     stepsDone += 1
                     if not callBack((100*stepsDone)/nTotalSteps): return None    
            else:
                cmd = "qsub -cwd -q batch.q" + AZOC.SGE_QSUB_ARCH_OPTION_CURRENT + os.path.join(jobs[job]["path"],"run.sh")
                status, out = commands.getstatusoutput(cmd)
                if status:
                    print "ERROR on Job "+str(job)+" (will be skipped)"
                    print out
                    #raise Exception("ERROR starting job for folder "+str(job))
                # Your job 955801 ("template_run.sh") has been submitted
                jID = out.strip().split(" ")[2]
                print "    jID: ",jID
                os.system('echo "'+jID+'" > '+os.path.join(jobs[job]["path"], "jID"))
                jobs[job]["running"] = True
                jobs[job]["jID"] = jID
            os.chdir(runningDir)
        os.chdir(thisDir)

        finished = []
        if queueType == "NoSGE":  
            failed = []
            #Report failed Jobs
            for job in jobs:
                if jobs[job]["finished"]:
                    finished.append(job)
            for job in jobs:
                if jobs[job]["failed"]:
                    failed.append(job)
            print "Successful finished Jobs: ",finished
            print "Failed Jobs: ",failed                 
        else:                           # Monitor SGE jobs untill all are finished
            #Monitor Fold jobs
            updateJobsStatus(jobs)
            for job in jobs:
                if jobs[job]["finished"]:
                    finished.append(job)
            print "Jobs already finished: ",finished
            os.system(' echo "'+str(len(finished))+'/'+str(AZOC.QSARNEXTFOLDS)+'" > '+os.path.join(runningDir,"status"))
            while len(finished) < AZOC.QSARNEXTFOLDS:
                print ".",
                sys.stdout.flush() 
                updateJobsStatus(jobs)
                for job in jobs:
                    if jobs[job]["finished"] and job not in finished:
                        finished.append(job)
                        if callBack:
                            stepsDone += 1
                            if not callBack((100*stepsDone)/nTotalSteps): return None
                        print time.asctime()+": Finished job "+str(job)
                os.system(' echo "'+str(len(finished))+'/'+str(AZOC.QSARNEXTFOLDS)+'" > '+os.path.join(runningDir,"status"))
                for job in [j for j in jobs if jobs[j]["failed"]]:
                    jobs[job] = restartJob(jobs[job]) 
                time.sleep(5)                

        print "All fold jobs finished!"
        # Gather the results
        print "Gathering results..."
        #Var for saving each Fols result
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}
        # Var for saving the statistics results
        statistics = {}

        mlMethods = [ml for ml in AZOC.MLMETHODS] + ["Consensus"] 
        sortedJobs = [job for job in jobs]
        sortedJobs.sort(cmp = lambda x,y:int(x)>int(y) and 1 or -1)
        # Place for storing the selected models results
        results["selectedML"] = []
        exp_pred["selectedML"] = []
        nTrainEx["selectedML"] = []
        nTestEx["selectedML"] = []
        foldSelectedML = []

        for ml in mlMethods:   # Loop over each MLMethod
            try:
                #Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                logTxt = ""

                
                for job in sortedJobs:   #loop over each fold
                    modelPath = os.path.join(jobs[job]["path"], "model_"+ml)
                    if not os.path.isdir(modelPath):
                        if getAllModels: print "MLMethod "+ml+" not available in fold "+job
                        continue

                    resFile = os.path.join(jobs[job]["path"], "results.pkl")
                    statFile_h = open(resFile)
                    foldStat = pickle.load(statFile_h)
                    statFile_h.close()

                    #load model
                    model = AZBaseClasses.modelRead(modelPath)
                    #Test the model
                    testData = dataset.select(DataIdxs[int(job)])
                    nTrainEx[ml].append(model.NTrainEx)
                    nTestEx[ml].append(len(testData))
                    if foldStat[ml]["selected"]:
                        foldSelectedML.append(ml)
                        nTrainEx["selectedML"].append(model.NTrainEx)
                        nTestEx["selectedML"].append(len(testData))

                    if responseType == "Classification":
                        results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                        if foldStat[ml]["selected"]:
                            results["selectedML"].append(results[ml][-1])
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                        #Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred
                        if foldStat[ml]["selected"]:
                            results["selectedML"].append(results[ml][-1])
                            exp_pred["selectedML"]+= local_exp_pred

                res = createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],responseType, len(sortedJobs), logTxt)
                if not res:
                    raise Exception("No results available!")
                if getAllModels:
                    statistics[ml] = copy.deepcopy(res)
                    writeResults(statistics, resultsFile)
                print "       OK",ml
            except:
                print "Error on MLmethod "+ml+". It will be skipped"
        ml = "selectedML"
        res = createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],responseType, len(sortedJobs), logTxt, foldSelectedML)
        if not res:
            raise Exception("No results available!")
        statistics[ml] = copy.deepcopy(res)
        writeResults(statistics, resultsFile)
        os.system(' echo "finished" > '+os.path.join(runningDir,"status"))
        return statistics
Exemplo n.º 11
0
    def getAcc(self, callBack=None, callBackWithFoldModel=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None
        # Set the response type
        self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        self.__log("  " + str(self.responseType))

        #Create the Train and test sets
        if self.usePreDefFolds:
            DataIdxs = self.preDefIndices
        else:
            DataIdxs = self.sampler(self.data, self.nExtFolds)
        foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0
                  ]  #Folds used only from 1 on ... 0 are for fixed train Bias
        nFolds = len(foldsN)
        #Fix the Indexes based on DataIdxs
        # (0s) represents the train set  ( >= 1s) represents the test set folds
        if self.useVarCtrlCV:
            nShifted = [0] * nFolds
            for idx, isTest in enumerate(
                    self.preDefIndices
            ):  # self.preDefIndices == 0 are to be used in TrainBias
                if not isTest:
                    if DataIdxs[idx]:
                        nShifted[DataIdxs[idx]] += 1
                        DataIdxs[idx] = 0
            for idx, shift in enumerate(nShifted):
                self.__log("In fold " + str(idx) + ", " + str(shift) +
                           " examples were shifted to the train set.")

        #Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        #Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        #Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in foldsN:
            trainData = self.data.select(DataIdxs, foldN, negate=1)
            self.__checkTrainData(trainData)

        #Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        stepsDone = 0
        nTotalSteps = len(sortedML) * self.nExtFolds
        for ml in sortedML:
            startTime = time.time()
            self.__log("    > " + str(ml) + "...")
            try:
                #Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []
                logTxt = ""
                for foldN in foldsN:
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs, foldN, negate=1)
                    testData = self.data.select(DataIdxs, foldN)
                    smilesAttr = dataUtilities.getSMILESAttr(trainData)
                    if smilesAttr:
                        self.__log("Found SMILES attribute:" + smilesAttr)
                        if MLmethods[ml].specialType == 1:
                            trainData = dataUtilities.attributeSelectionData(
                                trainData,
                                [smilesAttr, trainData.domain.classVar.name])
                            testData = dataUtilities.attributeSelectionData(
                                testData,
                                [smilesAttr, testData.domain.classVar.name])
                            self.__log(
                                "Selected attrs: " +
                                str([attr.name for attr in trainData.domain]))
                        else:
                            trainData = dataUtilities.attributeDeselectionData(
                                trainData, [smilesAttr])
                            testData = dataUtilities.attributeDeselectionData(
                                testData, [smilesAttr])
                            self.__log("Selected attrs: " + str(
                                [attr.name for attr in trainData.domain[0:3]] +
                                ["..."] + [
                                    attr.name for attr in trainData.
                                    domain[len(trainData.domain) - 3:]
                                ]))

                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    #Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (
                            len(trainData) *
                        (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = self.sampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs,
                                                        1,
                                                        negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    SpecialModel = None
                    if dontOptimize:
                        logTxt += "       Fold " + str(
                            foldN
                        ) + ": Too few compounds to optimize model hyper-parameters\n"
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = evalUtilities.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                stratified=orange.MakeRandomIndices.
                                StratifiedIfPossible,
                                random_generator=random.randint(0, 100))
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = evalUtilities.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                stratified=orange.MakeRandomIndices.
                                StratifiedIfPossible,
                                random_generator=random.randint(0, 100))
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        if MLmethods[ml].specialType == 1:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optInfo, SpecialModel = MLmethods[
                                    ml].optimizePars(trainData, folds=5)
                                optAcc[ml].append(optInfo["Acc"])
                            else:
                                res = evalUtilities.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    stratified=orange.MakeRandomIndices.
                                    StratifiedIfPossible,
                                    random_generator=random.randint(0, 100))
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)
                        else:
                            runPath = miscUtilities.createScratchDir(
                                baseDir=AZOC.NFS_SCRATCHDIR,
                                desc="AccWOptParam",
                                seed=id(trainData))
                            trainData.save(
                                os.path.join(runPath, "trainData.tab"))
                            tunedPars = paramOptUtilities.getOptParam(
                                learner=MLmethods[ml],
                                trainDataFile=os.path.join(
                                    runPath, "trainData.tab"),
                                paramList=self.paramList,
                                useGrid=False,
                                verbose=self.verbose,
                                queueType=self.queueType,
                                runPath=runPath,
                                nExtFolds=None,
                                nFolds=self.nInnerFolds,
                                logFile=self.logFile,
                                getTunedPars=True,
                                fixedParams=self.fixedParams)
                            if not MLmethods[ml] or not MLmethods[ml].optimized:
                                self.__log(
                                    "       WARNING: GETACCWOPTPARAM: The learner "
                                    + str(ml) + " was not optimized.")
                                self.__log(
                                    "                It will be ignored")
                                #self.__log("                It will be set to default parameters")
                                self.__log(
                                    "                    DEBUG can be done in: "
                                    + runPath)
                                #Set learner back to default
                                #MLmethods[ml] = MLmethods[ml].__class__()
                                raise Exception("The learner " + str(ml) +
                                                " was not optimized.")
                            else:
                                if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                    optAcc[ml].append(tunedPars[0])
                                else:
                                    res = evalUtilities.crossValidation(
                                        [MLmethods[ml]],
                                        trainData,
                                        folds=5,
                                        stratified=orange.MakeRandomIndices.
                                        StratifiedIfPossible,
                                        random_generator=random.randint(
                                            0, 100))
                                    R2 = evalUtilities.R2(res)[0]
                                    optAcc[ml].append(R2)

                                miscUtilities.removeDir(runPath)
                    #Train the model
                    if SpecialModel is not None:
                        model = SpecialModel
                    else:
                        model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    #Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (evalUtilities.getClassificationAccuracy(
                                testData, model),
                             evalUtilities.getConfMat(testData, model)))
                    else:
                        local_exp_pred = []
                        # Predict using bulk-predict
                        predictions = model(testData)
                        # Gather predictions
                        for n, ex in enumerate(testData):
                            local_exp_pred.append(
                                (ex.getclass().value, predictions[n].value))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred),
                             evalUtilities.calcRsqrt(local_exp_pred)))
                        #Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None
                    if callBackWithFoldModel:
                        callBackWithFoldModel(model)

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    labels=hasattr(self.data.domain.classVar, "values")
                    and list(self.data.domain.classVar.values) or None)
                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                res["runningTime"] = time.time() - startTime
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                self.__log("       Learner " + str(ml) +
                           " failed to create/optimize the model!")
                error = str(sys.exc_info()[0]) +" "+\
                            str(sys.exc_info()[1]) +" "+\
                            str(traceback.extract_tb(sys.exc_info()[2]))
                self.__log(error)

                res = self.createStatObj()
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            #We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0)
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName][
                        "stable"]:
                    consensusMLs[modelName] = copy.deepcopy(
                        statistics[modelName])

            self.__log("Found " + str(len(consensusMLs)) +
                       " stable MLmethods out of " + str(len(statistics)) +
                       " MLmethods.")

            if len(consensusMLs
                   ) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(
                        statistics[modelName])

            # Exclude specialType models
            excludeThis = []
            for learnerName in consensusMLs:
                if models[learnerName][0].specialType > 0:
                    excludeThis.append(learnerName)
            for learnerName in excludeThis:
                consensusMLs.pop(learnerName)
                self.__log("    > Excluded special model " + learnerName)
            self.__log("    > Stable modules: " + str(consensusMLs.keys()))

            if len(consensusMLs) >= 2:
                #Var for saving each Fols result
                startTime = time.time()
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on "
                    + str([ml for ml in consensusMLs]))
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        # exprTest0
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(
                                optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        # exprTest1
                        exprTest1 = "(0"
                        for ml in consensusMLs:
                            exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str(
                                optAcc[ml][foldN]) + " "
                        exprTest1 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest1 += ", " + ml + " == " + CLASS1 + " "
                        exprTest1 += "]),1)"
                        # Expression
                        expression = [
                            exprTest0 + " >= " + exprTest1 + " -> " + CLASS0,
                            " -> " + CLASS1
                        ]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(
                                optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(
                        DataIdxs, foldN + 1)  # fold 0 if for the train Bias!!
                    smilesAttr = dataUtilities.getSMILESAttr(testData)
                    if smilesAttr:
                        self.__log("Found SMILES attribute:" + smilesAttr)
                        testData = dataUtilities.attributeDeselectionData(
                            testData, [smilesAttr])
                        self.__log("Selected attrs: " + str(
                            [attr.name
                             for attr in trainData.domain[0:3]] + ["..."] + [
                                 attr.name for attr in
                                 trainData.domain[len(trainData.domain) - 3:]
                             ]))

                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[
                            learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(
                        classifiers=consensusClassifiers,
                        expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    #Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (evalUtilities.getClassificationAccuracy(
                                testData, model),
                             evalUtilities.getConfMat(testData, model)))
                    else:
                        local_exp_pred = []
                        # Predict using bulk-predict
                        predictions = model(testData)
                        # Gather predictions
                        for n, ex in enumerate(testData):
                            local_exp_pred.append(
                                (ex.getclass().value, predictions[n].value))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred),
                             evalUtilities.calcRsqrt(local_exp_pred)))
                        #Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(
                    Cresults,
                    Cexp_pred,
                    CnTrainEx,
                    CnTestEx,
                    self.responseType,
                    self.nExtFolds,
                    labels=hasattr(self.data.domain.classVar, "values")
                    and list(self.data.domain.classVar.values) or None)
                res["runningTime"] = time.time() - startTime
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"][
                    "IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        #By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]