def getAccStat(rankSumTuple, nDesc, train, randTest, extTest, resultsFid, projectName): print "Select features based on top ranked features" attrList = [] for elem in rankSumTuple: if len(attrList) < nDesc: attrList.append(elem[0]) train = dataUtilities.attributeSelectionData(train, attrList) train = dataUtilities.attributeDeselectionData( train, ['HLM_XEN025;Mean;CLint (uL/min/mg);(Num)']) print train.domain.attributes, len( train.domain.attributes), train.domain.classVar # Get accuracies learners = [AZorngRF.RFLearner(nTrees=100)] print "CV accuracy" MCC_CV = printCV(train, learners, resultsFid, projectName) Model = learners[0](train) print "Random Test set accuracy" MCC_rand = printTestSetAcc(Model, randTest, learners, resultsFid, projectName, True) print "External Test set accuracy" MCC_ext = printTestSetAcc(Model, extTest, learners, resultsFid, projectName, False) return MCC_CV, MCC_rand, MCC_ext
def buildConsensus(trainData, learners, MLMethods, logFile = None): log(logFile, "Building a consensus model based on optimized MLmethods: "+str([ml for ml in MLMethods])+"...") if trainData.domain.classVar.varType == orange.VarTypes.Discrete: #Expression: If CAavg_{POS} ge CAavg_{NEG} -> POS else -> NEG # where CAavg_{POS} is the average of classification accuracies of all models predicting POS. CLASS0 = str(trainData.domain.classVar.values[0]) CLASS1 = str(trainData.domain.classVar.values[1]) #exprTest0 exprTest0 = "(0" for ml in MLMethods: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(MLMethods[ml]["optAcc"])+" " exprTest0 += ")/IF0(sum([False" for ml in MLMethods: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in MLMethods: exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(MLMethods[ml]["optAcc"])+" " exprTest1 += ")/IF0(sum([False" for ml in MLMethods: exprTest1 += ", "+ml+" == "+CLASS1+" " exprTest1 += "]),1)" # expression expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([MLMethods[ml]["optAcc"] for ml in MLMethods]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in MLMethods: expression += " + "+str(MLMethods[ml]["optAcc"])+" * " + ml +" " expression += ")" consensusLearners = {} for learnerName in learners: consensusLearners[learnerName] = learners[learnerName] learner = AZorngConsensus.ConsensusLearner(learners = consensusLearners, expression = expression) log(logFile, " Training Consensus Learner") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile,"Found SMILES attribute:"+smilesAttr) if learner.specialType == 1: trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] +\ [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) return learner(trainData)
def filterDesc(data, zeroFracT = 0.95, LowVarT = 0.01, HighCorrT = 0.95): print "Initial number of descriptors ", len(data.domain.attributes) # Find the descriptors for which the fraction of zeros is smaller than zeroFracT - keep these attrList = [] rmAttrList = [] for attr in data.domain.attributes: valueList = [] nZero = 0 for ex in data: value = ex[attr.name].value if value == 0: nZero = nZero + 1 valueList.append(value) zeroFrac = float(nZero)/len(valueList) if zeroFrac < zeroFracT: attrList.append(attr.name) else: rmAttrList.append(attr.name) print "Descriptors deselected because of a large fraction of zeros: " print rmAttrList data = dataUtilities.attributeSelectionData(data, attrList) print "Remaining number of descriptors ", len(data.domain.attributes) # Filter descriptors based on normalized variance rmAttrList = [] for attr in data.domain.attributes: valueList = [] for ex in data: value = ex[attr.name].value valueList.append(value) variance = numpy.var(valueList) mean = numpy.mean(valueList) normVar = variance/mean if normVar < LowVarT: rmAttrList.append(attr.name) print "Descriptors deselected because of low variance " print rmAttrList data = dataUtilities.attributeDeselectionData(data, rmAttrList) print "Remaining number of descriptors ", len(data.domain.attributes) print "Correlation filter not implemented yet" return data
def buildModel(trainData, MLMethod, queueType="NoSGE", verbose=0, logFile=None): """ Buld the method passed in MLMethod and optimize ( "IndividualStatistics" not in MLMethod) if MLMethod is a Consensus ("individualStatistics" in MLMethod) , build each and optimize first all models and after build the consensus! """ log(logFile, "Building and optimizing learner: " + MLMethod["MLMethod"] + "...") learners = {} MLMethods = {} if "IndividualStatistics" in MLMethod: #It is a consensus and will certaily not contain any #special model as it was filtered in the getUnbiasedAcc for ML in MLMethod["IndividualStatistics"]: MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML]) else: ML = MLMethod["MLMethod"] if MLMETHODS[ML]( name=ML ).specialType == 1: # If is a special model and has a built-in optimizaer log(logFile, " This is a special model") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile, "Found SMILES attribute:" + smilesAttr) trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) optInfo, SpecialModel = MLMETHODS[ML](name=ML).optimizePars( trainData, folds=5) return SpecialModel else: MLMethods[MLMethod["MLMethod"]] = MLMethod smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) # optimize all MLMethods for ML in MLMethods: log(logFile, " Optimizing MLmethod: " + ML) learners[ML] = MLMETHODS[ML](name=ML) runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="competitiveWorkflow_BuildModel") trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam(learner=learners[ML], trainDataFile=os.path.join( runPath, "trainData.tab"), useGrid=False, verbose=verbose, queueType=queueType, runPath=runPath, nExtFolds=None, logFile=logFile, getTunedPars=True) if not learners[ML].optimized: print "WARNING: competitiveWorkflow: The learner " + str( learners[ML]) + " was not optimized." #print " Using default parameters" print " The " + str(learners[ML]) + " will not be included" #print " Returning None" print " DEBUG can be made in: " + runPath #Setting default parameters #learners[ML] = learners[ML].__class__() #return None learners.pop(ML) continue else: print "Optimized learner ", learners[ML] if trainData.domain.classVar.varType == orange.VarTypes.Discrete: MLMethods[ML]["optAcc"] = tunedPars[0] else: res = orngTest.crossValidation( [learners[ML]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] MLMethods[ML]["optAcc"] = R2 miscUtilities.removeDir(runPath) #Train the model if len(learners) == 1: log(logFile, " Building the model:" + learners.keys()[0]) model = learners[learners.keys()[0]](trainData) elif len(learners) >= 1: model = buildConsensus(trainData, learners, MLMethods) else: print "ERROR: No Learners were selected!" return None return model
def buildConsensus(trainData, learners, MLMethods, logFile=None): log( logFile, "Building a consensus model based on optimized MLmethods: " + str([ml for ml in MLMethods]) + "...") if trainData.domain.classVar.varType == orange.VarTypes.Discrete: #Expression: If CAavg_{POS} ge CAavg_{NEG} -> POS else -> NEG # where CAavg_{POS} is the average of classification accuracies of all models predicting POS. CLASS0 = str(trainData.domain.classVar.values[0]) CLASS1 = str(trainData.domain.classVar.values[1]) #exprTest0 exprTest0 = "(0" for ml in MLMethods: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str( MLMethods[ml]["optAcc"]) + " " exprTest0 += ")/IF0(sum([False" for ml in MLMethods: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in MLMethods: exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str( MLMethods[ml]["optAcc"]) + " " exprTest1 += ")/IF0(sum([False" for ml in MLMethods: exprTest1 += ", " + ml + " == " + CLASS1 + " " exprTest1 += "]),1)" # expression expression = [ exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1 ] else: Q2sum = sum([MLMethods[ml]["optAcc"] for ml in MLMethods]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in MLMethods: expression += " + " + str( MLMethods[ml]["optAcc"]) + " * " + ml + " " expression += ")" consensusLearners = {} for learnerName in learners: consensusLearners[learnerName] = learners[learnerName] learner = AZorngConsensus.ConsensusLearner(learners=consensusLearners, expression=expression) log(logFile, " Training Consensus Learner") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile, "Found SMILES attribute:" + smilesAttr) if learner.specialType == 1: trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) log( logFile, "Selected attrs: " + str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] +\ [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) return learner(trainData)
print "Number of attributes ", nAttr print "Maximum number of desc combinations ", pow(2, nAttr) print "Ndesc must be lower than the max number of desc combinations" print NdescComb # Randomly sample Ndesc combinations attrList = getDescComb(data, nAttr, NdescComb) # Rank the accuracy of each descriptor by averaging the accuracy of all models including a descriptor # Select all descriptors above median accuracy and repeat the random sampling of desc combinations return attrList if __name__ == "__main__": dataFile = "trainDataAllEP.txt" data = dataUtilities.DataTable(dataFile) attrList = [ "IT03423_Seq_BF", "hERG_IW_pIC50", "IT03423_BF", "IT03423_perc101_BF", "Caco2_intrinsic", "ACDlogD74", "Conc_QTc", "IT03713_BF", "IT10850_BF", "IT22015_BF", "IT22016_BF" ] data = dataUtilities.attributeSelectionData(data, attrList) NdescComb = 100 # Number of desc combinations to sample in the first iteration attrList = descSelection(data, NdescComb) print attrList
def getAcc(self, callBack = None, callBackWithFoldModel = None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(self.responseType)) #Create the Train and test sets if self.usePreDefFolds: DataIdxs = self.preDefIndices else: DataIdxs = self.sampler(self.data, self.nExtFolds) foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0] #Folds used only from 1 on ... 0 are for fixed train Bias nFolds = len(foldsN) #Fix the Indexes based on DataIdxs # (0s) represents the train set ( >= 1s) represents the test set folds if self.useVarCtrlCV: nShifted = [0] * nFolds for idx,isTest in enumerate(self.preDefIndices): # self.preDefIndices == 0 are to be used in TrainBias if not isTest: if DataIdxs[idx]: nShifted[DataIdxs[idx]] += 1 DataIdxs[idx] = 0 for idx,shift in enumerate(nShifted): self.__log("In fold "+str(idx)+", "+str(shift)+" examples were shifted to the train set.") #Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in foldsN: trainData = self.data.select(DataIdxs,foldN,negate=1) self.__checkTrainData(trainData) #Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0,"PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: startTime = time.time() self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in foldsN: if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs,foldN,negate=1) testData = self.data.select(DataIdxs,foldN) smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: self.__log("Found SMILES attribute:"+smilesAttr) if MLmethods[ml].specialType == 1: trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) testData = dataUtilities.attributeSelectionData(testData, [smilesAttr, testData.domain.classVar.name]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = self.sampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs,1,negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True SpecialModel = None if dontOptimize: logTxt += " Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: if MLmethods[ml].specialType == 1: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optInfo, SpecialModel = MLmethods[ml].optimizePars(trainData, folds = 5) optAcc[ml].append(optInfo["Acc"]) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam", seed = id(trainData)) trainData.save(os.path.join(runPath,"trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds, logFile = self.logFile, getTunedPars = True, fixedParams = self.fixedParams) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log(" WARNING: GETACCWOPTPARAM: The learner "+str(ml)+" was not optimized.") self.__log(" It will be ignored") #self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: "+runPath) #Set learner back to default #MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner "+str(ml)+" was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) #Train the model if SpecialModel is not None: model = SpecialModel else: model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n,ex in enumerate(testData): local_exp_pred.append((ex.getclass().value, predictions[n].value)) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None if callBackWithFoldModel: callBackWithFoldModel(model) res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") res["runningTime"] = time.time() - startTime statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to create/optimize the model!") error = str(sys.exc_info()[0]) +" "+\ str(sys.exc_info()[1]) +" "+\ str(traceback.extract_tb(sys.exc_info()[2])) self.__log(error) res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0) consensusMLs={} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log("Found "+str(len(consensusMLs))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs={} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) # Exclude specialType models excludeThis = [] for learnerName in consensusMLs: if models[learnerName][0].specialType > 0: excludeThis.append(learnerName) for learnerName in excludeThis: consensusMLs.pop(learnerName) self.__log(" > Excluded special model " + learnerName) self.__log(" > Stable modules: " + str(consensusMLs.keys())) if len(consensusMLs) >= 2: #Var for saving each Fols result startTime = time.time() Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log("Calculating the statistics for a Consensus model based on "+str([ml for ml in consensusMLs])) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) # exprTest0 exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(optAcc[ml][foldN])+" " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in consensusMLs: exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(optAcc[ml][foldN])+" " exprTest1 += ")/IF0(sum([False" for ml in consensusMLs: exprTest1 += ", "+ml+" == "+CLASS1+" " exprTest1 += "]),1)" # Expression expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in consensusMLs: expression += " + "+str(optAcc[ml][foldN])+" * "+ml+" " expression += ")" testData = self.data.select(DataIdxs,foldN+1) # fold 0 if for the train Bias!! smilesAttr = dataUtilities.getSMILESAttr(testData) if smilesAttr: self.__log("Found SMILES attribute:"+smilesAttr) testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n,ex in enumerate(testData): local_exp_pred.append((ex.getclass().value, predictions[n].value)) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None ) res["runningTime"] = time.time() - startTime statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def Wrapper(train, randTest, extTest, resultsFid, projectName, MCCdict, descList): # Select RDKit desc and keep CLint for reference attrList = [] fid = open("RDKitDesc.txt") for line in fid: attrList.append(string.strip(line)) fid.close() #attrList = attrList + ["CLint"] attrList = attrList + ["HLM_XEN025;Mean;CLint (uL/min/mg);(Num)"] train = dataUtilities.attributeSelectionData(train, attrList) print "Ranking with contiuous response" train = changeRespVar(train, "HLM_XEN025;Mean;CLint (uL/min/mg);(Num)") print "Cont resp ", train.domain.classVar print "Pearson ranking" scoreTuple = PearsonRank.getRankedAbs(train, 178) tuplePrint(scoreTuple, 20) rankTuple = getRank(scoreTuple) rankSumTuple = rankTuple #tuplePrint(rankSumTuple, 50) #tuplePrint(rankTuple, 20) print "ReliefF ranking " relief = Orange.feature.scoring.Relief(k=20, m=50) scoreTuple = Orange.feature.scoring.score_all(train, score=relief) #tuplePrint(scoreTuple, 20) rankTuple = getRank(scoreTuple) # rankSumTuple = rankTuple rankSumTuple = getRankSum(rankSumTuple, rankTuple) # tuplePrint(rankSumTuple, 50) # tuplePrint(rankTuple, 20) # print "Ranking with categorical response" train = changeRespVar(train, "CLint") print "Cat resp ", train.domain.classVar # print "ReliefF ranking " relief = Orange.feature.scoring.Relief(k=20, m=50) scoreTuple = Orange.feature.scoring.score_all(train, score=relief) tuplePrint(scoreTuple, 20) rankTuple = getRank(scoreTuple) # print "OBS!!!!!!!!!!!! Remove if already done" # rankSumTuple = rankTuple rankSumTuple = getRankSum(rankSumTuple, rankTuple) #tuplePrint(rankTuple, 20) # print "MDL ranking " score = Orange.feature.scoring.MDL() scoreTuple = Orange.feature.scoring.score_all(train, score) tuplePrint(scoreTuple, 20) rankTuple = getRank(scoreTuple) # rankSumTuple = rankTuple rankSumTuple = getRankSum(rankSumTuple, rankTuple) # #tuplePrint(rankTuple, 20) # #tuplePrint(rankSumTuple, 50) # # Return average rank in sorted list rankSumTuple = sortRankSum(rankSumTuple, 1) tuplePrint(rankSumTuple, 50) #print rankSumTuple for nDesc in descList: MCC_CV, MCC_rand, MCC_ext = getAccStat(rankSumTuple, nDesc, train, randTest, extTest, resultsFid, projectName) MCCdict[projectName][str(nDesc)] = [MCC_CV, MCC_rand, MCC_ext] return MCCdict
def descSelection(data, NdescComb): nAttr = len(data.domain.attributes) print "Number of attributes ", nAttr print "Maximum number of desc combinations ", pow(2, nAttr) print "Ndesc must be lower than the max number of desc combinations" print NdescComb # Randomly sample Ndesc combinations attrList = getDescComb(data, nAttr, NdescComb) # Rank the accuracy of each descriptor by averaging the accuracy of all models including a descriptor # Select all descriptors above median accuracy and repeat the random sampling of desc combinations return attrList if __name__ == "__main__": dataFile = "trainDataAllEP.txt" data = dataUtilities.DataTable(dataFile) attrList = ["IT03423_Seq_BF", "hERG_IW_pIC50", "IT03423_BF", "IT03423_perc101_BF", "Caco2_intrinsic", "ACDlogD74", "Conc_QTc", "IT03713_BF", "IT10850_BF", "IT22015_BF", "IT22016_BF"] data = dataUtilities.attributeSelectionData(data, attrList) NdescComb = 100 # Number of desc combinations to sample in the first iteration attrList = descSelection(data, NdescComb) print attrList
def buildModel(trainData, MLMethod, queueType = "NoSGE", verbose = 0, logFile = None): """ Buld the method passed in MLMethod and optimize ( "IndividualStatistics" not in MLMethod) if MLMethod is a Consensus ("individualStatistics" in MLMethod) , build each and optimize first all models and after build the consensus! """ log(logFile, "Building and optimizing learner: "+MLMethod["MLMethod"]+"...") learners = {} MLMethods = {} if "IndividualStatistics" in MLMethod: #It is a consensus and will certaily not contain any #special model as it was filtered in the getUnbiasedAcc for ML in MLMethod["IndividualStatistics"]: MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML]) else: ML = MLMethod["MLMethod"] if MLMETHODS[ML](name = ML).specialType == 1: # If is a special model and has a built-in optimizaer log(logFile, " This is a special model") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile,"Found SMILES attribute:"+smilesAttr) trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) optInfo, SpecialModel = MLMETHODS[ML](name = ML).optimizePars(trainData, folds = 5) return SpecialModel else: MLMethods[MLMethod["MLMethod"]] = MLMethod smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) # optimize all MLMethods for ML in MLMethods: log(logFile, " Optimizing MLmethod: "+ML) learners[ML] = MLMETHODS[ML](name = ML) runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "competitiveWorkflow_BuildModel") trainData.save(os.path.join(runPath,"trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner = learners[ML], trainDataFile = os.path.join(runPath,"trainData.tab"), useGrid = False, verbose = verbose, queueType = queueType, runPath = runPath, nExtFolds = None, logFile = logFile, getTunedPars = True) if not learners[ML].optimized: print "WARNING: competitiveWorkflow: The learner "+str(learners[ML])+" was not optimized." #print " Using default parameters" print " The "+str(learners[ML])+" will not be included" #print " Returning None" print " DEBUG can be made in: "+runPath #Setting default parameters #learners[ML] = learners[ML].__class__() #return None learners.pop(ML) continue else: print "Optimized learner ",learners[ML] if trainData.domain.classVar.varType == orange.VarTypes.Discrete: MLMethods[ML]["optAcc"] = tunedPars[0] else: res = orngTest.crossValidation([learners[ML]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] MLMethods[ML]["optAcc"] = R2 miscUtilities.removeDir(runPath) #Train the model if len(learners) == 1: log(logFile, " Building the model:"+learners.keys()[0]) model = learners[learners.keys()[0]](trainData) elif len(learners) >= 1: model = buildConsensus(trainData,learners,MLMethods) else: print "ERROR: No Learners were selected!" return None return model
def getAcc(self, callBack=None, callBackWithFoldModel=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" " + str(self.responseType)) #Create the Train and test sets if self.usePreDefFolds: DataIdxs = self.preDefIndices else: DataIdxs = self.sampler(self.data, self.nExtFolds) foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0 ] #Folds used only from 1 on ... 0 are for fixed train Bias nFolds = len(foldsN) #Fix the Indexes based on DataIdxs # (0s) represents the train set ( >= 1s) represents the test set folds if self.useVarCtrlCV: nShifted = [0] * nFolds for idx, isTest in enumerate( self.preDefIndices ): # self.preDefIndices == 0 are to be used in TrainBias if not isTest: if DataIdxs[idx]: nShifted[DataIdxs[idx]] += 1 DataIdxs[idx] = 0 for idx, shift in enumerate(nShifted): self.__log("In fold " + str(idx) + ", " + str(shift) + " examples were shifted to the train set.") #Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in foldsN: trainData = self.data.select(DataIdxs, foldN, negate=1) self.__checkTrainData(trainData) #Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: startTime = time.time() self.__log(" > " + str(ml) + "...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in foldsN: if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs, foldN, negate=1) testData = self.data.select(DataIdxs, foldN) smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: self.__log("Found SMILES attribute:" + smilesAttr) if MLmethods[ml].specialType == 1: trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) testData = dataUtilities.attributeSelectionData( testData, [smilesAttr, testData.domain.classVar.name]) self.__log( "Selected attrs: " + str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) testData = dataUtilities.attributeDeselectionData( testData, [smilesAttr]) self.__log("Selected attrs: " + str( [attr.name for attr in trainData.domain[0:3]] + ["..."] + [ attr.name for attr in trainData. domain[len(trainData.domain) - 3:] ])) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and ( len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = self.sampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs, 1, negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True SpecialModel = None if dontOptimize: logTxt += " Fold " + str( foldN ) + ": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: if MLmethods[ml].specialType == 1: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optInfo, SpecialModel = MLmethods[ ml].optimizePars(trainData, folds=5) optAcc[ml].append(optInfo["Acc"]) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)) trainData.save( os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join( runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, fixedParams=self.fixedParams) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized.") self.__log( " It will be ignored") #self.__log(" It will be set to default parameters") self.__log( " DEBUG can be done in: " + runPath) #Set learner back to default #MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint( 0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) #Train the model if SpecialModel is not None: model = SpecialModel else: model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n, ex in enumerate(testData): local_exp_pred.append( (ex.getclass().value, predictions[n].value)) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None if callBackWithFoldModel: callBackWithFoldModel(model) res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, labels=hasattr(self.data.domain.classVar, "values") and list(self.data.domain.classVar.values) or None) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") res["runningTime"] = time.time() - startTime statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") error = str(sys.exc_info()[0]) +" "+\ str(sys.exc_info()[1]) +" "+\ str(traceback.extract_tb(sys.exc_info()[2])) self.__log(error) res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0) consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName][ "stable"]: consensusMLs[modelName] = copy.deepcopy( statistics[modelName]) self.__log("Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods.") if len(consensusMLs ) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy( statistics[modelName]) # Exclude specialType models excludeThis = [] for learnerName in consensusMLs: if models[learnerName][0].specialType > 0: excludeThis.append(learnerName) for learnerName in excludeThis: consensusMLs.pop(learnerName) self.__log(" > Excluded special model " + learnerName) self.__log(" > Stable modules: " + str(consensusMLs.keys())) if len(consensusMLs) >= 2: #Var for saving each Fols result startTime = time.time() Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) # exprTest0 exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str( optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in consensusMLs: exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str( optAcc[ml][foldN]) + " " exprTest1 += ")/IF0(sum([False" for ml in consensusMLs: exprTest1 += ", " + ml + " == " + CLASS1 + " " exprTest1 += "]),1)" # Expression expression = [ exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1 ] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str( optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select( DataIdxs, foldN + 1) # fold 0 if for the train Bias!! smilesAttr = dataUtilities.getSMILESAttr(testData) if smilesAttr: self.__log("Found SMILES attribute:" + smilesAttr) testData = dataUtilities.attributeDeselectionData( testData, [smilesAttr]) self.__log("Selected attrs: " + str( [attr.name for attr in trainData.domain[0:3]] + ["..."] + [ attr.name for attr in trainData.domain[len(trainData.domain) - 3:] ])) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[ learnerName][foldN] model = AZorngConsensus.ConsensusClassifier( classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n, ex in enumerate(testData): local_exp_pred.append( (ex.getclass().value, predictions[n].value)) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj( Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels=hasattr(self.data.domain.classVar, "values") and list(self.data.domain.classVar.values) or None) res["runningTime"] = time.time() - startTime statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"][ "IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]