def autoFlow (self, numIter, outputPath): log("Start blending autoFlow, num of Iter: " , numIter) start = time.time() distinctModels = len(self._clfNameList) tmpResultList =[] tmpRandomWeightList =[] tmpBlendedDfList =[] for i in range (0, numIter): tmpWeightList = self.getRandomWeightList(distinctModels) tmpRandomWeightList.append(tmpWeightList) tmpDf = self.doBlending(tmpWeightList) tmpBlendedDfList.append(tmpDf) tmpResultList.append(self.calLogLoss(tmpDf)) idList = np.array(tmpResultList).argsort()[:3] firstFlag = True finalDf = [] logResult =[] for id in idList: if firstFlag == True: finalDf = tmpBlendedDfList[id] self._bestParamList = tmpRandomWeightList[id] firstFlag = False log ("logloss: " , tmpResultList[id] , "blender param: " , tmpRandomWeightList[id]) logResult.append ( (tmpResultList[id] , tmpRandomWeightList[id])) mail("Blender Top3: " ,logResult, self._clfNameList) log("clfNameList = ", self._clfNameList) log ("low prob. id list (in 1st): #", len(self._lowProbIdList) , ", ", self._lowProbIdList) log("End blending autoFlow, num of Iter: " , numIter, " cost: ", time.time() - start , " sec") finalDf.to_csv(outputPath, sep=',', encoding='utf-8')
def getKnnClf(self, X, Y): clfName = "K_NN" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html clf = KNeighborsClassifier( n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, ) if self._gridSearchFlag == True: log(clfName + " start searching param...") param_dist = { "n_neighbors": sp_randint(4, 8), "weights": ['uniform', 'uniform'], "leaf_size": sp_randint(30, 60), "algorithm": ['auto', 'auto'], } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def doRandomSearch(self, clfName, clf, param_dist, X, Y): start = time.time() multiCores = -1 if clfName == "Logistic_Regression": multiCores = 1 if self._setXgboostTheradToOne == True and clfName == "Xgboost": multiCores = 1 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=self._n_iter_search, n_jobs=multiCores, scoring='log_loss') random_search.fit(X, Y) log(clfName + " randomized search cost: ", time.time() - start, " sec") self._bestClf[clfName] = random_search.best_estimator_ self._bestLoglossDict[clfName] = self.getLogloss( self._bestClf[clfName], X, Y) self.report(random_search.grid_scores_, clfName, self._bestLoglossDict[clfName]) dumpModel(random_search.best_estimator_, clfName, self._expInfo, self._subFolderName) return random_search.best_estimator_
def autoFlow(self, numIter, outputPath): log("Start blending autoFlow, num of Iter: ", numIter) start = time.time() distinctModels = len(self._clfNameList) tmpResultList = [] tmpRandomWeightList = [] tmpBlendedDfList = [] for i in range(0, numIter): tmpWeightList = self.getRandomWeightList(distinctModels) tmpRandomWeightList.append(tmpWeightList) tmpDf = self.doBlending(tmpWeightList) tmpBlendedDfList.append(tmpDf) tmpResultList.append(self.calLogLoss(tmpDf)) idList = np.array(tmpResultList).argsort()[:3] firstFlag = True finalDf = [] logResult = [] for id in idList: if firstFlag == True: finalDf = tmpBlendedDfList[id] self._bestParamList = tmpRandomWeightList[id] firstFlag = False log("logloss: ", tmpResultList[id], "blender param: ", tmpRandomWeightList[id]) logResult.append((tmpResultList[id], tmpRandomWeightList[id])) mail("Blender Top3: ", logResult, self._clfNameList) log("clfNameList = ", self._clfNameList) log("low prob. id list (in 1st): #", len(self._lowProbIdList), ", ", self._lowProbIdList) log("End blending autoFlow, num of Iter: ", numIter, " cost: ", time.time() - start, " sec") finalDf.to_csv(outputPath, sep=',', encoding='utf-8')
def getRandomForestClf(self, X, Y): clfName = "Random_Forest" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score=True) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = 10 tmpHighDepth = 50 param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0, 1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "criterion": ["gini", "entropy"], "n_estimators": sp_randint(100, 300), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def getLogisticRegressionClf(self, X, Y): clfName = "Logistic_Regression" ## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html clf = LogisticRegression( penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, ) if self._gridSearchFlag == True: log(clfName + " start searching param...") param_dist = { "penalty": ['l2', 'l2'], "C": sp_randf(1.0, 3.0), "solver": ['lbfgs', 'liblinear'], } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def getLogisticRegressionClf(self, X, Y): clfName = "Logistic_Regression" ## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html clf = LogisticRegression( penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, ) if self._gridSearchFlag == True: log(clfName + " start searching param...") param_dist = { "penalty": ['l2', 'l2'], "C": sp_randf(1.0,3.0), "solver": [ 'lbfgs', 'liblinear'], } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def readInCSV(self, path, mode): # # 1. read csv data in df = pd.read_csv(path, header=0, sep=',') log("loading csv: " + path) if mode.lower() == "train": self._ansDataFrame = df[df.columns[0]] self._trainDataFrame = df[df.columns[1:]] else: self._testDataFrame = df
def genXgboostRpt(self, bestClf, bestScore, paramList, best_num_round): dumpModel(bestClf, "Xgboost", self._expInfo, self._subFolderName) log("Native Xgboost best score : ", bestScore, ", param list: ", paramList, "best_num_round: ", best_num_round) if self._singleModelMail == True: mail( "Xgboost Done", "Native Xgboost best score : " + str(bestScore) + ", param list: " + str(paramList) + "best_num_round: ", best_num_round)
def getNaiveBayesClf(self, X, Y): clfName = "Naive_Bayes" ## http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes clf = GaussianNB() clf = clf.fit(X, Y) scores = cross_val_score(clf, X, Y) log(clfName + " Cross Validation Precision: ", scores.mean()) self._bestScoreDict[clfName] = scores.mean() return clf
def getNaiveBayesClf(self, X, Y): clfName = "Naive_Bayes" ## http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes clf = GaussianNB() clf = clf.fit(X, Y) scores = cross_val_score(clf, X, Y ) log( clfName + " Cross Validation Precision: ", scores.mean() ) self._bestScoreDict[clfName] = scores.mean() return clf
def report(self, grid_scores, clfName, bestLogLoss, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] bestParameters = {} mailContent = "" for i, score in enumerate(top_scores): log("Model with rank: {0}".format(i + 1)) log("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) log("Parameters: {0}".format(score.parameters)) mailContent += str("Model with rank: {0}".format(i + 1) ) mailContent += "\n" mailContent += str("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores)) ) mailContent += "\n" mailContent += str("Parameters: {0}".format(score.parameters) ) mailContent += "\n" if i == 0: self._bestScoreDict[clfName] = score.mean_validation_score mailContent += str("Best CV score: ") + str ( score.mean_validation_score ) mailContent += "\n" log("") #log (clfName , " best logloss: ", bestLogLoss) if (self._singleModelMail == True): mail("Single Model Done: ", clfName , ", ", mailContent) return bestParameters
def report(self, grid_scores, clfName, bestLogLoss, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] bestParameters = {} mailContent = "" for i, score in enumerate(top_scores): log("Model with rank: {0}".format(i + 1)) log("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) log("Parameters: {0}".format(score.parameters)) mailContent += str("Model with rank: {0}".format(i + 1)) mailContent += "\n" mailContent += str( "Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) mailContent += "\n" mailContent += str("Parameters: {0}".format(score.parameters)) mailContent += "\n" if i == 0: self._bestScoreDict[clfName] = score.mean_validation_score mailContent += str("Best CV score: ") + str( score.mean_validation_score) mailContent += "\n" log("") #log (clfName , " best logloss: ", bestLogLoss) if (self._singleModelMail == True): mail("Single Model Done: ", clfName, ", ", mailContent) return bestParameters
def getXgboostClf(self, X, Y): clfName = "Xgboost" ## https://github.com/dmlc/xgboost/blob/master/doc/parameter.md tmpLowDepth = 10 tmpHighDepth = 50 num_class = len(set(Y)) objective = "" if len(set(Y)) <= 2: objective = "binary:logistic" else: objective = "multi:softprob" num_round = 120 param = { 'bst:max_depth': 74, 'bst:eta': 0.05, 'silent': 1, 'min_child_weight': 2, 'subsample': 0.6031536958709969, #'colsample_bytree': 0.7, 'max_delta_step': 9, 'gamma': 3, 'eta': 0.23833373077656667, 'eval_metric': 'mlogloss', 'num_class': num_class, 'objective': objective, 'alpha': 1, 'lambda': 1 } param['nthread'] = 4 plst = param.items() clf = None if self._gridSearchFlag == True: log(clfName + " start searching param...") clf = self.doXgboostRandomSearch(X, Y, num_round) else: dtrain = xgb.DMatrix(X, label=Y) clf = xgb.train(plst, dtrain, num_round) #joblib.dump(clf, xgbModelPath) return clf
def getXgboostClf(self, X, Y): clfName = "Xgboost" ## https://github.com/dmlc/xgboost/blob/master/doc/parameter.md tmpLowDepth = 10 tmpHighDepth = 50 num_class = len(set(Y)) objective ="" if len(set(Y)) <=2: objective = "binary:logistic" else: objective = "multi:softprob" num_round = 120 param = {'bst:max_depth':74, 'bst:eta':0.05, 'silent':1, 'min_child_weight':2, 'subsample': 0.6031536958709969, #'colsample_bytree': 0.7, 'max_delta_step':9, 'gamma' : 3, 'eta' : 0.23833373077656667, 'eval_metric':'mlogloss', 'num_class':num_class , 'objective':objective, 'alpha': 1, 'lambda': 1 } param['nthread'] = 4 plst = param.items() clf = None if self._gridSearchFlag == True: log(clfName + " start searching param...") clf = self.doXgboostRandomSearch(X, Y, num_round) else: dtrain = xgb.DMatrix(X , label=Y) clf = xgb.train( plst, dtrain, num_round) #joblib.dump(clf, xgbModelPath) return clf
def dumpModel(clf, clfName, expInfo, subFolderName): tmpDumpPath = getDumpFilePath(clfName, expInfo, subFolderName) log("Start dump ",clfName, " to " + tmpDumpPath) log("Exp info: ",expInfo) joblib.dump(clf, tmpDumpPath) log("Dump ",clfName, " successfully")
def dumpModel(clf, clfName, expInfo, subFolderName): tmpDumpPath = getDumpFilePath(clfName, expInfo, subFolderName) log("Start dump ", clfName, " to " + tmpDumpPath) log("Exp info: ", expInfo) joblib.dump(clf, tmpDumpPath) log("Dump ", clfName, " successfully")
def getExtraTressClf(self, X, Y): clfName = "Extra_Trees" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html clf = ExtraTreesClassifier( n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = int(len(X.columns) * 0.7) tmpHighDepth = int(len(X.columns) ) param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0,1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, True], "criterion": ["gini", "entropy"], "oob_score":[True, True], "n_estimators" : sp_randint(100, 300), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def getExtraTressClf(self, X, Y): clfName = "Extra_Trees" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html clf = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = int(len(X.columns) * 0.7) tmpHighDepth = int(len(X.columns)) param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0, 1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, True], "criterion": ["gini", "entropy"], "oob_score": [True, True], "n_estimators": sp_randint(100, 300), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def doRandomSearch(self, clfName, clf, param_dist, X, Y): start = time.time() multiCores = -1 if clfName == "Logistic_Regression": multiCores = 1 if self._setXgboostTheradToOne == True and clfName =="Xgboost": multiCores = 1 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=self._n_iter_search, n_jobs=multiCores, scoring='log_loss') random_search.fit(X, Y) log(clfName + " randomized search cost: " , time.time() - start , " sec") self._bestClf[clfName] = random_search.best_estimator_ self._bestLoglossDict[clfName] = self.getLogloss(self._bestClf[clfName], X, Y) self.report(random_search.grid_scores_, clfName, self._bestLoglossDict[clfName]) dumpModel(random_search.best_estimator_, clfName, self._expInfo, self._subFolderName) return random_search.best_estimator_
def getRandomForestClf(self, X, Y): clfName = "Random_Forest" ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score = True) if self._gridSearchFlag == True: log(clfName + " start searching param...") tmpLowDepth = 10 tmpHighDepth = 50 param_dist = { "max_depth": sp_randint(tmpLowDepth, tmpHighDepth), "max_features": sp_randf(0,1), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "criterion": ["gini", "entropy"], "n_estimators" : sp_randint(100, 300), } clf = self.doRandomSearch(clfName, clf, param_dist, X, Y) return clf
def loadModel(modelPath): log("Start load model: ", modelPath) clf = joblib.load(modelPath) return clf
#gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame) if __name__ == '__main__': start = time.time() exp() end = time.time() elapsed = end - start log("exp elapsed:", elapsed, "sec") #os.startfile('D:\\123.m4a')
def doXgboostRandomSearch(self, X, Y, num_round): paramList = [] bestScore = sys.float_info.max bestClf = None best_num_round = 0 num_class = len(set(Y)) objective = "" if len(set(Y)) <= 2: objective = "binary:logistic" else: objective = "multi:softprob" for i in range(0, self._n_iter_search): log("xgboost start random search : " + str(i + 1) + "/" + str(self._n_iter_search)) param = {} param['nthread'] = 4 param['eta'] = random.uniform(0.15, 0.45) param['gamma'] = randint(0, 3) param['max_depth'] = randint(8, 120) param['min_child_weight'] = randint(1, 3) param['eval_metric'] = 'mlogloss' param['max_delta_step'] = randint(1, 10) param['objective'] = objective param['subsample'] = random.uniform(0.45, 0.65) param['num_class'] = num_class param['silent'] = 1 param['alpha'] = 1 param['lambda'] = 1 #param['early_stopping_rounds']=2 plst = param.items() evalDataPercentage = 0.2 sampleRows = np.random.choice(X.index, len(X) * evalDataPercentage) sampleAnsDf = Y.ix[sampleRows] ori_X = X ori_Y = Y #dtest = xgb.DMatrix( X.ix[sampleRows], label=sampleAnsDf) #dtrain = xgb.DMatrix( X.drop(sampleRows), label=Y.drop(sampleRows)) #evallist = [(dtest,'eval'), (dtrain,'train')] dtrain = xgb.DMatrix(X, label=Y) xgbCvResult = xgb.cv(plst, dtrain, num_boost_round=num_round, nfold=5) scoreList = xgbCvResult[xgbCvResult.columns[0]].tolist() new_num_round = scoreList.index(min(scoreList)) + 1 minScore = scoreList[new_num_round - 1] tmpScore = minScore if tmpScore < bestScore: #tmpSelfScore = calLogLoss(pd.DataFrame(bst.predict(dtest)), sampleAnsDf) #print "self best score:" + str(tmpSelfScore) log("xgb best score:" + str(minScore)) log("xgb best num_round: " + str(new_num_round)) log("xgb best param: " + str(plst)) newDtrain = xgb.DMatrix(ori_X, label=ori_Y) bst = xgb.train(plst, newDtrain, new_num_round) bestScore = tmpScore bestClf = bst paramList = plst best_num_round = new_num_round joblib.dump(bst, Config.xgboostBestTmpCflPath) self.genXgboostRpt(bestClf, bestScore, paramList, best_num_round) return bestClf
outTestFold1 = pd.DataFrame(predictTestResult) outTestFold1.columns = [ tmpClfName + "_" + str(i) + "_0", tmpClfName + "_" + str(i) + "_1", tmpClfName + "_" + str(i) + "_2" ] dfTestLower = pd.concat([dfTestLower, outTestFold1], axis=1) mergeDf = dfUpper.append(dfLower) mergeTestDf = dfTestUpper.append(dfTestLower) mergeAns = train_fold_label_2.append(train_fold_label_1) # Testing tmpOutPath = _basePath + expNo + "_" + "Xgboost_" + "stacking" + "_ans.csv" fab = ModelFactory() fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "stacking_level2_xgboost" fab._n_iter_search = 2 fab._expInfo = expInfo clf = fab.getXgboostClf(mergeDf, mergeAns) log(clf.predict(xgb.DMatrix(mergeTestDf))) outDf = pd.DataFrame(clf.predict(xgb.DMatrix(mergeTestDf))) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') musicAlarm()
fab._gridSearchFlag = True # fab._subFolderName = "stacked" fab._n_iter_search = 250 fab._expInfo = expInfo # fab.getAllModels(newX, newY) finalClf = fab.getRandomForestClf(newX, newY) featureImportance = [] for i in range(0, len(finalClf.feature_importances_)): if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: featureImportance.append([dr._trainDataFrame.columns[i], finalClf.feature_importances_[i]]) # log( featureImportance) featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) log(featureImportance) trainNewX = dr._ansDataFrame tmpOutPath = _basePath + "012_train_trim_features.csv" selectCnt = 20 tmpCnt = 0 for tmpColName in featureImportance: for i in range(0, len(dr._trainDataFrame.columns)): if tmpColName[0] == dr._trainDataFrame.columns[i]: trainNewX = pd.concat([trainNewX, dr._trainDataFrame[dr._trainDataFrame.columns[i]]], axis=1) tmpCnt += 1 break if tmpCnt == 20: break trainNewX.to_csv(tmpOutPath, sep=",", encoding="utf-8")
outFold1 = pd.DataFrame(predictResult) outFold1.columns = [tmpClfName+"_" + str(i) + "_0", tmpClfName+"_" + str(i) + "_1", tmpClfName+"_" + str(i) + "_2" ] dfLower = pd.concat([dfLower, outFold1], axis=1) outTestFold1 = pd.DataFrame(predictTestResult) outTestFold1.columns = [tmpClfName+"_" + str(i) + "_0", tmpClfName+"_" + str(i) + "_1", tmpClfName+"_" + str(i) + "_2" ] dfTestLower = pd.concat([dfTestLower, outTestFold1], axis=1) mergeDf = dfUpper.append(dfLower) mergeTestDf = dfTestUpper.append(dfTestLower) mergeAns = train_fold_label_2.append(train_fold_label_1) # Testing tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "stacking"+ "_ans.csv" fab = ModelFactory() fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "stacking_level2_xgboost" fab._n_iter_search = 2 fab._expInfo = expInfo clf = fab.getXgboostClf(mergeDf, mergeAns) log(clf.predict(xgb.DMatrix(mergeTestDf))) outDf = pd.DataFrame(clf.predict(xgb.DMatrix(mergeTestDf))) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') musicAlarm()
newX, newY = dr._trainDataFrame, dr._ansDataFrame dr2 = DataReader() dr2.readInCSV(testPath, "test") #newX = dr2._testDataFrame dr3 = DataReader() dr3.readInCSV(testSortIdPath, "test") sortIdDf = dr3._testDataFrame dr4 = DataReader() dr4.readInCSV(trainSortIdPath, "test") sortIdDf = dr4._testDataFrame modelFolder = _basePath + "models" + Config.osSep + "binary" + Config.osSep curModel = "Xgboost" modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_" + curModel + "_test_ans.csv" tmpClf = loadModel(modelPath) log(tmpClf.predict_proba(newX)) ans = tmpClf.predict_proba(newX) ansList = [] for i, tmpAns in enumerate(dr._ansDataFrame): if ans[i][tmpAns] < 0.35: #log( "id: " + sortIdDf[sortIdDf.columns[0]][i] + ", prob: " + ans[i][tmpAns], ", cate: " + tmpAns) log((sortIdDf[sortIdDf.columns[0]][i], ans[i][tmpAns], tmpAns)) ansList.append( (sortIdDf[sortIdDf.columns[0]][i], ans[i][tmpAns], tmpAns)) log(len(ansList))
# 1. read in data expNo = "014" expInfo = expNo + "_one_hot_each_features" _basePath = Config.FolderBasePath + expInfo + Config.osSep featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"] ansPath = _basePath + "014_ans_array.csv" drAns = DataReader() drAns.readInCSV(ansPath, "train") newY = drAns._ansDataFrame for i in range(1,32): log( "start " + str(i) + "/32 ...") tmpCurFeatureList = [] flagList =[] for i2 in range (0, 7- len(bin(i))): flagList.append(0) for i2 in range(2,len(bin(i))): flagList.append(int(bin(i)[i2])) for j in range(0,5): if flagList[j] ==1: tmpCurFeatureList.append(featureList[j]) log(tmpCurFeatureList)
def genXgboostRpt(self, bestClf, bestScore, paramList, best_num_round): dumpModel(bestClf, "Xgboost", self._expInfo, self._subFolderName) log("Native Xgboost best score : ", bestScore, ", param list: ", paramList, "best_num_round: ", best_num_round) if self._singleModelMail == True: mail("Xgboost Done" ,"Native Xgboost best score : " + str( bestScore) + ", param list: " + str( paramList) + "best_num_round: ", best_num_round)
def getAllModels(self, X, Y): log("GetAllModels start with iteration numbers: ", self._n_iter_search) start = time.time() self._basicClf["Xgboost"] = self.getXgboostClf(X, Y) self._basicClf["Random_Forest"] = self.getRandomForestClf(X, Y) self._basicClf["Extra_Trees"] = self.getExtraTressClf(X, Y) if not self._onlyTreeBasedModels: self._basicClf["K_NN"] = self.getKnnClf(X, Y) self._basicClf[ "Logistic_Regression"] = self.getLogisticRegressionClf(X, Y) self._basicClf["Naive_Bayes"] = self.getNaiveBayesClf(X, Y) log("GetAllModels cost: ", time.time() - start, " sec") log( sorted(self._bestScoreDict.items(), key=lambda x: x[1], reverse=True)) mail( self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1], reverse=True)) log( self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1], reverse=True)) bestScoreList = sorted(self._bestScoreDict.items(), key=lambda x: x[1], reverse=True) log("MVP clf is : ", bestScoreList[0][0]) self._mvpClf = self._bestClf[bestScoreList[0][0]] log("GetAllModels end with iteration numbers: ", self._n_iter_search)
# newX = xgb.DMatrix(newX) # #print clf.predict(newX) # tmpOutPath = _basePath + expNo +"_" + "Xgboost" + "_testXgboost7_ans.csv" # log(clf.predict(newX)) # outDf = pd.DataFrame(clf.predict(newX)) # outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') # musicAlarm() clf = joblib.load( "F:\\xgboost_tmp_best_020.model" ) tmpPath = _basePath + "test_merge_one_hot" + ".csv" dr = DataReader() dr.readInCSV(tmpPath, "test") newX = dr._testDataFrame newX = xgb.DMatrix(newX) tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans_" + "2" + ".csv" log(clf.predict(newX)) outDf = pd.DataFrame(clf.predict(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') musicAlarm() # sampleRows = np.random.choice(X.index, len(X)*evalDataPercentage) # # print X.ix[sampleRows] # exit() # dtest = xgb.DMatrix( X.ix[sampleRows], label=Y.ix[sampleRows]) # dtrain = xgb.DMatrix( X.drop(sampleRows), label=Y.drop(sampleRows)) # # print strftime("%Y-%m-%d %H:%M:%S", gmtime()) #
#X, Y = pd.DataFrame([1,2,3,4,5,6,7,8,9,10,11,12]), pd.DataFrame([1,2,3,4,5,6,7,8,9,10,11,12]) #newX, newY = stratifyData(X,Y, 0.4) # clf = fab.getNaiveBayesClf(X, Y) # clf2 = fab.getKnnClf(X, Y) #clf3 = fab.getRandomForestClf(X, Y) # x= clf.predict_proba(X) # log( x) #log(fab._bestScoreDict) # #log(fab._bestClf) # log( fab._bestClf['Random Forest'].predict_proba(X)) #newX, newY = stratifyData(X, Y, 0.4) newX, newY = X, Y #print newX fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 1 fab._expInfo = "001_location_only" print newX #print newY fab.getAllModels(newX, newY) #fab.getRandomForestClf(newX, newY) bestClf = fab._mvpClf log(bestClf.predict_proba(newX)) #log(sorted(fab._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) ) #log(fab._bestClf['Random Forest'].predict_proba(X)) #dumpModel(clf3, "Random_Forest", "ExpTest") #log("haha") #log(getDumpFilePath( "Random_Forest", "haha Tets")) #musicAlarm()
dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame dr2 = DataReader() dr2.readInCSV( testPath, "test") #newX = dr2._testDataFrame dr3 = DataReader() dr3.readInCSV( testSortIdPath, "test") sortIdDf =dr3._testDataFrame dr4 = DataReader() dr4.readInCSV(trainSortIdPath, "test") sortIdDf =dr4._testDataFrame modelFolder = _basePath + "models" + Config.osSep + "binary" + Config.osSep curModel = "Xgboost" modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_" + curModel + "_test_ans.csv" tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(newX)) ans = tmpClf.predict_proba(newX) ansList = [] for i, tmpAns in enumerate (dr._ansDataFrame): if ans[i][tmpAns] < 0.35: #log( "id: " + sortIdDf[sortIdDf.columns[0]][i] + ", prob: " + ans[i][tmpAns], ", cate: " + tmpAns) log((sortIdDf[sortIdDf.columns[0]][i],ans[i][tmpAns],tmpAns)) ansList.append((sortIdDf[sortIdDf.columns[0]][i],ans[i][tmpAns],tmpAns)) log (len(ansList))
#gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame) if __name__ == '__main__': start = time.time() exp() end = time.time() elapsed = end - start log( "exp elapsed:", elapsed , "sec") #os.startfile('D:\\123.m4a')
if doTestFlag == True: dr.readInCSV(testPath , "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) #print newX # 3. get all best model from newX # fab = ModelFactory() # fab._gridSearchFlag = True # fab._n_iter_search = 100 # fab._expInfo = expInfo # fab.getXgboostClf(newX, newY) # 4. test all data, output 3 ans as features #D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model #D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model #D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model #D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model modelPath = _basePath+"(Xgboost)_(2016-02-06_11_14_31).model" tmpOutPath = _basePath + "004_submission_1_train_Xgboost.csv" tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(newX)) outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm() log("004 Done")
tmpTestPath = _basePath + "test_tobe.csv" tmpOutPath = _basePath + "test_ans.csv" tmpTestDf = pd.DataFrame() for tmpColName in trainColNameList: print tmpColName for tmpTestColName in testX.columns: if tmpColName == tmpTestColName: tmpTestDf = pd.concat([tmpTestDf, testX[tmpColName]], axis=1) tmpTestDf.to_csv(tmpTestPath, sep=',', encoding='utf-8') modelFolder = _basePath + "models" + Config.osSep + "top20" + Config.osSep curModel = "Xgboost" modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(tmpTestDf)) outDf = pd.DataFrame(tmpClf.predict_proba(tmpTestDf)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') # Get all best model from newX # fab = ModelFactory() # fab._gridSearchFlag = True # fab._subFolderName = "top20" # fab._n_iter_search = 250 # fab._expInfo = expInfo # fab.getAllModels(newX, newY) # finalClf = fab.getRandomForestClf(newX, newY) # Test all data modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
def loadModel(modelPath): log("Start load model: ", modelPath) clf = joblib.load( modelPath ) return clf
testPath = _basePath + "001_test_tobe.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath, "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) print newX # 2. stratify 60 % data and train location only # newX, newY = stratifyData(dr._trainDataFrame, dr._ansDataFrame, 0.4) # 3. get all best model from newX # fab = ModelFactory() # fab._gridSearchFlag = True # fab._n_iter_search = 500 # fab._expInfo = "001_location_only" # fab.getAllModels(newX, newY) # 4. test all data, output 3 ans as features modelPath = _basePath + "(Xgboost)_(2016-02-03_18_39_14).model" tmpOutPath = _basePath + "001_submission_2.csv" tmpClf = loadModel(modelPath) log(tmpClf.predict_proba(newX)) #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') musicAlarm()
# drAns = DataReader() # drAns.readInCSV(ansPath, "train") # newY = drAns._ansDataFrame tmpPath = _basePath + "train_merge_one_hot.csv" dr = DataReader() dr.readInCSV(tmpPath, "train") newX = dr._trainDataFrame newY = dr._ansDataFrame fab = ModelFactory() #fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "groupby_sum" fab._n_iter_search = 1 fab._expInfo = expInfo clf = fab.getXgboostClf(newX, newY) # tmpPath = _basePath + "test_merge_one_hot" + ".csv" dr = DataReader() dr.readInCSV(tmpPath, "test") newX = dr._testDataFrame newX = xgb.DMatrix(newX) tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans.csv" log(clf.predict(newX)) outDf = pd.DataFrame(clf.predict(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') musicAlarm()
def getAllModels(self, X, Y): log("GetAllModels start with iteration numbers: " , self._n_iter_search) start = time.time() self._basicClf["Xgboost"] = self.getXgboostClf(X, Y) self._basicClf["Random_Forest"] = self.getRandomForestClf(X, Y) self._basicClf["Extra_Trees"] = self.getExtraTressClf(X, Y) if not self._onlyTreeBasedModels: self._basicClf["K_NN"] = self.getKnnClf(X, Y) self._basicClf["Logistic_Regression"] = self.getLogisticRegressionClf(X, Y) self._basicClf["Naive_Bayes"] = self.getNaiveBayesClf(X, Y) log("GetAllModels cost: " , time.time() - start , " sec") log(sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True)) mail(self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) ) log(self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) ) bestScoreList = sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) log("MVP clf is : ", bestScoreList[0][0]) self._mvpClf = self._bestClf[bestScoreList[0][0]] log("GetAllModels end with iteration numbers: " , self._n_iter_search)
# 4. test all data, output 3 ans as features # D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep clfNameList = [] clfNameList.append("Extra_Trees") clfNameList.append("K_NN") clfNameList.append("Random_Forest") clfNameList.append("Xgboost") clfNameList.append("Logistic_Regression") testCsv = _basePath + "010_train_tobe.csv" dr = DataReader() newX, testY = dr.cvtPathListToDfList(testCsv, "train") for curModel in clfNameList: modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv" tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(newX)) #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm()
tmpTestPath = _basePath + "test_tobe.csv" tmpOutPath = _basePath + "test_ans.csv" tmpTestDf = pd.DataFrame() for tmpColName in trainColNameList: print tmpColName for tmpTestColName in testX.columns: if tmpColName == tmpTestColName: tmpTestDf = pd.concat([tmpTestDf, testX[tmpColName]], axis=1) tmpTestDf.to_csv(tmpTestPath, sep=',', encoding='utf-8') modelFolder = _basePath + "models" + Config.osSep + "top20" + Config.osSep curModel = "Xgboost" modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpClf = loadModel(modelPath) log(tmpClf.predict_proba(tmpTestDf)) outDf = pd.DataFrame(tmpClf.predict_proba(tmpTestDf)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') # Get all best model from newX # fab = ModelFactory() # fab._gridSearchFlag = True # fab._subFolderName = "top20" # fab._n_iter_search = 250 # fab._expInfo = expInfo # fab.getAllModels(newX, newY) # finalClf = fab.getRandomForestClf(newX, newY) # Test all data modelList = [ "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
''' Created on Jan 30, 2016 author: whmou Jan 30, 2016 1.0.0 Init. ''' from Telstra.util.CustomLogger import info as log if __name__ == '__main__': log("haha")
def doXgboostRandomSearch(self, X, Y, num_round): paramList = [] bestScore = sys.float_info.max bestClf = None best_num_round=0 num_class = len(set(Y)) objective ="" if len(set(Y)) <=2: objective = "binary:logistic" else: objective = "multi:softprob" for i in range(0, self._n_iter_search): log("xgboost start random search : " + str(i+1) + "/"+ str(self._n_iter_search)) param = {} param['nthread'] = 4 param['eta'] = random.uniform(0.15, 0.45) param['gamma'] = randint(0,3) param['max_depth'] = randint(8,120) param['min_child_weight'] = randint(1,3) param['eval_metric'] = 'mlogloss' param['max_delta_step'] = randint(1,10) param['objective'] = objective param['subsample'] = random.uniform(0.45, 0.65) param['num_class'] = num_class param['silent'] = 1 param['alpha'] = 1 param['lambda'] = 1 #param['early_stopping_rounds']=2 plst = param.items() evalDataPercentage = 0.2 sampleRows = np.random.choice(X.index, len(X)*evalDataPercentage) sampleAnsDf = Y.ix[sampleRows] ori_X = X ori_Y = Y #dtest = xgb.DMatrix( X.ix[sampleRows], label=sampleAnsDf) #dtrain = xgb.DMatrix( X.drop(sampleRows), label=Y.drop(sampleRows)) #evallist = [(dtest,'eval'), (dtrain,'train')] dtrain = xgb.DMatrix( X, label=Y) xgbCvResult = xgb.cv(plst, dtrain, num_boost_round= num_round, nfold=5) scoreList = xgbCvResult[xgbCvResult.columns[0]].tolist() new_num_round = scoreList.index(min(scoreList)) + 1 minScore = scoreList[new_num_round-1] tmpScore = minScore if tmpScore < bestScore: #tmpSelfScore = calLogLoss(pd.DataFrame(bst.predict(dtest)), sampleAnsDf) #print "self best score:" + str(tmpSelfScore) log("xgb best score:" + str(minScore)) log("xgb best num_round: " + str(new_num_round)) log("xgb best param: " + str(plst)) newDtrain = xgb.DMatrix(ori_X, label=ori_Y) bst = xgb.train(plst, newDtrain, new_num_round) bestScore = tmpScore bestClf = bst paramList = plst best_num_round = new_num_round joblib.dump(bst, Config.xgboostBestTmpCflPath) self.genXgboostRpt(bestClf, bestScore, paramList, best_num_round) return bestClf