def sumExist(self): dr = DataReader() dr.readInCSV(self._pathMain, self._mode) tmpColumnPrefix = self._typeName + "_" df = pd.read_csv(self._pathMain, header=0, sep=',') if self._mode == "train": processDf = dr._trainDataFrame else: processDf = dr._testDataFrame for i in range (1,1127): tmpColName = tmpColumnPrefix + "one_hot_" + str(i) processDf[tmpColName] = 0 tmpLastI2 = 0 for i1 in range(0, len(processDf[processDf.columns[0]] )): tmpFlag = False for i2 in range(tmpLastI2, len(df[df.columns[0]] )): tmpMainId = processDf[processDf.columns[0]][i1] tmpId = df[df.columns[0]][i2] tmpVal= df[df.columns[1]][i2] #tmpVal2= df[df.columns[2]][i2] if tmpMainId == tmpId: tmpFlag = True processDf[processDf.columns[tmpVal+394]][i1] =1 if tmpFlag == True and tmpMainId != tmpId: tmpLastI2 = i2 break #print i1, i2 #outDf = pd.concat([dr._ansDataFrame, processDf], axis=1) outDf = processDf outDf.to_csv(self._outputPathName, sep=',', encoding='utf-8')
def oneHot(): dr = DataReader() dr.readInCSV(_pathMain, _mode) tmpColumnPrefix = _typeName + "_" df = pd.read_csv(_eventTypePath, header=0, sep=',') if _mode == "train": processDf = dr._trainDataFrame else: processDf = dr._testDataFrame for i in range (1, 55): tmpColName = tmpColumnPrefix + "one_hot_" + str(i) processDf[tmpColName] = 0 tmpLastI2 = 0 for i1 in range(0, len(processDf[processDf.columns[0]])): tmpFlag = False for i2 in range(tmpLastI2, len(df[df.columns[0]])): tmpMainId = processDf[processDf.columns[0]][i1] tmpId = df[df.columns[0]][i2] tmpVal = df[df.columns[1]][i2] # tmpVal2= df[df.columns[2]][i2] if tmpMainId == tmpId: tmpFlag = True print tmpVal processDf[processDf.columns[tmpVal + 394]][i1] = 1 if tmpFlag == True and tmpMainId != tmpId: tmpLastI2 = i2 break print i1, i2 # outDf = pd.concat([dr._ansDataFrame, processDf], axis=1) outDf = processDf outDf.to_csv(_outputPathName, sep=',', encoding='utf-8')
if __name__ == '__main__': # 1. read in data expNo = "008" expInfo = expNo + "_blender" _basePath = Config.FolderBasePath + expInfo + Config.osSep doTestFlag = False path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_tobe.csv" outputPath = _basePath + expNo + "blender_train.csv" # 1. read data dr = DataReader() tmpDfList = [] tmpPath = _basePath + "008_submission_1_train_Extra_Trees.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_K_NN.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_Random_Forest.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_Xgboost.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
from Telstra.Bartender.Blender import Blender if __name__ == '__main__': # 1. read in data expNo = "013" expInfo = expNo + "_data_exploration" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_asis.csv" testSortIdPath = Config.FolderBasePath + "test_sort_id.csv" trainSortIdPath = _basePath + "train_sort_id.csv" dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame dr2 = DataReader() dr2.readInCSV(testPath, "test") #newX = dr2._testDataFrame dr3 = DataReader() dr3.readInCSV(testSortIdPath, "test") sortIdDf = dr3._testDataFrame dr4 = DataReader() dr4.readInCSV(trainSortIdPath, "test") sortIdDf = dr4._testDataFrame
import pandas as pd from Telstra.Bartender.Blender import Blender from test._mock_backport import inplace import random import xgboost as xgb import numpy as np if __name__ == '__main__': # 1. read in data expNo = "021" expInfo = expNo + "_stacking" _basePath = Config.FolderBasePath + expInfo + Config.osSep tmpPath = _basePath + "train.csv" dr = DataReader() dr.readInCSV(tmpPath, "train") X = dr._trainDataFrame Y = dr._ansDataFrame ori_X = X ori_Y = Y evalDataPercentage = 0.5 sampleRows = np.random.choice(X.index, len(X) * evalDataPercentage) train_fold_1 = X.ix[sampleRows] train_fold_label_1 = Y.ix[sampleRows] train_fold_2 = X.drop(sampleRows) train_fold_label_2 = Y.drop(sampleRows) # tmpOutPath = _basePath + expNo +"_" + "fold_1.csv"
[0.24276169, 0.02004454, 0.00445434, 0.71714922, 0.0155902], [0.00310559, 0.53881988, 0.03416149, 0.4052795, 0.01863354], [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619], [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111] ] # for tmpFeature in featureList: # dr = DataReader() # tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv" # newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") # tmpDf = pd.concat([tmpDf, newX], axis=1) # # tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') tmpI, tmpJ = 0, 0 dr = DataReader() baseDf, ansY = dr.cvtPathListToDfList( _basePath + "010_blenderXgboost_train.csv", "train") tmpOutPath = _basePath + "010_train_last_blender.csv" tmpFeatureBlendedAns = pd.DataFrame() baseDf = pd.DataFrame() tmpDfList = [] for tmpClfName in clfNameList: dr = DataReader() tmpPath = _basePath + "010_" + "blender" + tmpClfName + "_train.csv" newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) b1 = Blender(clfNameList, tmpDfList, ansY) b1.autoFlow(2000, tmpOutPath)
# 4. test all data, output 3 ans as features # D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep clfNameList = [] clfNameList.append("Extra_Trees") clfNameList.append("K_NN") clfNameList.append("Random_Forest") clfNameList.append("Xgboost") clfNameList.append("Logistic_Regression") testCsv = _basePath + "010_train_tobe.csv" dr = DataReader() newX, testY = dr.cvtPathListToDfList(testCsv, "train") for curModel in clfNameList: modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv" tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(newX)) #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm()
#D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model #Logistic_Regression modelList = [ "Xgboost", "Random_Forest", "Extra_Trees", "K_NN", "Logistic_Regression" ] featureList = [ "event_type", "log_feature", "resource_type", "severity_type" ] for tmpFeature in featureList: for tmpModel in modelList: subFolder = tmpFeature curModel = tmpModel tmpCsvPath = _basePath + expNo + "_" + tmpFeature + "_test_tobe.csv" dr = DataReader() dr.readInCSV(tmpCsvPath, "train") newX = dr._trainDataFrame modelFolder = _basePath + "models" + Config.osSep + subFolder + Config.osSep modelPath = modelFolder + str( getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + "010_" + curModel + "_stack_" + subFolder + "_test.csv" tmpClf = loadModel(modelPath) log(tmpClf.predict_proba(newX)) outDf = pd.concat( [newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm() # log("004 Done")
[0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619], [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111] ] tmpOutPath = _basePath + "010_test_tobe.csv" # for tmpFeature in featureList: # dr = DataReader() # tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv" # newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") # tmpDf = pd.concat([tmpDf, newX], axis=1) # # tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') tmpI, tmpJ = 0, 0 dr = DataReader() baseDf, tmpY = dr.cvtPathListToDfList(_basePath + "010_test_asis.csv", "test") for tmpFeature in featureList: outputPath = _basePath + expNo + "_blender_" + tmpFeature + "_test.csv" #ansPath = _basePath + "010_Extra_Trees_stack_event_type.csv" #dr = DataReader() #tmpX, ansY = dr.cvtPathListToDfList(ansPath, "train") #tmpDfList = [] tmpFeatureBlendedAns = pd.DataFrame() for tmpClfName in clfNameList: dr = DataReader() tmpPath = _basePath + "010_" + tmpClfName + "_stack_" + tmpFeature + "_test.csv" newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") tmpWight = tmpWeightList[tmpI][tmpJ]
from Telstra.util.ModelUtils import deleteModelFiles import pandas as pd from Telstra.Bartender.Blender import Blender if __name__ == "__main__": # 1. read in data expNo = "012" expInfo = expNo + "_rf_chk_important" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_asis.csv" testPath = _basePath + expNo + "_test_asis.csv" dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True # fab._subFolderName = "stacked" fab._n_iter_search = 250 fab._expInfo = expInfo # fab.getAllModels(newX, newY) finalClf = fab.getRandomForestClf(newX, newY) featureImportance = [] for i in range(0, len(finalClf.feature_importances_)): if i != len(dr._trainDataFrame.columns):
# tmpPath = _basePath + "test_2.csv" # dr = DataReader() # dr.readInCSV(tmpPath, "test") # newX = dr._testDataFrame # newY = dr._ansDataFrame # newX = xgb.DMatrix(newX) # #print clf.predict(newX) # tmpOutPath = _basePath + expNo +"_" + "Xgboost" + "_testXgboost7_ans.csv" # log(clf.predict(newX)) # outDf = pd.DataFrame(clf.predict(newX)) # outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') # musicAlarm() clf = joblib.load( "F:\\xgboost_tmp_best_020.model" ) tmpPath = _basePath + "test_merge_one_hot" + ".csv" dr = DataReader() dr.readInCSV(tmpPath, "test") newX = dr._testDataFrame newX = xgb.DMatrix(newX) tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans_" + "2" + ".csv" log(clf.predict(newX)) outDf = pd.DataFrame(clf.predict(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') musicAlarm() # sampleRows = np.random.choice(X.index, len(X)*evalDataPercentage) # # print X.ix[sampleRows] # exit()
if __name__ == '__main__': # 1. read in data expNo = "013" expInfo = expNo + "_data_exploration" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_asis.csv" testSortIdPath = Config.FolderBasePath + "test_sort_id.csv" trainSortIdPath = _basePath + "train_sort_id.csv" dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame dr2 = DataReader() dr2.readInCSV( testPath, "test") #newX = dr2._testDataFrame dr3 = DataReader() dr3.readInCSV( testSortIdPath, "test") sortIdDf =dr3._testDataFrame dr4 = DataReader() dr4.readInCSV(trainSortIdPath, "test") sortIdDf =dr4._testDataFrame
# fab._expInfo = expInfo # fab.getAllModels(newX, newY) # 4. test all data, output 3 ans as features #D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model #D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model #D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model #D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model #Logistic_Regression modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"] featureList = ["event_type", "log_feature", "resource_type", "severity_type"] for tmpFeature in featureList: for tmpModel in modelList: subFolder = tmpFeature curModel = tmpModel tmpCsvPath = _basePath + expNo + "_" + tmpFeature +"_test_tobe.csv" dr = DataReader() dr.readInCSV(tmpCsvPath , "train") newX = dr._trainDataFrame modelFolder = _basePath + "models" + Config.osSep + subFolder + Config.osSep modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + "010_" + curModel + "_stack_" + subFolder + "_test.csv" tmpClf = loadModel( modelPath) log(tmpClf.predict_proba(newX)) outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm() # log("004 Done")
# outputPath = _basePath + "blendTestOut.csv" # dr.readInCSV(path, "train") # newX, newY = dr._trainDataFrame, dr._ansDataFrame # # # dr2 = DataReader() # dr2.readInCSV(path, "train") # newX2, newY2 = dr2._trainDataFrame, dr2._ansDataFrame # # predictDfList = [] # predictDfList.append(newX) # predictDfList.append(newX2) clfNameList = [] clfNameList.append("test") clfNameList.append("test2") # # b1 = Blender(clfNameList, predictDfList, newY) # inputWeightList = b1.getRandomWeightList(2) # #print inputWeightList # tmpDf = b1.doBlending(inputWeightList) # #b1.calLogLoss(tmpDf) # b1.autoFlow(11, outputPath) dr3 = DataReader() path = _basePath + "testLogLoss.csv" dr3.readInCSV(path, "train") predictDfList = [] predictDfList.append(dr3._trainDataFrame) b2 = Blender(clfNameList, predictDfList, dr3._ansDataFrame) print b2.calLogLoss(dr3._trainDataFrame)
# 4. test all data, output 3 ans as features # D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model # D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep clfNameList = [] clfNameList.append("Extra_Trees") clfNameList.append("K_NN") clfNameList.append("Random_Forest") clfNameList.append("Xgboost") clfNameList.append("Logistic_Regression") testCsv = _basePath + "010_train_tobe.csv" dr = DataReader() newX, testY = dr.cvtPathListToDfList(testCsv, "train") for curModel in clfNameList: modelPath = modelFolder + str( getMatchNameModelPath(modelFolder, curModel)) tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv" tmpClf = loadModel(modelPath) log(tmpClf.predict_proba(newX)) #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) outDf = pd.DataFrame(tmpClf.predict_proba(newX)) outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') #musicAlarm()
# 1. read in data expNo = "020" expInfo = expNo + "_groupby_sum" _basePath = Config.FolderBasePath + expInfo + Config.osSep featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"] ans1List = [] ans2List = [] # ansPath = _basePath + "014_ans_array.csv" # drAns = DataReader() # drAns.readInCSV(ansPath, "train") # newY = drAns._ansDataFrame tmpPath = _basePath + "train_merge_one_hot.csv" dr = DataReader() dr.readInCSV(tmpPath, "train") newX = dr._trainDataFrame newY = dr._ansDataFrame fab = ModelFactory() #fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._singleModelMail = True fab._subFolderName = "groupby_sum" fab._n_iter_search = 1 fab._expInfo = expInfo clf = fab.getXgboostClf(newX, newY) # tmpPath = _basePath + "test_merge_one_hot" + ".csv"
from Telstra.Bartender.Blender import Blender if __name__ == '__main__': # 1. read in data expNo = "008" expInfo = expNo + "_blender" _basePath = Config.FolderBasePath + expInfo + Config.osSep doTestFlag = False path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_tobe.csv" outputPath = _basePath + expNo + "blender_train.csv" # 1. read data dr = DataReader() tmpDfList = [] tmpPath = _basePath + "008_submission_1_train_Extra_Trees.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_K_NN.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_Random_Forest.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) tmpPath = _basePath + "008_submission_1_train_Xgboost.csv" newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
import pandas as pd from Telstra.Bartender.Blender import Blender if __name__ == '__main__': # 1. read in data expNo = "014" expInfo = expNo + "_one_hot_each_features" _basePath = Config.FolderBasePath + expInfo + Config.osSep featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"] ansPath = _basePath + "014_ans_array.csv" drAns = DataReader() drAns.readInCSV(ansPath, "train") newY = drAns._ansDataFrame for i in range(1,32): log( "start " + str(i) + "/32 ...") tmpCurFeatureList = [] flagList =[] for i2 in range (0, 7- len(bin(i))): flagList.append(0) for i2 in range(2,len(bin(i))): flagList.append(int(bin(i)[i2]))
from Telstra.util.ModelUtils import getMatchNameModelPath from Telstra.util.ModelUtils import deleteModelFiles import pandas as pd from Telstra.Bartender.Blender import Blender if __name__ == '__main__': # 1. read in data expNo = "013" expInfo = expNo + "_data_exploration" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_asis.csv" dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._subFolderName = "binary" fab._n_iter_search = 50 fab._expInfo = expInfo fab.getAllModels(newX, newY) musicAlarm() # Test all data modelList = [ "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
import pandas as pd if __name__ == '__main__': # 1. read in data expNo = "003" expInfo = expNo + "_one_hot_event_type" _basePath = Config.FolderBasePath + expInfo + Config.osSep doTestFlag = False path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_tobe.csv" # 1. read data dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath , "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) #print newX # 3. get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 30 fab._expInfo = expInfo fab.getAllModels(newX, newY)
def exp(): expInfo = "location_only\\" _basePath = Config.FolderBasePath + expInfo doTestFlag = False path = _basePath + "train.csv" testPath = _basePath + "test10.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") if doTestFlag == True: dr.readInCSV(testPath, "test") # 2. run models #print dr._trainDataFrame.as_matrix fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 10 fab._expInfo = "location_only" X = dr._trainDataFrame Y = dr._ansDataFrame #fab.getRandomForestClf(X, Y) #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame) # log( "xgb start") # param = {'max_depth':10, 'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'} # num_round = 5 #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame, dr._ansDataFrame) #testResult = gbm.predict_proba(dr._testDataFrame) #print testResult # gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob') # scores = cross_val_score(rfClf, dr._trainDataFrame, dr._ansDataFrame, n_jobs = -1) # log( "xgboost Validation Precision: ", scores.mean() ) #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame), num_round, nfold=5,metrics={'error'}, seed = 0) #gbTrain = gbm.fit(dr._trainDataFrame, dr._ansDataFrame) #joblib.dump(gbTrain, xgbModelPath) #clf = joblib.load( xgbModelPath ) #clf.predict_proba(dr._testDataFrame) #xgb.save(gbm, xgbModelPath) #print xgbCv #print "xgb end" #gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame)
[0.24276169, 0.02004454, 0.00445434, 0.71714922, 0.0155902], [0.00310559, 0.53881988, 0.03416149, 0.4052795, 0.01863354], [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619], [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111], ] # for tmpFeature in featureList: # dr = DataReader() # tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv" # newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") # tmpDf = pd.concat([tmpDf, newX], axis=1) # # tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') tmpI, tmpJ = 0, 0 dr = DataReader() baseDf, ansY = dr.cvtPathListToDfList(_basePath + "010_blenderXgboost_train.csv", "train") tmpOutPath = _basePath + "010_train_last_blender.csv" tmpFeatureBlendedAns = pd.DataFrame() baseDf = pd.DataFrame() tmpDfList = [] for tmpClfName in clfNameList: dr = DataReader() tmpPath = _basePath + "010_" + "blender" + tmpClfName + "_train.csv" newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") tmpDfList.append(newX) b1 = Blender(clfNameList, tmpDfList, ansY) b1.autoFlow(2000, tmpOutPath)
[0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619], [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111], ] tmpOutPath = _basePath + "010_test_tobe.csv" # for tmpFeature in featureList: # dr = DataReader() # tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv" # newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") # tmpDf = pd.concat([tmpDf, newX], axis=1) # # tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8') tmpI, tmpJ = 0, 0 dr = DataReader() baseDf, tmpY = dr.cvtPathListToDfList(_basePath + "010_test_asis.csv", "test") for tmpFeature in featureList: outputPath = _basePath + expNo + "_blender_" + tmpFeature + "_test.csv" # ansPath = _basePath + "010_Extra_Trees_stack_event_type.csv" # dr = DataReader() # tmpX, ansY = dr.cvtPathListToDfList(ansPath, "train") # tmpDfList = [] tmpFeatureBlendedAns = pd.DataFrame() for tmpClfName in clfNameList: dr = DataReader() tmpPath = _basePath + "010_" + tmpClfName + "_stack_" + tmpFeature + "_test.csv" newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train") tmpWight = tmpWeightList[tmpI][tmpJ]
import pandas as pd from Telstra.Bartender.Blender import Blender if __name__ == '__main__': # 1. read in data expNo = "011" expInfo = expNo + "_remove_one_hot" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_tobe.csv" dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True # fab._subFolderName = "stacked" fab._n_iter_search = 100 fab._expInfo = expInfo fab.getAllModels(newX, newY) # Test all data modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"] # featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
import os _basePath ="" if os.name == 'nt': _basePath = "D:\\Kaggle\\Telstra\\" else: _basePath = "/Users/whmou/Kaggle/Telstra/" testPath = _basePath + "test6.csv" # take id list only testPath2 = _basePath + "test11.csv" samplePath = _basePath + "sample_submission.csv" outputPath = _basePath+"temp_submission4.csv" if __name__ == '__main__': print "start to make submission version:", outputPath dr = DataReader() dr.readInCSV(testPath, "test") idList = dr._testDataFrame[dr._testDataFrame.columns[0]] dr2= DataReader() dr2.readInCSV(testPath2, "test") dr3= DataReader() dr3.readInCSV(samplePath, "test") sampleIdList = dr3._testDataFrame[dr3._testDataFrame.columns[0]] tmp = pd.DataFrame(exp()) ansArr = pd.concat([idList, tmp], axis=1) print ansArr
from Telstra.util.CustomLogger import musicAlarm from Telstra.util.ModelUtils import loadModel import pandas as pd if __name__ == '__main__': # 1. read in data expInfo = "001_location_only" + Config.osSep _basePath = Config.FolderBasePath + expInfo doTestFlag = True path = _basePath + "001_train_tobe.csv" testPath = _basePath + "001_test_tobe.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath, "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) print newX # 2. stratify 60 % data and train location only # newX, newY = stratifyData(dr._trainDataFrame, dr._ansDataFrame, 0.4) # 3. get all best model from newX # fab = ModelFactory() # fab._gridSearchFlag = True # fab._n_iter_search = 500 # fab._expInfo = "001_location_only"
def exp(): expInfo = "location_only\\" _basePath = Config.FolderBasePath + expInfo doTestFlag = False path = _basePath + "train.csv" testPath = _basePath + "test10.csv" # 1. read data dr = DataReader() dr.readInCSV( path, "train") if doTestFlag == True: dr.readInCSV(testPath , "test") # 2. run models #print dr._trainDataFrame.as_matrix fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 10 fab._expInfo = "location_only" X = dr._trainDataFrame Y = dr._ansDataFrame #fab.getRandomForestClf(X, Y) #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame) # log( "xgb start") # param = {'max_depth':10, 'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'} # num_round = 5 #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame, dr._ansDataFrame) #testResult = gbm.predict_proba(dr._testDataFrame) #print testResult # gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob') # scores = cross_val_score(rfClf, dr._trainDataFrame, dr._ansDataFrame, n_jobs = -1) # log( "xgboost Validation Precision: ", scores.mean() ) #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame), num_round, nfold=5,metrics={'error'}, seed = 0) #gbTrain = gbm.fit(dr._trainDataFrame, dr._ansDataFrame) #joblib.dump(gbTrain, xgbModelPath) #clf = joblib.load( xgbModelPath ) #clf.predict_proba(dr._testDataFrame) #xgb.save(gbm, xgbModelPath) #print xgbCv #print "xgb end" #gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame)