Пример #1
0
    def sumExist(self):
        dr = DataReader()
        dr.readInCSV(self._pathMain, self._mode)
        tmpColumnPrefix = self._typeName + "_"
        df = pd.read_csv(self._pathMain, header=0, sep=',')
        if self._mode == "train":
            processDf = dr._trainDataFrame
        else:
            processDf = dr._testDataFrame
            
        for i in range (1,1127):
            tmpColName = tmpColumnPrefix + "one_hot_" + str(i)
            processDf[tmpColName] = 0
        
        tmpLastI2 = 0
        for i1 in range(0, len(processDf[processDf.columns[0]] )):

            tmpFlag = False
            for i2 in range(tmpLastI2, len(df[df.columns[0]] )):
                tmpMainId = processDf[processDf.columns[0]][i1]
                tmpId = df[df.columns[0]][i2]
                tmpVal= df[df.columns[1]][i2]
                #tmpVal2= df[df.columns[2]][i2]
                if  tmpMainId == tmpId:
                    tmpFlag = True
                    processDf[processDf.columns[tmpVal+394]][i1] =1
                if tmpFlag == True and tmpMainId != tmpId:
                    tmpLastI2 = i2
                    break
                #print i1, i2
        #outDf = pd.concat([dr._ansDataFrame, processDf], axis=1)
        outDf = processDf
        outDf.to_csv(self._outputPathName, sep=',', encoding='utf-8')  
Пример #2
0
def oneHot():
    dr = DataReader()
    dr.readInCSV(_pathMain, _mode)
    tmpColumnPrefix = _typeName + "_"
    df = pd.read_csv(_eventTypePath, header=0, sep=',')
    if _mode == "train":
        processDf = dr._trainDataFrame
    else:
        processDf = dr._testDataFrame
        
    for i in range (1, 55):
        tmpColName = tmpColumnPrefix + "one_hot_" + str(i)
        processDf[tmpColName] = 0
    
    tmpLastI2 = 0
    for i1 in range(0, len(processDf[processDf.columns[0]])):

        tmpFlag = False
        for i2 in range(tmpLastI2, len(df[df.columns[0]])):
            tmpMainId = processDf[processDf.columns[0]][i1]
            tmpId = df[df.columns[0]][i2]
            tmpVal = df[df.columns[1]][i2]
            # tmpVal2= df[df.columns[2]][i2]
            if  tmpMainId == tmpId:
                tmpFlag = True
                print tmpVal
                processDf[processDf.columns[tmpVal + 394]][i1] = 1
            if tmpFlag == True and tmpMainId != tmpId:
                tmpLastI2 = i2
                break
            print i1, i2
    # outDf = pd.concat([dr._ansDataFrame, processDf], axis=1)
    outDf = processDf
    outDf.to_csv(_outputPathName, sep=',', encoding='utf-8')  
Пример #3
0
if __name__ == '__main__':
    
    
    # 1. read in data
    expNo = "008"
    expInfo = expNo + "_blender" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    doTestFlag = False
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
    outputPath = _basePath + expNo + "blender_train.csv"
    
    # 1. read data
    dr = DataReader()
    tmpDfList = []
    tmpPath = _basePath + "008_submission_1_train_Extra_Trees.csv"
    newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)
    
    tmpPath = _basePath + "008_submission_1_train_K_NN.csv"
    newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)
    
    tmpPath = _basePath + "008_submission_1_train_Random_Forest.csv"
    newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)
    
    tmpPath = _basePath + "008_submission_1_train_Xgboost.csv"
    newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
Пример #4
0
from Telstra.Bartender.Blender import Blender

if __name__ == '__main__':

    # 1. read in data
    expNo = "013"
    expInfo = expNo + "_data_exploration"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"

    testSortIdPath = Config.FolderBasePath + "test_sort_id.csv"
    trainSortIdPath = _basePath + "train_sort_id.csv"

    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    dr2 = DataReader()
    dr2.readInCSV(testPath, "test")
    #newX = dr2._testDataFrame

    dr3 = DataReader()
    dr3.readInCSV(testSortIdPath, "test")
    sortIdDf = dr3._testDataFrame

    dr4 = DataReader()
    dr4.readInCSV(trainSortIdPath, "test")
    sortIdDf = dr4._testDataFrame
Пример #5
0
import pandas as pd
from Telstra.Bartender.Blender import Blender
from test._mock_backport import inplace
import random
import xgboost as xgb
import numpy as np

if __name__ == '__main__':

    # 1. read in data
    expNo = "021"
    expInfo = expNo + "_stacking"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    tmpPath = _basePath + "train.csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "train")
    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    ori_X = X
    ori_Y = Y

    evalDataPercentage = 0.5
    sampleRows = np.random.choice(X.index, len(X) * evalDataPercentage)

    train_fold_1 = X.ix[sampleRows]
    train_fold_label_1 = Y.ix[sampleRows]
    train_fold_2 = X.drop(sampleRows)
    train_fold_label_2 = Y.drop(sampleRows)

    #     tmpOutPath = _basePath + expNo +"_" + "fold_1.csv"
Пример #6
0
        [0.24276169, 0.02004454, 0.00445434, 0.71714922, 0.0155902],
        [0.00310559, 0.53881988, 0.03416149, 0.4052795, 0.01863354],
        [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619],
        [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111]
    ]

    #     for tmpFeature in featureList:
    #         dr = DataReader()
    #         tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv"
    #         newX, tmpY =  dr.cvtPathListToDfList(tmpPath, "train")
    #         tmpDf = pd.concat([tmpDf, newX], axis=1)
    #
    #     tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    tmpI, tmpJ = 0, 0
    dr = DataReader()
    baseDf, ansY = dr.cvtPathListToDfList(
        _basePath + "010_blenderXgboost_train.csv", "train")

    tmpOutPath = _basePath + "010_train_last_blender.csv"
    tmpFeatureBlendedAns = pd.DataFrame()
    baseDf = pd.DataFrame()
    tmpDfList = []
    for tmpClfName in clfNameList:
        dr = DataReader()
        tmpPath = _basePath + "010_" + "blender" + tmpClfName + "_train.csv"
        newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train")
        tmpDfList.append(newX)

    b1 = Blender(clfNameList, tmpDfList, ansY)
    b1.autoFlow(2000, tmpOutPath)
Пример #7
0
        
#      4. test all data, output 3 ans as features
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model

    modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep
    
    clfNameList = []
    clfNameList.append("Extra_Trees")
    clfNameList.append("K_NN")
    clfNameList.append("Random_Forest")
    clfNameList.append("Xgboost")
    clfNameList.append("Logistic_Regression")
    
    testCsv = _basePath + "010_train_tobe.csv"
    dr = DataReader()
    newX, testY = dr.cvtPathListToDfList(testCsv, "train")
    
    for curModel in clfNameList:
        modelPath =  modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
        tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv"
        tmpClf = loadModel( modelPath)
        log(tmpClf.predict_proba(newX))
        #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
        outDf = pd.DataFrame(tmpClf.predict_proba(newX))
        outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
        #musicAlarm()

    
Пример #8
0
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model
    #Logistic_Regression
    modelList = [
        "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
        "Logistic_Regression"
    ]
    featureList = [
        "event_type", "log_feature", "resource_type", "severity_type"
    ]

    for tmpFeature in featureList:
        for tmpModel in modelList:
            subFolder = tmpFeature
            curModel = tmpModel

            tmpCsvPath = _basePath + expNo + "_" + tmpFeature + "_test_tobe.csv"
            dr = DataReader()
            dr.readInCSV(tmpCsvPath, "train")
            newX = dr._trainDataFrame
            modelFolder = _basePath + "models" + Config.osSep + subFolder + Config.osSep
            modelPath = modelFolder + str(
                getMatchNameModelPath(modelFolder, curModel))
            tmpOutPath = _basePath + "010_" + curModel + "_stack_" + subFolder + "_test.csv"
            tmpClf = loadModel(modelPath)
            log(tmpClf.predict_proba(newX))
            outDf = pd.concat(
                [newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
            outDf = pd.DataFrame(tmpClf.predict_proba(newX))
            outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #musicAlarm()
#     log("004 Done")
Пример #9
0
        [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619],
        [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111]
    ]

    tmpOutPath = _basePath + "010_test_tobe.csv"

    #     for tmpFeature in featureList:
    #         dr = DataReader()
    #         tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv"
    #         newX, tmpY =  dr.cvtPathListToDfList(tmpPath, "train")
    #         tmpDf = pd.concat([tmpDf, newX], axis=1)
    #
    #     tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    tmpI, tmpJ = 0, 0
    dr = DataReader()
    baseDf, tmpY = dr.cvtPathListToDfList(_basePath + "010_test_asis.csv",
                                          "test")
    for tmpFeature in featureList:
        outputPath = _basePath + expNo + "_blender_" + tmpFeature + "_test.csv"
        #ansPath = _basePath + "010_Extra_Trees_stack_event_type.csv"
        #dr = DataReader()
        #tmpX, ansY = dr.cvtPathListToDfList(ansPath, "train")
        #tmpDfList = []

        tmpFeatureBlendedAns = pd.DataFrame()
        for tmpClfName in clfNameList:
            dr = DataReader()
            tmpPath = _basePath + "010_" + tmpClfName + "_stack_" + tmpFeature + "_test.csv"
            newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train")
            tmpWight = tmpWeightList[tmpI][tmpJ]
Пример #10
0
from Telstra.util.ModelUtils import deleteModelFiles
import pandas as pd
from Telstra.Bartender.Blender import Blender


if __name__ == "__main__":

    # 1. read in data
    expNo = "012"
    expInfo = expNo + "_rf_chk_important"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    path = _basePath + expNo + "_train_asis.csv"
    testPath = _basePath + expNo + "_test_asis.csv"

    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    #     fab._subFolderName = "stacked"
    fab._n_iter_search = 250
    fab._expInfo = expInfo
    #     fab.getAllModels(newX, newY)
    finalClf = fab.getRandomForestClf(newX, newY)

    featureImportance = []
    for i in range(0, len(finalClf.feature_importances_)):
        if i != len(dr._trainDataFrame.columns):
Пример #11
0
#     tmpPath = _basePath + "test_2.csv"
#     dr = DataReader()
#     dr.readInCSV(tmpPath, "test")
#     newX = dr._testDataFrame
#     newY = dr._ansDataFrame
#     newX  = xgb.DMatrix(newX)
#     #print clf.predict(newX)
#     tmpOutPath = _basePath + expNo +"_" + "Xgboost" + "_testXgboost7_ans.csv"
#     log(clf.predict(newX))
#     outDf = pd.DataFrame(clf.predict(newX))
#     outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
#     musicAlarm()
    
    clf = joblib.load( "F:\\xgboost_tmp_best_020.model" )
    tmpPath = _basePath + "test_merge_one_hot" + ".csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "test")
    newX = dr._testDataFrame
    newX  = xgb.DMatrix(newX)
    tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans_" + "2" + ".csv"
    log(clf.predict(newX))
    outDf = pd.DataFrame(clf.predict(newX))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    musicAlarm()
    
    
        
#     sampleRows = np.random.choice(X.index, len(X)*evalDataPercentage) 
#     
#     print  X.ix[sampleRows]
#     exit()
Пример #12
0
if __name__ == '__main__':
    
    
    # 1. read in data
    expNo = "013"
    expInfo = expNo + "_data_exploration" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"
    
    testSortIdPath = Config.FolderBasePath + "test_sort_id.csv"
    trainSortIdPath = _basePath + "train_sort_id.csv"
    
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    
    dr2 = DataReader()
    dr2.readInCSV( testPath, "test")
    #newX = dr2._testDataFrame
    
    dr3 = DataReader()
    dr3.readInCSV( testSortIdPath, "test")
    sortIdDf =dr3._testDataFrame
    
    dr4 = DataReader()
    dr4.readInCSV(trainSortIdPath, "test")
    sortIdDf =dr4._testDataFrame
    
Пример #13
0
#     fab._expInfo = expInfo
#     fab.getAllModels(newX, newY)
    
    # 4. test all data, output 3 ans as features
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model
    #Logistic_Regression
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
    featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
    
    for tmpFeature in featureList: 
        for tmpModel in modelList:
            subFolder = tmpFeature
            curModel = tmpModel
            
            tmpCsvPath = _basePath + expNo + "_" + tmpFeature +"_test_tobe.csv"
            dr = DataReader()
            dr.readInCSV(tmpCsvPath , "train")
            newX = dr._trainDataFrame
            modelFolder = _basePath + "models" + Config.osSep + subFolder + Config.osSep
            modelPath =  modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
            tmpOutPath = _basePath + "010_" + curModel + "_stack_" + subFolder + "_test.csv"
            tmpClf = loadModel( modelPath)
            log(tmpClf.predict_proba(newX))
            outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
            outDf = pd.DataFrame(tmpClf.predict_proba(newX))
            outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #musicAlarm()
#     log("004 Done")
Пример #14
0
    #     outputPath = _basePath + "blendTestOut.csv"
    #     dr.readInCSV(path, "train")
    #     newX, newY = dr._trainDataFrame, dr._ansDataFrame
    #
    #
    #     dr2 = DataReader()
    #     dr2.readInCSV(path, "train")
    #     newX2, newY2 = dr2._trainDataFrame, dr2._ansDataFrame
    #
    #     predictDfList = []
    #     predictDfList.append(newX)
    #     predictDfList.append(newX2)
    clfNameList = []
    clfNameList.append("test")
    clfNameList.append("test2")
    #
    #     b1 = Blender(clfNameList, predictDfList, newY)
    #     inputWeightList = b1.getRandomWeightList(2)
    #     #print inputWeightList
    #     tmpDf = b1.doBlending(inputWeightList)
    #     #b1.calLogLoss(tmpDf)
    #     b1.autoFlow(11, outputPath)

    dr3 = DataReader()
    path = _basePath + "testLogLoss.csv"
    dr3.readInCSV(path, "train")
    predictDfList = []
    predictDfList.append(dr3._trainDataFrame)
    b2 = Blender(clfNameList, predictDfList, dr3._ansDataFrame)
    print b2.calLogLoss(dr3._trainDataFrame)
Пример #15
0
    #      4. test all data, output 3 ans as features
    #     D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
    #     D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
    #     D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
    #     D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model

    modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep

    clfNameList = []
    clfNameList.append("Extra_Trees")
    clfNameList.append("K_NN")
    clfNameList.append("Random_Forest")
    clfNameList.append("Xgboost")
    clfNameList.append("Logistic_Regression")

    testCsv = _basePath + "010_train_tobe.csv"
    dr = DataReader()
    newX, testY = dr.cvtPathListToDfList(testCsv, "train")

    for curModel in clfNameList:
        modelPath = modelFolder + str(
            getMatchNameModelPath(modelFolder, curModel))
        tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv"
        tmpClf = loadModel(modelPath)
        log(tmpClf.predict_proba(newX))
        #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
        outDf = pd.DataFrame(tmpClf.predict_proba(newX))
        outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
        #musicAlarm()
Пример #16
0
    
   # 1. read in data
    expNo = "020"
    expInfo = expNo + "_groupby_sum" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"]
    ans1List = []
    ans2List = []
#     ansPath = _basePath + "014_ans_array.csv"
#     drAns = DataReader()
#     drAns.readInCSV(ansPath, "train")
#     newY = drAns._ansDataFrame

    tmpPath = _basePath + "train_merge_one_hot.csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "train")
    newX = dr._trainDataFrame
    newY = dr._ansDataFrame


    fab = ModelFactory()
    #fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "groupby_sum"  
    fab._n_iter_search = 1
    fab._expInfo = expInfo
    clf = fab.getXgboostClf(newX, newY)
#     
    tmpPath = _basePath + "test_merge_one_hot" + ".csv"
Пример #17
0
from Telstra.Bartender.Blender import Blender

if __name__ == '__main__':

    # 1. read in data
    expNo = "008"
    expInfo = expNo + "_blender"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    doTestFlag = False
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
    outputPath = _basePath + expNo + "blender_train.csv"

    # 1. read data
    dr = DataReader()
    tmpDfList = []
    tmpPath = _basePath + "008_submission_1_train_Extra_Trees.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)

    tmpPath = _basePath + "008_submission_1_train_K_NN.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)

    tmpPath = _basePath + "008_submission_1_train_Random_Forest.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)

    tmpPath = _basePath + "008_submission_1_train_Xgboost.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
Пример #18
0
import pandas as pd
from Telstra.Bartender.Blender import Blender


if __name__ == '__main__':
    
    
   # 1. read in data
    expNo = "014"
    expInfo = expNo + "_one_hot_each_features" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"]
    
    ansPath = _basePath + "014_ans_array.csv"
    drAns = DataReader()
    drAns.readInCSV(ansPath, "train")
    newY = drAns._ansDataFrame
    
    
       
    for i in range(1,32):
        log( "start " + str(i) + "/32 ...")
        tmpCurFeatureList = []
        
        flagList =[]
        for i2 in range (0, 7- len(bin(i))):
            flagList.append(0)
        for i2 in range(2,len(bin(i))):
            flagList.append(int(bin(i)[i2]))
        
Пример #19
0
from Telstra.util.ModelUtils import getMatchNameModelPath
from Telstra.util.ModelUtils import deleteModelFiles
import pandas as pd
from Telstra.Bartender.Blender import Blender

if __name__ == '__main__':

    # 1. read in data
    expNo = "013"
    expInfo = expNo + "_data_exploration"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"

    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._subFolderName = "binary"
    fab._n_iter_search = 50
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)

    musicAlarm()
    # Test all data
    modelList = [
        "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
Пример #20
0
import pandas as pd

if __name__ == '__main__':
    
    # 1. read in data
    expNo = "003"
    expInfo = expNo + "_one_hot_event_type" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    
    doTestFlag = False
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
   
    # 1. read data
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX
 
    
    # 3. get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 30
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
Пример #21
0
def exp():
    expInfo = "location_only\\"
    _basePath = Config.FolderBasePath + expInfo

    doTestFlag = False
    path = _basePath + "train.csv"
    testPath = _basePath + "test10.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")

    # 2. run models
    #print dr._trainDataFrame.as_matrix
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 10
    fab._expInfo = "location_only"

    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    #fab.getRandomForestClf(X, Y)
    #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame)

    #     log( "xgb start")
    #     param = {'max_depth':10,  'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'}
    #     num_round = 5
    #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame,  dr._ansDataFrame)
    #testResult = gbm.predict_proba(dr._testDataFrame)
    #print testResult
    #     gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob')

    #     scores = cross_val_score(rfClf, dr._trainDataFrame,  dr._ansDataFrame, n_jobs = -1)
    #     log( "xgboost Validation Precision: ", scores.mean() )
    #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame),  num_round, nfold=5,metrics={'error'}, seed = 0)
    #gbTrain = gbm.fit(dr._trainDataFrame,  dr._ansDataFrame)
    #joblib.dump(gbTrain, xgbModelPath)
    #clf = joblib.load( xgbModelPath )
    #clf.predict_proba(dr._testDataFrame)
    #xgb.save(gbm, xgbModelPath)
    #print xgbCv
    #print "xgb end"

    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm

    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)


#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance

    if doTestFlag == True:
        return finalClf.predict_proba(dr._testDataFrame)
Пример #22
0
        [0.24276169, 0.02004454, 0.00445434, 0.71714922, 0.0155902],
        [0.00310559, 0.53881988, 0.03416149, 0.4052795, 0.01863354],
        [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619],
        [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111],
    ]

    #     for tmpFeature in featureList:
    #         dr = DataReader()
    #         tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv"
    #         newX, tmpY =  dr.cvtPathListToDfList(tmpPath, "train")
    #         tmpDf = pd.concat([tmpDf, newX], axis=1)
    #
    #     tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    tmpI, tmpJ = 0, 0
    dr = DataReader()
    baseDf, ansY = dr.cvtPathListToDfList(_basePath + "010_blenderXgboost_train.csv", "train")

    tmpOutPath = _basePath + "010_train_last_blender.csv"
    tmpFeatureBlendedAns = pd.DataFrame()
    baseDf = pd.DataFrame()
    tmpDfList = []
    for tmpClfName in clfNameList:
        dr = DataReader()
        tmpPath = _basePath + "010_" + "blender" + tmpClfName + "_train.csv"
        newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train")
        tmpDfList.append(newX)

    b1 = Blender(clfNameList, tmpDfList, ansY)
    b1.autoFlow(2000, tmpOutPath)
Пример #23
0
        [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619],
        [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111],
    ]

    tmpOutPath = _basePath + "010_test_tobe.csv"

    #     for tmpFeature in featureList:
    #         dr = DataReader()
    #         tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv"
    #         newX, tmpY =  dr.cvtPathListToDfList(tmpPath, "train")
    #         tmpDf = pd.concat([tmpDf, newX], axis=1)
    #
    #     tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    tmpI, tmpJ = 0, 0
    dr = DataReader()
    baseDf, tmpY = dr.cvtPathListToDfList(_basePath + "010_test_asis.csv", "test")
    for tmpFeature in featureList:
        outputPath = _basePath + expNo + "_blender_" + tmpFeature + "_test.csv"
        # ansPath = _basePath + "010_Extra_Trees_stack_event_type.csv"
        # dr = DataReader()
        # tmpX, ansY = dr.cvtPathListToDfList(ansPath, "train")
        # tmpDfList = []

        tmpFeatureBlendedAns = pd.DataFrame()
        for tmpClfName in clfNameList:
            dr = DataReader()
            tmpPath = _basePath + "010_" + tmpClfName + "_stack_" + tmpFeature + "_test.csv"
            newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train")
            tmpWight = tmpWeightList[tmpI][tmpJ]
import pandas as pd
from Telstra.Bartender.Blender import Blender


if __name__ == '__main__':
    
    
   # 1. read in data
    expNo = "011"
    expInfo = expNo + "_remove_one_hot" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
    
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    
    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
#     fab._subFolderName = "stacked"
    fab._n_iter_search = 100
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
    # Test all data
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
#     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
    
Пример #25
0
import os

_basePath =""
if os.name == 'nt':
    _basePath = "D:\\Kaggle\\Telstra\\"
else:
    _basePath = "/Users/whmou/Kaggle/Telstra/"
testPath = _basePath + "test6.csv"    # take id list only
testPath2 = _basePath + "test11.csv"   

samplePath = _basePath + "sample_submission.csv" 
outputPath = _basePath+"temp_submission4.csv"

if __name__ == '__main__':
    print "start to make submission version:", outputPath
    dr = DataReader()
    dr.readInCSV(testPath, "test")
    idList =  dr._testDataFrame[dr._testDataFrame.columns[0]]
    
    
    dr2= DataReader()
    dr2.readInCSV(testPath2, "test")
    
    
    dr3= DataReader()
    dr3.readInCSV(samplePath, "test")
    sampleIdList =  dr3._testDataFrame[dr3._testDataFrame.columns[0]]
    
    tmp = pd.DataFrame(exp())
    ansArr = pd.concat([idList, tmp], axis=1)
    print ansArr
Пример #26
0
from Telstra.util.CustomLogger import musicAlarm
from Telstra.util.ModelUtils import loadModel
import pandas as pd

if __name__ == '__main__':

    # 1. read in data
    expInfo = "001_location_only" + Config.osSep
    _basePath = Config.FolderBasePath + expInfo

    doTestFlag = True
    path = _basePath + "001_train_tobe.csv"
    testPath = _basePath + "001_test_tobe.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        print newX
    # 2. stratify 60 % data and train location only
#     newX, newY = stratifyData(dr._trainDataFrame, dr._ansDataFrame, 0.4)

# 3. get all best model from newX
#     fab = ModelFactory()
#     fab._gridSearchFlag = True
#     fab._n_iter_search = 500
#     fab._expInfo = "001_location_only"
Пример #27
0
def exp():
    expInfo = "location_only\\"
    _basePath = Config.FolderBasePath + expInfo
    
    
    doTestFlag = False
    path = _basePath + "train.csv"
    testPath = _basePath + "test10.csv"
   
    # 1. read data
    dr = DataReader()
    dr.readInCSV( path, "train")
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
    
    # 2. run models
    #print dr._trainDataFrame.as_matrix
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 10
    fab._expInfo = "location_only"

    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    #fab.getRandomForestClf(X, Y)
    #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame)
    
#     log( "xgb start")
#     param = {'max_depth':10,  'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'}
#     num_round = 5
    #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame,  dr._ansDataFrame)
    #testResult = gbm.predict_proba(dr._testDataFrame)
    #print testResult
#     gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob')
    
#     scores = cross_val_score(rfClf, dr._trainDataFrame,  dr._ansDataFrame, n_jobs = -1)
#     log( "xgboost Validation Precision: ", scores.mean() )
    #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame),  num_round, nfold=5,metrics={'error'}, seed = 0)
    #gbTrain = gbm.fit(dr._trainDataFrame,  dr._ansDataFrame)
    #joblib.dump(gbTrain, xgbModelPath)
    #clf = joblib.load( xgbModelPath )
    #clf.predict_proba(dr._testDataFrame)
    #xgb.save(gbm, xgbModelPath)
    #print xgbCv
    #print "xgb end"
    
    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm
    
    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)
    
    
#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):  
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#     
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance 

    if doTestFlag == True:       
        return finalClf.predict_proba(dr._testDataFrame)