Exemplo n.º 1
0
        
#      4. test all data, output 3 ans as features
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model

    modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep
    
    clfNameList = []
    clfNameList.append("Extra_Trees")
    clfNameList.append("K_NN")
    clfNameList.append("Random_Forest")
    clfNameList.append("Xgboost")
    clfNameList.append("Logistic_Regression")
    
    testCsv = _basePath + "010_train_tobe.csv"
    dr = DataReader()
    newX, testY = dr.cvtPathListToDfList(testCsv, "train")
    
    for curModel in clfNameList:
        modelPath =  modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
        tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv"
        tmpClf = loadModel( modelPath)
        log(tmpClf.predict_proba(newX))
        #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
        outDf = pd.DataFrame(tmpClf.predict_proba(newX))
        outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
        #musicAlarm()

    
Exemplo n.º 2
0
    # 1. read in data
    expNo = "008"
    expInfo = expNo + "_blender"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    doTestFlag = False
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
    outputPath = _basePath + expNo + "blender_train.csv"

    # 1. read data
    dr = DataReader()
    tmpDfList = []
    tmpPath = _basePath + "008_submission_1_train_Extra_Trees.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)

    tmpPath = _basePath + "008_submission_1_train_K_NN.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)

    tmpPath = _basePath + "008_submission_1_train_Random_Forest.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)

    tmpPath = _basePath + "008_submission_1_train_Xgboost.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)

    clfNameList = []
Exemplo n.º 3
0
        [0.00310559, 0.53881988, 0.03416149, 0.4052795, 0.01863354],
        [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619],
        [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111],
    ]

    #     for tmpFeature in featureList:
    #         dr = DataReader()
    #         tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv"
    #         newX, tmpY =  dr.cvtPathListToDfList(tmpPath, "train")
    #         tmpDf = pd.concat([tmpDf, newX], axis=1)
    #
    #     tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    tmpI, tmpJ = 0, 0
    dr = DataReader()
    baseDf, ansY = dr.cvtPathListToDfList(_basePath + "010_blenderXgboost_train.csv", "train")

    tmpOutPath = _basePath + "010_train_last_blender.csv"
    tmpFeatureBlendedAns = pd.DataFrame()
    baseDf = pd.DataFrame()
    tmpDfList = []
    for tmpClfName in clfNameList:
        dr = DataReader()
        tmpPath = _basePath + "010_" + "blender" + tmpClfName + "_train.csv"
        newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train")
        tmpDfList.append(newX)

    b1 = Blender(clfNameList, tmpDfList, ansY)
    b1.autoFlow(2000, tmpOutPath)

    # test
Exemplo n.º 4
0
        for i2 in range(2,len(bin(i))):
            flagList.append(int(bin(i)[i2]))
        
        for j in range(0,5):
            if flagList[j] ==1:
                tmpCurFeatureList.append(featureList[j])
        
        log(tmpCurFeatureList)        
        
        
        newX = pd.DataFrame()
        
        for tmpFeature in tmpCurFeatureList:
            path = _basePath + tmpFeature + "_train.csv"
            dr = DataReader()
            tmpX = dr.cvtPathListToDfList(path, "test")
            newX = pd.concat([newX, tmpX], axis=1)
        #log("feature len: " , len(newX))
            
        # Get all best model from newX
        fab = ModelFactory()
        fab._setXgboostTheradToOne = True
        fab._gridSearchFlag = True
        fab._onlyTreeBasedModels = True
        fab._subFolderName = "one_hot_each_" + str(i)
        fab._n_iter_search = 30
        fab._expInfo = expInfo
#         fab.getAllModels(newX, newY)
        fab.getRandomForestClf(newX, newY)
#         fab.getXgboostClf(newX, newY)
        log ( i , "/32 done..." )
Exemplo n.º 5
0
        [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111]
    ]

    tmpOutPath = _basePath + "010_test_tobe.csv"

    #     for tmpFeature in featureList:
    #         dr = DataReader()
    #         tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv"
    #         newX, tmpY =  dr.cvtPathListToDfList(tmpPath, "train")
    #         tmpDf = pd.concat([tmpDf, newX], axis=1)
    #
    #     tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    tmpI, tmpJ = 0, 0
    dr = DataReader()
    baseDf, tmpY = dr.cvtPathListToDfList(_basePath + "010_test_asis.csv",
                                          "test")
    for tmpFeature in featureList:
        outputPath = _basePath + expNo + "_blender_" + tmpFeature + "_test.csv"
        #ansPath = _basePath + "010_Extra_Trees_stack_event_type.csv"
        #dr = DataReader()
        #tmpX, ansY = dr.cvtPathListToDfList(ansPath, "train")
        #tmpDfList = []

        tmpFeatureBlendedAns = pd.DataFrame()
        for tmpClfName in clfNameList:
            dr = DataReader()
            tmpPath = _basePath + "010_" + tmpClfName + "_stack_" + tmpFeature + "_test.csv"
            newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train")
            tmpWight = tmpWeightList[tmpI][tmpJ]

            newX = newX.multiply(tmpWight)
Exemplo n.º 6
0
 
 # 1. read in data
 expNo = "008"
 expInfo = expNo + "_blender" 
 _basePath = Config.FolderBasePath + expInfo + Config.osSep
 
 doTestFlag = False
 path = _basePath + expNo + "_train_tobe.csv"
 testPath = _basePath + expNo + "_test_tobe.csv"
 outputPath = _basePath + expNo + "blender_train.csv"
 
 # 1. read data
 dr = DataReader()
 tmpDfList = []
 tmpPath = _basePath + "008_submission_1_train_Extra_Trees.csv"
 newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
 tmpDfList.append(newX)
 
 tmpPath = _basePath + "008_submission_1_train_K_NN.csv"
 newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
 tmpDfList.append(newX)
 
 tmpPath = _basePath + "008_submission_1_train_Random_Forest.csv"
 newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
 tmpDfList.append(newX)
 
 tmpPath = _basePath + "008_submission_1_train_Xgboost.csv"
 newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
 tmpDfList.append(newX)
 
 clfNameList = []
Exemplo n.º 7
0
        [0.00310559, 0.53881988, 0.03416149, 0.4052795, 0.01863354],
        [0.13333333, 0.01333333, 0.01142857, 0.73142857, 0.11047619],
        [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111]
    ]

    #     for tmpFeature in featureList:
    #         dr = DataReader()
    #         tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv"
    #         newX, tmpY =  dr.cvtPathListToDfList(tmpPath, "train")
    #         tmpDf = pd.concat([tmpDf, newX], axis=1)
    #
    #     tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    tmpI, tmpJ = 0, 0
    dr = DataReader()
    baseDf, ansY = dr.cvtPathListToDfList(
        _basePath + "010_blenderXgboost_train.csv", "train")

    tmpOutPath = _basePath + "010_train_last_blender.csv"
    tmpFeatureBlendedAns = pd.DataFrame()
    baseDf = pd.DataFrame()
    tmpDfList = []
    for tmpClfName in clfNameList:
        dr = DataReader()
        tmpPath = _basePath + "010_" + "blender" + tmpClfName + "_train.csv"
        newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train")
        tmpDfList.append(newX)

    b1 = Blender(clfNameList, tmpDfList, ansY)
    b1.autoFlow(2000, tmpOutPath)

    # test
    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
#     fab._subFolderName = "stacked"
    fab._n_iter_search = 100
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
    # Test all data
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
#     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
    
#     for tmpFeature in featureList:
    modelFolder = _basePath + "models" + Config.osSep 
    for tmpModel in modelList:  
        curModel = tmpModel
        
        dr = DataReader()
        newX = dr.cvtPathListToDfList(testPath, "test")
        
        modelPath =  modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
        tmpOutPath = _basePath + "011_" + curModel + "_test_ans.csv"
        tmpClf = loadModel( modelPath)
        log(tmpClf.predict_proba(newX))
        outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
        outDf = pd.DataFrame(tmpClf.predict_proba(newX))
        outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
        
    
    #musicAlarm()
#     log("004 Done")
Exemplo n.º 9
0
        [0.08222222, 0.00222222, 0.00222222, 0.00222222, 0.91111111],
    ]

    tmpOutPath = _basePath + "010_test_tobe.csv"

    #     for tmpFeature in featureList:
    #         dr = DataReader()
    #         tmpPath = _basePath + "010_blender_" + tmpFeature + "_train.csv"
    #         newX, tmpY =  dr.cvtPathListToDfList(tmpPath, "train")
    #         tmpDf = pd.concat([tmpDf, newX], axis=1)
    #
    #     tmpDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    tmpI, tmpJ = 0, 0
    dr = DataReader()
    baseDf, tmpY = dr.cvtPathListToDfList(_basePath + "010_test_asis.csv", "test")
    for tmpFeature in featureList:
        outputPath = _basePath + expNo + "_blender_" + tmpFeature + "_test.csv"
        # ansPath = _basePath + "010_Extra_Trees_stack_event_type.csv"
        # dr = DataReader()
        # tmpX, ansY = dr.cvtPathListToDfList(ansPath, "train")
        # tmpDfList = []

        tmpFeatureBlendedAns = pd.DataFrame()
        for tmpClfName in clfNameList:
            dr = DataReader()
            tmpPath = _basePath + "010_" + tmpClfName + "_stack_" + tmpFeature + "_test.csv"
            newX, tmpY = dr.cvtPathListToDfList(tmpPath, "train")
            tmpWight = tmpWeightList[tmpI][tmpJ]

            newX = newX.multiply(tmpWight)
Exemplo n.º 10
0
    #      4. test all data, output 3 ans as features
    #     D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
    #     D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
    #     D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
    #     D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model

    modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep

    clfNameList = []
    clfNameList.append("Extra_Trees")
    clfNameList.append("K_NN")
    clfNameList.append("Random_Forest")
    clfNameList.append("Xgboost")
    clfNameList.append("Logistic_Regression")

    testCsv = _basePath + "010_train_tobe.csv"
    dr = DataReader()
    newX, testY = dr.cvtPathListToDfList(testCsv, "train")

    for curModel in clfNameList:
        modelPath = modelFolder + str(
            getMatchNameModelPath(modelFolder, curModel))
        tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv"
        tmpClf = loadModel(modelPath)
        log(tmpClf.predict_proba(newX))
        #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
        outDf = pd.DataFrame(tmpClf.predict_proba(newX))
        outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
        #musicAlarm()