示例#1
0
 def autoFlow (self, numIter, outputPath):
     log("Start blending autoFlow, num of Iter: " , numIter)
     start = time.time()
     distinctModels = len(self._clfNameList)
     tmpResultList =[]
     tmpRandomWeightList =[]
     tmpBlendedDfList =[]
     for i in range (0, numIter):
         tmpWeightList = self.getRandomWeightList(distinctModels)
         tmpRandomWeightList.append(tmpWeightList)
         tmpDf = self.doBlending(tmpWeightList)
         tmpBlendedDfList.append(tmpDf)
         tmpResultList.append(self.calLogLoss(tmpDf))
 
     
     idList = np.array(tmpResultList).argsort()[:3]
     firstFlag = True
     finalDf = []
     logResult =[]
     for id in idList:
         if firstFlag == True:
             finalDf = tmpBlendedDfList[id]
             self._bestParamList = tmpRandomWeightList[id]
             firstFlag = False
         log ("logloss: " , tmpResultList[id] , "blender param: " , tmpRandomWeightList[id])
         logResult.append ( (tmpResultList[id] , tmpRandomWeightList[id]))
     mail("Blender Top3: " ,logResult,  self._clfNameList)
     log("clfNameList = ", self._clfNameList)
     log ("low prob. id list (in 1st): #", len(self._lowProbIdList) , ", ", self._lowProbIdList)
     log("End blending autoFlow, num of Iter: " , numIter, " cost: ", time.time() - start , " sec") 
     
     finalDf.to_csv(outputPath, sep=',', encoding='utf-8')  
示例#2
0
    def getKnnClf(self, X, Y):
        clfName = "K_NN"
        
        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        clf = KNeighborsClassifier(
                                n_neighbors=5, 
                                weights='uniform', 
                                algorithm='auto', 
                                leaf_size=30, 
                                p=2, 
                                metric='minkowski', 
                                metric_params=None, 

                                )
        
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            
            param_dist = {
                          "n_neighbors": sp_randint(4, 8),
                          "weights": ['uniform', 'uniform'],
                          "leaf_size": sp_randint(30, 60),
                          "algorithm": ['auto', 'auto'],
                          }
            
            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
            
        return clf
示例#3
0
    def doRandomSearch(self, clfName, clf, param_dist, X, Y):
        start = time.time()
        multiCores = -1
        if clfName == "Logistic_Regression":
            multiCores = 1
        if self._setXgboostTheradToOne == True and clfName == "Xgboost":
            multiCores = 1

        random_search = RandomizedSearchCV(clf,
                                           param_distributions=param_dist,
                                           n_iter=self._n_iter_search,
                                           n_jobs=multiCores,
                                           scoring='log_loss')

        random_search.fit(X, Y)
        log(clfName + " randomized search cost: ", time.time() - start, " sec")
        self._bestClf[clfName] = random_search.best_estimator_
        self._bestLoglossDict[clfName] = self.getLogloss(
            self._bestClf[clfName], X, Y)
        self.report(random_search.grid_scores_, clfName,
                    self._bestLoglossDict[clfName])

        dumpModel(random_search.best_estimator_, clfName, self._expInfo,
                  self._subFolderName)

        return random_search.best_estimator_
示例#4
0
    def autoFlow(self, numIter, outputPath):
        log("Start blending autoFlow, num of Iter: ", numIter)
        start = time.time()
        distinctModels = len(self._clfNameList)
        tmpResultList = []
        tmpRandomWeightList = []
        tmpBlendedDfList = []
        for i in range(0, numIter):
            tmpWeightList = self.getRandomWeightList(distinctModels)
            tmpRandomWeightList.append(tmpWeightList)
            tmpDf = self.doBlending(tmpWeightList)
            tmpBlendedDfList.append(tmpDf)
            tmpResultList.append(self.calLogLoss(tmpDf))

        idList = np.array(tmpResultList).argsort()[:3]
        firstFlag = True
        finalDf = []
        logResult = []
        for id in idList:
            if firstFlag == True:
                finalDf = tmpBlendedDfList[id]
                self._bestParamList = tmpRandomWeightList[id]
                firstFlag = False
            log("logloss: ", tmpResultList[id], "blender param: ",
                tmpRandomWeightList[id])
            logResult.append((tmpResultList[id], tmpRandomWeightList[id]))
        mail("Blender Top3: ", logResult, self._clfNameList)
        log("clfNameList = ", self._clfNameList)
        log("low prob. id list (in 1st): #", len(self._lowProbIdList), ", ",
            self._lowProbIdList)
        log("End blending autoFlow, num of Iter: ", numIter, " cost: ",
            time.time() - start, " sec")

        finalDf.to_csv(outputPath, sep=',', encoding='utf-8')
示例#5
0
    def getRandomForestClf(self, X, Y):
        clfName = "Random_Forest"
        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
        clf = rf(n_estimators=300,
                 max_depth=None,
                 min_samples_split=1,
                 random_state=0,
                 bootstrap=True,
                 oob_score=True)

        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            tmpLowDepth = 10
            tmpHighDepth = 50

            param_dist = {
                "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                "max_features": sp_randf(0, 1),
                "min_samples_split": sp_randint(1, 11),
                "min_samples_leaf": sp_randint(1, 11),
                "criterion": ["gini", "entropy"],
                "n_estimators": sp_randint(100, 300),
            }

            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)

        return clf
示例#6
0
    def getKnnClf(self, X, Y):
        clfName = "K_NN"

        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        clf = KNeighborsClassifier(
            n_neighbors=5,
            weights='uniform',
            algorithm='auto',
            leaf_size=30,
            p=2,
            metric='minkowski',
            metric_params=None,
        )

        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")

            param_dist = {
                "n_neighbors": sp_randint(4, 8),
                "weights": ['uniform', 'uniform'],
                "leaf_size": sp_randint(30, 60),
                "algorithm": ['auto', 'auto'],
            }

            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)

        return clf
示例#7
0
    def getLogisticRegressionClf(self, X, Y):
        clfName = "Logistic_Regression"

        ## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
        clf = LogisticRegression(
            penalty='l2',
            dual=False,
            tol=0.0001,
            C=1.0,
            fit_intercept=True,
            intercept_scaling=1,
            class_weight=None,
            random_state=None,
            solver='liblinear',
            max_iter=100,
            multi_class='ovr',
            verbose=0,
        )

        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")

            param_dist = {
                "penalty": ['l2', 'l2'],
                "C": sp_randf(1.0, 3.0),
                "solver": ['lbfgs', 'liblinear'],
            }

            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)

        return clf
示例#8
0
    def getLogisticRegressionClf(self, X, Y):
        clfName = "Logistic_Regression"
        
        ## http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
        clf = LogisticRegression(
                                penalty='l2', 
                                dual=False, 
                                tol=0.0001, 
                                C=1.0, 
                                fit_intercept=True, 
                                intercept_scaling=1, 
                                class_weight=None, 
                                random_state=None, 
                                solver='liblinear', 
                                max_iter=100, 
                                multi_class='ovr', 
                                verbose=0, 


                                )
        
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            
            param_dist = {
                          "penalty": ['l2', 'l2'],
                          "C": sp_randf(1.0,3.0),
                          "solver": [ 'lbfgs', 'liblinear'],
                          }
            
            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
            
        return clf
示例#9
0
 def readInCSV(self, path, mode):
     # # 1. read csv data in
     df = pd.read_csv(path, header=0, sep=',')
     log("loading csv: " + path)
     if mode.lower() == "train":
         self._ansDataFrame = df[df.columns[0]]
         self._trainDataFrame = df[df.columns[1:]]
     else:
         self._testDataFrame = df
示例#10
0
 def readInCSV(self, path, mode):
     # # 1. read csv data in
     df = pd.read_csv(path, header=0, sep=',')
     log("loading csv: " + path)
     if mode.lower() == "train":
         self._ansDataFrame = df[df.columns[0]]
         self._trainDataFrame = df[df.columns[1:]]
     else:
         self._testDataFrame = df
示例#11
0
 def genXgboostRpt(self, bestClf, bestScore, paramList, best_num_round):
     dumpModel(bestClf, "Xgboost", self._expInfo, self._subFolderName)
     log("Native Xgboost best score : ", bestScore, ", param list: ",
         paramList, "best_num_round: ", best_num_round)
     if self._singleModelMail == True:
         mail(
             "Xgboost Done",
             "Native Xgboost best score : " + str(bestScore) +
             ", param list: " + str(paramList) + "best_num_round: ",
             best_num_round)
示例#12
0
    def getNaiveBayesClf(self, X, Y):
        clfName = "Naive_Bayes"

        ## http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes
        clf = GaussianNB()
        clf = clf.fit(X, Y)
        scores = cross_val_score(clf, X, Y)
        log(clfName + " Cross Validation Precision: ", scores.mean())
        self._bestScoreDict[clfName] = scores.mean()

        return clf
示例#13
0
 def getNaiveBayesClf(self, X, Y):
     clfName = "Naive_Bayes"
     
     ## http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes
     clf = GaussianNB()
     clf = clf.fit(X, Y)
     scores = cross_val_score(clf, X,  Y )
     log( clfName + " Cross Validation Precision: ", scores.mean() )
     self._bestScoreDict[clfName] = scores.mean()
         
     return clf
示例#14
0
 def report(self, grid_scores, clfName, bestLogLoss, n_top=3):
     top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
     bestParameters = {}
     mailContent = ""
     for i, score in enumerate(top_scores):
         
         log("Model with rank: {0}".format(i + 1))
         log("Mean validation score: {0:.3f} (std: {1:.3f})".format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores)))
         log("Parameters: {0}".format(score.parameters))
         
         mailContent += str("Model with rank: {0}".format(i + 1)  )
         mailContent += "\n"
         mailContent += str("Mean validation score: {0:.3f} (std: {1:.3f})".format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores))   )
         mailContent += "\n"
         mailContent += str("Parameters: {0}".format(score.parameters)  )
         mailContent += "\n"
                 
         if i == 0:
             self._bestScoreDict[clfName] = score.mean_validation_score
             mailContent += str("Best CV score: ") + str ( score.mean_validation_score )
             mailContent += "\n"
             
         log("")
     #log (clfName , " best logloss: ", bestLogLoss)
     if (self._singleModelMail == True):
         mail("Single Model Done: ", clfName , ", ", mailContent)
     return bestParameters
示例#15
0
    def report(self, grid_scores, clfName, bestLogLoss, n_top=3):
        top_scores = sorted(grid_scores, key=itemgetter(1),
                            reverse=True)[:n_top]
        bestParameters = {}
        mailContent = ""
        for i, score in enumerate(top_scores):

            log("Model with rank: {0}".format(i + 1))
            log("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                score.mean_validation_score,
                np.std(score.cv_validation_scores)))
            log("Parameters: {0}".format(score.parameters))

            mailContent += str("Model with rank: {0}".format(i + 1))
            mailContent += "\n"
            mailContent += str(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    score.mean_validation_score,
                    np.std(score.cv_validation_scores)))
            mailContent += "\n"
            mailContent += str("Parameters: {0}".format(score.parameters))
            mailContent += "\n"

            if i == 0:
                self._bestScoreDict[clfName] = score.mean_validation_score
                mailContent += str("Best CV score: ") + str(
                    score.mean_validation_score)
                mailContent += "\n"

            log("")
        #log (clfName , " best logloss: ", bestLogLoss)
        if (self._singleModelMail == True):
            mail("Single Model Done: ", clfName, ", ", mailContent)
        return bestParameters
示例#16
0
    def getXgboostClf(self, X, Y):
        clfName = "Xgboost"

        ## https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
        tmpLowDepth = 10
        tmpHighDepth = 50

        num_class = len(set(Y))
        objective = ""
        if len(set(Y)) <= 2:
            objective = "binary:logistic"
        else:
            objective = "multi:softprob"

        num_round = 120
        param = {
            'bst:max_depth': 74,
            'bst:eta': 0.05,
            'silent': 1,
            'min_child_weight': 2,
            'subsample': 0.6031536958709969,
            #'colsample_bytree': 0.7,
            'max_delta_step': 9,
            'gamma': 3,
            'eta': 0.23833373077656667,
            'eval_metric': 'mlogloss',
            'num_class': num_class,
            'objective': objective,
            'alpha': 1,
            'lambda': 1
        }
        param['nthread'] = 4
        plst = param.items()

        clf = None
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            clf = self.doXgboostRandomSearch(X, Y, num_round)

        else:
            dtrain = xgb.DMatrix(X, label=Y)
            clf = xgb.train(plst, dtrain, num_round)
        #joblib.dump(clf, xgbModelPath)
        return clf
示例#17
0
    def getXgboostClf(self, X, Y):
        clfName = "Xgboost"
        
        ## https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
        tmpLowDepth = 10
        tmpHighDepth = 50
        
        num_class = len(set(Y))
        objective =""
        if len(set(Y)) <=2:
            objective = "binary:logistic"
        else:
            objective = "multi:softprob"
        
        num_round = 120
        param = {'bst:max_depth':74, 
                 'bst:eta':0.05, 
                 'silent':1, 
                 'min_child_weight':2, 
                 'subsample': 0.6031536958709969,
                 #'colsample_bytree': 0.7,
                  'max_delta_step':9,
                   'gamma' : 3,
                   'eta' : 0.23833373077656667,
                    'eval_metric':'mlogloss',
                     'num_class':num_class ,
                      'objective':objective,
                      'alpha': 1,
                      'lambda': 1 }
        param['nthread'] = 4
        plst = param.items()
        
        clf = None
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            clf = self.doXgboostRandomSearch(X, Y, num_round)

        else:
            dtrain = xgb.DMatrix(X , label=Y)
            clf = xgb.train( plst, dtrain, num_round)
        #joblib.dump(clf, xgbModelPath)    
        return clf
示例#18
0
def dumpModel(clf, clfName, expInfo, subFolderName):
    
    tmpDumpPath = getDumpFilePath(clfName, expInfo, subFolderName)
    log("Start dump ",clfName, " to " + tmpDumpPath)
    log("Exp info: ",expInfo)
    joblib.dump(clf, tmpDumpPath)
    log("Dump ",clfName, " successfully")
示例#19
0
def dumpModel(clf, clfName, expInfo, subFolderName):

    tmpDumpPath = getDumpFilePath(clfName, expInfo, subFolderName)
    log("Start dump ", clfName, " to " + tmpDumpPath)
    log("Exp info: ", expInfo)
    joblib.dump(clf, tmpDumpPath)
    log("Dump ", clfName, " successfully")
示例#20
0
 def getExtraTressClf(self, X, Y):
     clfName = "Extra_Trees"
     
     ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
     clf = ExtraTreesClassifier(
                             n_estimators=10, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1, 
                             min_weight_fraction_leaf=0.0, 
                             max_features='auto', 
                             max_leaf_nodes=None, 
                             bootstrap=False, 
                             oob_score=False, 
                             n_jobs=1, 
                             random_state=None, 
                             verbose=0, 
                             warm_start=False, 
                             class_weight=None)
     
     if self._gridSearchFlag == True:
         log(clfName + " start searching param...")
         tmpLowDepth = int(len(X.columns) * 0.7)
         tmpHighDepth = int(len(X.columns) )
         
         param_dist = {
                       "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                       "max_features": sp_randf(0,1),
                       "min_samples_split": sp_randint(1, 11),
                       "min_samples_leaf": sp_randint(1, 11),
                       "bootstrap": [True, True],
                       "criterion": ["gini", "entropy"], 
                       "oob_score":[True, True],
                       "n_estimators" : sp_randint(100, 300),
                       }
         
         clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
         
     return clf
示例#21
0
    def getExtraTressClf(self, X, Y):
        clfName = "Extra_Trees"

        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        clf = ExtraTreesClassifier(n_estimators=10,
                                   criterion='gini',
                                   max_depth=None,
                                   min_samples_split=2,
                                   min_samples_leaf=1,
                                   min_weight_fraction_leaf=0.0,
                                   max_features='auto',
                                   max_leaf_nodes=None,
                                   bootstrap=False,
                                   oob_score=False,
                                   n_jobs=1,
                                   random_state=None,
                                   verbose=0,
                                   warm_start=False,
                                   class_weight=None)

        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            tmpLowDepth = int(len(X.columns) * 0.7)
            tmpHighDepth = int(len(X.columns))

            param_dist = {
                "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                "max_features": sp_randf(0, 1),
                "min_samples_split": sp_randint(1, 11),
                "min_samples_leaf": sp_randint(1, 11),
                "bootstrap": [True, True],
                "criterion": ["gini", "entropy"],
                "oob_score": [True, True],
                "n_estimators": sp_randint(100, 300),
            }

            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)

        return clf
示例#22
0
 def doRandomSearch(self, clfName, clf, param_dist, X, Y):
     start = time.time()
     multiCores = -1
     if  clfName == "Logistic_Regression": 
         multiCores = 1
     if self._setXgboostTheradToOne == True and clfName =="Xgboost":
         multiCores = 1
         
     random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                            n_iter=self._n_iter_search, n_jobs=multiCores, scoring='log_loss')
     
     
     random_search.fit(X, Y)
     log(clfName + " randomized search cost: " , time.time() - start , " sec")
     self._bestClf[clfName] = random_search.best_estimator_
     self._bestLoglossDict[clfName] = self.getLogloss(self._bestClf[clfName], X, Y)
     self.report(random_search.grid_scores_, clfName, self._bestLoglossDict[clfName])
     
     dumpModel(random_search.best_estimator_, clfName, self._expInfo, self._subFolderName)
     
         
     return random_search.best_estimator_
示例#23
0
 def getRandomForestClf(self, X, Y):
     clfName = "Random_Forest"
     ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
     clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score = True)
     
     if self._gridSearchFlag == True:
         log(clfName + " start searching param...")
         tmpLowDepth = 10
         tmpHighDepth = 50
         
         
         param_dist = {
                       "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                       "max_features": sp_randf(0,1),
                       "min_samples_split": sp_randint(1, 11),
                       "min_samples_leaf": sp_randint(1, 11),
                       "criterion": ["gini", "entropy"], 
                       "n_estimators" : sp_randint(100, 300),
                       }
         
         clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
         
         
     return clf
示例#24
0
def loadModel(modelPath):

    log("Start load model: ", modelPath)
    clf = joblib.load(modelPath)
    return clf
示例#25
0
    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm

    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)


#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance

    if doTestFlag == True:
        return finalClf.predict_proba(dr._testDataFrame)

if __name__ == '__main__':
    start = time.time()

    exp()

    end = time.time()
    elapsed = end - start
    log("exp elapsed:", elapsed, "sec")
    #os.startfile('D:\\123.m4a')
示例#26
0
    def doXgboostRandomSearch(self, X, Y, num_round):

        paramList = []
        bestScore = sys.float_info.max
        bestClf = None
        best_num_round = 0

        num_class = len(set(Y))
        objective = ""
        if len(set(Y)) <= 2:
            objective = "binary:logistic"
        else:
            objective = "multi:softprob"

        for i in range(0, self._n_iter_search):
            log("xgboost start random search : " + str(i + 1) + "/" +
                str(self._n_iter_search))
            param = {}
            param['nthread'] = 4

            param['eta'] = random.uniform(0.15, 0.45)
            param['gamma'] = randint(0, 3)
            param['max_depth'] = randint(8, 120)
            param['min_child_weight'] = randint(1, 3)
            param['eval_metric'] = 'mlogloss'
            param['max_delta_step'] = randint(1, 10)
            param['objective'] = objective
            param['subsample'] = random.uniform(0.45, 0.65)
            param['num_class'] = num_class
            param['silent'] = 1
            param['alpha'] = 1
            param['lambda'] = 1
            #param['early_stopping_rounds']=2
            plst = param.items()

            evalDataPercentage = 0.2

            sampleRows = np.random.choice(X.index, len(X) * evalDataPercentage)

            sampleAnsDf = Y.ix[sampleRows]
            ori_X = X
            ori_Y = Y
            #dtest  = xgb.DMatrix( X.ix[sampleRows], label=sampleAnsDf)
            #dtrain  =  xgb.DMatrix( X.drop(sampleRows), label=Y.drop(sampleRows))
            #evallist  = [(dtest,'eval'), (dtrain,'train')]

            dtrain = xgb.DMatrix(X, label=Y)

            xgbCvResult = xgb.cv(plst,
                                 dtrain,
                                 num_boost_round=num_round,
                                 nfold=5)
            scoreList = xgbCvResult[xgbCvResult.columns[0]].tolist()
            new_num_round = scoreList.index(min(scoreList)) + 1
            minScore = scoreList[new_num_round - 1]

            tmpScore = minScore
            if tmpScore < bestScore:
                #tmpSelfScore = calLogLoss(pd.DataFrame(bst.predict(dtest)), sampleAnsDf)
                #print "self best score:" + str(tmpSelfScore)
                log("xgb best score:" + str(minScore))
                log("xgb best num_round: " + str(new_num_round))
                log("xgb best param: " + str(plst))
                newDtrain = xgb.DMatrix(ori_X, label=ori_Y)
                bst = xgb.train(plst, newDtrain, new_num_round)

                bestScore = tmpScore
                bestClf = bst
                paramList = plst
                best_num_round = new_num_round
                joblib.dump(bst, Config.xgboostBestTmpCflPath)

        self.genXgboostRpt(bestClf, bestScore, paramList, best_num_round)
        return bestClf
示例#27
0
            outTestFold1 = pd.DataFrame(predictTestResult)
            outTestFold1.columns = [
                tmpClfName + "_" + str(i) + "_0",
                tmpClfName + "_" + str(i) + "_1",
                tmpClfName + "_" + str(i) + "_2"
            ]
            dfTestLower = pd.concat([dfTestLower, outTestFold1], axis=1)

    mergeDf = dfUpper.append(dfLower)
    mergeTestDf = dfTestUpper.append(dfTestLower)

    mergeAns = train_fold_label_2.append(train_fold_label_1)

    # Testing

    tmpOutPath = _basePath + expNo + "_" + "Xgboost_" + "stacking" + "_ans.csv"

    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "stacking_level2_xgboost"
    fab._n_iter_search = 2
    fab._expInfo = expInfo
    clf = fab.getXgboostClf(mergeDf, mergeAns)

    log(clf.predict(xgb.DMatrix(mergeTestDf)))
    outDf = pd.DataFrame(clf.predict(xgb.DMatrix(mergeTestDf)))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    musicAlarm()
示例#28
0
    fab._gridSearchFlag = True
    #     fab._subFolderName = "stacked"
    fab._n_iter_search = 250
    fab._expInfo = expInfo
    #     fab.getAllModels(newX, newY)
    finalClf = fab.getRandomForestClf(newX, newY)

    featureImportance = []
    for i in range(0, len(finalClf.feature_importances_)):
        if i != len(dr._trainDataFrame.columns):
            # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
            featureImportance.append([dr._trainDataFrame.columns[i], finalClf.feature_importances_[i]])

    # log( featureImportance)
    featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
    log(featureImportance)

    trainNewX = dr._ansDataFrame
    tmpOutPath = _basePath + "012_train_trim_features.csv"
    selectCnt = 20
    tmpCnt = 0
    for tmpColName in featureImportance:
        for i in range(0, len(dr._trainDataFrame.columns)):

            if tmpColName[0] == dr._trainDataFrame.columns[i]:
                trainNewX = pd.concat([trainNewX, dr._trainDataFrame[dr._trainDataFrame.columns[i]]], axis=1)
                tmpCnt += 1
                break
        if tmpCnt == 20:
            break
    trainNewX.to_csv(tmpOutPath, sep=",", encoding="utf-8")
示例#29
0
         
         outFold1 = pd.DataFrame(predictResult)
         outFold1.columns = [tmpClfName+"_" + str(i) + "_0", tmpClfName+"_" + str(i) + "_1", tmpClfName+"_" + str(i) + "_2"  ]
         dfLower = pd.concat([dfLower, outFold1], axis=1)    
         
         outTestFold1 = pd.DataFrame(predictTestResult)
         outTestFold1.columns = [tmpClfName+"_" + str(i) + "_0", tmpClfName+"_" + str(i) + "_1", tmpClfName+"_" + str(i) + "_2"  ]
         dfTestLower = pd.concat([dfTestLower, outTestFold1], axis=1)   
     
 mergeDf = dfUpper.append(dfLower)
 mergeTestDf = dfTestUpper.append(dfTestLower)
 
 mergeAns = train_fold_label_2.append(train_fold_label_1)
 
 # Testing
      
 tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "stacking"+ "_ans.csv"
 
 fab = ModelFactory()
 fab._gridSearchFlag = True
 fab._singleModelMail = True
 fab._subFolderName = "stacking_level2_xgboost"  
 fab._n_iter_search = 2
 fab._expInfo = expInfo
 clf = fab.getXgboostClf(mergeDf, mergeAns)
 
 log(clf.predict(xgb.DMatrix(mergeTestDf)))
 outDf = pd.DataFrame(clf.predict(xgb.DMatrix(mergeTestDf)))
 outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
 musicAlarm()
 
示例#30
0
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    dr2 = DataReader()
    dr2.readInCSV(testPath, "test")
    #newX = dr2._testDataFrame

    dr3 = DataReader()
    dr3.readInCSV(testSortIdPath, "test")
    sortIdDf = dr3._testDataFrame

    dr4 = DataReader()
    dr4.readInCSV(trainSortIdPath, "test")
    sortIdDf = dr4._testDataFrame

    modelFolder = _basePath + "models" + Config.osSep + "binary" + Config.osSep
    curModel = "Xgboost"
    modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
    tmpOutPath = _basePath + expNo + "_" + curModel + "_test_ans.csv"
    tmpClf = loadModel(modelPath)
    log(tmpClf.predict_proba(newX))
    ans = tmpClf.predict_proba(newX)

    ansList = []
    for i, tmpAns in enumerate(dr._ansDataFrame):
        if ans[i][tmpAns] < 0.35:
            #log( "id: " + sortIdDf[sortIdDf.columns[0]][i] + ", prob: " + ans[i][tmpAns], ", cate: " + tmpAns)
            log((sortIdDf[sortIdDf.columns[0]][i], ans[i][tmpAns], tmpAns))
            ansList.append(
                (sortIdDf[sortIdDf.columns[0]][i], ans[i][tmpAns], tmpAns))

    log(len(ansList))
 # 1. read in data
  expNo = "014"
  expInfo = expNo + "_one_hot_each_features" 
  _basePath = Config.FolderBasePath + expInfo + Config.osSep
  
  featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"]
  
  ansPath = _basePath + "014_ans_array.csv"
  drAns = DataReader()
  drAns.readInCSV(ansPath, "train")
  newY = drAns._ansDataFrame
  
  
     
  for i in range(1,32):
      log( "start " + str(i) + "/32 ...")
      tmpCurFeatureList = []
      
      flagList =[]
      for i2 in range (0, 7- len(bin(i))):
          flagList.append(0)
      for i2 in range(2,len(bin(i))):
          flagList.append(int(bin(i)[i2]))
      
      for j in range(0,5):
          if flagList[j] ==1:
              tmpCurFeatureList.append(featureList[j])
      
      log(tmpCurFeatureList)        
      
      
示例#32
0
 def genXgboostRpt(self, bestClf, bestScore, paramList, best_num_round):
     dumpModel(bestClf, "Xgboost", self._expInfo, self._subFolderName)
     log("Native Xgboost best score : ", bestScore, ", param list: ", paramList, "best_num_round: ", best_num_round)
     if self._singleModelMail == True:
         mail("Xgboost Done" ,"Native Xgboost best score : " + str( bestScore) + ", param list: " + str( paramList) + "best_num_round: ", best_num_round)
示例#33
0
    def getAllModels(self, X, Y):

        log("GetAllModels start with iteration numbers: ", self._n_iter_search)
        start = time.time()

        self._basicClf["Xgboost"] = self.getXgboostClf(X, Y)
        self._basicClf["Random_Forest"] = self.getRandomForestClf(X, Y)
        self._basicClf["Extra_Trees"] = self.getExtraTressClf(X, Y)

        if not self._onlyTreeBasedModels:
            self._basicClf["K_NN"] = self.getKnnClf(X, Y)
            self._basicClf[
                "Logistic_Regression"] = self.getLogisticRegressionClf(X, Y)
            self._basicClf["Naive_Bayes"] = self.getNaiveBayesClf(X, Y)

        log("GetAllModels cost: ", time.time() - start, " sec")
        log(
            sorted(self._bestScoreDict.items(),
                   key=lambda x: x[1],
                   reverse=True))
        mail(
            self._expInfo,
            sorted(self._bestScoreDict.items(),
                   key=lambda x: x[1],
                   reverse=True))
        log(
            self._expInfo,
            sorted(self._bestScoreDict.items(),
                   key=lambda x: x[1],
                   reverse=True))
        bestScoreList = sorted(self._bestScoreDict.items(),
                               key=lambda x: x[1],
                               reverse=True)
        log("MVP clf is : ", bestScoreList[0][0])
        self._mvpClf = self._bestClf[bestScoreList[0][0]]
        log("GetAllModels end with iteration numbers: ", self._n_iter_search)
示例#34
0
#     newX  = xgb.DMatrix(newX)
#     #print clf.predict(newX)
#     tmpOutPath = _basePath + expNo +"_" + "Xgboost" + "_testXgboost7_ans.csv"
#     log(clf.predict(newX))
#     outDf = pd.DataFrame(clf.predict(newX))
#     outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
#     musicAlarm()
    
    clf = joblib.load( "F:\\xgboost_tmp_best_020.model" )
    tmpPath = _basePath + "test_merge_one_hot" + ".csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "test")
    newX = dr._testDataFrame
    newX  = xgb.DMatrix(newX)
    tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans_" + "2" + ".csv"
    log(clf.predict(newX))
    outDf = pd.DataFrame(clf.predict(newX))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    musicAlarm()
    
    
        
#     sampleRows = np.random.choice(X.index, len(X)*evalDataPercentage) 
#     
#     print  X.ix[sampleRows]
#     exit()
#     dtest  = xgb.DMatrix( X.ix[sampleRows], label=Y.ix[sampleRows])
#     dtrain  =  xgb.DMatrix( X.drop(sampleRows), label=Y.drop(sampleRows))
#     
#     print strftime("%Y-%m-%d %H:%M:%S", gmtime())
#     
示例#35
0
    #X, Y =  pd.DataFrame([1,2,3,4,5,6,7,8,9,10,11,12]), pd.DataFrame([1,2,3,4,5,6,7,8,9,10,11,12])
    #newX, newY =  stratifyData(X,Y, 0.4)
#     clf = fab.getNaiveBayesClf(X, Y)
#     clf2 = fab.getKnnClf(X, Y)
    #clf3 = fab.getRandomForestClf(X, Y)
#     x= clf.predict_proba(X)
#     log( x)
    #log(fab._bestScoreDict)
#     #log(fab._bestClf)
#     log( fab._bestClf['Random Forest'].predict_proba(X))
    #newX, newY = stratifyData(X, Y, 0.4)
    newX, newY = X, Y
    #print newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 1
    fab._expInfo = "001_location_only" 
    print newX
    #print newY
    fab.getAllModels(newX, newY)
    #fab.getRandomForestClf(newX, newY)

    bestClf = fab._mvpClf
    log(bestClf.predict_proba(newX))
    #log(sorted(fab._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) )
    #log(fab._bestClf['Random Forest'].predict_proba(X))
    #dumpModel(clf3, "Random_Forest", "ExpTest")
    #log("haha")
    #log(getDumpFilePath( "Random_Forest", "haha Tets"))      
    #musicAlarm()
示例#36
0
 dr.readInCSV( path, "train")
 newX, newY = dr._trainDataFrame, dr._ansDataFrame
 
 dr2 = DataReader()
 dr2.readInCSV( testPath, "test")
 #newX = dr2._testDataFrame
 
 dr3 = DataReader()
 dr3.readInCSV( testSortIdPath, "test")
 sortIdDf =dr3._testDataFrame
 
 dr4 = DataReader()
 dr4.readInCSV(trainSortIdPath, "test")
 sortIdDf =dr4._testDataFrame
 
 modelFolder = _basePath + "models" + Config.osSep  + "binary" + Config.osSep
 curModel = "Xgboost"
 modelPath =  modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
 tmpOutPath = _basePath + expNo + "_" +  curModel + "_test_ans.csv"
 tmpClf = loadModel( modelPath)
 log(tmpClf.predict_proba(newX))
 ans = tmpClf.predict_proba(newX)
 
 ansList = []
 for i, tmpAns in enumerate (dr._ansDataFrame):
     if ans[i][tmpAns] < 0.35:
         #log( "id: " + sortIdDf[sortIdDf.columns[0]][i] + ", prob: " + ans[i][tmpAns], ", cate: " + tmpAns)
         log((sortIdDf[sortIdDf.columns[0]][i],ans[i][tmpAns],tmpAns))
         ansList.append((sortIdDf[sortIdDf.columns[0]][i],ans[i][tmpAns],tmpAns))
 
 log (len(ansList))
示例#37
0
    
    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm
    
    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)
    
    
#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):  
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#     
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance 

    if doTestFlag == True:       
        return finalClf.predict_proba(dr._testDataFrame)

if __name__ == '__main__':
    start = time.time()
    
    exp()
    
    end = time.time()
    elapsed = end - start
    log( "exp elapsed:", elapsed , "sec")
    #os.startfile('D:\\123.m4a')
    
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX
 
    
    # 3. get all best model from newX
#     fab = ModelFactory()
#     fab._gridSearchFlag = True
#     fab._n_iter_search = 100
#     fab._expInfo = expInfo
#     fab.getXgboostClf(newX, newY)
    
    # 4. test all data, output 3 ans as features
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model

    modelPath = _basePath+"(Xgboost)_(2016-02-06_11_14_31).model"
    tmpOutPath = _basePath + "004_submission_1_train_Xgboost.csv"
    tmpClf = loadModel( modelPath)
    log(tmpClf.predict_proba(newX))
    outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
    outDf = pd.DataFrame(tmpClf.predict_proba(newX))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #musicAlarm()
    log("004 Done")
    
    
示例#39
0
    tmpTestPath = _basePath + "test_tobe.csv"
    tmpOutPath = _basePath + "test_ans.csv"
    tmpTestDf = pd.DataFrame()
    for tmpColName in trainColNameList:
        print tmpColName
        for tmpTestColName in testX.columns:
            
            if tmpColName == tmpTestColName:
                tmpTestDf = pd.concat([tmpTestDf, testX[tmpColName]], axis=1)
    
    tmpTestDf.to_csv(tmpTestPath, sep=',', encoding='utf-8')           
    modelFolder = _basePath + "models" + Config.osSep + "top20" + Config.osSep
    curModel = "Xgboost"
    modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
    tmpClf = loadModel( modelPath)
    log(tmpClf.predict_proba(tmpTestDf))
    outDf = pd.DataFrame(tmpClf.predict_proba(tmpTestDf))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    
    
    # Get all best model from newX
#     fab = ModelFactory()
#     fab._gridSearchFlag = True
#     fab._subFolderName = "top20"
#     fab._n_iter_search = 250
#     fab._expInfo = expInfo
#     fab.getAllModels(newX, newY)
#     finalClf = fab.getRandomForestClf(newX, newY)
    
    # Test all data
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
示例#40
0
    #X, Y =  pd.DataFrame([1,2,3,4,5,6,7,8,9,10,11,12]), pd.DataFrame([1,2,3,4,5,6,7,8,9,10,11,12])
    #newX, newY =  stratifyData(X,Y, 0.4)
    #     clf = fab.getNaiveBayesClf(X, Y)
    #     clf2 = fab.getKnnClf(X, Y)
    #clf3 = fab.getRandomForestClf(X, Y)
    #     x= clf.predict_proba(X)
    #     log( x)
    #log(fab._bestScoreDict)
    #     #log(fab._bestClf)
    #     log( fab._bestClf['Random Forest'].predict_proba(X))
    #newX, newY = stratifyData(X, Y, 0.4)
    newX, newY = X, Y
    #print newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 1
    fab._expInfo = "001_location_only"
    print newX
    #print newY
    fab.getAllModels(newX, newY)
    #fab.getRandomForestClf(newX, newY)

    bestClf = fab._mvpClf
    log(bestClf.predict_proba(newX))
    #log(sorted(fab._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) )
    #log(fab._bestClf['Random Forest'].predict_proba(X))
    #dumpModel(clf3, "Random_Forest", "ExpTest")
    #log("haha")
    #log(getDumpFilePath( "Random_Forest", "haha Tets"))
    #musicAlarm()
示例#41
0
def loadModel(modelPath):
    
    log("Start load model: ", modelPath)
    clf = joblib.load( modelPath )
    return clf
示例#42
0
    testPath = _basePath + "001_test_tobe.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        print newX
    # 2. stratify 60 % data and train location only
#     newX, newY = stratifyData(dr._trainDataFrame, dr._ansDataFrame, 0.4)

# 3. get all best model from newX
#     fab = ModelFactory()
#     fab._gridSearchFlag = True
#     fab._n_iter_search = 500
#     fab._expInfo = "001_location_only"
#     fab.getAllModels(newX, newY)

# 4. test all data, output 3 ans as features
    modelPath = _basePath + "(Xgboost)_(2016-02-03_18_39_14).model"
    tmpOutPath = _basePath + "001_submission_2.csv"
    tmpClf = loadModel(modelPath)
    log(tmpClf.predict_proba(newX))
    #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
    outDf = pd.DataFrame(tmpClf.predict_proba(newX))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    musicAlarm()
示例#43
0
#     drAns = DataReader()
#     drAns.readInCSV(ansPath, "train")
#     newY = drAns._ansDataFrame

    tmpPath = _basePath + "train_merge_one_hot.csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "train")
    newX = dr._trainDataFrame
    newY = dr._ansDataFrame


    fab = ModelFactory()
    #fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "groupby_sum"  
    fab._n_iter_search = 1
    fab._expInfo = expInfo
    clf = fab.getXgboostClf(newX, newY)
#     
    tmpPath = _basePath + "test_merge_one_hot" + ".csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "test")
    newX = dr._testDataFrame
    newX  = xgb.DMatrix(newX)
    tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans.csv"
    log(clf.predict(newX))
    outDf = pd.DataFrame(clf.predict(newX))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    musicAlarm()
    
示例#44
0
 def getAllModels(self, X, Y):
     
     log("GetAllModels start with iteration numbers: " , self._n_iter_search)
     start = time.time()
     
     self._basicClf["Xgboost"] = self.getXgboostClf(X, Y)
     self._basicClf["Random_Forest"] = self.getRandomForestClf(X, Y)
     self._basicClf["Extra_Trees"] = self.getExtraTressClf(X, Y)
     
     if not self._onlyTreeBasedModels:
         self._basicClf["K_NN"] = self.getKnnClf(X, Y)
         self._basicClf["Logistic_Regression"] = self.getLogisticRegressionClf(X, Y)
         self._basicClf["Naive_Bayes"] = self.getNaiveBayesClf(X, Y)
     
     
     log("GetAllModels cost: " , time.time() - start , " sec")
     log(sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True))
     mail(self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) )
     log(self._expInfo, sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True) )
     bestScoreList = sorted(self._bestScoreDict.items(), key=lambda x: x[1] , reverse=True)
     log("MVP clf is : ", bestScoreList[0][0])
     self._mvpClf = self._bestClf[bestScoreList[0][0]]
     log("GetAllModels end with iteration numbers: " , self._n_iter_search)
示例#45
0
        
#      4. test all data, output 3 ans as features
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
#     D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model

    modelFolder = _basePath + "models" + Config.osSep + "stacked" + Config.osSep
    
    clfNameList = []
    clfNameList.append("Extra_Trees")
    clfNameList.append("K_NN")
    clfNameList.append("Random_Forest")
    clfNameList.append("Xgboost")
    clfNameList.append("Logistic_Regression")
    
    testCsv = _basePath + "010_train_tobe.csv"
    dr = DataReader()
    newX, testY = dr.cvtPathListToDfList(testCsv, "train")
    
    for curModel in clfNameList:
        modelPath =  modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
        tmpOutPath = _basePath + expNo + "_blender" + curModel + "_train.csv"
        tmpClf = loadModel( modelPath)
        log(tmpClf.predict_proba(newX))
        #outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
        outDf = pd.DataFrame(tmpClf.predict_proba(newX))
        outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
        #musicAlarm()

    
示例#46
0
    tmpTestPath = _basePath + "test_tobe.csv"
    tmpOutPath = _basePath + "test_ans.csv"
    tmpTestDf = pd.DataFrame()
    for tmpColName in trainColNameList:
        print tmpColName
        for tmpTestColName in testX.columns:

            if tmpColName == tmpTestColName:
                tmpTestDf = pd.concat([tmpTestDf, testX[tmpColName]], axis=1)

    tmpTestDf.to_csv(tmpTestPath, sep=',', encoding='utf-8')
    modelFolder = _basePath + "models" + Config.osSep + "top20" + Config.osSep
    curModel = "Xgboost"
    modelPath = modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
    tmpClf = loadModel(modelPath)
    log(tmpClf.predict_proba(tmpTestDf))
    outDf = pd.DataFrame(tmpClf.predict_proba(tmpTestDf))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')

    # Get all best model from newX
    #     fab = ModelFactory()
    #     fab._gridSearchFlag = True
    #     fab._subFolderName = "top20"
    #     fab._n_iter_search = 250
    #     fab._expInfo = expInfo
    #     fab.getAllModels(newX, newY)
    #     finalClf = fab.getRandomForestClf(newX, newY)

    # Test all data
    modelList = [
        "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
示例#47
0
'''
Created on Jan 30, 2016
author: whmou

Jan 30, 2016     1.0.0     Init.

'''

from Telstra.util.CustomLogger import info as log

if __name__ == '__main__':
    log("haha")
示例#48
0
 def doXgboostRandomSearch(self, X, Y, num_round):
     
     paramList = []
     bestScore = sys.float_info.max
     bestClf = None
     best_num_round=0 
     
     num_class = len(set(Y))
     objective =""
     if len(set(Y)) <=2:
         objective = "binary:logistic"
     else:
         objective = "multi:softprob"
     
     for i in range(0, self._n_iter_search):
         log("xgboost start random search : " + str(i+1) + "/"+ str(self._n_iter_search))
         param = {}
         param['nthread'] = 4
         
         param['eta'] = random.uniform(0.15, 0.45)
         param['gamma'] = randint(0,3)
         param['max_depth'] = randint(8,120)
         param['min_child_weight'] = randint(1,3)
         param['eval_metric'] = 'mlogloss'
         param['max_delta_step'] = randint(1,10)
         param['objective'] = objective
         param['subsample'] = random.uniform(0.45, 0.65)
         param['num_class'] = num_class 
         param['silent'] = 1
         param['alpha'] = 1
         param['lambda'] = 1
         #param['early_stopping_rounds']=2
         plst = param.items()
     
         
         evalDataPercentage = 0.2
         
         sampleRows = np.random.choice(X.index, len(X)*evalDataPercentage) 
         
         sampleAnsDf = Y.ix[sampleRows]
         ori_X = X
         ori_Y = Y
         #dtest  = xgb.DMatrix( X.ix[sampleRows], label=sampleAnsDf)
         #dtrain  =  xgb.DMatrix( X.drop(sampleRows), label=Y.drop(sampleRows))
         #evallist  = [(dtest,'eval'), (dtrain,'train')]
         
         dtrain  =  xgb.DMatrix( X, label=Y)
         
         xgbCvResult =  xgb.cv(plst, dtrain, num_boost_round= num_round,  nfold=5)
         scoreList = xgbCvResult[xgbCvResult.columns[0]].tolist()
         new_num_round = scoreList.index(min(scoreList)) + 1 
         minScore = scoreList[new_num_round-1]
         
         tmpScore = minScore
         if  tmpScore < bestScore:
             #tmpSelfScore = calLogLoss(pd.DataFrame(bst.predict(dtest)), sampleAnsDf)
             #print "self best score:" + str(tmpSelfScore)
             log("xgb best score:" + str(minScore))
             log("xgb best num_round: " + str(new_num_round))
             log("xgb best param: " + str(plst))
             newDtrain = xgb.DMatrix(ori_X, label=ori_Y)
             bst = xgb.train(plst, newDtrain, new_num_round)
             
             bestScore = tmpScore
             bestClf = bst
             paramList = plst
             best_num_round = new_num_round
             joblib.dump(bst, Config.xgboostBestTmpCflPath)
             
     
     self.genXgboostRpt(bestClf, bestScore, paramList, best_num_round)
     return bestClf