Пример #1
0
def print_results(unique_test_name, grid, y_pred, y_test):

	fpr, tpr, _ = ROC(y_test, y_pred)
	roc_auc = auc(fpr, tpr)
	
	print("-----------------%s--------------------" % unique_test_name)
	print("-----------------Best Param Overview--------------------")
	print("Best score: %0.4f" % grid.best_score_)
	print("Using the following parameters:")
	print(grid.best_params_)
	print("-----------------Scoring Model--------------------")
	print(classification_report(y_pred, y_test))
	print(confusion_matrix(y_pred, y_test), "\n")

	plt.figure()
	lw = 2
	plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
	plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
	plt.xlim([0.0, 1.0])
	plt.ylim([0.0, 1.05])
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('Receiver operating characteristic example')
	plt.legend(loc="lower right")
	plt.show()

	return
Пример #2
0
 def evaluateModel(self,test_x,test_y,model):
     from sklearn.metrics import classification_report as CR, roc_auc_score as ROC
     
     predict_result = model.predict(test_x)
     predict_prob = model.predict_proba(test_x)[:,1]
     
     
     crReport = CR(test_y,predict_result)
     acc = model.score(test_x,test_y)
     roc = ROC(test_y,predict_prob)
     #print("Accuracy %f" % model.score(test_x,test_y))        
     return crReport,acc,roc
Пример #3
0
 def ensembleStacking(self, train_x,train_y,test_x,test_y):
     from sklearn.linear_model import LogisticRegression       
     from sklearn.metrics import classification_report as CR, roc_auc_score as ROC
     
     algorithms = {"rfc":{"algo":["gini","entropy"] , "method":self.RandomForestClassifier},
                   "lgbm":{"algo":["gbdt","dart","goss"] , "method": self.lightgbm},
                   "logit": {"algo":[""] , "method":self.LogisticRegression},
                   "ada":{"algo":[""] , "method":self.AdaBoostClassifier},
                   "dt":{"algo":["gini","entropy"] , "method":self.DecisionTreeClassifier},
                   "knn":{"algo":["auto"] , "method":self.KNN},
                   "xgboost":{"algo":["gbtree","gblinear","dart"] , "method":self.XGBoostClassifier},
                   }
     
     models = []
     df_val_pred = pd.DataFrame()
     for key in algorithms: 
         algosPerClassifier = algorithms[key]["algo"]
         for algo in algosPerClassifier:
             method = algorithms[key]["method"]
             model = method(train_x,train_y,test_x,test_y,algorithm = algo)
             predict_prob = model.predict_proba(test_x)[:,1]
             modelName = key + "_"+algo
             df_val_pred = pd.concat([df_val_pred,pd.DataFrame(predict_prob,columns = [modelName])],
                                      axis = 1)
             models.append({modelName:(model,ROC(test_y,predict_prob))})
     
     df_val_pred = np.array(df_val_pred)
     
     pdb.set_trace()
     stackingModel = LogisticRegression(class_weight = 'balanced')    
     stackingModel.fit(df_val_pred,test_y)
     
     predict_prob = stackingModel.predict_proba(df_val_pred)[:,1]
     roc = ROC(test_y,predict_prob)
     print("ROC %f" % roc)
     models.append({"StackingModel":(stackingModel,roc)})
     return models,df_val_pred
Пример #4
0
def metaLearner(data_x,data_y,test,models):
    from sklearn import svm
    from sklearn.metrics import classification_report as CR, roc_auc_score as ROC
    from sklearn.model_selection import StratifiedKFold
    
    
    roc_tmp = 0
    model_final = 0
    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(data_x, data_y):
        train_x, test_x = data_x[train_index], data_x[test_index]
        train_y, test_y = data_y[train_index], data_y[test_index]
        model,result = models(train_x,train_y,test_x,test_y)
            
        roc = ROC(test_y,result)
        print("ROC:",roc)
        if roc_tmp  < roc:
            roc_tmp = roc
            model_final = model
    
    pdb.set_trace()
    result = model_final.predict_proba(test)
    return result
Пример #5
0
 def lightgbm_tmp(self,train,test):
     from sklearn.model_selection import StratifiedKFold
     from sklearn.metrics import classification_report as CR, roc_auc_score as ROC
     
     data_x = np.array(train.loc[:,train.columns != "TARGET"])
     data_y = np.array(train[["TARGET"]])
     data_test_x = np.array(test)
     
     skf = StratifiedKFold(n_splits=2)
     roc = 0
     for train_index, test_index in skf.split(data_x, data_y):
         train_x, test_x = data_x[train_index], data_x[test_index]
         train_y, test_y = data_y[train_index], data_y[test_index]
     
         model_tmp,result = self.lightgbm(train_x,train_y,test_x,test_y)
         roc_tmp = ROC(test_y,result)
         print("ROC",roc_tmp)
         if roc_tmp > roc:
             roc= roc_tmp
             model = model_tmp
             
     results = model.predict_proba(data_test_x)
     results = results[:,1]
     return model, results
st = '(svm) fraction of testing instances correctly predicted: '
print("{0}{1}".format(st, fraction_correct(svc_pred, te[1])))

#### Evaluate Classifier Performance

##### ROC

# In[148]:

# because we have a binary classification problem,
# we can use ROC to evaluate the quality of these models

#logistic regression
pred_prob_lr = lr.predict_proba(te[0])
false_pos_rate_lr, true_pos_rate_lr, thresholds_lr = ROC(
    te[1], pred_prob_lr[:, 1])
roc_auc_lr = AUC(false_pos_rate_lr, true_pos_rate_lr)
print(
    "Logisitc Regression, area under the curve: {0:>9.3f}".format(roc_auc_lr))

# svm
pred_prob_svm = svc.predict_proba(te[0])
false_pos_rate_svm, true_pos_rate_svm, thresholds_svm = ROC(
    te[1], pred_prob_svm[:, 1])
roc_auc_svm = AUC(false_pos_rate_svm, true_pos_rate_svm)
print("SVM, area under the curve: {0:>25.3f}".format(roc_auc_svm))

# In[170]:

# plot the ROC curves for each classifier
Пример #7
0
    def _error(self, text, y_test, predictions):

        print(text + ' roc_auc_score: ', ROC(y_test, predictions))
        return ROC(y_test, predictions)
Пример #8
0
times = time()
clf = SVC(kernel="linear",
          C=3.1663157894736838,
          cache_size=5000,
          class_weight="balanced").fit(Xtrain, Ytrain)
result = clf.predict(Xtest)
score = clf.score(Xtest, Ytest)
recall = recall_score(Ytest, result)
auc = roc_auc_score(Ytest, clf.decision_function(Xtest))
print("testing accuracy %f,recall is %f', auc is %f" % (score, recall, auc))
print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%S:%f"))

from sklearn.metrics import roc_curve as ROC
import matplotlib.pyplot as plt

FPR, Recall, thresholds = ROC(Ytest, clf.decision_function(Xtest), pos_label=1)

area = roc_auc_score(Ytest, clf.decision_function(Xtest))

plt.figure()
plt.plot(FPR, Recall, color='red', label='ROC curve (area = %0.2f)' % area)
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

maxindex = (Recall - FPR).tolist().index(max(Recall - FPR))
Пример #9
0
 def ensembleStacking(self, train,test, iteration = 5):
     from sklearn.linear_model import LogisticRegression       
     from sklearn.metrics import classification_report as CR, roc_auc_score as ROC
     from sklearn.model_selection import StratifiedKFold
     model = 0
     roc = 0
     models = []
     
     algorithms = {
                   "xgboost":{"algo":["gbtree","gblinear","dart"] , "method":self.XGBoostClassifier},
                   "tf":{"algo":[""], "method": self.TensorFlowModel},
                   "rfc":{"algo":["gini","entropy"] , "method":self.RandomForestClassifier},
                   "lgbm":{"algo":["gbdt","dart","goss"] , "method": self.lightgbm},
                   "logit": {"algo":[""] , "method":self.LogisticRegression},
                   "ada":{"algo":[""] , "method":self.AdaBoostClassifier},
                   "dt":{"algo":["gini","entropy"] , "method":self.DecisionTreeClassifier}
                   
                   }
     
     skf = StratifiedKFold(n_splits=iteration)
     
     data_x = np.array(train.loc[:,train.columns != "TARGET"])
     data_y = np.array(train[["TARGET"]])
     data_test_x = np.array(test)
     
     df_val_pred = pd.DataFrame()
     df_test_pred = pd.DataFrame()
     index = 0
     target = np.array([], dtype = np.int32)
     
     for train_index, test_index in skf.split(data_x, data_y):
         train_x, test_x = data_x[train_index], data_x[test_index]
         train_y, test_y = data_y[train_index], data_y[test_index]
         
         df_val_pred_algo = pd.DataFrame()
         for key in algorithms: 
             algosPerClassifier = algorithms[key]["algo"]
             for algo in algosPerClassifier:
             
                 method = algorithms[key]["method"]
                 model,predict_prob = method(train_x,train_y,test_x,test_y,algorithm = algo)
                 
                 modelName = key + "_"+algo
                 
                 #---------Merge result of each algo
                 #pdb.set_trace()
                 df_val_pred_algo = pd.concat([df_val_pred_algo,
                                              pd.DataFrame({modelName: predict_prob})],
                                          axis = 1)
                 
                 if index == 0:
                     #-------Do prediction for entire dataset i.e. train and test
                     #-------This will be your test set
                     model,result = method(data_x, data_y,data_test_x,"")
                     df_test_pred = pd.concat([df_test_pred,pd.DataFrame({modelName: result})],axis = 1)
         
         #--------Add target variable
         target = np.concatenate((target,test_y[:,0]), axis = 0)
         
         #-------Merge all the splits row wise
         df_val_pred = pd.concat([df_val_pred,df_val_pred_algo],axis = 0)
         index = index + 1
         
     df_val_pred = np.array(df_val_pred)
     df_test_pred = np.array(df_test_pred)
     pdb.set_trace()
     stackingModel = LogisticRegression(class_weight = 'balanced')    
     stackingModel.fit(df_val_pred,target)
     
     #--------Metrics info on training set
     predict_prob = stackingModel.predict_proba(df_val_pred)[:,1]
     roc = ROC(target,predict_prob)
     print("ROC %f" % roc)
     
     predict_prob = stackingModel.predict_proba(df_test_pred)[:,1]
     
     return predict_prob,df_val_pred,target,df_test_pred