def draw_roc(self, label_sets, title='', save_path='', show_plot=False): # Compute ROC curve and area the curve pyplot.clf() for i, (labels, probas) in enumerate(label_sets): fpr, tpr, _ = roc_curve(labels, probas[:, 1]) roc_auc = auc(fpr, tpr) # Plot ROC curve pyplot.plot(fpr, tpr, label='Training fold {0} (area = {1})'.format( i + 1, round(roc_auc, 2))) pyplot.plot([0, 1], [0, 1], 'k--') pyplot.xlim([0.0, 1.0]) pyplot.ylim([0.0, 1.0]) pyplot.xlabel('False Positive Rate') pyplot.ylabel('True Positive Rate') pyplot.title(title) pyplot.legend(loc="lower right") if save_path: pyplot.savefig(save_path) if show_plot: pyplot.show()
from sklearn.metrics import metrics file = open("../output/splitpred1.csv") numarray = [] while 1: line = file.readline() if not line: break numarray.append(float(line)) file = open("../output/answers.csv") answerarray = [] while 1: line = file.readline() if not line: break answerarray.append(float(line)) fpr, tpr, thresholds = metrics.roc_curve(answerarray, numarray, pos_label=1) auc = metrics.auc(fpr, tpr) print auc
train_output = train[:, 4] print("train in and out", train_input.shape, train_output.shape) test_input = test[:, 0:4] test_output = test[:, 4] print("test in and out", test_input.shape, test_output.shape) # train an SVC (Suppot Vector Classifier) # create the classifier classifier = RandomForestClassifier() # or SVC() # learn the data classifier.fit(train_input, train_output) print(test_output) # predict the output of the test input predicted = classifier.predict(test_input) print(predicted) # Calculate the ROC curve fpr, tpr, thresholds = metrics.roc_curve(test_output, predicted, pos_label=2) # Calculate the area under the ROC curve auc = metrics.auc(fpr, tpr) print(auc) # predict the test set values # get our AUC and accuracy
from sklearn.metrics import metrics file = open("../output/splitpred1.csv") numarray = [] while 1: line = file.readline() if not line: break numarray.append(float(line)) file = open("../output/answers.csv") answerarray = [] while 1: line = file.readline() if not line: break answerarray.append(float(line)) fpr, tpr, thresholds = metrics.roc_curve(answerarray, numarray, pos_label=1) auc = metrics.auc(fpr,tpr) print auc
def roc(y, p_hat): fpr, tpr, thresholds = roc_curve(y, p_hat) roc_auc = auc(fpr, tpr) return roc_auc, fpr, tpr
def evaluate(y_true, y_pred): y_pred = np.array(y_pred) y_true = np.array(y_true) fpr, tpr, _thresholds = metrics.roc_curve(y_true, y_pred, pos_label=1) return metrics.auc(fpr, tpr)
def doCV(): SEED = 42 rnd = np.random.RandomState(SEED) model_lr = linear_model.LogisticRegression(C=3) model_rf = ensemble.RandomForestClassifier( n_estimators=10, min_samples_split=10, compute_importances=False, n_jobs=2, random_state=rnd, verbose=2 ) print "loading data for random forest..." y, X = data_io.load_data_pd("train_orig.csv", use_labels=True) _, X_test = data_io.load_data_pd("test_orig.csv", use_labels=False) xtrain = getRFX(X) xtest = getRFX_test(X_test) xtrain = xtrain[:, 1:] xtest = xtest[:, 1:] xtrain.dump("num_train.dat") xtest.dump("num_test.dat") print "dumped..!" print "loading data for logistic regression..." ysp, Xsp = data_io.load_data("train_orig.csv") y_testsp, X_testsp = data_io.load_data("test_orig.csv", use_labels=False) # === one-hot encoding === # # we want to encode the category IDs encountered both in # the training and the test set, so we fit the encoder on both encoder = preprocessing.OneHotEncoder() # print Xsp.shape, X_testsp.shape encoder.fit(np.vstack((Xsp, X_testsp))) Xsp = encoder.transform(Xsp) # Returns a sparse matrix (see numpy.sparse) X_testsp = encoder.transform(X_testsp) print "starting cross validation..." nFeatures = X.shape[0] niter = 10 cv = cross_validation.ShuffleSplit(nFeatures, n_iter=niter, test_size=0.2, random_state=rnd) mean_auc = 0.0 i = 0 for train, test in cv: xtrain = X.ix[train] ytrain = y[train] xtest = X.ix[test] ytest = y[test] xtrain_sp = Xsp[train] xtest_sp = X_testsp[test] ytrainsp = ysp[train] xtrain = getRFX(xtrain) xtest = getRFX_test(xtest) xtrain = xtrain[:, 1:] xtest = xtest[:, 1:] print "fitting random forest...." model_rf.fit(xtrain, ytrain) preds_rf = model_rf.predict_proba(xtest)[:, 1] print "fitting logistic regression..." model_lr.fit(xtrain_sp, ytrainsp) preds_lr = model_lr.predict_proba(xtest_sp)[:, 1] preds = [np.mean(x) for x in zip(preds_rf, preds_lr)] fpr, tpr, _ = metrics.roc_curve(ytest, preds) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, niter, roc_auc) mean_auc += roc_auc i += 1 print "Mean AUC: ", mean_auc / niter
from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import metrics from sklearn.svm import SVC classifier = SVC() print trainingNpArray.shape trainInput = trainingNpArray[:,0:10] trainOutput = trainingNpArray[:,10] print "training" classifier.fit(trainInput,trainOutput) testInput = testingNpArray[:,0:10] testOutput = testingNpArray[:,10] print "Predicting" predicted = classifier.predict(testInput) fpr,tpr,thresholds = metrics.roc_curve(testOutput,predicted,pos_label=3)#If you want to increase the pos label you need to increase the number of lines are read. auc = metrics.auc(fpr,tpr) print auc
def get_auc(self, labels_true, labels_prob): fpr, tpr, _ = roc_curve(labels_true, labels_prob) return auc(fpr, tpr)
from sklearn.metrics import metrics print metrics.accuracy_score(y_test, y_pred) #92% Accuracy #Task 6 # Map five to 1 and 1 to 0 y_test[y_test ==1] = 0 y_test[y_test == 5 ] = 1 y_pred_prob = nb.predict_proba(test_dtm)[:,1] print metrics.roc_auc_score(y_test, y_pred_prob) #Task 7 import matplotlib.pyplot as plt fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob) plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate (1 - Specificity)') plt.ylabel('True Positive Rate (Sensitivity)') #Task 8 print metrics.confusion_matrix(y_test, y_pred) sensitivity = 126 / float(25 + 126) specificity = 813/ float(813 + 58) #Task 9 false_positives = X_test[y_test < y_pred] # false positives false_negatives = X_test[y_test > y_pred] # false negatives