예제 #1
0
    def draw_roc(self, label_sets, title='', save_path='', show_plot=False):
        # Compute ROC curve and area the curve
        pyplot.clf()

        for i, (labels, probas) in enumerate(label_sets):
            fpr, tpr, _ = roc_curve(labels, probas[:, 1])
            roc_auc = auc(fpr, tpr)

            # Plot ROC curve
            pyplot.plot(fpr,
                        tpr,
                        label='Training fold {0} (area = {1})'.format(
                            i + 1, round(roc_auc, 2)))

        pyplot.plot([0, 1], [0, 1], 'k--')
        pyplot.xlim([0.0, 1.0])
        pyplot.ylim([0.0, 1.0])
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        pyplot.title(title)
        pyplot.legend(loc="lower right")
        if save_path: pyplot.savefig(save_path)
        if show_plot: pyplot.show()
예제 #2
0
from sklearn.metrics import metrics

file = open("../output/splitpred1.csv")
numarray = []
while 1:
    line = file.readline()
    if not line:
        break
    numarray.append(float(line))
file = open("../output/answers.csv")
answerarray = []
while 1:
    line = file.readline()
    if not line:
        break
    answerarray.append(float(line))
fpr, tpr, thresholds = metrics.roc_curve(answerarray, numarray, pos_label=1)
auc = metrics.auc(fpr, tpr)
print auc
train_output = train[:, 4]

print("train in and out", train_input.shape, train_output.shape)

test_input = test[:, 0:4]
test_output = test[:, 4]

print("test in and out", test_input.shape, test_output.shape)

# train an SVC (Suppot Vector Classifier)

# create the classifier
classifier = RandomForestClassifier()  # or SVC()
# learn the data
classifier.fit(train_input, train_output)
print(test_output)
# predict the output of the test input
predicted = classifier.predict(test_input)

print(predicted)

# Calculate the ROC curve
fpr, tpr, thresholds = metrics.roc_curve(test_output, predicted, pos_label=2)
# Calculate the area under the ROC curve
auc = metrics.auc(fpr, tpr)

print(auc)

# predict the test set values
# get our AUC and accuracy
예제 #4
0
from sklearn.metrics import metrics

file = open("../output/splitpred1.csv")
numarray = []
while 1:
	line = file.readline()
	if not line:
		break
	numarray.append(float(line))
file = open("../output/answers.csv")
answerarray = []
while 1:
	line = file.readline()
	if not line:
		break
	answerarray.append(float(line))
fpr, tpr, thresholds = metrics.roc_curve(answerarray, numarray, pos_label=1)
auc = metrics.auc(fpr,tpr)
print auc
예제 #5
0
파일: util.py 프로젝트: jcrudy/higgs
def roc(y, p_hat):
    fpr, tpr, thresholds = roc_curve(y, p_hat)
    roc_auc = auc(fpr, tpr)
    
    return roc_auc, fpr, tpr
예제 #6
0
파일: main.py 프로젝트: oddskool/amz_eac
def evaluate(y_true, y_pred):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    fpr, tpr, _thresholds = metrics.roc_curve(y_true, y_pred, pos_label=1)
    return metrics.auc(fpr, tpr)
예제 #7
0
def doCV():
    SEED = 42
    rnd = np.random.RandomState(SEED)

    model_lr = linear_model.LogisticRegression(C=3)
    model_rf = ensemble.RandomForestClassifier(
        n_estimators=10, min_samples_split=10, compute_importances=False, n_jobs=2, random_state=rnd, verbose=2
    )

    print "loading data for random forest..."
    y, X = data_io.load_data_pd("train_orig.csv", use_labels=True)
    _, X_test = data_io.load_data_pd("test_orig.csv", use_labels=False)

    xtrain = getRFX(X)
    xtest = getRFX_test(X_test)
    xtrain = xtrain[:, 1:]
    xtest = xtest[:, 1:]

    xtrain.dump("num_train.dat")
    xtest.dump("num_test.dat")
    print "dumped..!"
    print "loading data for logistic regression..."
    ysp, Xsp = data_io.load_data("train_orig.csv")
    y_testsp, X_testsp = data_io.load_data("test_orig.csv", use_labels=False)
    # === one-hot encoding === #
    # we want to encode the category IDs encountered both in
    # the training and the test set, so we fit the encoder on both
    encoder = preprocessing.OneHotEncoder()
    # print Xsp.shape, X_testsp.shape
    encoder.fit(np.vstack((Xsp, X_testsp)))
    Xsp = encoder.transform(Xsp)  # Returns a sparse matrix (see numpy.sparse)
    X_testsp = encoder.transform(X_testsp)

    print "starting cross validation..."
    nFeatures = X.shape[0]
    niter = 10
    cv = cross_validation.ShuffleSplit(nFeatures, n_iter=niter, test_size=0.2, random_state=rnd)
    mean_auc = 0.0
    i = 0
    for train, test in cv:
        xtrain = X.ix[train]
        ytrain = y[train]
        xtest = X.ix[test]
        ytest = y[test]

        xtrain_sp = Xsp[train]
        xtest_sp = X_testsp[test]
        ytrainsp = ysp[train]

        xtrain = getRFX(xtrain)
        xtest = getRFX_test(xtest)
        xtrain = xtrain[:, 1:]
        xtest = xtest[:, 1:]

        print "fitting random forest...."
        model_rf.fit(xtrain, ytrain)
        preds_rf = model_rf.predict_proba(xtest)[:, 1]

        print "fitting logistic regression..."
        model_lr.fit(xtrain_sp, ytrainsp)
        preds_lr = model_lr.predict_proba(xtest_sp)[:, 1]

        preds = [np.mean(x) for x in zip(preds_rf, preds_lr)]

        fpr, tpr, _ = metrics.roc_curve(ytest, preds)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, niter, roc_auc)
        mean_auc += roc_auc
        i += 1
    print "Mean AUC: ", mean_auc / niter
예제 #8
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import metrics
from sklearn.svm import SVC

classifier = SVC()

print trainingNpArray.shape
trainInput = trainingNpArray[:,0:10]
trainOutput = trainingNpArray[:,10]

print "training"
classifier.fit(trainInput,trainOutput)


testInput = testingNpArray[:,0:10]
testOutput = testingNpArray[:,10]

print "Predicting"
predicted = classifier.predict(testInput)
fpr,tpr,thresholds = metrics.roc_curve(testOutput,predicted,pos_label=3)#If you want to increase the pos label you need to increase the number of lines are read.

auc = metrics.auc(fpr,tpr)

print auc






예제 #9
0
 def get_auc(self, labels_true, labels_prob):
     fpr, tpr, _ = roc_curve(labels_true, labels_prob)
     return auc(fpr, tpr)
from sklearn.metrics import metrics

print metrics.accuracy_score(y_test, y_pred)
#92% Accuracy 
#Task 6 
# Map five to 1 and 1 to 0 
y_test[y_test ==1]  = 0
y_test[y_test == 5 ] = 1


y_pred_prob = nb.predict_proba(test_dtm)[:,1]
print metrics.roc_auc_score(y_test, y_pred_prob)
#Task 7
import matplotlib.pyplot as plt
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')

#Task 8
print metrics.confusion_matrix(y_test, y_pred)
sensitivity = 126 / float(25 + 126)
specificity = 813/ float(813 + 58)
#Task 9 
false_positives = X_test[y_test < y_pred] # false positives

false_negatives = X_test[y_test > y_pred] # false negatives