def interpolated_precision_recall_curve(queries_ranking, queries_similarities, relevants): queries_count = np.shape(queries_ranking)[0] interpolated_precision = np.zeros(11,dtype = np.float128) for qindex in range(0,queries_count): tp = 0 precision, recall = [0],[0] relevants_count = np.shape(np.nonzero(relevants[qindex]))[1] retrieved_count = 1 for ranki in queries_ranking[qindex]: if (queries_similarities[qindex][ranki] > 0) and (relevants[qindex][ranki] == 1): tp += 1 precisioni = tp / retrieved_count if relevants_count == 0: recalli = 1 else: recalli = tp / relevants_count retrieved_count += 1 precision += [precisioni] recall += [recalli] # query's 11 levels of precision recall precision_levels[0] = max precision in recall > 0 precision_levels = [] for i in range(0,11): prec_ati = 0 for j in range(0,len(recall)): if i <= recall[j]*10: prec_ati = max(prec_ati,precision[j]) precision_levels.append(prec_ati) interpolated_precision[i] += prec_ati/queries_count del precision del recall auc = float("{0:1.4f}".format(metrics.auc([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],interpolated_precision))) return interpolated_precision, auc
def draw_roc(self, label_sets, title='', save_path='', show_plot=False): # Compute ROC curve and area the curve pyplot.clf() for i, (labels, probas) in enumerate(label_sets): fpr, tpr, _ = roc_curve(labels, probas[:, 1]) roc_auc = auc(fpr, tpr) # Plot ROC curve pyplot.plot(fpr, tpr, label='Training fold {0} (area = {1})'.format( i + 1, round(roc_auc, 2))) pyplot.plot([0, 1], [0, 1], 'k--') pyplot.xlim([0.0, 1.0]) pyplot.ylim([0.0, 1.0]) pyplot.xlabel('False Positive Rate') pyplot.ylabel('True Positive Rate') pyplot.title(title) pyplot.legend(loc="lower right") if save_path: pyplot.savefig(save_path) if show_plot: pyplot.show()
from sklearn.metrics import metrics file = open("../output/splitpred1.csv") numarray = [] while 1: line = file.readline() if not line: break numarray.append(float(line)) file = open("../output/answers.csv") answerarray = [] while 1: line = file.readline() if not line: break answerarray.append(float(line)) fpr, tpr, thresholds = metrics.roc_curve(answerarray, numarray, pos_label=1) auc = metrics.auc(fpr, tpr) print auc
train_output = train[:, 4] print("train in and out", train_input.shape, train_output.shape) test_input = test[:, 0:4] test_output = test[:, 4] print("test in and out", test_input.shape, test_output.shape) # train an SVC (Suppot Vector Classifier) # create the classifier classifier = RandomForestClassifier() # or SVC() # learn the data classifier.fit(train_input, train_output) print(test_output) # predict the output of the test input predicted = classifier.predict(test_input) print(predicted) # Calculate the ROC curve fpr, tpr, thresholds = metrics.roc_curve(test_output, predicted, pos_label=2) # Calculate the area under the ROC curve auc = metrics.auc(fpr, tpr) print(auc) # predict the test set values # get our AUC and accuracy
def roc(y, p_hat): fpr, tpr, thresholds = roc_curve(y, p_hat) roc_auc = auc(fpr, tpr) return roc_auc, fpr, tpr
def evaluate(y_true, y_pred): y_pred = np.array(y_pred) y_true = np.array(y_true) fpr, tpr, _thresholds = metrics.roc_curve(y_true, y_pred, pos_label=1) return metrics.auc(fpr, tpr)
def doCV(): SEED = 42 rnd = np.random.RandomState(SEED) model_lr = linear_model.LogisticRegression(C=3) model_rf = ensemble.RandomForestClassifier( n_estimators=10, min_samples_split=10, compute_importances=False, n_jobs=2, random_state=rnd, verbose=2 ) print "loading data for random forest..." y, X = data_io.load_data_pd("train_orig.csv", use_labels=True) _, X_test = data_io.load_data_pd("test_orig.csv", use_labels=False) xtrain = getRFX(X) xtest = getRFX_test(X_test) xtrain = xtrain[:, 1:] xtest = xtest[:, 1:] xtrain.dump("num_train.dat") xtest.dump("num_test.dat") print "dumped..!" print "loading data for logistic regression..." ysp, Xsp = data_io.load_data("train_orig.csv") y_testsp, X_testsp = data_io.load_data("test_orig.csv", use_labels=False) # === one-hot encoding === # # we want to encode the category IDs encountered both in # the training and the test set, so we fit the encoder on both encoder = preprocessing.OneHotEncoder() # print Xsp.shape, X_testsp.shape encoder.fit(np.vstack((Xsp, X_testsp))) Xsp = encoder.transform(Xsp) # Returns a sparse matrix (see numpy.sparse) X_testsp = encoder.transform(X_testsp) print "starting cross validation..." nFeatures = X.shape[0] niter = 10 cv = cross_validation.ShuffleSplit(nFeatures, n_iter=niter, test_size=0.2, random_state=rnd) mean_auc = 0.0 i = 0 for train, test in cv: xtrain = X.ix[train] ytrain = y[train] xtest = X.ix[test] ytest = y[test] xtrain_sp = Xsp[train] xtest_sp = X_testsp[test] ytrainsp = ysp[train] xtrain = getRFX(xtrain) xtest = getRFX_test(xtest) xtrain = xtrain[:, 1:] xtest = xtest[:, 1:] print "fitting random forest...." model_rf.fit(xtrain, ytrain) preds_rf = model_rf.predict_proba(xtest)[:, 1] print "fitting logistic regression..." model_lr.fit(xtrain_sp, ytrainsp) preds_lr = model_lr.predict_proba(xtest_sp)[:, 1] preds = [np.mean(x) for x in zip(preds_rf, preds_lr)] fpr, tpr, _ = metrics.roc_curve(ytest, preds) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, niter, roc_auc) mean_auc += roc_auc i += 1 print "Mean AUC: ", mean_auc / niter
def get_auc(self, labels_true, labels_prob): fpr, tpr, _ = roc_curve(labels_true, labels_prob) return auc(fpr, tpr)
def interpolated_precision_recall_curve(self,queries_ranking, queries_similarities, relevants): # precision_per_query = [] # recall_per_query = [] # precision_leves_per_query = [] queries_count = shape(queries_ranking)[0] interpolated_precision = zeros(11,dtype = np.float128) for qindex in range(0,queries_count): tp = 0 precision, recall = [0],[0] relevants_count = shape(nonzero(relevants[qindex]))[1] # print('relevants_count : ',relevants_count) retrieved_count = 1 for ranki in queries_ranking[qindex]: # print('queries_similarities.shape(%d,%d)'%(queries_similarities.shape[0],queries_similarities.shape[1])) # print('relevants.shape(%d,%d)'%(relevants.shape[0],relevants.shape[1])) # print('[qindex][ranki]',qindex,ranki) if (queries_similarities[qindex][ranki] > 0) and (relevants[qindex][ranki] == 1): tp += 1 precisioni = tp / retrieved_count if relevants_count == 0: recalli = 1 else: recalli = tp / relevants_count retrieved_count += 1 # if (recalli != 1.0): # print('precisioni : ',precisioni,'recalli : ',recalli,"!!!!!!!!!!!!!!!!!!!!!!!") # else: # print('precisioni : ',precisioni,'recalli : ',recalli,) # print('tp : ',tp,'retrieved_count : ',retrieved_count, 'relevants_count : ',relevants_count) precision += [precisioni] recall += [recalli] # query's 11 levels of precision recall precision_levels[0] = max precision in recall > 0 precision_levels = [] for i in range(0,11): prec_ati = 0 for j in range(0,len(recall)): if i <= recall[j]*10: prec_ati = max(prec_ati,precision[j]) precision_levels.append(prec_ati) interpolated_precision[i] += prec_ati/queries_count del precision del recall # precision_leves_per_query.append(precision_levels) # print(precision_levels) # print(interpolated_precision) # print("**************") # precision_per_query.append(precision) # recall_per_query.append(recall) # # print('precision_per_query : ',shape(precision_per_query)) # print('recall_per_query : ',shape(recall_per_query)) # print('precision_leves_per_query : ',shape(precision_leves_per_query)) auc = float("{0:1.4f}".format(metrics.auc([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],interpolated_precision))) return interpolated_precision, auc
def blTopic(): (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset kernelType = options.kernelType nFold = options.nFold nCodeword = options.nCodeword dataPath = rootDir + dataset + bofDir catmap = getCatMap(dataset) catList = catmap.keys() dataext = str(nCodeword) + bofext nCategory = len(catList) perfMean = np.zeros(nCategory) perfStd = np.zeros(nCategory) for iCat, catname in enumerate(catList): print catname #read the category data which will positive fname = dataPath + catname + dataext catpos = np.genfromtxt(fname, dtype=np.int) # catpos catpos = catpos[:, :nCodeword + 1] catpos[:, nCodeword] = 1 #read the category data of remaining classes for cats in catList: if (cats != catname): firstvisit = True if (firstvisit): catneg = np.genfromtxt(fname, dtype=np.int) firstvisit = False else: catneg = np.concatenate( (catneg, np.genfromtxt(fname, dtype=np.int)), axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0, nNeg, nPos), :] #catneg catneg = catneg[:, :nCodeword + 1] catneg[:, nCodeword] = 0 #combine positive and negative data data = np.concatenate((catpos, catneg), axis=0) #shuffle the rows to aid in random selection of train and test np.random.shuffle(data) X = data[:, :nCodeword] y = data[:, nCodeword] clfParamList = { 'kernel': kernelType, 'gamma': 1e-3, 'C': 1, 'degree': 4, 'probability': True, 'shrinking': True, 'cache_size': 1000 } classifier = SVC(**clfParamList) cv = StratifiedKFold(y, k=nFold) avgprec = np.zeros(nFold) for icv, (train, test) in enumerate(cv): clf = classifier.fit(X[train], y[train]) probas_ = clf.predict_proba(X[test]) precision, recall, thresholds = precision_recall_curve( y[test], probas_[:, 1]) #@UnusedVariable avgprec[icv] = auc(recall, precision) perfMean[iCat] = np.mean(avgprec) perfStd[iCat] = np.std(avgprec) return [perfMean, perfStd]