def do_cross_validation(data, k, target, algorithm, estimators, name, outdir): kf = cross_validation.StratifiedKFold(target, k) auc_list = [] mean_tpr = 0.0 mean_fpr = linspace(0, 1, 100) rocs = [] mean_roc = [] cut_value = 0.0 predictions = [None] * len(target) for train_index, test_index in kf: data_train = [data[index] for index in train_index] data_test = [data[index] for index in test_index] target_train = [target[index] for index in train_index] target_test = [target[index] for index in test_index] if algorithm == 'svm': predicted = train_svm(data_train, target_train, data_test) else: predicted = train_random_forest(data_train, target_train, data_test, estimators) cut_value += cut_at(target_test, predicted) for x in xrange(len(test_index)): predictions[test_index[x]] = (test_index[x], predicted[x, 1], target_test[x]) roc = pyroc.ROCData([( target_test[i], predicted[i, 1], ) for i in xrange(0, len(predicted))]) rocs.append(roc) mean_roc += [( target_test[i], predicted[i, 1], ) for i in xrange(0, len(predicted))] auc_list.append(roc.auc()) mean_tpr /= k #mean_tpr[-1] = 1.0 #mean_auc = metrics.auc(mean_fpr, mean_tpr) mean_auc = sum(auc_list) / len(auc_list) rocs.append(pyroc.ROCData(mean_roc, 'r-')) print("Averaged AUC: %f" % mean_auc) print "Averaged cut_value: %f" % (cut_value / k) save_predictions(predictions, outdir) fig_title = 'ROC Curve for %s on %s \n (mean area = %0.2f)' % ( algorithm, name, mean_auc) plot_roc(rocs, fig_title, outdir) return mean_auc, cut_value / k
def do_cross_validation(pos1, neg1, pos2, neg2, k, algorithm, estimators, name, outdir): #if len(data) < 20*k: #k = len(data)/20 #TODO? len_n = min(pos1.shape[0], pos2.shape[0], neg1.shape[0], neg2.shape[0]) target = [1]*len_n + [0]*len_n data1, data2 = join(pos1, neg1, pos2, neg2) kf = cross_validation.StratifiedKFold(target, k) auc_list=[] mean_tpr = 0.0 mean_fpr = linspace(0, 1, 100) predictions = [] mean_roc = [] cut_value = 0.0 for train_index, test_index in kf: data_train = [data1[index] for index in train_index] data_test = [data1[index] for index in test_index] data_train2 = [data2[index] for index in train_index] data_test2 = [data2[index] for index in test_index] target_train = [target[index] for index in train_index] target_test = [target[index] for index in test_index] if algorithm == 'svm': #TODO predicted = train_svm(data_train, target_train, data_test) else: predicted = train_random_forest(data_train, target_train, data_test, estimators) predicted2 = train_random_forest(data_train2, target_train, data_test2, estimators) predicted = combine_predictions(predicted, predicted2) cut_value += cut_at(target_test, predicted) roc = pyroc.ROCData([(target_test[i], predicted[i][1],) for i in xrange(0, len(predicted))]) predictions.append(roc) mean_roc += [(target_test[i], predicted[i][1],) for i in xrange(0, len(predicted))] auc_list.append(roc.auc()) mean_tpr /= k #mean_tpr[-1] = 1.0 #mean_auc = metrics.auc(mean_fpr, mean_tpr) mean_auc = sum(auc_list)/len(auc_list) predictions.append(pyroc.ROCData(mean_roc, 'r-')) print("Averaged AUC: %f" % mean_auc) print "Averaged cut_value: %f" % (cut_value/k) fig_title = 'ROC Curve for %s on %s \n (mean area = %0.2f)' % (algorithm, name, mean_auc) #plot_roc(predictions, fig_title, outdir) return mean_auc, cut_value/k
def testBupaData(self): X, Y = load_bupa_dataset() classifier = AdaBoost(DecisionStump) for t in [100, 200, 300, 400, 500]: score = classifier.test_on_training_set(X, Y, t) roc = pyroc.ROCData(zip(Y, score)) auc = roc.auc() print auc self.failUnless(auc > .9)
def auc2(pos, neg): roc = pyroc.ROCData([( 1, pos[i], ) for i in xrange(0, len(pos))] + [( 0, neg[i], ) for i in xrange(0, len(neg))]) return roc.auc()
def predict(datapos, dataneg, class_dir, outdir): try: class_filename = glob(RESULTSPATH + class_dir + '/*')[0] except: assert True, 'No classifier in %s' % class_dir print "Predicting using classifier from ", class_filename classifier = pickle.load(open(class_filename)) data, target = train.join_and_balance(datapos, dataneg, False) data, names = my_transpose(data) predicted = classifier.predict_proba(data) roc = pyroc.ROCData([( target[i], predicted[i, 1], ) for i in xrange(0, len(predicted))]) train.save_predictions([(i, predicted[i, 1], target[i]) for i in xrange(0, len(predicted))], outdir) print "AUC=", roc.auc() #plot roc ? return [roc.auc(), 0]
max_AUC = 0.01 best_epoch = 0 for i in range(1000): for start, end in zip(range(0, len(trX), 100), range(100, len(trX), 100)): cost = train(trX[start:end], trY[start:end]) batches_seen += 1 # Learning rate decay lr.set_value( floatX(np.amax((init_lr / (decay_factor**batches_seen), min_lr)))) pred = predict(vX) current_epoch.set_value(floatX(i)) this_error = 1 - np.mean(vY == pred[1]) if this_error < min_error: min_error = this_error this_AUC = pyroc.ROCData(zip(vY, pred[0])).auc() if this_AUC > max_AUC: max_AUC = this_AUC best_epoch = i f = file(os.path.basename(__file__) + '_best.pkl', 'wb') for p in params: cPickle.dump(p.get_value(), f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() if i > end_momentum and (i - best_epoch) > 10: break print "Epoch {0} \t Error={1}".format(i, this_error) print " \t AUC={0}".format(this_AUC) print " \t Learning rate={0}".format(lr.get_value()) if not (i % 5): print " \t \t min Error={0} \t max AUC={1}".format(
#print mod.predict(Xte).ravel() == yte.ravel() #if fold == 4: break y_pred = np.concatenate(y_pred) y_true = np.concatenate(y_true) y_prob_pred = np.concatenate(y_prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) fpr, tpr, thresholds = roc_curve(y_true, y_prob_pred) roc_auc = auc(fpr, tpr) print "Precisions: ", p, p.mean(), "\n",\ "Recalls: ", r, r.mean(), "\n",\ "F: ", f, f.mean(), "\n",\ "AUC: ", roc_auc, "\n",\ "Support: ", s import pyroc sample = np.c_[y_true, y_prob_pred] roc = pyroc.ROCData(sample) #Create the ROC Object roc.auc() #get the area under the curve # 0.9829545454545454 roc.plot('ROC Curve (AUC= %.2f)' % roc.auc(), True, True) #Create a plot of the ROC curve ############################################################################ ## Permuations + 10 CV ############################################################################ if MODE == "permutations": k, l, g = ALPHA * np.array([1 - L1_RATIO, L1_RATIO, 0]) mod = LogisticRegressionL1L2TV(k=k, l=l, g=g, A=A, penalty_start=1,
filename = sys.argv[1] dataname = 'higgs' # Load pylearn2 model object. print 'Loading model...' model = pkl.load(open(filename, 'r')) # Determine which features were used to train the model from filename. if 'all' in filename or 'True' in filename: derived_feat = True elif 'raw' in filename or 'False' in filename: derived_feat = False else: assert 'only' in filename derived_feat = 'only' print 'Loading dataset %s...' % dataname benchmark = 1 if dataname == 'higgs' else 2 dataset = physics.PHYSICS(benchmark=benchmark, which_set='test', derived_feat=derived_feat) # Predict. print 'Making predictions...' Yhat = fprop(model, dataset.X) # Compute area under the ROC curve. print 'Computing AUC...' auc = pyroc.ROCData(zip(dataset.y, Yhat)).auc() error_test = model.monitor.channels['test_y_kl'].val_record[-1] print 'AUC=%f, Error=%f, Dataset=%s, Model File=%s' % (auc, error_test, dataname, filename)
def auc(target, predicted): roc = pyroc.ROCData([( target[i], predicted[i, 1], ) for i in xrange(0, len(predicted))]) return roc.auc()
truth = open("Paper/pred_results/truth.csv") true_vals = {} for line in truth: if line.strip() == '': continue vals = line.strip().split("\t") true_vals[vals[1]] = int(vals[0]) roclist = [] labels = [] for fname in sys.argv[1:]: infile = open(fname) data = [] for line in infile: if line.strip() == '': continue vals = line.strip().split(",") if not vals[0] in true_vals: continue data.append((true_vals[vals[0]], float(vals[1]))) roc = pyroc.ROCData(data) print fname + ": " + str(roc.auc()) roclist.append(roc) labels.append(";".join( [x for x in fname.split("/")[-1].split("_")[0:-1] if x != ""])) pyroc.plot_multiple_roc(roclist, labels=labels)