def best_c_logistic_regression(self, train_X, train_y, test_X, test_y, c_list=np.arange(0.1, 1, 0.1), penalty='l2'): auc = [] for c in c_list: print(c) # all_solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'] LR = LogisticRegression(C=c, penalty=penalty, solver='liblinear').fit( np.mat(train_X), np.ravel(train_y)) pred = LR.predict_proba(np.mat(np.mat(test_X)))[:, 1] test_auc = roc_auc_score(test_y, pred) auc.append(test_auc) position = np.argmax(auc) c_best = c_list[position] print('max auc: ', max(auc)) LR = LogisticRegression(C=c_best, penalty=penalty, solver='liblinear').fit( np.mat(train_X), np.ravel(train_y)) # parameters = {'C': c_list} # lr = GridSearchCV(n_jobs=-1, estimator=LogisticRegression(penalty=penalty), param_grid=parameters, scoring='f1', cv=5,) # LR.fit(train_X, train_y) # best_c, return LR
def get_acc_auc_kfold(X,Y,k=5): #TODO:First get the train indices and test indices for each iteration #Then train the classifier accordingly #Report the mean accuracy and mean auc of all the folds a=KFold(len(Y),k) acc=[] auc=[] for train_index, test_index in a: X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] Y_pred=models.logistic_regression_pred(X_train, Y_train, X_test) ''' false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, Y_pred) roc_auc = sklearn.metrics.roc_auc_score(Y_test, Y_pred) plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc) plt.legend(loc='lower right') plt.plot([0,1],[0,1],'r--') plt.xlim([-0.1,1.2]) plt.ylim([-0.1,1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() ''' acc_1=sklearn.metrics.accuracy_score(Y_test, Y_pred) auc_1=sklearn.metrics.roc_auc_score(Y_test, Y_pred) acc.append(acc_1) auc.append(auc_1) acc_mean=mean(acc) auc_mean=mean(auc) return acc_mean,auc_mean
def get_pairwise_score(model, user_name: str, print_score: bool = False): global preprocessingFunction auc, accuracy, frr, far = list(), list(), list(), list() legal_features = get_legal_test_features_for_user(user_name) all_illegal_features = get_all_illegal_test_features_for_user(user_name) all_illegal_features = shuffle(all_illegal_features) n_fold = all_illegal_features.shape[0] // legal_features.shape[0] for illegal_features in np.array_split(all_illegal_features, n_fold): pairwise_features = np.vstack((legal_features, illegal_features)) if preprocessingFunction is not None: pairwise_features = preprocessingFunction.transform( pairwise_features) y_true = np.ones(pairwise_features.shape[0]) y_true[legal_features.shape[0]:] = -1 y_score = model.decision_function(pairwise_features) # ^-- Signed distance is positive for an inlier and negative for an outlier. auc.append(roc_auc_score(y_true, y_score)) y_score = model.predict(pairwise_features) accuracy.append(np.mean(y_score == y_true)) frr.append( np.sum(y_score[:legal_features.shape[0]] == -1) / pairwise_features.shape[0]) far.append( np.sum(y_score[-illegal_features.shape[0]:] == 1) / pairwise_features.shape[0]) if print_score: print(f" AUC = {auc[-1]:.2f}\n" f" ACC = {accuracy[-1]:.2f}\n" f" FRR(I) = {frr[-1]:.2f}%\n" f" FAR(II) = {far[-1]:.2f}%\n") return np.mean(auc), np.mean(accuracy), np.mean(frr), np.mean(far)
def train(model, cv_data, intMat, drugMat, targetMat): aupr, auc = [], [] for seed in cv_data.keys(): for W, test_data, test_label in cv_data[seed]: model.fix_model(W, intMat, drugMat, targetMat, seed) aupr_val, auc_val = model.evaluation(test_data, test_label) aupr.append(aupr_val) auc.append(auc_val) return np.array(aupr, dtype=np.float64), np.array(auc, dtype=np.float64)
def make_auc(files): df = pd.DataFrame() cols = ['GE-GE','GE-MIX','GE-TOSHIBA','MIX-GE','MIX-MIX','MIX-TOSHIBA','TOSHIBA-GE','TOSHIBA-MIX','TOSHIBA-TOSHIBA'] auc = [] for file in files: auc.append(eval(file).auc.values) df = pd.DataFrame(auc).T df.columns = cols df return df
def deep_learning(posfile, negfile, predfile, fileout): acc, sep, sen, mcc, f1, auc, prauc = [], [], [], [], [], [], [] best = { 'batch_size': 8.0, 'drop_out': 0.10680339747442835, 'hdim': 48.0, 'l2_reg': 0.00024102301670176588, 'learning_rate': 0.0012709235952012008, 'sdim': 32.0, 'tdim': 11.0 } model = get_DNN_model(best) earlystopper = EarlyStopping(monitor='val_loss', patience=5, verbose=1) for i in range(10): X_train, Y_train, X_val, Y_val, X_test, Y_test = get_DNN_data( posfile, negfile) model.fit(X_train, Y_train, batch_size=2**int(best['batch_size']), epochs=100, shuffle=True, validation_data=(X_val, Y_val), callbacks=[earlystopper]) predictions = model.predict(X_test) rounded = [round(x[0]) for x in predictions] pred_train_prob = [x[0] for x in model.predict_proba(X_test)] accuracy, sepcificity, sensitivity, mccvalue, f1value, aucvalue, praucvalue = metrics( Y_test, rounded, pred_train_prob, fileout) acc.append(accuracy) sep.append(sepcificity) sen.append(sensitivity) mcc.append(mccvalue) f1.append(f1value) auc.append(aucvalue) prauc.append(praucvalue) fileout.write("DNN\n" + "Accuracy_mean: " + str(np.mean(acc)) + "\n" + "Sepcificity_mean: " + str(np.mean(sep)) + "\n" + "Sensitivity_mean: " + str(np.mean(sen)) + "\n" + "MCC_mean: " + str(np.mean(mcc)) + "\n" + "Fscore_mean: " + str(np.mean(f1)) + "\n" + "AUC_mean: " + str(np.mean(auc)) + "\n" + "PRAUC_mean: " + str(np.mean(prauc)) + "\n") X_train, Y_train, X_val, Y_val, X_pred, Info_pred = get_DNN_pred_data( posfile, negfile, predfile) model.fit(X_train, Y_train, batch_size=2**int(best['batch_size']), epochs=100, shuffle=True, validation_data=(X_val, Y_val), callbacks=[earlystopper]) predictions = model.predict(X_pred) rounded = [round(x[0]) for x in predictions] pred_train_prob = [x[0] for x in model.predict_proba(X_pred)] return rounded, pred_train_prob
def new_metric(tdf, seqnum_columns, y_label): true_label = list(tdf[y_label]) auc = [] for colums_name in seqnum_columns: pred_label = list(tdf[colums_name]) fpr, tpr, thresholds = metrics.roc_curve(true_label, pred_label, pos_label=1) value = metrics.auc(fpr, tpr) auc.append(value) return pd.DataFrame({"method": seqnum_columns, "metric_auc": auc})
def train(model, cv_data, intMat, drugMat, targetMat, N=5): aupr, auc = [], [] for seed in cv_data.keys(): for W, test_data, test_label in cv_data[seed]: model.fix_model(W, intMat, drugMat, targetMat, seed) # model.fix_model(W*intMat, drugMat, targetMat, seed) scores = model.predict_scores(test_data) aupr_val, auc_val = evaluation(scores, test_label.astype(int)) aupr.append(aupr_val) auc.append(auc_val) return np.array(aupr, dtype=np.float64), np.array(auc, dtype=np.float64)
def train_cv_model(X, Y): ''' train cv xgb model and return auc vectors for test and train ''' skf = StratifiedKFold(n_splits=3, shuffle=True) auc = [] auc_train = [] for train_index, test_index in skf.split(X, Y): print(train_index) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = Y.iloc[train_index], Y.iloc[test_index] model = xgb.XGBClassifier( objective='binary:logistic', colsample_bytree=1, learning_rate=0.03, max_depth=6, subsample=0.8, n_estimators=500, base_score=0.22, seed=2, ) eval_result = {} eval_set = [(X_train, y_train, 'train'), (X_test, y_test, 'test')] model.fit(X_train, y_train, verbose=True, eval_set=eval_set, eval_metric="auc", callbacks=[ xgb.callback.record_evaluation(eval_result), xgb.callback.early_stop(15) ]) preds_test = model.predict_proba(X_test)[:, 1] roc_auc_score(y_test, preds_test) preds_train = model.predict_proba(X_train)[:, 1] roc_auc_score(y_train, preds_train) auc.append(roc_auc_score(y_test, preds_test)) auc_train.append(roc_auc_score(y_train, preds_train)) #print("test mean: {0} train mean: {1}".format(np.mean(auc),np.mean(auc_train))) return {"test_auc": auc, "train_auc": auc_train, "model": model}
def train(model, cv_data, intMat, drugMat, targetMat): aupr, auc, ndcg, ndcg_inv, results = [], [], [], [], [] for seed in cv_data.keys(): for W, test_data, test_label in cv_data[seed]: t = time.clock() model.fix_model(W, intMat, drugMat, targetMat, seed) aupr_val, auc_val, ndcg_val, ndcg_inv_val = model.evaluation( test_data, test_label) results = results + [("", "", "", "")] + zip( test_data[:, 0], test_data[:, 1], test_label, model.scores) print(aupr_val, auc_val, ndcg_val, ndcg_inv_val, time.clock() - t) aupr.append(aupr_val) auc.append(auc_val) ndcg.append(ndcg_val) ndcg_inv.append(ndcg_inv_val) return np.array(aupr, dtype=np.float64), np.array( auc, dtype=np.float64), np.array(ndcg, dtype=np.float64), np.array( ndcg_inv, dtype=np.float64), results
def get_acc_auc_randomisedCV(X,Y,iterNo=5,test_percent=0.2): #TODO: First get the train indices and test indices for each iteration #Then train the classifier accordingly #Report the mean accuracy and mean auc of all the iterations a=ShuffleSplit(len(Y),iterNo,test_percent) acc=[] auc=[] for train_index, test_index in a: X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] Y_pred=models.logistic_regression_pred(X_train, Y_train, X_test) acc_1=sklearn.metrics.accuracy_score(Y_test, Y_pred) auc_1=sklearn.metrics.roc_auc_score(Y_test, Y_pred) acc.append(acc_1) auc.append(auc_1) acc_mean=mean(acc) auc_mean=mean(auc) return acc_mean,auc_mean
def check_parameters(parameters, values, fixed={}, features=None): scores = [] f1 = [] auc = [] for p in values: print(f'Fitting with {parameters}={p}') fts = X_train.columns if features is None else features kw = {parameters: p, **fixed} model = RandomForestClassifier(**kw) model.fit(X_train[fts], y_train) s = roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1]) rf_f1, rf_auc = auc_score(model, X_valid[fts], y_valid) print('ROC AUC Score', s) print('F1', rf_f1) print('Auc', rf_auc) print('') scores.append(rf_auc) f1.append(rf_f1) auc.append(rf_auc) plt.title(parameters) plt.plot(values, scores)
def reporting(dat, task, m, f, dct, multi=False): data = {} #pheno25 #task = '25 ddx' data[task][m + '_' + f] = {} if multi: f1 = [] auc = [] sen = {} spec = {} for loop in list(dat[f].keys()): sen[loop] = [] spec[loop] = [] f1.append(list(dat[f][loop]['f1_score'][0].values())) auc.append(list(dat[f][loop]['te_auc'][0].values())) for mat in list(dat[f][loop]['te_matrix'][0].values()): tn, fp, fn, tp = mat.ravel() sen[loop].append(1.0 * (tp / (tp + fn))) spec[loop].append(1.0 * (tn / (tn + fp))) f1 = (np.mean(f1, axis=0), np.std(f1, axis=0)) auc = (np.mean(auc, axis=0), np.std(auc, axis=0)) sen = (np.mean(list(sen.values()), axis=0), np.std(list(sen.values()), axis=0)) spec = (np.mean(list(spec.values()), axis=0), np.std(list(spec.values()), axis=0)) for n in range(len(f1[0])): data[task][m + '_' + f]['f1_' + dct[n]] = '{0:.3}'.format( f1[0][n]) + '({0:.3})'.format(f1[1][n]) data[task][m + '_' + f]['auc_' + dct[n]] = '{0:.3}'.format( auc[0][n]) + '({0:.3})'.format(auc[1][n]) try: data[task][m + '_' + f]['sen_' + dct[n]] = '{0:.3}'.format( sen[0][n]) + '({0:.3})'.format(sen[1][n]) data[task][m + '_' + f]['spec_' + dct[n]] = '{0:.3}'.format( spec[0][n]) + '({0:.3})'.format(spec[1][n]) except: pass else: f1 = [] auc = [] sen = [] spec = [] for loop in list(dat[f].keys()): f1.append(dat[f][loop]['f1_score'][0]) auc.append(dat[f][loop]['te_auc'][0]) tn, fp, fn, tp = dat[f][loop]['te_matrix'][0].ravel() sen.append(1.0 * (tp / (tp + fn))) spec.append(1.0 * (tn / (tn + fp))) f1 = (np.mean(f1, axis=0), np.std(f1, axis=0)) auc = (np.mean(auc, axis=0), np.std(auc, axis=0)) sen = (np.mean(sen, axis=0), np.std(sen, axis=0)) spec = (np.mean(spec, axis=0), np.std(spec, axis=0)) data[task][m + '_' + f]['f1'] = '{0:.3}'.format( f1[0]) + ' ({0:.3})'.format(f1[1]) data[task][m + '_' + f]['auc'] = '{0:.3}'.format( auc[0]) + ' ({0:.3})'.format(auc[1]) data[task][m + '_' + f]['sen'] = '{0:.3}'.format( sen[0]) + ' ({0:.3})'.format(sen[1]) data[task][m + '_' + f]['spec'] = '{0:.3}'.format( spec[0]) + ' ({0:.3})'.format(spec[1]) return data
end = 0 print len(list_exc) steps = range(0, len(list_exc), 20) print steps feature_ids = [] auc = [] for i in range(0, len(steps) - 1): begin = steps[i] end = steps[i + 1] feature_ids.extend(list_exc[range(begin, end)]) x_train = train_data[:, feature_ids] x_valid = valid_data[:, feature_ids] x_test = test_data[:, feature_ids] clf.fit(x_train, y_train) dec_val_test = clf.decision_function(x_test) auc.append(roc_auc_score(y_test, dec_val_test)) #print feature_ids feature_ids.extend(list_exc[range(begin, end)]) #print feature_ids x_train = train_data[:, feature_ids] x_valid = valid_data[:, feature_ids] x_test = test_data[:, feature_ids] clf.fit(x_train, y_train) dec_val_test = clf.decision_function(x_test) auc.append(roc_auc_score(y_test, dec_val_test)) clf2 = linear_model.LogisticRegression(C=0.01, penalty='l1') clf2.fit(x_train, y_train) dec_val_test = clf2.decision_function(x_test) auc_lasso = roc_auc_score(y_test, dec_val_test) lasso = [auc_lasso] * len(list_exc) #plt.title(",fontsize=18)
rec[0].append(numpy.average(yr, weights=ys)) fone[0].append(numpy.average(yf1, weights=ys)) sup[0].append(numpy.sum(ys)) cnf_matrix=confusion_matrix(Y_test_temp, predictionMlp) cm_name_path="./Oberservations/Iter%s/pass%s/CM-MLP-fold%s.png" % (topiter, iter1,pltctr) plot_confusion_matrix(cnf_matrix, classes=['Class-1','Class-2','Class-3','Class-4','Class-5'], title='confusion matrix',pltname=cm_name_path) prob=mlp.predict_proba(X_test) prob=prob.transpose() auc = [] tp = [] fp = [] for cx in range(0,5): fpr, tpr, threshold = sklearn.metrics.roc_curve(ohc[cx], prob[cx], pos_label=1) auc.append(sklearn.metrics.auc(fpr, tpr)) tp.append(tpr) fp.append(fpr) plotROC(fpr=fp, tpr=tp, path="./Oberservations/Iter%s/pass%s/ROC-pass-%s-MLP.png" % (topiter, iter1, pltctr), auc=auc) # ------------ # ------------------------------------------------------------- # logistic regression reports.append("----------------------Logistic Regression-------------------------------") print "-------------------------Logistic Regression----------------------------------" modelName.append("Logistic Regresion") logreg = LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter=5, class_weight='balanced', C=1) mLogreg = logreg.fit(X_train, Y_train_temp) pred = mLogreg.predict(X_test)
estimator = CatBoostClassifier(iterations = 1000,depth = 10,learning_rate = 0.1,logging_level = None,scale_pos_weight = 45) #estimator = svm.SVC(kernel = 'rbf',C = 10, gamma = 0.012) #estimator = lgb.LGBMClassifier(is_unbalance = True, learning_rate = 0.012) model = estimator.fit(X_train[train,:], y_train[train]) y_pred = estimator.predict(X_train[test,:]) y_proba_pred = estimator.predict_proba(X_train[test,:])[:,1] TP = numpy.sum(numpy.logical_and(numpy.equal(y_train[test],1),numpy.equal(y_pred,1))) FP = numpy.sum(numpy.logical_and(numpy.equal(y_train[test],0),numpy.equal(y_pred,1))) TN = numpy.sum(numpy.logical_and(numpy.equal(y_train[test],0),numpy.equal(y_pred,0))) FN = numpy.sum(numpy.logical_and(numpy.equal(y_train[test],1),numpy.equal(y_pred,0))) accuracy = (TP+TN)/(TP+FP+TN+FN) acc.append(accuracy) fpr, tpr, th = metrics.roc_curve(y_train[test],y_proba_pred ,pos_label=1) auc.append(metrics.auc(fpr, tpr)) plot_AUROC(fpr,tpr) aupr.append(metrics.average_precision_score(y_train[test],y_proba_pred)) if metrics.average_precision_score(y_train[test],y_proba_pred) > max_num: max_num = metrics.average_precision_score(y_train[test],y_proba_pred) model.save_model(cmd+cellline_dir+'best_model{}'.format(kvalue)) prec, rec, thres = metrics.precision_recall_curve(y_train[test],y_proba_pred ,pos_label=1) auprc.append(metrics.auc(rec, prec)) plot_AUPRC(rec,prec) recall.append(metrics.recall_score(y_train[test],y_pred)) precision.append(metrics.precision_score(y_train[test],y_pred)) f1.append(metrics.f1_score(y_train[test],y_pred)) m_c_c = (TP*TN - FP*FN)/(math.sqrt((TP+FN)*(TP+FP)*(TN+FN)*(TN+FP))) mcc.append(m_c_c)
def main(): infofile = open(modelDir.replace('.h5', '_infofile.txt')) infos = infofile.readlines() analysis = infos[0].replace('Used analysis method: ', '').replace('\n', '') dataset = DatasetDir + infos[3].replace('Used dataset: ', '').replace( '\n', '') nvar = infos[5].replace('Used variables for training: ', '').replace('\n', '') nvar = nvar.split() model = load_model(modelDir) scaler = joblib.load(SCALING) recurrent = False if analysis.lower() == 'rnn': recurrent = True h5f = h5py.File(dataset + '.h5', 'r') X_train = h5f['X_train'][:] y = h5f['y_train'][:] y_train = deepcopy(y) y_train[y != 0] = 0. y_train[y == 0] = 1. collection = [] if recurrent: for col in COLLECTION: collection.append(h5f['X_train_' + col][:]) h5f.close() where_nan = np.isnan(X_train) X_train[where_nan] = -999. X_train = scaler.transform( X_train) # collection already standardized in training print '#----MODEL----#' print modelDir print model.summary() ###################################### # Read in trained and tested dataset # ###################################### if recurrent: y_hat = model.predict(collection + [X_train]) else: y_hat = model.predict(X_train) importanceBySquaredWeight = getImportanceBySquaredWeight( model, nvar, recurrent) importanceByWeight = getImportanceByWeight(model, nvar, recurrent) impotanceByGrad = getImportanceByGradient(model, nvar, X_train, collection, recurrent) # Re-shuffle for re-evaluate X_train_reshuffled = [] for idx, var in enumerate(nvar): X = np.copy(X_train) print X[:1] np.random.shuffle(X[:, idx]) print X[:1], '\n' X_train_reshuffled.append(X) roc = [] auc = [] for i in xrange(len(X_train_reshuffled)): print type(X_train_reshuffled[i]) if recurrent: y_predict = model.predict(collection + [X_train_reshuffled[i]]) else: y_predict = model.predict(X_train_reshuffled[i]) roc.append(roc_curve(y_train, y_predict[:, 0])) auc.append(roc_auc_score(y_train, y_predict[:, 0])) del y_predict roc.append(roc_curve(y_train, y_hat[:, 0])) auc.append(roc_auc_score(y_train, y_hat[:, 0])) print auc, '\n', importanceBySquaredWeight, '\n', importanceByWeight, '\n', impotanceByGrad, '\n' print 100 * '#' print '\n\t\t\tVariable ranking' print '\n sum of squared weights \t sum of absolute weights \t gradients \t AUC (after shuffle)' print 100 * '-' for i in xrange(len(nvar)): print '{}: {}\t{}: {}\t{}: {}\t{}: {}'.format( importanceBySquaredWeight[i][0], importanceBySquaredWeight[i][1], importanceByWeight[i][0], importanceByWeight[i][1], impotanceByGrad[i][0], impotanceByGrad[i][1], nvar[i], auc[i]) print 100 * '-' print 100 * '#' print('Plotting the ROC curves ...') fig = plt.figure(figsize=(8, 6)) ax1 = plt.subplot2grid((4, 4), (0, 0), colspan=4, rowspan=4) ax1.set_xlim((0, 1)) ax1.set_ylim((0, 1)) ax1.set_xlabel('$\epsilon_{Sig.}$', horizontalalignment='right', x=1.0) ax1.set_ylabel("$r_{Bkg.}$", horizontalalignment='right', y=1.0) for i in xrange(len(roc)): try: plt.plot(roc[i][1], 1 - roc[i][0], '-', label='w/o %s (AUC = %0.4f)' % (nvar[i], auc[i])) except IndexError: plt.plot(roc[i][1], 1 - roc[i][0], '-', label='Default (AUC = %0.4f)' % (auc[i])) plt.plot([0, 1], [1, 0], '--', color=(0.6, 0.6, 0.6), label='Luck') leg = plt.legend(loc="lower left", frameon=False) AtlasStyle_mpl.ATLASLabel(ax1, 0.13, 0.9, 'Work in progress') #AtlasStyle_mpl.LumiLabel(ax1, 0.02, 0.3, lumi=LUMI*0.001) plt.savefig("plots/" + modelfile + "_ROC_n-1.pdf") plt.savefig("plots/" + modelfile + "_ROC_n-1.png") plt.close()
def renew(f,t,a): fpr.append(f) tpr.append(t) auc.append(a)
import scipy import pandas as pd import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_curve from sklearn.utils.fixes import signature from sklearn.metrics import average_precision_score import statistics as st y_pred_cnn = h5py.File('prediction') pred = np.array(y_pred_cnn['pred']) testmat = scipy.io.loadmat('test_add_1.mat') y_test = testmat['testdata'] auc = [] for i in range(0, 919): auc.append(roc_auc_score(y_test[:, i], pred[:, i])) print(auc) print(sum(auc) / 919) y = range(0, 125) plt.figure() plt.plot(y, sorted(auc[0:125])) plt.show() y = range(0, 690) plt.figure() plt.plot(y, sorted(auc[125:815])) plt.show() y3 = range(0, 104) plt.figure() plt.plot(y3, sorted(auc[815:919])) plt.show()
def test_model(self, params=None): """ Function to perform the core machine learning analysis. Metrics are calculated in Cross Validation and stored as a dictionary with average values and std. Arguements: params: A dictionary of parameters, "model_instance" is required. Should be of the form: params={'model_instance': <desired model>, 'scaler_instance': <optional scaler>, 'imputer_instance': <optional imputer>} Returns: A dictionary of tested models with corresponding metrics """ ######################### Scale, CV, Imputation ################################ self.params = params if 'scaler_instance' in self.params.keys(): raise Exception('No scaler defined in params.' \ 'Use form {"scaler_instance":<scaler>}') if 'scaler_instance' in self.params: scaler = self.params['scaler_instance'] scaled_x = scaler.fit_transform(X=self.df) X = scaled_x y = self.target.values else: X = self.df.values y= self.target.values accuracies = [] balanced_accuracies = [] recalls = [] precisions = [] specificities = [] f1_scores = [] auc = [] y_hat_probs = [] y_tests = [] model_instance = self.params['model_instance'] k_fold = KFold(n_splits=self.cv_folds, random_state=self.random_seed, shuffle=True) if 'imputer_instance' in self.params.keys(): raise Exception('No imputer defined in params.' \ 'Use form {"imputer_instance":<imputer>}') if 'imputer_instance' in self.params: med_imp = self.params['imputer_instance'] kf = k_fold.split(X, y) for train_index, test_index in kf: X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] if 'imputer_instance' in self.params: X_train = med_imp.fit_transform(X_train) X_test = med_imp.fit_transform(X_test) trained_model = model_instance.fit(X=X_train, y=y_train) y_hat= trained_model.predict(X_test) y_hat_prob = [p[1] for p in trained_model.predict_proba(X_test)] accuracies.append(np.mean(y_hat == y_test)) if self.include_auc: auc.append(roc_auc_score(y_test, y_hat_prob)) recall, precision, specificity, balanced_accuracy, f1_score =\ self.calculate_accuracies(y_hat, y_test) recalls.append(recall) precisions.append(precision) specificities.append(specificity) balanced_accuracies.append(balanced_accuracy) f1_scores.append(f1_score) y_hat_probs += y_hat_prob y_tests += y_test.tolist() model_id = self._make_model_id() if self.include_auc: self.results[model_id] = { 'model_id': model_id, 'model': model_instance, 'f1_score': self._make_result(f1_score), 'recall': self._make_result(recalls), 'precision': self._make_result(precisions), 'specificity': self._make_result(specificities), 'balanced_accuracy': self._make_result(balanced_accuracies), 'accuracy': self._make_result(accuracies), 'auc':self._make_result(auc) } else: self.results[model_id] = { 'model_id': model_id, 'model': model_instance, 'f1_score': self._make_result(f1_score), 'recall': self._make_result(recalls), 'precision': self._make_result(precisions), 'specificity': self._make_result(specificities), 'balanced_accuracy': self._make_result(balanced_accuracies), 'accuracy': self._make_result(accuracies) } self.predictions[model_id] = { 'prediction_probabilities': y_hat_probs, 'y_test': y_tests, } return self.results
def plot_multi_SVM(prediction, mutation_data, label_type, show_plots=False, key=None, n_classifiers=[1], outputfolder=None): if type(prediction) is not pd.core.frame.DataFrame: if os.path.isfile(prediction): prediction = pd.read_hdf(prediction) keys = prediction.keys() SVMs = list() if key is None: label = keys[0] else: label = key SVMs = prediction[label]['classifiers'] Y_test = prediction[label]['Y_test'] X_test = prediction[label]['X_test'] X_train = prediction[label]['X_train'] Y_train = prediction[label]['Y_train'] test_patient_IDs = prediction[label]['patient_ID_test'] train_patient_IDs = prediction[label]['patient_ID_train'] feature_labels = prediction[label]['feature_labels'] # print(len(X_test[0][0])) # print(config) # X_train = data2['19q']['X_train'] # Y_train = data2['19q']['Y_train'] # mutation_data = gp.load_mutation_status(patientinfo, [[label]]) if type(mutation_data) is not dict: if os.path.isfile(mutation_data): label_data = gp.load_mutation_status(mutation_data, [[label_type]]) patient_IDs = label_data['patient_IDs'] mutation_label = label_data['mutation_label'] # print(len(SVMs)) N_iterations = float(len(SVMs)) # mutation_label = np.asarray(mutation_label) for n_class in n_classifiers: # output_json = os.path.join(outputfolder, ('performance_{}.json').format(str(n_class))) sensitivity = list() specificity = list() precision = list() accuracy = list() auc = list() # auc_train = list() f1_score_list = list() patient_classification_list = dict() trained_classifiers = list() y_score = list() y_test = list() pid_test = list() y_predict = list() # csvfile = os.path.join(outputfolder, ('scores_{}.csv').format(str(n_class))) # towrite = list() # # csvfile_plain = os.path.join(outputfolder, ('scores_plain_{}.csv').format(str(n_class))) # towrite_plain = list() empty_scores = {k: '' for k in natsort.natsorted(patient_IDs)} empty_scores = collections.OrderedDict(sorted(empty_scores.items())) # towrite.append(["Patient"] + empty_scores.keys()) params = dict() for num, s in enumerate(SVMs): scores = empty_scores.copy() print("Processing {} / {}.").format(str(num + 1), str(len(SVMs))) trained_classifiers.append(s) # Extract test info test_patient_IDs_temp = test_patient_IDs[num] train_patient_IDs_temp = train_patient_IDs[num] X_train_temp = X_train[num] Y_train_temp = Y_train[num] X_test_temp = X_test[num] Y_test_temp = Y_test[num] # Extract sample size N_1 = float(len(train_patient_IDs_temp)) N_2 = float(len(test_patient_IDs_temp)) test_indices = list() for i_ID in test_patient_IDs_temp: test_indices.append(np.where(patient_IDs == i_ID)[0][0]) if i_ID not in patient_classification_list: patient_classification_list[i_ID] = dict() patient_classification_list[i_ID]['N_test'] = 0 patient_classification_list[i_ID]['N_correct'] = 0 patient_classification_list[i_ID]['N_wrong'] = 0 patient_classification_list[i_ID]['N_test'] += 1 # y_truth = [mutation_label[0][k] for k in test_indices] # FIXME: order can be switched, need to find a smart fix # 1 for normal, 0 for KM # y_truth = [mutation_label[0][k][0] for k in test_indices] y_truth = Y_test_temp # Predict using the top N classifiers results = s.cv_results_['rank_test_score'] indices = range(0, len(results)) sortedindices = [x for _, x in sorted(zip(results, indices))] sortedindices = sortedindices[0:n_class] y_prediction = np.zeros([n_class, len(y_truth)]) y_score = np.zeros([n_class, len(y_truth)]) # Get some base objects required base_estimator = s.estimator y_train = Y_train_temp y_train_prediction = np.zeros([n_class, len(y_train)]) scorer = s.scorer_ train = np.asarray(range(0, len(y_train))) test = train # This is in order to use the full training dataset to train the model # Remove the NaN features X_notnan = X_train_temp[:] for pnum, (pid, x) in enumerate( zip(train_patient_IDs_temp, X_train_temp)): for fnum, (f, fid) in enumerate(zip(x, feature_labels)): if np.isnan(f): print( "[PREDICT WARNING] NaN found, patient {}, label {}. Replacing with zero." ).format(pid, fid) # Note: X is a list of lists, hence we cannot index the element directly features_notnan = x[:] features_notnan[fnum] = 0 X_notnan[pnum] = features_notnan X_train_temp = X_notnan[:] X_train_temp = [(x, feature_labels) for x in X_train_temp] X_notnan = X_test_temp[:] for pnum, (pid, x) in enumerate(zip(test_patient_IDs_temp, X_test_temp)): for fnum, (f, fid) in enumerate(zip(x, feature_labels)): if np.isnan(f): print( "[PREDICT WARNING] NaN found, patient {}, label {}. Replacing with zero." ).format(pid, fid) # Note: X is a list of lists, hence we cannot index the element directly features_notnan = x[:] features_notnan[fnum] = 0 X_notnan[pnum] = features_notnan X_test_temp = X_notnan[:] # X_test_temp = [(x, feature_labels) for x in X_test_temp] # NOTE: need to build this in the SearchCVFastr Object for i, index in enumerate(sortedindices): print("Processing number {} of {} classifiers.").format( str(i + 1), str(n_class)) X_testtemp = X_test_temp[:] # Get the parameters from the index parameters_est = s.cv_results_['params'][index] parameters_all = s.cv_results_['params_all'][index] print parameters_all print s.cv_results_['mean_test_score'][index] # NOTE: kernel parameter can be unicode kernel = str(parameters_est[u'kernel']) del parameters_est[u'kernel'] del parameters_all[u'kernel'] parameters_est['kernel'] = kernel parameters_all['kernel'] = kernel # Refit a classifier using the settings given print("Refitting classifier with best settings.") # Only when using fastr this is an entry if 'Number' in parameters_est.keys(): del parameters_est['Number'] best_estimator = clone(base_estimator).set_params( **parameters_est) # ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler =\ # fit_and_score(best_estimator, X_train, y_train, scorer, # train, test, True, parameters_all, # t.fit_params, # t.return_train_score, # True, True, True, # t.error_score) ret, GroupSel, VarSel, SelectModel, _, scaler =\ fit_and_score(estimator=best_estimator, X=X_train_temp, y=y_train, scorer=scorer, train=train, test=test, verbose=True, para=parameters_all, fit_params=s.fit_params, return_train_score=s.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=s.error_score) X = [x[0] for x in X_train_temp] if GroupSel is not None: X = GroupSel.transform(X) X_testtemp = GroupSel.transform(X_testtemp) if SelectModel is not None: X = SelectModel.transform(X) X_testtemp = SelectModel.transform(X_testtemp) if VarSel is not None: X = VarSel.transform(X) X_testtemp = VarSel.transform(X_testtemp) if scaler is not None: X = scaler.transform(X) X_testtemp = scaler.transform(X_testtemp) try: if y_train is not None: best_estimator.fit(X, y_train, **s.fit_params) else: best_estimator.fit(X, **s.fit_params) # Predict the posterios using the fitted classifier for the training set print("Evaluating performance on training set.") if hasattr(best_estimator, 'predict_proba'): probabilities = best_estimator.predict_proba(X) y_train_prediction[i, :] = probabilities[:, 1] else: # Regression has no probabilities probabilities = best_estimator.predict(X) y_train_prediction[i, :] = probabilities[:] # Predict the posterios using the fitted classifier for the test set print("Evaluating performance on test set.") if hasattr(best_estimator, 'predict_proba'): probabilities = best_estimator.predict_proba( X_testtemp) y_prediction[i, :] = probabilities[:, 1] else: # Regression has no probabilities probabilities = best_estimator.predict(X_testtemp) y_prediction[i, :] = probabilities[:] if type(s.estimator) == sklearn.svm.classes.SVC: y_score[i, :] = best_estimator.decision_function( X_testtemp) else: y_score[i, :] = best_estimator.decision_function( X_testtemp)[:, 0] except ValueError: # R2 score was set to zero previously y_train_prediction[i, :] = np.asarray([0.5] * len(X)) y_prediction[i, :] = np.asarray([0.5] * len(X_testtemp)) y_score[i, :] = np.asarray([0.5] * len(X_testtemp)) probabilities = [] # Add number parameter settings for k in parameters_all.keys(): if k not in params.keys(): params[k] = list() params[k].append(parameters_all[k]) # Save some memory del best_estimator, X, X_testtemp, ret, GroupSel, VarSel, SelectModel, scaler, parameters_est, parameters_all, probabilities # Take mean over posteriors of top n y_train_prediction_m = np.mean(y_train_prediction, axis=0) y_prediction_m = np.mean(y_prediction, axis=0) # NOTE: Not sure if this is best way to compute AUC y_score = y_prediction_m if type(s.estimator) == sklearn.svm.classes.SVC: # Look for optimal F1 performance on training set thresholds = np.arange(0, 1, 0.01) f1_scores = list() y_train_prediction = np.zeros(y_train_prediction_m.shape) for t in thresholds: for ip, y in enumerate(y_train_prediction_m): if y > t: y_train_prediction[ip] = 1 else: y_train_prediction[ip] = 0 f1_scores.append( f1_score(y_train_prediction, y_train, average='weighted')) # Use best threshold to determine test score best_index = np.argmax(f1_scores) best_thresh = thresholds[best_index] best_thresh = 0.5 y_prediction = np.zeros(y_prediction_m.shape) for ip, y in enumerate(y_prediction_m): if y > best_thresh: y_prediction[ip] = 1 else: y_prediction[ip] = 0 # y_prediction = t.predict(X_temp) y_prediction = [min(max(y, 0), 1) for y in y_prediction] else: y_prediction = y_prediction_m y_prediction = [min(max(y, 0), 1) for y in y_prediction] # NOTE: start of old function part print "Truth: ", y_truth print "Prediction: ", y_prediction for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction, test_patient_IDs_temp): if i_truth == i_predict: patient_classification_list[i_test_ID]['N_correct'] += 1 else: patient_classification_list[i_test_ID]['N_wrong'] += 1 # print('bla') # print(y_truth) # print(y_prediction) c_mat = confusion_matrix(y_truth, y_prediction) TN = c_mat[0, 0] FN = c_mat[1, 0] TP = c_mat[1, 1] FP = c_mat[0, 1] if FN == 0 and TP == 0: sensitivity.append(0) else: sensitivity.append(float(TP) / (TP + FN)) if FP == 0 and TN == 0: specificity.append(0) else: specificity.append(float(TN) / (FP + TN)) if TP == 0 and FP == 0: precision.append(0) else: precision.append(float(TP) / (TP + FP)) accuracy.append(accuracy_score(y_truth, y_prediction)) auc.append(roc_auc_score(y_truth, y_score)) f1_score_list.append( f1_score(y_truth, y_prediction, average='weighted')) # Adjusted according to "Inference for the Generelization error" accuracy_mean = np.mean(accuracy) S_uj = 1.0 / max((N_iterations - 1), 1) * np.sum( (accuracy_mean - accuracy)**2.0) print Y_test N_1 = float(len(Y_train[0])) N_2 = float(len(Y_test[0])) print(N_1) print(N_2) accuracy_var = np.sqrt((1.0 / N_iterations + N_2 / N_1) * S_uj) print(accuracy_var) print(np.sqrt(1 / N_iterations * S_uj)) print(st.sem(accuracy)) stats = dict() stats["Accuracy 95%:"] = str( compute_CI.compute_confidence(accuracy, N_1, N_2, 0.95)) stats["AUC 95%:"] = str( compute_CI.compute_confidence(auc, N_1, N_2, 0.95)) stats["F1-score 95%:"] = str( compute_CI.compute_confidence(f1_score_list, N_1, N_2, 0.95)) stats["Precision 95%:"] = str( compute_CI.compute_confidence(precision, N_1, N_2, 0.95)) stats["Sensitivity 95%: "] = str( compute_CI.compute_confidence(sensitivity, N_1, N_2, 0.95)) stats["Specificity 95%:"] = str( compute_CI.compute_confidence(specificity, N_1, N_2, 0.95)) print("Accuracy 95%:" + str(compute_CI.compute_confidence(accuracy, N_1, N_2, 0.95))) print("AUC 95%:" + str(compute_CI.compute_confidence(auc, N_1, N_2, 0.95))) print( "F1-score 95%:" + str(compute_CI.compute_confidence(f1_score_list, N_1, N_2, 0.95))) print("Precision 95%:" + str(compute_CI.compute_confidence(precision, N_1, N_2, 0.95))) print("Sensitivity 95%: " + str(compute_CI.compute_confidence(sensitivity, N_1, N_2, 0.95))) print("Specificity 95%:" + str(compute_CI.compute_confidence(specificity, N_1, N_2, 0.95))) alwaysright = dict() alwayswrong = dict() for i_ID in patient_classification_list: percentage_right = patient_classification_list[i_ID][ 'N_correct'] / float( patient_classification_list[i_ID]['N_test']) # print(i_ID + ' , ' + str(patient_classification_list[i_ID]['N_test']) + ' : ' + str(percentage_right) + '\n') if percentage_right == 1.0: label = mutation_label[0][np.where(i_ID == patient_IDs)] label = label[0][0] alwaysright[i_ID] = label # alwaysright.append(('{} ({})').format(i_ID, label)) print(("Always Right: {}, label {}").format(i_ID, label)) if percentage_right == 0: label = mutation_label[0][np.where( i_ID == patient_IDs)].tolist() label = label[0][0] alwayswrong[i_ID] = label # alwayswrong.append(('{} ({})').format(i_ID, label)) print(("Always Wrong: {}, label {}").format(i_ID, label)) stats["Always right"] = alwaysright stats["Always wrong"] = alwayswrong if show_plots: import matplotlib.pyplot as plt plt.figure() plt.boxplot(accuracy) plt.ylim([-0.05, 1.05]) plt.ylabel('Accuracy') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() plt.figure() plt.boxplot(auc) plt.ylim([-0.05, 1.05]) plt.ylabel('AUC') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() plt.figure() plt.boxplot(precision) plt.ylim([-0.05, 1.05]) plt.ylabel('Precision') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() plt.figure() plt.boxplot(sensitivity) plt.ylim([-0.05, 1.05]) plt.ylabel('Sensitivity') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() plt.figure() plt.boxplot(specificity) plt.ylim([-0.05, 1.05]) plt.ylabel('Specificity') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() return stats
def plot_single_SVM(prediction, mutation_data, label_type, show_plots=False, show_ROC=False): if type(prediction) is not pd.core.frame.DataFrame: if os.path.isfile(prediction): prediction = pd.read_hdf(prediction) keys = prediction.keys() SVMs = list() label = keys[0] SVMs = prediction[label]['classifiers'] Y_test = prediction[label]['Y_test'] X_test = prediction[label]['X_test'] Y_train = prediction[label]['X_train'] Y_score = list() # print(len(X_test[0][0])) # print(config) # X_train = data2['19q']['X_train'] # Y_train = data2['19q']['Y_train'] # mutation_data = gp.load_mutation_status(patientinfo, [[label]]) if type(mutation_data) is not dict: if os.path.isfile(mutation_data): mutation_data = gp.load_mutation_status(mutation_data, [[label_type]]) patient_IDs = mutation_data['patient_IDs'] mutation_label = mutation_data['mutation_label'] # mutation_name = mutation_data['mutation_name'] # print(len(SVMs)) N_iterations = float(len(SVMs)) # mutation_label = np.asarray(mutation_label) sensitivity = list() specificity = list() precision = list() accuracy = list() auc = list() # auc_train = list() f1_score_list = list() patient_classification_list = dict() for i in range(0, len(Y_test)): # print(Y_test[i]) # if Y_test[i].shape[1] > 1: # # print(Y_test[i]) # y_truth = np.prod(Y_test[i][:, 0:2], axis=1) # else: # y_truth_test = Y_test[i] test_patient_IDs = prediction[label]['patient_ID_test'][i] if 'LGG-Radiogenomics-046' in test_patient_IDs: wrong_index = np.where(test_patient_IDs == 'LGG-Radiogenomics-046') test_patient_IDs = np.delete(test_patient_IDs, wrong_index) X_temp = X_test[i] print(X_temp.shape) X_temp = np.delete(X_test[i], wrong_index, axis=0) print(X_temp.shape) # X_test.pop(wrong_index[0]) # print(len(X_test)) else: X_temp = X_test[i] test_indices = list() for i_ID in test_patient_IDs: test_indices.append(np.where(patient_IDs == i_ID)[0][0]) if i_ID not in patient_classification_list: patient_classification_list[i_ID] = dict() patient_classification_list[i_ID]['N_test'] = 0 patient_classification_list[i_ID]['N_correct'] = 0 patient_classification_list[i_ID]['N_wrong'] = 0 patient_classification_list[i_ID]['N_test'] += 1 y_truth = [mutation_label[0][k] for k in test_indices] # print(y_truth) # print(y_truth_test) # print(test_patient_IDs) y_predict_1 = SVMs[i].predict(X_temp) # print(y_predict_1).shape y_prediction = y_predict_1 # y_prediction = np.prod(y_prediction, axis=0) print "Truth: ", y_truth print "Prediction: ", y_prediction for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction, test_patient_IDs): if i_truth == i_predict: patient_classification_list[i_test_ID]['N_correct'] += 1 else: patient_classification_list[i_test_ID]['N_wrong'] += 1 # print('bla') # print(y_truth) # print(y_prediction) c_mat = confusion_matrix(y_truth, y_prediction) TN = c_mat[0, 0] FN = c_mat[1, 0] TP = c_mat[1, 1] FP = c_mat[0, 1] if FN == 0 and TP == 0: sensitivity.append(0) else: sensitivity.append(float(TP) / (TP + FN)) if FP == 0 and TN == 0: specificity.append(0) else: specificity.append(float(TN) / (FP + TN)) if TP == 0 and FP == 0: precision.append(0) else: precision.append(float(TP) / (TP + FP)) accuracy.append(accuracy_score(y_truth, y_prediction)) y_score = SVMs[i].decision_function(X_temp) Y_score.append(y_score) auc.append(roc_auc_score(y_truth, y_score)) f1_score_list.append( f1_score(y_truth, y_prediction, average='weighted')) # if show_ROC: # ROC_target_folder = '/archive/wkessels/output/ROC_temp/' # if not os.path.exists(ROC_target_folder): # os.makedirs(ROC_target_folder) # # luck = [0, 1] # # fpr, tpr, _ = roc_curve(y_truth, y_score) # plt.figure() # plt.plot(fpr, tpr, color='blue', label='ROC (AUC = {})'.format(auc[-1])) # plt.plot(luck, luck, '--', color='red', label='luck') # plt.xlabel('1-specificity') # plt.ylabel('sensitivity') # plt.axis([0, 1, 0, 1]) # plt.legend() # plt.savefig(ROC_target_folder + 'ROC_cv{}.png'.format(i)) # print('Saved ROC figure in {}!'.format(ROC_target_folder)) # Adjusted according to "Inference for the Generelization error" accuracy_mean = np.mean(accuracy) S_uj = 1.0 / max((N_iterations - 1), 1) * np.sum( (accuracy_mean - accuracy)**2.0) print Y_test N_1 = float(len(Y_train[0])) N_2 = float(len(Y_test[0])) print(N_1) print(N_2) accuracy_var = np.sqrt((1.0 / N_iterations + N_2 / N_1) * S_uj) print(accuracy_var) print(np.sqrt(1 / N_iterations * S_uj)) print(st.sem(accuracy)) stats = dict() stats["Accuracy 95%:"] = str( compute_CI.compute_confidence(accuracy, N_1, N_2, 0.95)) stats["AUC 95%:"] = str(compute_CI.compute_confidence(auc, N_1, N_2, 0.95)) stats["F1-score 95%:"] = str( compute_CI.compute_confidence(f1_score_list, N_1, N_2, 0.95)) stats["Precision 95%:"] = str( compute_CI.compute_confidence(precision, N_1, N_2, 0.95)) stats["Sensitivity 95%: "] = str( compute_CI.compute_confidence(sensitivity, N_1, N_2, 0.95)) stats["Specificity 95%:"] = str( compute_CI.compute_confidence(specificity, N_1, N_2, 0.95)) print("Accuracy 95%:" + str(compute_CI.compute_confidence(accuracy, N_1, N_2, 0.95))) print("AUC 95%:" + str(compute_CI.compute_confidence(auc, N_1, N_2, 0.95))) print("F1-score 95%:" + str(compute_CI.compute_confidence(f1_score_list, N_1, N_2, 0.95))) print("Precision 95%:" + str(compute_CI.compute_confidence(precision, N_1, N_2, 0.95))) print("Sensitivity 95%: " + str(compute_CI.compute_confidence(sensitivity, N_1, N_2, 0.95))) print("Specificity 95%:" + str(compute_CI.compute_confidence(specificity, N_1, N_2, 0.95))) what_to_print = ['always', 'mostly'] for what in what_to_print: if what == 'always': alwaysright = dict() alwayswrong = dict() for i_ID in patient_classification_list: percentage_right = patient_classification_list[i_ID][ 'N_correct'] / float( patient_classification_list[i_ID]['N_test']) # print(i_ID + ' , ' + str(patient_classification_list[i_ID]['N_test']) + ' : ' + str(percentage_right) + '\n') if percentage_right == 1.0: label = mutation_label[0][np.where(i_ID == patient_IDs)] label = label[0][0] alwaysright[i_ID] = label # alwaysright.append(('{} ({})').format(i_ID, label)) print(("Always Right: {}, label {}").format(i_ID, label)) if percentage_right == 0: label = mutation_label[0][np.where( i_ID == patient_IDs)].tolist() label = label[0][0] alwayswrong[i_ID] = label # alwayswrong.append(('{} ({})').format(i_ID, label)) print(("Always Wrong: {}, label {}").format(i_ID, label)) stats["Always right"] = alwaysright stats["Always wrong"] = alwayswrong elif what == 'mostly': margin = float(0.2) min_right = float(1 - margin) #for mostly right max_right = float(margin) #for mostly wrong mostlyright = dict() mostlywrong = dict() for i_ID in patient_classification_list: percentage_right = patient_classification_list[i_ID][ 'N_correct'] / float( patient_classification_list[i_ID]['N_test']) if percentage_right > min_right: label = mutation_label[0][np.where(i_ID == patient_IDs)] label = label[0][0] mostlyright[i_ID] = [ label, "{}%".format(100 * percentage_right) ] print(( "Mostly Right: {}, label {}, percentage: {}%").format( i_ID, label, 100 * percentage_right)) if percentage_right < max_right: label = mutation_label[0][np.where( i_ID == patient_IDs)].tolist() label = label[0][0] mostlywrong[i_ID] = [ label, "{}%".format(100 * percentage_right) ] print(( "Mostly Wrong: {}, label {}, percentage: {}%").format( i_ID, label, 100 * percentage_right)) stats["Mostly right"] = mostlyright stats["Mostly wrong"] = mostlywrong else: raise IOError('Unknown argument given...') if show_plots: import matplotlib.pyplot as plt plt.figure() plt.boxplot(accuracy) plt.ylim([-0.05, 1.05]) plt.ylabel('Accuracy') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() plt.figure() plt.boxplot(auc) plt.ylim([-0.05, 1.05]) plt.ylabel('AUC') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() plt.figure() plt.boxplot(precision) plt.ylim([-0.05, 1.05]) plt.ylabel('Precision') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() plt.figure() plt.boxplot(sensitivity) plt.ylim([-0.05, 1.05]) plt.ylabel('Sensitivity') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() plt.figure() plt.boxplot(specificity) plt.ylim([-0.05, 1.05]) plt.ylabel('Specificity') plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tight_layout() plt.show() # Save Y_score values Y_score_dict = dict() for j in range(len(Y_score)): Y_score_dict['CV_{}'.format(j)] = Y_score[j] Y_score = pd.DataFrame(Y_score_dict) Y_score.to_hdf('/archive/wkessels/output/Lipo_SVM/Y_score.hdf5', 'Y_score') # write_to_txt('Y_test', Y_test, ROC_data_folder) # write_to_txt('X_test', X_test, ROC_data_folder) # write_to_txt('Y_train', Y_train, ROC_data_folder) # write_to_txt('mutation_data', mutation_data, ROC_data_folder) # write_to_txt('patient_IDs', patient_IDs, ROC_data_folder) # write_to_txt('mutation_label',mutation_label, ROC_data_folder) # write_to_txt('y_truth', y_truth, ROC_data_folder) # write_to_txt('y_prediction', y_prediction, ROC_data_folder) # write_to_txt('y_score', y_score, ROC_data_folder) # write_to_txt('N_1', N_1, ROC_data_folder) # write_to_txt('N_2', N_2, ROC_data_folder) # write_to_txt('stats', stats, ROC_data_folder) return stats
y_pred = pool_classifiers.predict(Xi_test) acc_subsample[i / 10] = np.mean(y_pred == yi_test) f1_subsample[i / 10] = f1_score(yi_test, y_pred, average='weighted') #gmean_subsample[i/10]= geometric_mean_score(yi_test, y_pred, average='weighted') auc_subsample[i / 10] = roc_auc_score(yi_test, y_pred, average='weighted') #fpr, tpr, thresholds = roc_curve(yi_test, y_pred) #auc_subsample[i/10]= auc(fpr, tpr) accuracy.append(acc_subsample) f1.append(f1_subsample) # gmean.append(gmean_subsample) auc.append(auc_subsample) ##np.save(dataset+"Perceptron_bagging_acc.py", accuracy) ##np.save(dataset+"Perceptron_bagging_f1.py", f1) ##np.save(dataset+"Perceptron_bagging_gmean.py", gmean) ##np.save(dataset+"Perceptron_bagging_auc.py", auc) ##accuracy = [] ##fold=0 ##kfold = KFold(n_splits=10, shuffle=False, random_state=1) ##for train, test in kfold.split(X): ## Xi_train = X[train] ## yi_train = y[train] ## ## Xi_test = X[test] ## yi_test = y[test]
end= 0 print len(list_exc) steps = range(0,len(list_exc),20) print steps feature_ids=[] auc=[] for i in range(0,len(steps)-1): begin = steps[i] end = steps[i+1] feature_ids.extend(list_exc[range(begin,end)]) x_train=train_data[:,feature_ids] x_valid=valid_data[:,feature_ids] x_test=test_data[:,feature_ids] clf.fit(x_train,y_train) dec_val_test=clf.decision_function(x_test) auc.append(roc_auc_score(y_test,dec_val_test)) #print feature_ids feature_ids.extend(list_exc[range(begin,end)]) #print feature_ids x_train=train_data[:,feature_ids] x_valid=valid_data[:,feature_ids] x_test=test_data[:,feature_ids] clf.fit(x_train,y_train) dec_val_test=clf.decision_function(x_test) auc.append(roc_auc_score(y_test,dec_val_test)) clf2 = linear_model.LogisticRegression(C=0.01, penalty='l1') clf2.fit(x_train,y_train) dec_val_test=clf2.decision_function(x_test) auc_lasso = roc_auc_score(y_test,dec_val_test) lasso =[auc_lasso] * len(list_exc) #plt.title(",fontsize=18)
def main(): from pandas import read_csv # Read in the data # # NORMAL DATA # DATA = read_csv("norm_data__non_log.txt",sep='\t').T # DATA = DATA.apply(np.log).values # Retain the log due to the maximising values # MIN MAX DATA DATA = read_csv("norm_data__non_log.txt",sep='\t').T label = read_csv("sample_list.csv",sep=';') DATA = DATA.apply(np.log).values # Retain the log due to the maximising values # Conversion of string to bool mapping = {'Non-LCa':0,'LCa':1} TARGET = label.Disease.map(mapping).values print(DATA.shape) DATA = boost_select(DATA,TARGET) kf = KFold(n_splits=5, random_state=seed, shuffle=True) acc = [] prec = [] recall = [] auc = [] with open('results_deep.txt', 'w') as f: for train_index, test_index in kf.split(DATA): X_train, X_test, y_train, y_test = DATA[train_index],DATA[test_index],TARGET[train_index],TARGET[test_index] from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) ada = ADASYN() X_train, y_train = ada.fit_resample(X_train,y_train) nb_epoch = 80 batch_size = 64 input_dim = DATA.shape[1] learning_rate = 1e-7 input_layer = Input(shape=(input_dim, )) net = Dense(200,activation="relu",activity_regularizer=regularizers.l2(learning_rate))(input_layer) net = Dense(400, activation="relu")(net) net = Dense(600, activation="relu")(net) net = Dense(800, activation="relu")(net) net = Dense(1000,activation="relu")(net) net = Dense(800,activation="relu")(net) net = Dense(600, activation="relu")(net) net = Dense(400, activation="relu")(net) net = Dense(200, activation="relu")(net) output_layer = Dense(1, activation='sigmoid')(net) model = Model(inputs=input_layer, outputs=output_layer) model.compile(metrics=['accuracy',precision_m,recall_m,f1_m], loss='binary_crossentropy', optimizer='adam') cp = ModelCheckpoint(filepath="NeuralNetworkModel.h5", save_best_only=True, verbose=0) tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) history = model.fit(X_train, y_train, epochs=nb_epoch, batch_size=batch_size, shuffle=True, validation_data=(X_test, y_test), verbose=1, callbacks=[cp, tb]).history # This is to figure out the correct number of epochs before training # becomes redundant. Here, it is discovered 80 epochs satisfies this problem # Uncomment the code to visualise the test v train loss plot. # plt.plot(history['loss'], linewidth=2, label='Train') # plt.plot(history['val_loss'], linewidth=2, label='Test') # plt.legend(loc='upper right') # plt.title('Model loss') # plt.ylabel('Loss') # plt.xlabel('Epoch') # #plt.ylim(ymin=0.70,ymax=1) # plt.show() # load weights model.load_weights("NeuralNetworkModel.h5") # Compile model (required to make predictions) model.compile(metrics=['accuracy',precision_m,recall_m,f1_m], loss='binary_crossentropy', optimizer='adam') y_pred = model.predict(X_test) auc.append(roc_auc_score(y_test, y_pred)) recall.append(recall_score(y_test,np.round(y_pred,0))) prec.append(precision_score(y_test,np.round(y_pred,0))) acc.append(accuracy_score(y_test,np.round(y_pred,0))) print(np.mean(auc),np.mean(recall),np.mean(prec),np.mean(acc)) print("MODEL 9 Hidden, Hidden Nodes [200,400,600,800,1000,800,600,400,200], \n L2 Regulariser Layer 1, Epoch: 80, LearnRate: 1e-7, Loss: Binary Cross, Opt: ADAM \n CV: 5 \n\n\n",file=f) print('N_FEATURES: {}, AUC: {}, RECALL: {}, PRECISION: {}, ACCURACY: {}, '.format(42,auc,recall,prec,acc),file=f)
#### 5,构建基于Ridge的逻辑回归模型 ##### ######################################## all_features = list(train_data.columns) all_features.remove('ID') all_features.remove('flag') #对于参数C的选择,我们用交叉验证法选择最优的C C_list = np.arange(0.01, 1, 0.01) auc = [] for c in C_list: train2, validation = model_selection.train_test_split(train_data, test_size=0.2) LR = LogisticRegression(C=c).fit(train2[all_features], train2['flag']) pred = LR.predict_proba(validation[all_features])[:, 1] test_auc = metrics.roc_auc_score(validation['flag'], pred) auc.append(test_auc) position = auc.index(max(auc)) C_best = C_list[position] print(max(auc)) LR = LogisticRegression(C=C_best).fit(train_data[all_features], train_data['flag']) pred = LR.predict_proba(train_data[all_features]) ####画ROC曲线 mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] fpr, tpr, thresholds = roc_curve(train_data['flag'], pred[:, 1])
tmp_auc=[] tmp_acc=[] tmp_f1=[] initial_time=time.time() for i in range(5): X_train, X_test, y_train, y_test = cross_validation.train_test_split( data, mark, test_size=0.05, random_state=i) clf.fit(X_train, y_train) y_predict = clf.predict_proba(X_test)[:,1] test_auc = metrics.roc_auc_score(y_test, y_predict) # 验证集上的auc值 #print('AUC:', test_auc) tmp_auc.append(test_auc) y_pred = clf.predict(X_test) #print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred)) tmp_acc.append(metrics.accuracy_score(y_test,y_pred)) #print ('F1-score: %.4f' %metrics.f1_score(y_test,y_predict)) tmp_f1.append(metrics.f1_score(y_test, y_pred)) over_time=time.time() auc.append(round(sum(tmp_auc)/len(tmp_auc),3)) acc.append(round(sum(tmp_acc)/len(tmp_acc),3)) f1.append(round(sum(tmp_f1)/len(tmp_f1),3)) used_time.append(round(over_time-initial_time,3)) index=['AUC','ACC','F1','time'] out=[] out.append(auc) out.append(acc) out.append(f1) out.append(used_time) data = pd.DataFrame(out,index=index) data.to_csv('/Users/hhy/Desktop/node_model_information.csv',encoding='utf-8-sig',header=header)
def main(): from pandas import read_csv # Read in the data # # NORMAL DATA # DATA = read_csv("norm_data__non_log.txt",sep='\t').T # DATA = DATA.apply(np.log).values # Retain the log due to the maximising values # MIN MAX DATA DATA = read_csv("norm_data__non_log.txt", sep='\t').T print(DATA.shape) label = read_csv("sample_list.csv", sep=';') DATA = DATA.apply( np.log).values # Retain the log due to the maximising values # Conversion of string to bool mapping = {'Non-LCa': 0, 'LCa': 1} TARGET = label.Disease.map(mapping).values class_weight = {1: 2, 0: 1} DATA = feature_select(DATA, TARGET, 55) print(DATA.shape) kf = KFold(n_splits=2, random_state=seed, shuffle=True) acc = [] prec = [] recall = [] auc = [] with open('results_total_best_cv2_lr.txt', 'w') as f: for train_index, test_index in kf.split(DATA): X_train, X_test, y_train, y_test = DATA[train_index], DATA[ test_index], TARGET[train_index], TARGET[test_index] from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) nb_epoch = 80 batch_size = 64 input_dim = DATA.shape[1] # learning_rate = 1 # decay = learning_rate/nb_epoch input_layer = Input(shape=(input_dim, )) net = Dense( 200, activation="relu", activity_regularizer=regularizers.l2(1e-7))(input_layer) net = Dense(400, activation="relu")(net) net = Dense(600, activation="relu")(net) net = Dense(800, activation="relu")(net) net = Dense(1000, activation="relu")(net) net = Dense(800, activation="relu")(net) net = Dense(600, activation="relu")(net) net = Dense(400, activation="relu")(net) net = Dense(200, activation="relu")(net) output_layer = Dense(1, activation='sigmoid')(net) adam = optimizers.Adam(lr=1e-5) model = Model(inputs=input_layer, outputs=output_layer) model.compile(metrics=['accuracy', precision_m, recall_m, f1_m], loss='binary_crossentropy', optimizer=adam) cp = ModelCheckpoint(filepath="NeuralNetworkModel.h5", save_best_only=True, verbose=0) tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) history = model.fit(X_train, y_train, epochs=nb_epoch, batch_size=batch_size, shuffle=True, validation_data=(X_test, y_test), verbose=1, class_weight=class_weight, callbacks=[cp, tb]).history # This is to figure out the correct number of epochs before training # becomes redundant. Here, it is discovered 80 epochs satisfies this problem # Uncomment the code to visualise the test v train loss plot. plt.plot(history['loss'], linewidth=2, label='Train') plt.plot(history['val_loss'], linewidth=2, label='Test') plt.legend(loc='upper right') plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.savefig("lossvepoch.png", dpi=300) #plt.ylim(ymin=0.70,ymax=1) plt.show() # load weights model.load_weights("NeuralNetworkModel.h5") # Compile model (required to make predictions) model.compile(metrics=['accuracy', precision_m, recall_m, f1_m], loss='binary_crossentropy', optimizer=adam) y_pred = model.predict(X_test) # plt.style.use(['seaborn-colorblind']) # fpr, tpr, _ = roc_curve(y_test, y_pred) # plt.figure(1) # plt.plot([0, 1], [0, 1], 'k--') # plt.plot(fpr, tpr, label='Deep Net',alpha=0.6,color='r') # plt.xlabel('False positive rate') # plt.ylabel('True positive rate') # plt.title('ROC curve AUC = {}'.format(roc_auc_score(y_test, y_pred))) # plt.legend(loc='best') # plt.show() auc.append(roc_auc_score(y_test, y_pred)) recall.append(recall_score(y_test, np.round(y_pred, 0))) prec.append(precision_score(y_test, np.round(y_pred, 0))) acc.append(accuracy_score(y_test, np.round(y_pred, 0))) # AVERAGE PRECISION # from sklearn.metrics import average_precision_score # average_precision = average_precision_score(y_test, y_pred) # print('Average precision-recall score: {0:0.2f}'.format( # average_precision)) # # PRECISION RECALL CURVE # from sklearn.metrics import precision_recall_curve # from inspect import signature # precision, recall1, _ = precision_recall_curve(y_test, y_pred) # # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument # step_kwargs = ({'step': 'post'} # if 'step' in signature(plt.fill_between).parameters # else {}) # plt.step(recall1, precision, color='b', alpha=0.2, # where='post') # plt.fill_between(recall1, precision, alpha=0.2, color='b', **step_kwargs) # plt.xlabel('Recall') # plt.ylabel('Precision') # plt.ylim([0.0, 1.05]) # plt.xlim([0.0, 1.0]) # plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format( # average_precision)) # plt.show() print(np.mean(auc), np.mean(recall), np.mean(prec), np.mean(acc)) print( "MODEL 9 Hidden, Hidden Nodes [200,400,600,800,1000,800,600,400,200], \n L2 Regulariser Layer 1, Epoch: 300, LearnRate: 1e-20, Loss: Binary Cross, Opt: ADAM \n CV: 5 \n\n\n", file=f) print( 'N_FEATURES: {}, AUC: {}, RECALL: {}, PRECISION: {}, ACCURACY: {} \n\n\n' .format(DATA.shape[1], auc, recall, prec, acc), file=f) print( 'AUC MEAN: {}\n RECALL MEAN: {}\n PRECISON MEAN: {}\n ACCURACY MEAN: {}' .format(np.mean(auc), np.mean(recall), np.mean(prec), np.mean(acc)))
def plotScores(scores, title, xLabel, yLabel='Score'): x = [] accuracy = [] f1 = [] prescion = [] recall = [] auc = [] trainaccuracy = [] trainf1 = [] trainprescion = [] trainrecall = [] trainauc = [] for score in scores: x.append(score.HyperParam) accuracy.append(score.Accuracy) prescion.append(score.Precision) recall.append(score.Recall) f1.append(score.F1) auc.append(score.AUC) trainaccuracy.append(score.TrainAccuracy) trainprescion.append(score.TrainPrecision) trainrecall.append(score.TrainRecall) trainf1.append(score.TrainF1) trainauc.append(score.TrainAUC) plotAccuracy(x, accuracy, trainaccuracy, title, xLabel, yLabel) plotF1(x, f1, trainf1, title, xLabel, yLabel) plotPrecision(x, prescion, trainprescion, title, xLabel, yLabel) plotRecall(x, recall, trainrecall, title, xLabel, yLabel) plt.clf() plt.title = title plt.xlabel(xLabel) plt.ylabel(yLabel) # plt.plot(x, accuracy, label='Accuracy', color='r', lw=1.0) # # plt.plot(x, recall, label='Recall', color='g', lw=2.0) # plt.plot(x, f1, label='F1', color='b', lw=1.0) # # plt.plot(x, prescion, label='Precision', color='k', lw=2.0) plt.plot(x, auc, label='AUC') # plt.plot(x, trainaccuracy, label='Train Accuracy', color='r', lw=5.0, alpha=0.25) # # plt.plot(x, trainprescion, label='Train Precision', color='g', marker='o', lw=1.0, ls='--') # plt.plot(x, trainf1, label='Train F1', color='b', lw=5.0, alpha=0.25) # # plt.plot(x, trainrecall, label='Train Recall', color='k', marker='o', lw=1.0, ls='--') plt.plot(x, trainauc, label='Train AUC') plt.legend() plt.grid() # plt.ylim(0, 1) # plt.show() if showPlot: plt.show() else: name = baseGraphPath + title + ' AUC' + '.png' plt.savefig(name)