def func(): for imbalance in imbalances: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4444) X_train, y_train = imbalance.fit_sample(X_train, y_train) for clf in classifiers: print('-----------------') print("%s " %imbalance) print('-----------------') clf.fit(X_train, y_train) print('-----------------') print("%s " %clf) print('-----------------') print("") print("Accuracy score", accuracy_score(y_test, clf.predict(X_test))) print('auc', roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])) print("") print(classification_report(y_test, clf.predict(X_test))) print("") print('-----------------') best_dict[imbalance] = [clf, roc_auc_score(y_test, clf.predict(X_test))]
def test_thresholded_scorers(): """Test scorers that take thresholds.""" X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = SCORERS['log_loss'](clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
def randomforest(df1,df2): newsT=df1.L L= ['L'] for x in L: del df1[x] news=df1 TRAINING=df1.as_matrix(columns=None) TEST=newsT.as_matrix(columns=None) newsT=df2['L'] L= ['L'] for x in L: del df2[x] X_test=df2.as_matrix(columns=None) y_test=newsT.as_matrix(columns=None) clf = RandomForestClassifier(n_estimators=200) clf.fit(TRAINING, TEST) y_pred1 = clf.predict_proba(X_test)[:, 1] y_pred = clf.predict(X_test) recall_score(y_test, y_pred) precision_score(y_test, y_pred) precision_score(y_test, y_pred,pos_label=0) recall_score(y_test, y_pred,pos_label=0) roc_auc_score(y_test, y_pred1) print 'roc: ',roc_auc_score(y_test, y_pred1) print 'precision: ',precision_score(y_test, y_pred) print 'recall:', recall_score(y_test, y_pred) print 'precision Negatives: ',precision_score(y_test, y_pred,pos_label=0) print 'recall Negatives: ', recall_score(y_test, y_pred,pos_label=0) return roc_auc_score(y_test, y_pred1),precision_score(y_test, y_pred),recall_score(y_test, y_pred),precision_score(y_test, y_pred,pos_label=0), recall_score(y_test, y_pred,pos_label=0)
def roc_score(predictions): logreg = roc_auc_score([int(y) for y in predictions[:, 0]], [float(w) for w in predictions[:, 1]]) svm = roc_auc_score([int(y) for y in predictions[:, 0]], [float(w) for w in predictions[:, 2]]) knn = roc_auc_score([int(y) for y in predictions[:, 0]], [float(w) for w in predictions[:, 3]]) tree = roc_auc_score([int(y) for y in predictions[:, 0]], [float(w) for w in predictions[:, 4]]) return {'logreg': logreg, 'svm': svm, 'knn': knn, 'tree': tree}
def ensemble_measure(lst, classifiers, weigths): def norm_lst(lst): import numpy as np s = np.sum(lst) arr = np.array(lst) / s return np.nan_to_num(arr) tst = pd.DataFrame([t[0] for t in lst], columns=train.columns) X = tst[tst.columns[:-1]] y = tst[tst.columns[-1]] y_hat = [] y_pred = [] for clf in classifiers: y_hat.append(clf.decision_function(X)) if len(y_hat) == 1: y = [1 if p == True else -1 for p in y] auc = roc_auc_score(y, y_hat[0]) else: for pred, wgt in zip(y_hat, norm_lst(weigths)): y_pred.append([wgt * p for p in pred]) y_pred = np.sum(np.array(y_pred).T, axis=1) y = [1 if p == True else -1 for p in y] try: auc = roc_auc_score(y, y_pred) except: set_trace() return auc
def analysis(fold_val): total = 0 df_val = dict() for f in fold_val: df_val[total] = pd.read_csv(f) auc = roc_auc_score(df_val[total]['isDuplicate'].values, df_val[total]['probability'].values) print('Auc for experiment {}: {}'.format(total, auc)) total += 1 df_mean = df_val[0].copy() for i in range(1, len(fold_val)): df_mean['probability'] += df_val[i]['probability'] df_mean['probability'] /= len(fold_val) auc = roc_auc_score(df_mean['isDuplicate'].values, df_mean['probability'].values) print('Auc for mean: {}'.format(auc)) alls = [] x0 = [] for i in range(0, len(fold_val)): val = 'probability' + str(i) alls.append(val) df_mean[val] = df_val[i]['probability'] x0.append(1.0) df_mean['probability_median'] = df_mean[alls].median(axis=1) auc = roc_auc_score(df_mean['isDuplicate'].values, df_mean['probability_median'].values) print('Auc for median: {}'.format(auc)) res = minimize(get_ensemble_score, x0, args=(df_mean), method='Nelder-Mead', options={'xtol': 1e-8, 'disp': True}) print(res) return res.x
def Classification(df_detail,features,featurey,featue_selection): df_X=df_detail[features] # print df_X.isnull().values.any() X=numpy.array(df_X) # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() # X = scaler.fit_transform(X) Y=list(df_detail[featurey]) # print 'lenY',len(Y) # ############################################ # # classification # ############################################ X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.3, random_state=0) # print 'X_train[1]:',X_train[1] # print 'y_train[1]:',y_train[1] #####LR###### lr = LogisticRegression() lr.fit(X_train, y_train) expected = y_test predicted = lr.predict(X_test) answer=lr.predict_proba(X_test) if featue_selection==True: prob_auc=pd.DataFrame({'feature':features, 'auc':roc_auc_score(numpy.array( map(int, y_test)), answer[:,1])}) return prob_auc else: print '=====LogisticRegression======' print '1/0 in train:%d/%d\t1/0 in test:%d/%d'%(y_train.count('1'),y_train.count('0'),y_test.count('1'),y_test.count('0')) print 'N_train:N_test= %d:%d'%(len(y_train),len(y_test)) print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) print 'lr.score=',lr.score(X_test,y_test) print 'lr.auc_score=',roc_auc_score(numpy.array( map(int, y_test)), answer[:,1]) # print '****lr.coef_****' # print lr.coef_ lr_coef=pd.DataFrame(lr.coef_) lr_coef.to_csv('lr_coef_new.txt',sep='\t' ,index=False, header=False) lr_intercept=pd.DataFrame(lr.intercept_) lr_intercept.to_csv('lr_intercept_new.txt',sep='\t' ,index=False, header=False) test_pair=pd.concat([pd.DataFrame(y_test),pd.DataFrame(answer),pd.DataFrame(X_test)],axis=1) test_pair.to_csv('test_pair_new.txt',sep='\t' ,index=False, header=False) feature_imp=pd.DataFrame(lr.coef_[0] ) # print feature_imp feature_imp.to_csv('feature_imp.txt',sep='\t', mode='a',index=False, header=False)
def report(self): from sklearn.metrics import roc_auc_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix y_pred_probas, y_true = self.make_predictions()[:2] y_pred = y_pred_probas.argmax(1) y_pred_probas = y_pred_probas[:, 1] y_true = y_true.reshape(-1) try: score = roc_auc_score(y_true, y_pred_probas) except ValueError: pass else: print print "AUC score:", score print "AUC score (binary):", roc_auc_score(y_true, y_pred) print print "Classification report:" print classification_report(y_true, y_pred) print print "Confusion matrix:" print confusion_matrix(y_true, y_pred) print
def getScores(y, yPredTrain, yTest, yPredTest): scores = dict() scores['f1Train'] = f1_score(y, yPredTrain) scores['f1Test'] = f1_score(yTest, yPredTest) scores['accTrain'] = accuracy_score(y, yPredTrain) scores['accTest'] = accuracy_score(yTest, yPredTest) scores['rocTrain'] = roc_auc_score(y, yPredTrain) scores['rocTest'] = roc_auc_score(yTest, yPredTest) scores['cMatrixTrain'] = confusion_matrix(y, yPredTrain) scores['cMatrixTest'] = confusion_matrix(yTest, yPredTest) proba = float(len(np.where(y==1)[0]))/len(y) if proba < 0.50: proba = 1 - proba scores['random'] = proba return scores
def giniGrowth(df,woeVarsInfo,badFlag): woeTable = woeVarsInfo.copy() woeTable.variable = woeTable.variable.apply(lambda x: x + '_WOE') IV = getIVfromWOE(woeTable) columns = IV.variable columnsForModeking = [] giniTest = [] giniTrain = [] y = df[badFlag].values for col in columns: columnsForModeking.append(col) X = df[columnsForModeking].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=3) lr = LogisticRegression() lr.fit(X_train,y_train) pr_test = lr.predict_proba(X_test)[:,1] pr_train = lr.predict_proba(X_train)[:,1] rocGiniTest = met.roc_auc_score(y_test,pr_test) * 2 - 1 rocGiniTrain = met.roc_auc_score(y_train,pr_train) * 2 - 1 giniTest.append(rocGiniTest) giniTrain.append(rocGiniTrain) trainDiff = [x-y for x,y in zip(giniTrain,[0]+giniTrain[:-1])] testDiff = [x-y for x,y in zip(giniTest,[0]+giniTest[:-1])] dfOut = pd.DataFrame({'variable':columns, 'giniTrain' : giniTrain,'giniTest': giniTest,'trainDiff':trainDiff,'testDiff':testDiff,'informationValue':list(IV.InformationValue)}) dfOut[['trainDiff','testDiff']] = dfOut[['trainDiff','testDiff']]#.apply('${:,.2f}'.format) dfOut = dfOut.reindex_axis(['variable','informationValue','testDiff','trainDiff','giniTest','giniTrain'],axis=1) return dfOut
def compare_models(xtraindata, ytraindata): from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC, LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA classifier_dict = { #'linSVC': LinearSVC(), #'kNC5': KNeighborsClassifier(), #'kNC6': KNeighborsClassifier(6), #'SVC': SVC(kernel="linear", C=0.025), #'DT': DecisionTreeClassifier(max_depth=5), #'RF200': RandomForestClassifier(n_estimators=200, n_jobs=-1), 'RF400gini': RandomForestClassifier(n_estimators=400, n_jobs=-1), 'RF400entropy': RandomForestClassifier(n_estimators=400, n_jobs=-1, criterion='entropy'), #'RF800': RandomForestClassifier(n_estimators=800, n_jobs=-1), #'RF1000': RandomForestClassifier(n_estimators=1000, n_jobs=-1), 'Ada': AdaBoostClassifier(), #'SVClin': SVC(kernel='linear'), #'SVCpoly': SVC(kernel='poly'), #'SVCsigmoid': SVC(kernel='sigmoid'), 'Gauss': GaussianNB(), 'LDA': LDA(), #'QDA': QDA(), 'SVC': SVC(), } results = {} ytrain_vals = [] ytest_vals = [] randint = reduce(lambda x,y: x|y, [ord(x)<<(n*8) for (n,x) in enumerate(os.urandom(4))]) xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(xtraindata, ytraindata, test_size=0.4, random_state=randint) scale = StandardScaler() xTrain = scale.fit_transform(xTrain) xTest = scale.transform(xTest) for name, model in sorted(classifier_dict.items()): model.fit(xTrain, yTrain) ytrpred = model.predict(xTrain) ytpred = model.predict(xTest) results[name] = roc_auc_score(yTest, ytpred) ytrain_vals.append(ytrpred) ytest_vals.append(ytpred) print name, results[name], ytest_vals[-1] print '\n\n\n' print 'shape3', xTrain.shape, xTest.shape, ytrain_vals[0].shape, ytest_vals[0].shape xTrain = np.hstack([xTrain]+[y.reshape(xTrain.shape[0],1) for y in ytrain_vals]) xTest = np.hstack([xTest]+[y.reshape(xTest.shape[0],1) for y in ytest_vals]) print '\n\n\n' model = RandomForestClassifier(n_estimators=400, n_jobs=-1) model.fit(xTrain, yTrain) ytpred = model.predict(xTest) print 'RF400', roc_auc_score(yTest, ytpred)
def print_metrics(y_test, y_pred, y_baseline): # clf_score = metrics.log_loss(y_test, y_pred) # baseline_score = metrics.log_loss(y_test, y_baseline) # never_score = metrics.log_loss(y_test, np.zeros(y_test.shape)) # always_score = metrics.log_loss(y_test, np.ones(y_test.shape)) # # print("-----") # print("log-loss score of classifier: " + str(clf_score)) # print("log-loss score of baseline: " + str(baseline_score)) # print("log-loss score of never: " + str(never_score)) # print("log-loss score of always: " + str(always_score)) # # clf_score = metrics.brier_score_loss(y_test, y_pred) # baseline_score = metrics.brier_score_loss(y_test, y_baseline) # never_score = metrics.brier_score_loss(y_test, np.zeros(y_test.shape)) # always_score = metrics.brier_score_loss(y_test, np.ones(y_test.shape)) # # print("-----") # print("Brier loss of classifier: " + str(clf_score)) # print("Brier loss of baseline: " + str(baseline_score)) # print("Brier loss of never: " + str(never_score)) # print("Brier loss of always: " + str(always_score)) clf_score = metrics.roc_auc_score(y_test, y_pred) baseline_score = metrics.roc_auc_score(y_test, y_baseline) never_score = metrics.roc_auc_score(y_test, np.zeros(y_test.shape)) always_score = metrics.roc_auc_score(y_test, np.ones(y_test.shape)) print("-----") print("ROC AUC of classifier: " + str(clf_score)) print("ROC AUC score of baseline: " + str(baseline_score)) print("ROC AUC score of never: " + str(never_score)) print("ROC AUC score of always: " + str(always_score))
def predict(fea, df, t, t9): Un = df.columns == 'Blank' for f in Fea: ''' try: df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')] print(1) except: pass ''' Un = Un | (df.columns == f) Un = Un | (df.columns == (f+'_x')) Un = Un | (df.columns == (f+'_y')) Un = Un & (df.columns != 'New_y') clf = GradientBoostingClassifier() y = df[t].label X = df[t].ix[:,Un] X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1) clf.fit(X_train, y_train) re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])) print re re = 'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1])) print re print(X.columns) print(clf.feature_importances_) return Un, clf
def _basic_metrics(self, data, brier_bins=20, prediction_column="prediction", observation_column="correct", brier_min=0, brier_max=1): report = {} n = 0 # log count sse = 0 # sum of square error llsum = 0 # log-likely-hood sum brier_counts = np.zeros(brier_bins) # count of answers in bins brier_correct = np.zeros(brier_bins) # sum of correct answers in bins brier_prediction = np.zeros(brier_bins) # sum of predictions in bins for log in data: n += 1 sse += (log[prediction_column] - log[observation_column]) ** 2 llsum += math.log(max(0.0001, log[prediction_column] if log[observation_column] else (1 - log[prediction_column]))) # brier bin = min(int((log[prediction_column] - brier_min) / (brier_max - brier_min) * brier_bins), brier_bins - 1) brier_counts[bin] += 1 brier_correct[bin] += log[observation_column] brier_prediction[bin] += log[prediction_column] answer_mean = sum(brier_correct) / n report["extra"] = {"answer_mean": answer_mean} report["rmse"] = math.sqrt(sse / n) report["log-likely-hood"] = llsum if observation_column == "correct": try: report["AUC"] = metrics.roc_auc_score(self._data.get_dataframe_test()[observation_column], self._data.get_dataframe_test()[prediction_column]) except ValueError: print("AUC - converting responses to 0, 1") report["AUC"] = metrics.roc_auc_score(self._data.get_dataframe_test()[observation_column] > 0, self._data.get_dataframe_test()[prediction_column]) # brier brier_prediction_means = brier_prediction / brier_counts brier_prediction_means[np.isnan(brier_prediction_means)] = \ ((np.arange(brier_bins) + 0.5) / brier_bins)[np.isnan(brier_prediction_means)] brier_correct_means = brier_correct / brier_counts brier_correct_means[np.isnan(brier_correct_means)] = 0 brier = { "reliability": sum(brier_counts * (brier_correct_means - brier_prediction_means) ** 2) / n, "resolution": sum(brier_counts * (brier_correct_means - answer_mean) ** 2) / n, "uncertainty": answer_mean * (1 - answer_mean), } report["brier"] = brier report["extra"]["brier"] = { "max": brier_max, "min": brier_min, "bin_count": brier_bins, "bin_counts": list(brier_counts), "bin_prediction_means": list(brier_prediction_means), "bin_correct_means": list(brier_correct_means), } report["evaluated"] = True return report
def main(): n_folds = 10 n_genes, n_terms = dicty[gene][go_term][0].data.shape for t, term_idx in enumerate(range(n_terms)): term = dicty[gene][go_term][0].col_names[term_idx] print("Term: %s" % term) y_true = dicty[gene][go_term][0].data[:, term_idx] cls_size = int(y_true.sum()) if cls_size > n_genes - 20 or cls_size < 20: continue cv = cross_validation.StratifiedKFold(y_true, n_folds=n_folds) y_pred_mf = np.zeros_like(y_true) y_pred_rf = np.zeros_like(y_true) for i, (train_idx, test_idx) in enumerate(cv): print("\tFold: %d" % (i+1)) # Let"s make predictions from fused data representation y_pred_mf[test_idx] = mf(train_idx, test_idx, term_idx) # Let"s make predictions from raw data y_pred_rf[test_idx] = rf(train_idx, test_idx, term_idx) mfa = metrics.roc_auc_score(y_true, y_pred_mf) rfa = metrics.roc_auc_score(y_true, y_pred_rf) print("(%2d/%2d): %10s MF: %0.3f RF: %0.3f" % (t+1, n_terms, term, mfa, rfa))
def func_cv_2(X, y, folds, model, verbose, seed): scores = [] for train, test in folds: print('**func_cv_2 Fold', 1 + len(scores), 'of', n_folds_outer) X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] #todo: pass folds to predict? y_predicted = np.zeros(len(y_test)) for i in range(len(models)): y_predicted_per_model = my_predict(X_train, y_train, X_test, n_iterations_inner, n_folds_inner, func_predict_1, models[i], verbose=False) score_per_model = roc_auc_score(y_test, y_predicted_per_model) print('model', i, score_per_model, models[i]) if i == 0: y_predicted = y_predicted_per_model elif i <= 3: y_predicted += 1/3*y_predicted_per_model else: foo += 1 if len(models) == 4: y_predicted = y_predicted/2 else: foo += 1 score = roc_auc_score(y_test, y_predicted) print('**func_cv_2 auc score for combined model', score) scores.append(score) scores = np.array(scores) print('****func_cv_2: mean, std', scores.mean(), scores.std()) return scores.mean()
def on_epoch_end(self, batch, logs={}): # losses self.losses_train.append(self.model.evaluate(X_train, Y_train, batch_size=128,verbose =0)) self.losses_val.append(self.model.evaluate(X_val, Y_val, batch_size=128,verbose = 0)) # Roc train train_preds = self.model.predict_proba(X_train, verbose=0) train_preds = train_preds[:, 1] roc_train = metrics.roc_auc_score(y_train, train_preds) self.roc_train.append(roc_train) # Roc val val_preds = self.model.predict_proba(X_val, verbose=0) val_preds = val_preds[:, 1] roc_val = metrics.roc_auc_score(y_val, val_preds) self.roc_val.append(roc_val) # Metrics train y_preds = self.model.predict_classes(X_train,verbose = 0) self.f1_train.append(metrics.f1_score(y_train,y_preds)) self.recal_train.append(metrics.recall_score(y_train,y_preds)) self.preci_train.append(metrics.precision_score(y_train,y_preds)) # Metrics val y_preds = self.model.predict_classes(X_val,verbose =0) self.f1_val.append(metrics.f1_score(y_val,y_preds)) self.recal_val.append(metrics.recall_score(y_val,y_preds)) self.preci_val.append(metrics.precision_score(y_val,y_preds))
def predict(fea1,fea2, df, t, t9): n = 0 weight = [0.73,0.27] tave = np.zeros(len(df[t9])) y = df[t].label X_1 = df[t] df9 = df[t9] for fea in [fea1,fea2]: Un = df.columns == 'Blank' for f in fea: Un = Un | (df.columns == f) Un = Un | (df.columns == (f+'_x')) Un = Un | (df.columns == (f+'_y')) Un = Un & (df.columns != 'quarterly_attrition_rate_y') clf = GradientBoostingClassifier() X = X_1.ix[:,Un] X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1) min_max_scaler = preprocessing.MinMaxScaler() clf.fit(min_max_scaler.fit_transform(X_train), y_train) re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(min_max_scaler.transform(X_test))[:,1])) print re t = clf.predict_proba(min_max_scaler.fit_transform(df9.ix[:,Un]))[:,1] re = 'September AUC: \t' + str(roc_auc_score(df9.label,t)) print re tave = t * weight[n] + tave n += 1 print '-' * 30 print(weight) print 'Total AUC' re = 'September AUC: \t' + str(roc_auc_score(df9.label,tave)) print re return Un, clf
def modelfit(alg, dtrain, dtest, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): """ Fit models w/ parameters """ if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix( dtrain[predictors].values, label=dtrain[target].values) xgtest = xgb.DMatrix(dtest[predictors].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) print(cvresult) alg.fit(dtrain[predictors], dtrain[target], eval_metric='auc') dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1] dtest['predprob'] = alg.predict_proba(dtest[predictors])[:, 1] print("\nModel Report") print("Accuracy : %.4g" % metrics.accuracy_score( dtrain[target].values, dtrain_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob)) print('AUC Score (Test): %f' % metrics.roc_auc_score(dtest[target], dtest['predprob'])) feat_imp = pd.Series(alg.booster().get_fscore() ).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score')
def run_svm_model(df, continuous_vars, categorical_vars, outcome_var): X_train, y_train, X_test, y_test = test_train_data(df, continuous_vars, categorical_vars, outcome_var, categorical='continuous', seed = 124, test_pct=0) scale = StandardScaler() X_train = scale.fit_transform(X_train) svc = SVC(probability=True, cache_size=500) params = [{'C': [10, 5, 1, 0.1, 0.001], 'class_weight': ['auto'], 'kernel': ['rbf'], 'gamma': [0, 0.1, 0.01]}] cv_strat = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=98) clf = GridSearchCV(svc, params, scoring='roc_auc', cv=cv_strat, n_jobs=3, verbose=1, iid=False) clf.fit(X_train, y_train) clf.grid_scores_ clf.best_estimator_ clf.best_params_ clf.best_score_ roc_auc_score(y_test, clf.best_estimator_.predict_proba(scale.transform(X_test))[:,1]) svc_predict = clf.best_estimator_.predict_proba(scale.transform(X_test))[:,1]
def evaluate_fold(clf, X_train, y_train, X_test, y_test): """ This is the business section """ tmp = dict() tmp['X_train.shape'] = X_train.shape tmp['X_test.shape'] = X_test.shape try: pred_test = clf.predict_proba(X_test) pred_train = clf.predict_proba(X_train) tmp['roc'] = roc_info(y_test, pred_test[:,1]) tmp['roc_area'] = roc_auc_score(y_test, pred_test[:,1]) pred_test = clf.predict(X_test) pred_train = clf.predict(X_train) tmp['f1_test'] = f1_score(y_test, pred_test, pos_label=1) tmp['f1_train'] = f1_score(y_train, pred_train, pos_label=1) except (AttributeError, NotImplementedError): pred_test = clf.predict(X_test) pred_train = clf.predict(X_train) tmp['roc'] = roc_info(y_test, pred_test) tmp['roc_area'] = roc_auc_score(y_test, pred_test) tmp['f1_test'] = f1_score(y_test, pred_test, pos_label=1) tmp['f1_train'] = f1_score(y_train, pred_train, pos_label=1) return tmp
def do_all_study(X,y): names = [ "Decision Tree","Gradient Boosting", "Random Forest", "AdaBoost", "Naive Bayes"] classifiers = [ #SVC(), DecisionTreeClassifier(max_depth=10), GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1), RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1), AdaBoostClassifier()] for name, clf in zip(names, classifiers): estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1) param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_GBC, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_GBC.fit(X_train,y_train) y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1] print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC)) clf_AB = AdaBoostClassifier() param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_AB, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_AB.fit(X_train,y_train) y_pred_AB = clf_AB.predict_proba(X_test)[:,1] print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
def exercise_3(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0) error = [] error_cart = [] error_mean = [] error_mean_cart = [] clf = RandomForestClassifier(n_estimators=100, oob_score=True, max_features="auto", random_state=0) clf_cart = DecisionTreeClassifier() error_mean = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) clf_cart.fit(X_train, y_train) error_mean.append( roc_auc_score(y_test, clf.predict(X_test)) ) error_mean_cart.append( roc_auc_score(y_test, clf_cart.predict(X_test)) ) error.append( np.array(error_mean).mean() ) error_cart.append( np.array(error_mean_cart).mean() ) print 'Error RandomForest: ', error print 'Error CART: ', error_cart
def on_epoch_end(self, epoch, logs={}): train_x, train_y = self.train_data train_y_score = self.model.predict_proba(train_x, verbose=0) test_x, test_y = self.test_data test_y_score = self.model.predict_proba(test_x, verbose=0) logs['auc'] = roc_auc_score(test_y, test_y_score) print('train roc_auc %.3f, test roc_auc %.3f\n' % (roc_auc_score(train_y, train_y_score), roc_auc_score(test_y, test_y_score)))
def process(): data = load_training_data(settings, target, pipeline, strategy=strategy, cv_fold_number=fold, check_only=False, quiet=quiet) if feature_mask is not None: s = [slice(None),] * data.X_train.ndim s[-1] = np.where(np.array(feature_mask) == True)[0] data['X_train'] = data.X_train[s] data['X_cv'] = data.X_cv[s] if not quiet: print ' feature mask', 'X_train', data.X_train.shape, 'y_train', data.y_train.shape, 'X_cv', data.X_cv.shape, 'y_cv', data.y_cv.shape train(classifier, data, quiet=quiet) if not quiet: print "Making predictions...", timer = time.Timer() mean_predictions, median_predictions, raw_predictions = make_predictions(classifier, data.X_cv, data.num_cv_segments) if not quiet: print timer.pretty_str() mean_score = roc_auc_score(data.y_cv, mean_predictions) median_score = roc_auc_score(data.y_cv, median_predictions) return jsdict({ 'mean_score': mean_score, 'median_score': median_score, 'mean_predictions': mean_predictions, 'median_predictions': median_predictions, 'y_cv': data.y_cv })
def show_roc(fold, targets, pred): # print 'fold : ',fold # print 'Size of targets : ',len(targets) # print 'Size of predictions : ', len(pred) roc_labels = [] for t in targets: if t > 0.0: roc_labels.append(1) else: roc_labels.append(0) print roc_auc_score(roc_labels, pred) # plots fpr, tpr, thresholds = roc_curve(roc_labels, pred) roc_auc = auc(fpr, tpr) # print fpr, ' , ', tpr, ' , ', roc_auc print fold,' , ',roc_auc plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (fold, roc_auc)) plt.axis([0,1,0,1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()
def evalmetric(pred, truth): return 'auc_mine', metrics.roc_auc_score(truth.get_label(), pred) thresholds = np.arange(99.6, 99.9, 0.025) bestScore = 0 bestT = 0 bestAcc = 0 bestCf = np.zeros((2,2)) thresholds = [0.10] for t in thresholds: temp = np.copy(pred) temp[np.where(pred > np.percentile(pred, t))] = 1 temp[np.where(pred <= np.percentile(pred, t))] = 0 score = metrics.matthews_corrcoef(truth.get_label(), temp) if score > bestScore: bestScore = score bestT = np.percentile(pred, t) bestAuc = metrics.roc_auc_score(truth.get_label(), temp, reorder=True) bestCf = metrics.confusion_matrix(truth.get_label(), temp) print('threshold {} mcc {} auc {} TN {} FP {} FN {} TP {}\n'.format(bestT, bestScore, bestAcc, bestCf[0][0], bestCf[0][1], bestCf[1][0], bestCf[1][1])) return 'mcc', -1 * bestScore
def train_and_evaluate(): nn_training_error = 0 nn_test_error = 0 training_error = 0 test_error = 0 for train, test in ss: # Train NN nn.initialize(x[train]) #print 'NN pre-training train error: %f' % metrics.mean_absolute_error(y[train], nn.predict(x[train]).reshape(x[train].shape[0],)) #print 'NN pre-training f1 score: %f' %metrics.f1_score(y[train], preprocessing.Binarizer(threshold=0.5).transform(nn.predict(x[train])).T) #print 'NN pre-training auc score: %f' %metrics.roc_auc_score(y[train], nn.predict(x[train]).T) nn.train(x[train], y[train], passes=500, alpha=0.7, lam=0.0) cat=1 nn_training_auc = metrics.roc_auc_score(y[train][:,cat], nn.predict(x[train]).T[:,cat]) nn_test_auc = metrics.roc_auc_score(y[test][:,cat], nn.predict(x[test]).T[:,cat]) nn_training_error = metrics.f1_score(y[train][:,cat], preprocessing.Binarizer(threshold=0.5).transform(nn.predict(x[train])).T[:,cat]) nn_test_error = metrics.f1_score(y[test][:,cat], preprocessing.Binarizer(threshold=0.5).transform(nn.predict(x[test])).T[:,cat]) #nn_training_error += metrics.mean_absolute_error(y[train], nn.predict(x[train]).reshape(x[train].shape[0],)) #nn_test_error += metrics.mean_absolute_error(y[test], nn.predict(x[test]).reshape(x[test].shape[0],)) print 'NN F1: (Training) %f, (Test) %f' %(nn_training_error, nn_test_error) print 'NN AUC: (Training) %f, (Test) %f' %(nn_training_auc, nn_test_auc)
def modelSelection(x_train, y_train, x_test, y_test, model, n_folds): """ Select various models and return the AUCs of training and test sets and predicted offer acceptance probabilities. """ if model == "Random Forest": clf = RandomForestClassifier(n_estimators=150, oob_score=True, random_state=0, min_samples_split=1) elif model == "Logistic Regression L1": clf = LogisticRegression(penalty='l1', random_state=0, class_weight='auto') elif model == "Logistic Regression L2": clf = LogisticRegression(penalty='l2', random_state=0, class_weight='auto') elif model == "Decision Tree": clf = DecisionTreeClassifier(random_state=0) elif model == "Naive Bayes": clf = GaussianNB() elif model == "KNN": clf = KNeighborsClassifier(n_neighbors=10) # Perform cross-validation on training dataset and calculate AUC cv = StratifiedKFold(y_train, n_folds=n_folds) auc_train = [] auc_validation = [] auc_test = [] pred_prob = [] for i, (train, validation) in enumerate(cv): clf = clf.fit(x_train[train], y_train[train]) auc_train.append(metrics.roc_auc_score(y_train[train], clf.predict_proba(x_train[train])[:, 1])) auc_validation.append(metrics.roc_auc_score(y_train[validation], clf.predict_proba(x_train[validation])[:, 1])) auc_test.append(metrics.roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])) pred_prob.append(clf.predict_proba(x_test)[:, 1]) return np.mean(auc_train), np.mean(auc_validation), np.mean(auc_test), np.mean(pred_prob, axis=0)
model = MLPClassifier(n_classes=19, n_input=X.shape[1]) model = model.cuda() train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True, pin_memory=True) opt = torch.optim.Adam(model.parameters(), lr=1e-3) for epoch in range(50): _ = model.train() for x, y in train_loader: x, y = x.cuda(), y.cuda() out = model(x) loss = F.binary_cross_entropy_with_logits(out, y) opt.zero_grad() loss.backward() opt.step() _ = model.eval() z = model(X_valid.cuda()) p_valid = to_numpy(z) auc_valid = [ metrics.roc_auc_score(y, p) for y, p in zip(y_valid.T, p_valid.T) ] print(np.mean(auc_valid))
bounds=[(0.0, 1.0) for _ in range(len(weights))], args=(valid_label_c, pred_validation), maxiter=1000, tol=1e-7) if VERBOSE: print(weights_optim.x) scores = np.average(pred_test, weights=weights_optim.x, axis=0) else: scores = [] for i in range(0, len(test_id)): if ENSEMBLE_LEARNING == ensemble_learning_type.soft_voting: max_prob = asy_pred_score[i] if bor_pred_score[i] > max_prob: max_prob = bor_pred_score[i] if col_pred_score[i] > max_prob: max_prob = col_pred_score[i] scores.append(max_prob) else: if ENSEMBLE_LEARNING == ensemble_learning_type.averaging: scores.append((asy_pred_score[i] + bor_pred_score[i] + col_pred_score[i]) / 3.0) aucs = [] auc = roc_auc_score(test_label_c, scores) aucs.append(auc) if CONV_LAYER_FROZEN: filename = os.path.join(pipeline, 'out', 'aucs_frozen' + str(seed) + '.csv') else: filename = os.path.join(pipeline, 'out', 'aucs_not_frozen' + str(seed) + '.csv') report_auc(aucs, filename)
# aggregate table to view statistics print(tabulate(X.describe(), X)) # fill NULLS X["Age"].fillna(X.Age.mean(), inplace=True) # BUILD FAST SIMPLE MODEL TO GET FIRST BENCHMARK********************************* # get numeric variables numeric_variables = list(X.dtypes[X.dtypes != "object"].index) print(tabulate((X[numeric_variables].head()), X)) # build model model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42) model.fit(X[numeric_variables], y) # model score is c-stat. # model.oob_score y_oob = model.oob_prediction_ print("c-stat: ", roc_auc_score(y, y_oob)) # print(y_oob) #probability of survival (this is what is then converted into classes) # ******************************************************************************* # # function that describes categorical variables # def describe_categorical(X): # print("EOF")
def test_ann(word2vec_path, model_number): # Parameters # ============================================================================= logger = feed.logger_fn("tflog", "logs/test-{0}.log".format(time.asctime())) # MODEL = input("☛ Please input the model file you want to test, " # "it should be like(1490175368): ") MODEL = str(model_number) while not (MODEL.isdigit() and len(MODEL) == 10): MODEL = input("✘ The format of your input is illegal, " "it should be like(1490175368), please re-input: ") logger.info("✔︎ The format of your input is legal, " "now loading to next step...") TRAININGSET_DIR = 'models/citability/data/Train.json' VALIDATIONSET_DIR = 'models/citability/data/Validation.json' # TEST_DIR = 'data/Test.json' cwd = os.getcwd() TEST_DIR = os.path.join(cwd, 'web/test_data.json') cwd = os.getcwd() MODEL_DIR = os.path.join(cwd, 'web/runs/' + MODEL + '/checkpoints/') print(MODEL_DIR) BEST_MODEL_DIR = 'runs/' + MODEL + '/bestcheckpoints/' SAVE_DIR = 'results/' + MODEL # Data Parameters tf.flags.DEFINE_string("training_data_file", TRAININGSET_DIR, "Data source for the training data.") tf.flags.DEFINE_string("validation_data_file", VALIDATIONSET_DIR, "Data source for the validation data") tf.flags.DEFINE_string("test_data_file", TEST_DIR, "Data source for the test data") tf.flags.DEFINE_string("checkpoint_dir", MODEL_DIR, "Checkpoint directory from training run") tf.flags.DEFINE_string("best_checkpoint_dir", BEST_MODEL_DIR, "Best checkpoint directory from training run") # Model Hyperparameters tf.flags.DEFINE_integer( "pad_seq_len", 35842, "Recommended padding Sequence length of data " "(depends on the data)") tf.flags.DEFINE_integer( "embedding_dim", 300, "Dimensionality of character embedding " "(default: 128)") tf.flags.DEFINE_integer("embedding_type", 1, "The embedding type (default: 1)") tf.flags.DEFINE_integer( "fc_hidden_size", 1024, "Hidden size for fully connected layer " "(default: 1024)") tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)") tf.flags.DEFINE_integer("num_classes", 80, "Number of labels (depends on the task)") tf.flags.DEFINE_integer("top_num", 80, "Number of top K prediction classes (default: 5)") tf.flags.DEFINE_float("threshold", 0.5, "Threshold for prediction classes (default: 0.5)") # Test Parameters tf.flags.DEFINE_integer("batch_size", 1, "Batch Size (default: 1)") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") tf.flags.DEFINE_boolean("gpu_options_allow_growth", True, "Allow gpu options growth") FLAGS = tf.flags.FLAGS FLAGS(sys.argv) dilim = '-' * 100 logger.info('\n'.join([ dilim, *[ '{0:>50}|{1:<50}'.format(attr.upper(), FLAGS.__getattr__(attr)) for attr in sorted(FLAGS.__dict__['__wrapped']) ], dilim ])) """Test ANN model.""" # Load data logger.info("✔︎ Loading data...") logger.info("Recommended padding Sequence length is: {0}".format( FLAGS.pad_seq_len)) logger.info("✔︎ Test data processing...") test_data = feed.load_data_and_labels(FLAGS.test_data_file, FLAGS.num_classes, FLAGS.embedding_dim, data_aug_flag=False, word2vec_path=word2vec_path) logger.info("✔︎ Test data padding...") x_test, y_test = feed.pad_data(test_data, FLAGS.pad_seq_len) y_test_labels = test_data.labels # Load ann model # BEST_OR_LATEST = input("☛ Load Best or Latest Model?(B/L): ") BEST_OR_LATEST = 'L' while not (BEST_OR_LATEST.isalpha() and BEST_OR_LATEST.upper() in ['B', 'L']): BEST_OR_LATEST = \ input("✘ The format of your input is illegal, please re-input: ") if BEST_OR_LATEST.upper() == 'B': logger.info("✔︎ Loading best model...") checkpoint_file = checkpoints.get_best_checkpoint( FLAGS.best_checkpoint_dir, select_maximum_value=True) else: logger.info("✔︎ Loading latest model...") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) logger.info(checkpoint_file) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{0}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] is_training = graph.get_operation_by_name("is_training").outputs[0] # Tensors we want to evaluate scores = graph.get_operation_by_name("output/scores").outputs[0] loss = graph.get_operation_by_name("loss/loss").outputs[0] # Split the output nodes name by '|' if you have several output # nodes output_node_names = "output/scores" # Save the .pb model file output_graph_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, output_node_names.split("|")) tf.train.write_graph(output_graph_def, "graph", "graph-ann-{0}.pb".format(MODEL), as_text=False) # Generate batches for one epoch batches = feed.batch_iter(list(zip(x_test, y_test, y_test_labels)), FLAGS.batch_size, 1, shuffle=False) test_counter, test_loss = 0, 0.0 test_pre_tk = [0.0] * FLAGS.top_num test_rec_tk = [0.0] * FLAGS.top_num test_F_tk = [0.0] * FLAGS.top_num # Collect the predictions here true_labels = [] predicted_labels = [] predicted_scores = [] # Collect for calculating metrics true_onehot_labels = [] predicted_onehot_scores = [] predicted_onehot_labels_ts = [] predicted_onehot_labels_tk = [[] for _ in range(FLAGS.top_num)] for batch_test in batches: x_batch_test, y_batch_test, y_batch_test_labels = zip( *batch_test) print("x_batch_test", x_batch_test) print("y_batch_test", y_batch_test) feed_dict = { input_x: x_batch_test, input_y: y_batch_test, dropout_keep_prob: 1.0, is_training: False } batch_scores, cur_loss = sess.run([scores, loss], feed_dict) # Prepare for calculating metrics for i in y_batch_test: true_onehot_labels.append(i) for j in batch_scores: predicted_onehot_scores.append(j) # Get the predicted labels by threshold batch_predicted_labels_ts, batch_predicted_scores_ts = \ feed.get_label_threshold(scores=batch_scores, threshold=FLAGS.threshold) # Add results to collection for i in y_batch_test_labels: true_labels.append(i) for j in batch_predicted_labels_ts: predicted_labels.append(j) for k in batch_predicted_scores_ts: predicted_scores.append(k) # Get onehot predictions by threshold batch_predicted_onehot_labels_ts = \ feed.get_onehot_label_threshold(scores=batch_scores, threshold=FLAGS.threshold) for i in batch_predicted_onehot_labels_ts: predicted_onehot_labels_ts.append(i) # Get onehot predictions by topK for top_num in range(FLAGS.top_num): batch_predicted_onehot_labels_tk = feed.\ get_onehot_label_topk(scores=batch_scores, top_num=top_num + 1) for i in batch_predicted_onehot_labels_tk: predicted_onehot_labels_tk[top_num].append(i) test_loss = test_loss + cur_loss test_counter = test_counter + 1 # Calculate Precision & Recall & F1 (threshold & topK) test_pre_ts = precision_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') test_rec_ts = recall_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') test_F_ts = f1_score(y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') for top_num in range(FLAGS.top_num): test_pre_tk[top_num] = precision_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') test_rec_tk[top_num] = recall_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') test_F_tk[top_num] = f1_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') # Calculate the average AUC test_auc = roc_auc_score(y_true=np.array(true_onehot_labels), y_score=np.array(predicted_onehot_scores), average='micro') # Calculate the average PR test_prc = average_precision_score( y_true=np.array(true_onehot_labels), y_score=np.array(predicted_onehot_scores), average="micro") test_loss = float(test_loss / test_counter) logger.info( "☛ All Test Dataset: Loss {0:g} | AUC {1:g} | AUPRC {2:g}". format(test_loss, test_auc, test_prc)) # Predict by threshold logger.info( "☛ Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}" .format(test_pre_ts, test_rec_ts, test_F_ts)) # Predict by topK logger.info("☛ Predict by topK:") for top_num in range(FLAGS.top_num): logger.info( "Top{0}: Precision {1:g}, Recall {2:g}, F {3:g}".format( top_num + 1, test_pre_tk[top_num], test_rec_tk[top_num], test_F_tk[top_num])) # Save the prediction result if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) feed.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", data_id=test_data.testid, all_labels=true_labels, all_predict_labels=predicted_labels, all_predict_scores=predicted_scores) logger.info("✔︎ Done.")
res = model.fit(X, Y, batch_size = 512, epochs = 2, validation_data = (X_val, Y_val), callbacks = [stop]) # In[ ]: Y_test = model.predict(X_test) # In[ ]: sum = 0 for i in range(6): score = roc_auc_score(test_labels[:,i], Y_test[:,i]) sum += score # In[ ]: print (sum / 6) # In[ ]:
C = 10 l = len(Y) x1 = X[:, 0] x2 = X[:, 1] n = 0 d = 10 y = [] while n < 10000 and d > 0.00001: v1 = w1 v2 = w2 a = 1 + np.exp(-Y * (w1 * x1 + w2 * x2)) w1 = w1 + (k / l) * (np.sum(Y * x1 * (1 - 1 / a))) - k * C * w1 w2 = w2 + (k / l) * (np.sum(Y * x2 * (1 - 1 / a))) - k * C * w2 d = np.sqrt((w1 - v1)**2 + (w2 - v2)**2) n = n + 1 y.append(a) y = [item for sublist in y for item in sublist] y[:] = [x - 1 for x in y] print(type(y)) zero = [] np.array(zero) zero.append(np.zeros((39, ), dtype=np.int)) zero1 = [item for sublist in zero for item in sublist] np.append(Y, zero1) y = np.asarray(y) print(roc_auc_score(y, Y))
def evaluate_model(label_df, y_predicted, **kwargs): """Evaluate the performance of the model Args: label_df (:py:class:`pandas.DataFrame`): Dataframe containing true y label y_predicted (:py:class:`pandas.DataFrame`): Dataframe containing predicted probability and score Returns: confusion_df (:py:class:`pandas.DataFrame`): Dataframe reporting confusion matrix """ try: # get predicted scores y_pred_prob = y_predicted.iloc[:, 0] y_pred = y_predicted.iloc[:, 1] # get true labels y_true = label_df.iloc[:, 0] # raise IndexError when the input dataframe does not have two columns as desired except: raise IndexError('Index out of bounds!') # check if label_df and y_predicted have only numeric columns for col in label_df.columns: if label_df[col].dtype not in [ np.dtype('float64'), np.dtype('float32'), np.dtype('int64') ]: raise ValueError( 'Input dataframe can only have numeric or boolean types!') for col in y_predicted.columns: if y_predicted[col].dtype not in [ np.dtype('float64'), np.dtype('float32'), np.dtype('int64') ]: raise ValueError( 'Input dataframe can only have numeric or boolean types!') # classification metrics can only take binary classes - 0 or 1 in this case # check if y_pred and label_df are all either 0 or 1 if (not y_pred.isin([0, 1]).all()) or (not y_true.isin([0, 1]).all()): raise ValueError('Class can only be 0 or 1!') # check if predicted probabilities are within 0-1 if not y_pred_prob.between(0, 1, inclusive=True).all(): raise ValueError('Probabilities needs to be in 0-1 range!') # calculate auc and accuracy and f1_score if specified if "auc" in kwargs["metrics"]: auc = roc_auc_score(label_df, y_pred_prob) print('AUC on test: %0.3f' % auc) if "accuracy" in kwargs["metrics"]: accuracy = accuracy_score(label_df, y_pred) print('Accuracy on test: %0.3f' % accuracy) if "f1_score" in kwargs["metrics"]: f1 = f1_score(label_df, y_pred) print('F1-score on test: %0.3f' % f1) # generate confusion matrix and classification report print(classification_report(label_df, y_pred)) confusion = confusion_matrix(label_df, y_pred) print(confusion) confusion_df = pd.DataFrame( confusion, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive']) return confusion_df
def evaluate(model, val_loader): model.eval() outputs = [step(model, batch) for batch in val_loader] Preds = [x['preds'] for x in outputs] Labels = [x['labels'] for x in outputs] Outs = [x['out'] for x in outputs] Preds = torch.cat(Preds, dim=0).cpu() Labels = torch.cat(Labels, dim=0).cpu() Outs = torch.cat(Outs, dim=0).cpu() Scores = F.softmax(Outs, dim=1) print(Preds, Labels, Scores) print(Preds.size(), len(Labels), Scores.size()) print('General Evaluation') # Precision | Recall | F1 - score | AUC acc_all = accuracy_score(Labels, Preds) ap_all = precision_score(Labels, Preds, average='macro') ar_all = recall_score(Labels, Preds, average='macro') f1_all = f1_score(Labels, Preds, average='macro') print(acc_all, ap_all, ar_all, f1_all) y_pred = [] y_true = [] for i in range(1900): if Preds[i] == 0: y_pred.append('AMD') elif Preds[i] == 1: y_pred.append('DME') elif Preds[i] == 2: y_pred.append('NM') elif Preds[i] == 3: y_pred.append('PCV') elif Preds[i] == 4: y_pred.append('PM') if Labels[i] == 0: y_true.append('AMD') elif Labels[i] == 1: y_true.append('DME') elif Labels[i] == 2: y_true.append('NM') elif Labels[i] == 3: y_true.append('PCV') elif Labels[i] == 4: y_true.append('PM') t1 = classification_report(y_true, y_pred, target_names=['AMD', 'DME', 'NM', 'PCV', 'PM']) t2 = classification_report(y_true, y_pred, output_dict=True, target_names=['AMD', 'DME', 'NM', 'PCV', 'PM']) print(t1) print(t2) # draw confuse matrix classes = ['AMD', 'DME', 'NM', 'PCV', 'PM'] cm = confusion_matrix(y_true, y_pred) fig, ax = plt.subplots() plt.imshow(cm, cmap=plt.cm.Greens) indices = range(len(cm)) plt.xticks(indices, classes) plt.yticks(indices, classes) plt.colorbar() plt.xlabel('Pred') plt.ylabel('True') for first_index in range(len(cm)): for second_index in range(len(cm[first_index])): plt.text(first_index, second_index, cm[first_index][second_index]) fig.savefig("./img/{}/Best-cm-img{}.png".format(args.model, args.bsize), dpi=320, format='png') # cal auc roc_ovr = roc_auc_score(Labels, Scores, multi_class='ovr') print('--roc-ovr:', roc_ovr) roc_ovo = roc_auc_score(Labels, Scores, multi_class='ovo') print('--roc-ovo:', roc_ovo)
for TRAIN_INDEX, TEST_INDEX in SKF.split(X_DATA, Y_DATA): X_TRAIN = X_DATA[TRAIN_INDEX] X_TEST = X_DATA[TEST_INDEX] Y_TRAIN = Y_DATA[TRAIN_INDEX] Y_TEST = Y_DATA[TEST_INDEX] X_RES = X_TRAIN Y_RES = Y_TRAIN classifier = KNeighborsClassifier(n_neighbors=2) classifier.fit(X_RES, Y_RES) Y_PRED = classifier.predict(X_TEST) CM = np.add(CM, confusion_matrix(Y_TEST, Y_PRED)) Y_TEST_TOTAL = np.concatenate((Y_TEST_TOTAL, Y_TEST)) Y_PRED_TOTAL = np.concatenate((Y_PRED_TOTAL, Y_PRED)) prec = CM[1][1] / (CM[1][1] + CM[0][1]) rec = CM[1][1] / (CM[1][1] + CM[1][0]) fmes = 2 * prec * rec / (prec + rec) auc = roc_auc_score(Y_TEST_TOTAL, Y_PRED_TOTAL) balan = bal(CM) print( str(prec) + ' ' + str(rec) + ' ' + str(fmes) + ' ' + str(auc) + ' ' + str(balan)) # print('Confusion Matrix') # print(CM) # print('Precision: ' + str(prec)) # print('Recall: ' + str(rec)) # print('fmeasure: ' + str(fmes)) # print('Balance: ' + str(balan))
def class_prob(simulation_name, device, csv_files, test_idx, epoch_max=40, batch_size=1, logits=False, calibrator=None, mscourse=None): csv_file_path = csv_files["path"] csv_file_tags = csv_files["tags"] csv_file_cov = csv_files["cov"] N = pd.read_csv(csv_file_tags)["ID"].nunique() validation = True if "cont" in csv_file_path: val_options = { "T_val": 1095, "max_val_samples": 1, "T_closest": 1825, "T_val_from": 1460 } else: val_options = { "T_val": 36, "max_val_samples": 1, "T_closest": 60, "T_val_from": 48 } if mscourse is not None: df_cov = pd.read_csv(csv_file_cov) test_idx = np.array(df_cov.loc[(df_cov.ID.isin(test_idx)) & ((df_cov[mscourse] > 0).any(1)), "ID"].unique().tolist()) data_test = data_utils.ODE_Dataset(csv_file=csv_file_path, label_file=csv_file_tags, cov_file=csv_file_cov, idx=test_idx, validation=validation, val_options=val_options) dl_test = DataLoader(dataset=data_test, collate_fn=data_utils.custom_collate_fn, shuffle=False, batch_size=batch_size) params_dict = np.load(f"./trained_models/{simulation_name}_params.npy", allow_pickle=True).item() nnfwobj = gru_ode.NNFOwithBayesianJumps( input_size=params_dict["input_size"], hidden_size=params_dict["hidden_size"], p_hidden=params_dict["p_hidden"], prep_hidden=params_dict["prep_hidden"], logvar=params_dict["logvar"], mixing=params_dict["mixing"], classification_hidden=params_dict["classification_hidden"], cov_size=params_dict["cov_size"], cov_hidden=params_dict["cov_hidden"], dropout_rate=params_dict["dropout_rate"], full_gru_ode=params_dict["full_gru_ode"]) nnfwobj.to(device) nnfwobj.load_state_dict( torch.load(f"./trained_models/{simulation_name}_MAX.pt")) class_criterion = torch.nn.BCEWithLogitsLoss(reduction='sum') val_metric_prev = -1000 nnfwobj.eval() class_preds = [] labels_list = [] for i, b in enumerate(tqdm.tqdm(dl_test)): prob_path = [] times = b["times"] time_ptr = b["time_ptr"] X = b["X"].to(device) M = b["M"].to(device) obs_idx = b["obs_idx"] cov = b["cov"].to(device) labels = b["y"].to(device) batch_size = labels.size(0) if labels.shape[0] > 1: _, _, class_pred, _ = nnfwobj(times, time_ptr, X, M, obs_idx, delta_t=params_dict["delta_t"], T=params_dict["T"], cov=cov) if logits: return labels.detach().cpu().numpy(), class_pred.detach().cpu( ).numpy() else: return labels.detach().cpu().numpy(), torch.sigmoid( class_pred).detach().cpu().numpy() for samp in range(0, len(times) + 1): times_samp = times[:samp] time_ptr_samp = time_ptr[:samp] X_samp = X[:samp] M_samp = M[:samp] obs_idx_samp = obs_idx[:samp] hT, loss, class_pred, _ = nnfwobj(times, time_ptr, X_samp, M_samp, obs_idx_samp, delta_t=params_dict["delta_t"], T=params_dict["T"], cov=cov) prob_path += [clf.predict_proba((class_pred).detach().cpu())[:, 1]] class_preds += [class_pred.detach().cpu().numpy().item()] labels_list += [labels.detach().cpu().numpy().item()] plt.figure() times /= 12 times -= 3 fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('Time before visit [Years]') ax1.set_ylabel('EDSS', color=color) edss_x = np.round(2 * (X.detach().cpu().numpy() * 1.6764 + 2.4818)) / 2 ax1.scatter(times, edss_x, color=color) ax1.tick_params(axis='y', labelcolor=color) ax1.set_ylim(edss_x.min() - 1, edss_x.max() + 1) min_tick = np.max((0, edss_x.min() - 0.5)) max_tick = np.min((10, edss_x.max() + 1)) ax1.set_yticks(np.arange(min_tick, max_tick, step=0.5)) ax2 = ax1.twinx( ) # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.set_ylabel('Probability', color=color) # we already handled the x-label with ax1 ax2.step(np.concatenate((np.array([-3]), times)), prob_path, where="post", color=color) ax2.tick_params(axis='y', labelcolor=color) ax2.set_ylim((0, 1)) fig.tight_layout() #plt.scatter(times,X.detach().cpu().numpy()) #plt.step(np.concatenate((np.array([0]),times)), prob_path, where = "post") plt.title( f"Progression of the worsening prediction over time. Label : {labels.detach().cpu().numpy()[0][0]}" ) fig.savefig(f"./figs/prob_prop_{i}.pdf") plt.close(fig) plt.close("all") #if i >100: # break print(roc_auc_score(np.array(labels_list), np.array(class_preds))) return class_preds, labels_list
# 6. Fit model model.fit(x_train, y_train, batch_size=batchSize, epochs=E, verbose=1, validation_data=(x_val, y_val), callbacks=[model_checkpoint]) # 7. Evalute model prediction # Based on all 4 runs, CNN model weights from epoch 3 appears to have the # lowest validation loss and highest validation accuracy. # So we load this model to determine ROC_AUC score # so, we load the weights from the second epoch model.load_weights(output_dir + "\\weights.03.hdf5") # Compute predictions y_hat = model.predict(x_val) # Visualise distribution of predicted y_hat plt.hist(y_hat) plt.axvline(0.5, color="orange") # Measure performance with ROC_AUC score pct_auc = roc_auc_score(y_val, y_hat) * 100.0 print("ROC AUC = %.2f percent" % pct_auc) # Convolutional neural net : 95.24% (model weights from epoch #3)
cm = confusion_matrix(y_test, y_predEnsem) print("time taken: ", round(time() - t, 3), "s") print(classification_report(y_test, y_predEnsem, target_names=['brand', 'female', 'male'])) print("accuracy: ", ensemble_classifier.score(x_test,y_test)) #plotting roc curve y_test = [ i if (i==1) else 0 for i in y_test ] plt.figure() plt.subplots(figsize=(8,6)) y_predGNB = [ i if (i==1) else 0 for i in y_predGNB ] fpr, tpr, _ = roc_curve(y_test, y_predGNB, pos_label=1) plt.plot(fpr, tpr, 'b', label="Naive Bayes, AUC=" + str(round(roc_auc_score(y_test, y_predGNB), 3))) y_predRFC = [ i if (i==1) else 0 for i in y_predRFC ] fpr, tpr, _ = roc_curve(y_test, y_predRFC) plt.plot(fpr, tpr, 'r', label="Random Forest, AUC=" + str(round(roc_auc_score(y_test, y_predRFC), 3))) y_predLR = [ i if (i==1) else 0 for i in y_predLR ] fpr, tpr, _ = roc_curve(y_test, y_predLR) plt.plot(fpr, tpr, 'g', label="Logistic Regression, AUC=" + str(round(roc_auc_score(y_test, y_predLR), 3))) y_predEnsem = [ i if (i==1) else 0 for i in y_predEnsem ] fpr, tpr, _ = roc_curve(y_test, y_predEnsem) plt.plot(fpr, tpr, 'y', label="Ensemble, AUC=" + str(round(roc_auc_score(y_test, y_predEnsem), 3))) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.legend()
X_train, X_test, y_train, y_test = train_test_split(hdd, hdd_labels, test_size=0.2) smote = SMOTE(kind="regular") X_train, y_train = smote.fit_sample(X_train, y_train) #clf=ensemble.RandomForestClassifier() clf = tree.DecisionTreeClassifier(max_depth=None, criterion='gini', min_samples_split=3, min_samples_leaf=2, max_leaf_nodes=5) clf = clf.fit(X_train, y_train) preds = clf.predict_proba(X_test) preds_ = clf.predict(X_test) roc_auc = metrics.roc_auc_score(y_true=y_test, y_score=preds[:, 1]) print('roc_auc', roc_auc) print('NACC', metrics.recall_score(y_true=y_test, y_pred=preds_)) print('accuracy', metrics.accuracy_score(y_true=y_test, y_pred=preds_)) if ((metrics.recall_score(y_true=y_test, y_pred=preds_) > 0.8) & (metrics.accuracy_score(y_true=y_test, y_pred=preds_) > 0.8)): break #recall.append(metrics.recall_score(y_true=test_label,y_pred=preds_)) #accuracy.append(metrics.accuracy_score(y_true=test_label,y_pred=preds_)) fpr, tpr, thresholds = metrics.roc_curve(y_test, preds[:, 1]) fig = plt.figure() plt.title('ROC curve') plt.plot(fpr, tpr, 'b') plt.legend(loc='lower right')
for c in df_train.columns: if c != 'ncodpers': print(c) y_train = df_train[c] x_train = df_train.drop([c, 'ncodpers'], 1) clf = LogisticRegression(solver='saga', max_iter=400) clf.fit(x_train, y_train) p_train = clf.predict_proba(x_train)[:, 1] models[c] = clf model_preds[c] = p_train for id, p in zip(ids, p_train): id_preds[id].append(p) print(roc_auc_score(y_train, p_train)) already_active = {} for row in df_train.values: row = list(row) id = row.pop(0) active = [c[0] for c in zip(df_train.columns[1:], row) if c[1] > 0] already_active[id] = active train_preds = {} for id, p in id_preds.items(): # Here be dragons preds = [ i[0] for i in sorted([ i for i in zip(df_train.columns[1:], p) if i[0] not in already_active[id]
#print outputs[-1], targets losses.append(criterion(outputs[-1], targets)) loss = sum(losses) / len(batch_icd) loss.backward() optimizer.step() ## Validation phase vpredictions = np.zeros(len(valid_input_seqs)) for i in range(len(valid_input_seqs)): test_seq = valid_input_seqs[i] vpredictions[i] = model.predict( Variable( torch.from_numpy(convert_to_one_hot(test_seq)).float())) print "Validation Test AUC_ROC: ", roc_auc_score( valid_labels, vpredictions) ## Testing phase predictions = np.zeros(len(test_input_seqs)) for i in range(len(test_input_seqs)): test_seq = test_input_seqs[i] predictions[i] = model.predict( Variable( torch.from_numpy(convert_to_one_hot(test_seq)).float())) print "Test AUC_ROC: ", roc_auc_score(test_labels, predictions) # actual_predictions = (predictions>0.5)*1 # print classification_report(test_labels, actual_predictions) aucrocs.append(roc_auc_score(test_labels, predictions)) best_aucrocs.append(max(aucrocs))
def train_model(): x_train, x_test, y_train, y_test = preprocess_data() clf = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=18, max_leaf_nodes=64, verbose=1, n_jobs=4) scores_rfc = [] # models1 = [] # initialize KFold, we vcan use stratified KFold to keep the same imblance ratio for target kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10) for i, (train_idx, valid_idx) in enumerate(kf.split(x_train, y_train)): print('...... training {}th fold \n'.format(i + 1)) tr_x = x_train[train_idx] tr_y = y_train[train_idx] val_x = x_train[valid_idx] val_y = y_train[valid_idx] model = clf model.fit(tr_x, tr_y) # picking best model? pred_val_y = model.predict(val_x) # measuring model vs validation score_rfc = roc_auc_score(val_y, pred_val_y) scores_rfc.append(score_rfc) print('current performance by auc:{}'.format(score_rfc)) # auc_scores1.append(auc) # models1.append(model) best_f1 = -np.inf best_thred = 0 v = [i * 0.01 for i in range(50)] for thred in v: preds = (pred_val_y > thred).astype(int) f1 = f1_score(val_y, preds) if f1 > best_f1: best_f1 = f1 best_thred = thred y_pred_rfc = (pred_val_y > best_thred).astype(int) print(confusion_matrix(val_y, y_pred_rfc)) print(f1_score(val_y, y_pred_rfc)) print('the average mean auc is:{}'.format(np.mean(scores_rfc))) model_lgb = lgb.LGBMClassifier( n_jobs=4, n_estimators=10000, boost_from_average='false', learning_rate=0.01, num_leaves=64, num_threads=4, max_depth=-1, tree_learner="serial", feature_fraction=0.7, bagging_freq=5, bagging_fraction=0.7, min_data_in_leaf=100, silent=-1, verbose=-1, max_bin=255, bagging_seed=11, ) auc_scores = [] models = [] kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10) for i, (train_idx, valid_idx) in enumerate(kf.split(x_train, y_train)): print('...... training {}th fold \n'.format(i + 1)) tr_x = x_train[train_idx] tr_y = y_train[train_idx] va_x = x_train[valid_idx] va_y = y_train[valid_idx] model = model_lgb # you need to initialize your lgb model at each loop, otherwise it will overwrite model.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (va_x, va_y)], eval_metric='auc', verbose=500, early_stopping_rounds=300) # calculate current auc after training the model pred_va_y = model.predict_proba(va_x, num_iteration=model.best_iteration_)[:, 1] auc = roc_auc_score(va_y, pred_va_y) print('current best auc score is:{}'.format(auc)) auc_scores.append(auc) models.append(model) best_f1 = -np.inf best_thred = 0 v = [i * 0.01 for i in range(50)] for thred in v: preds = (pred_va_y > thred).astype(int) f1 = f1_score(va_y, preds) if f1 > best_f1: best_f1 = f1 best_thred = thred y_pred_lgb = (pred_va_y > best_thred).astype(int) print(confusion_matrix(va_y, y_pred_lgb)) print(f1_score(va_y, y_pred_lgb)) print('the average mean auc is:{}'.format(np.mean(auc_scores))) fpr, tpr, _ = roc_curve(va_y, pred_va_y) # plot model roc curve plt.plot(fpr, tpr, marker='.', label='LGB model') # axis labels plt.title('ROC AUC CURVE') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') # show the legend plt.legend() # show the plot plt.savefig('LGB ROC_auc_curve.png') plt.show() # Test data pred_test_1 = models[0].predict_proba( x_test, num_iteration=models[0].best_iteration_)[:, 1] pred_test_2 = models[1].predict_proba( x_test, num_iteration=models[1].best_iteration_)[:, 1] pred_test_3 = models[2].predict_proba( x_test, num_iteration=models[2].best_iteration_)[:, 1] pred_test_4 = models[3].predict_proba( x_test, num_iteration=models[3].best_iteration_)[:, 1] pred_test_5 = models[4].predict_proba( x_test, num_iteration=models[4].best_iteration_)[:, 1] pred_test = (pred_test_1 + pred_test_2 + pred_test_3 + pred_test_4 + pred_test_5) / 5.0 print(pred_test)
y = data["class"] X = data.groupby("class").transform(lambda x: x.fillna(x.mean())) #data["value"] = data.groupby("name").transform(lambda x: x.fillna(x.mean())) #X = data.loc[:,"Attr1":"Attr64"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45, stratify=y) rf = GradientBoostingClassifier(n_estimators=1000) rf.fit(X_train, y_train) rf.score(X_train, y_train) rf.score(X_test, y_test) roc_auc_score(y_test, rf.predict(X_test)) from sklearn.metrics import confusion_matrix import itertools plt.figure(dpi=150) cm = confusion_matrix(y_test, rf.predict(X_test)) plt.imshow(cm, cmap=plt.cm.Blues) plt.colorbar() plt.xticks([0, 1]) plt.yticks([0, 1]) plt.title("Predicting Polish Bankruptcy within 5 Years") plt.ylabel("True") plt.xlabel("Predicted") fmt = '.2f' thresh = cm.max() / 2.
n = 0 cv = [] for index_train, index_eval in kf.split(train,train_y): x_train, x_eval = train_x[index_train], train_x[index_eval] y_train, y_eval = train_y[index_train], train_y[index_eval] d_train = xgb.DMatrix(x_train, label=y_train) d_valid = xgb.DMatrix(x_eval, label=y_eval) watchlist = [(d_valid, 'valid')] bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=100, verbose_eval=100) print('Start predicting...') y_pred = bst.predict(xgb.DMatrix(x_eval)) cv.append(roc_auc_score(y_eval, y_pred)) print('start predicting on test...') testpreds = bst.predict(xgb.DMatrix(test.values)) if n > 0: totalpreds = totalpreds + testpreds else: totalpreds = testpreds # bst.save_model('xgb_model_fold_{}.model'.format(n)) n += 1 totalpreds = totalpreds / n print('xgb best score', np.mean(cv)) # submit result
def non_error_roc_auc_score(y_true, y_pred): try: return roc_auc_score(y_true, y_pred) except: return 0.0
def test_auc(self, X, Y): y_pred = self.model.predict(X) return metrics.roc_auc_score(Y, y_pred)
cae.fit(X_test, X_test) features = cae.get_output(X_test) flat_output = np.reshape(features, (np.shape(X_test)[0], -1)) flat_input = np.reshape(X_test, (np.shape(X_test)[0], -1)) cosine_similarity = np.sum(flat_output * flat_input, -1) / ( np.linalg.norm(flat_output, axis=-1) + 0.000001) / ( np.linalg.norm(flat_input, axis=-1) + 0.000001) tEnd = time.time() tDiff = tEnd - tStart with open(filename, 'a') as f_log: f_log.write("Time elapsed: " + str(tDiff) + "\n") auc = roc_auc_score(y_test, -cosine_similarity) ap = average_precision_score(y_test, -cosine_similarity) print("auc = ", auc) print("ap = ", ap) print("time elapse = ", tDiff) aucs.append(auc) aps.append(ap) time_elapses.append(tDiff) std_auc = np.std(aucs) std_ap = np.std(aps) std_time = np.std(time_elapses) to_save_auc[cvalue][anomaly] = aucs to_save_ap[cvalue][anomaly] = aps
def compute_auc(self, X: np.ndarray, y: np.ndarray) -> float: """ Distance to hyperplane is used for AUC-style metrics. """ return metrics.roc_auc_score(y, self.decision_function(X))
def auc(X, y, model): probs = model.predict_proba(X)[:, 1] return roc_auc_score(y, probs)
filenames_list = np.concatenate( (filenames_list, np.array(filenames_current).reshape(-1, 1)), axis=0) target_list = np.concatenate( (target_list, target_now.detach().numpy()), axis=0) import pandas as pd activation_dataframe = pd.DataFrame(filenames_list, columns=['Filename']) activation_dataframe['Activations'] = activations_test activation_dataframe['Target'] = target_list #Saving dataframe to file activation_dataframe.to_csv('activations_test.csv') import sklearn from sklearn.metrics import roc_auc_score auroc_score = roc_auc_score(y_true=activation_dataframe['Target'], y_score=activation_dataframe['Activations']) threshold = 0.5 #Confusion matrix activation_dataframe['Predicted'] = 0 activation_dataframe.loc[activation_dataframe['Activations'] > threshold, 'Predicted'] = 1 from sklearn.metrics import confusion_matrix confusion_matrix_table = pd.DataFrame(confusion_matrix( y_true=activation_dataframe['Target'], y_pred=activation_dataframe['Predicted'], labels=[0, 1]), columns=[0, 1]) #Recall
import pandas as pd from matplotlib import pyplot as plt from sklearn.metrics import roc_curve, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') df['Male'] = df['Sex'] == 'Male' X = df[['Pclass','Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values y = df['Survived'].values X_train, X_test, y_train, y_test = train_test_split(X,y) model = LogisticRegression() # select the model model.fit(X_train, y_train) # train the model y_pred_proba = model.predict_proba(X_test) # predict the test data fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1]) auc = roc_auc_score(y_test,model.predict(X_test)) plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (LogisticRegression(), auc)) plt.plot([0, 1], [0, 1],'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('1-Specificity(False Positive Rate)') plt.ylabel('Sensitivity(True Positive Rate)') plt.legend(loc="lower right") plt.show() # Display
y_pred=model1.predict(x_test) plt.show() cm1=confusion_matrix(y_test,y_pred) sac=accuracy_score(y_test,y_pred) accper=sac*100 accper plt.figure(figsize=(10,10)) sns.heatmap(cm1,annot=True) model1.summary() y_pred_proba = model1.predict_proba(x_test)[::,1] #pyplot.plot(fpr, tpr, linestyle='--', label='No Skill') fpr, tpr, _ =metrics.roc_curve(y_test, y_pred_proba) auc = metrics.roc_auc_score(y_test, y_pred_proba) plt.plot(fpr,tpr,label="data 1, auc="+str(auc)) plt.plot([0, 1], [0, 1],'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig('Log_ROC') plt.legend(loc=4) print(classification_report(y_test,y_pred))
ds = DataLoader(sl=sl) if whole_map: test_seq = ds.load_whole_test(bmode) else: test_seq, test_label = ds.load_test(bmode) model_checkpoint_dir = os.path.join(s.intermediate_folder, 'model_checkpoints/opt') model_checkpoint_file = os.path.join(model_checkpoint_dir, uid + '.hdf5') model = load_model(model_checkpoint_file) test_predictions = model.predict(test_seq, verbose=1) results = {'test_predictions': test_predictions} logs_dir = os.path.join(s.intermediate_folder, 'logs', logs_dir) test_log_dir = os.path.join(logs_dir, 'test_logs/') if not whole_map: test_auc = roc_auc_score(test_label, test_predictions) spio.savemat(test_log_dir + uid + '.mat', results) print(["Test AUC: ", test_auc]) else: spio.savemat(test_log_dir + uid + '_whole.mat', results) # print('-' * 50) # print('UID: {}'.format(uid)) # print('-' * 50) k.clear_session()
embedding_dim, embedding_matrix, max_length, out_size=6) keras_model_trainer = trainer.KerasModelTrainer(model_stamp='kmax_text_cnn', epoch_num=50, learning_rate=1e-3) models, val_loss, total_auc, fold_predictions = keras_model_trainer.train_folds( data, y_train, fold_count=10, batch_size=256, get_model_func=get_model) print("Overall val-loss:", val_loss, "AUC", total_auc) train_fold_preditcions = np.concatenate(fold_predictions, axis=0) training_auc = roc_auc_score(y_train[:-1], train_fold_preditcions) print("Training AUC", training_auc) CLASSES = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] submit_path_prefix = "results/rnn/nds/fasttext-SC2-nds-randomNoisy-capNet-" + str( max_nb_words) + "-RST-lp-ct-" + str(max_length) print("Predicting testing results...") test_predicts_list = [] for fold_id, model in enumerate(models): test_predicts = model.predict(test_data, batch_size=256, verbose=1) test_predicts_list.append(test_predicts) np.save("predict_path/", test_predicts)
def create_model(dataset): print("dataset : ", dataset) df = pd.read_csv('/home/farshid/Desktop/' + dataset, header=None) print('reading', dataset) df['label'] = df[df.shape[1] - 1] # df.drop([df.shape[1] - 2], axis=1, inplace=True) labelencoder = LabelEncoder() df['label'] = labelencoder.fit_transform(df['label']) # X = np.array(df.drop(['label'], axis=1)) y = np.array(df['label']) number_of_clusters = 23 sampler = RandomUnderSampler() normalization_object = Normalizer() X = normalization_object.fit_transform(X) skf = StratifiedKFold(n_splits=5, shuffle=True) n_classes = 2 for train_index, test_index in skf.split(X, y): X_train = X[train_index] X_test = X[test_index] y_train = y[train_index] y_test = y[test_index] break print('training', dataset) top_roc = 0 depth_for_rus = 0 split_for_rus = 0 for depth in range(3, 20, 20): for split in range(3, 9, 20): classifier = AdaBoostClassifier(DecisionTreeClassifier( max_depth=depth, min_samples_split=split), n_estimators=100, learning_rate=1, algorithm='SAMME') X_train, y_train = sampler.fit_sample(X_train, y_train) classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) score = roc_auc_score(y_test, predictions[:, 1]) if top_roc < score: top_roc = score tpr = dict() fpr = dict() roc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test, predictions[:, i]) roc[i] = roc_auc_score(y_test, predictions[:, i]) major_class = max(sampler.fit(X_train, y_train).stats_c_, key=sampler.fit(X_train, y_train).stats_c_.get) major_class_X_train = [] major_class_y_train = [] minor_class_X_train = [] minor_class_y_train = [] for index in range(len(X_train)): if y_train[index] == major_class: major_class_X_train.append(X_train[index]) major_class_y_train.append(y_train[index]) else: minor_class_X_train.append(X_train[index]) minor_class_y_train.append(y_train[index]) # optimize for number of clusters here kmeans = KMeans(max_iter=200, n_jobs=4, n_clusters=number_of_clusters) kmeans.fit(major_class_X_train) # get the centroids of each of the clusters cluster_centroids = kmeans.cluster_centers_ # get the points under each cluster points_under_each_cluster = { i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters) } for i in range(number_of_clusters): size = len(points_under_each_cluster[i]) random_indexes = np.random.randint(low=0, high=size, size=int(size / 2)) temp = points_under_each_cluster[i] feature_indexes = temp[random_indexes] X_train_major = np.concatenate( (X_train_major, X_train[feature_indexes]), axis=0) y_train_major = np.concatenate( (y_train_major, y_train[feature_indexes]), axis=0) final_train_x = np.concatenate((X_train_major, minor_class_X_train), axis=0) final_train_y = np.concatenate((y_train_major, minor_class_y_train), axis=0) classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=150)) # classifier = sklearn.svm.SVC(C=50 , gamma= .0008 , kernel='rbf', probability=True) # classifier = sklearn.svm.SVC(C=100, gamma=.006, kernel='rbf', probability=True) classifier.fit(final_train_x, final_train_y) predicted = classifier.predict_proba(X_test) tpr_c = dict() fpr_c = dict() roc_c = dict() for i in range(n_classes): fpr_c[i], tpr_c[i], _ = roc_curve(y_test, predictions[:, i]) roc_c[i] = auc(y_test, predictions[:, i]) print('ploting', dataset) # plt.clf() plt.plot(fpr[1], tpr[1], lw=2, color='red', label='Roc curve: Clustered sampling') plt.plot(fpr_c[1], tpr_c[1], lw=2, color='navy', label='Roc curve: random under sampling') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('Area under ROC curve') plt.legend(loc="lower right") plt.show()
# Output the most important random forest features to a telemetry file feature_file = open(config["importance"], "w") for feature in sorted(zip(flabels, rf_classifier.feature_importances_), key=lambda x: x[1], reverse=True): feature_file.write("%s,%f\n" % feature) feature_file.close() # Create Results Table Here for ROC Curves result_table = pd.DataFrame(columns=['classifiers', 'fpr', 'tpr', 'auc']) # SVM Y_pred = svm_classifier.predict(X_test) yproba = svm_classifier.predict_proba(X_test)[::, 1] fpr, tpr, _ = roc_curve(Y_test, yproba) auc = roc_auc_score(Y_test, yproba) result_table = result_table.append( { 'classifiers': "SVM", 'fpr': fpr, 'tpr': tpr, 'auc': auc }, ignore_index=True) print('SVM Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred)) print('SVM Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred)) print('SVM Root Mean Squared Error:', numpy.sqrt(metrics.mean_squared_error(Y_test, Y_pred))) print('SVM FPR:', fpr) print('SVM TPR:', tpr) print('SVM AUC:', auc)
def test_user_supplied_features_accuracy(): model = LightFM(random_state=SEED) model.fit_partial( train, user_features=train_user_features, item_features=train_item_features, epochs=10, ) train_predictions = model.predict( train.row, train.col, user_features=train_user_features, item_features=train_item_features, ) test_predictions = model.predict( test.row, test.col, user_features=test_user_features, item_features=test_item_features, ) assert roc_auc_score(train.data, train_predictions) > 0.84 assert roc_auc_score(test.data, test_predictions) > 0.76