def evaluateModel(self, mdl): y_pred = mdl.predict(self.x_test) r_test = pearson(self.y_test, y_pred) rho_test = spearman(self.y_test, y_pred) rmse_test = rmse(self.y_test, y_pred) ci_test = ci(self.y_test, y_pred) auc_test = average_AUC(self.y_test, y_pred) y_pred_ext = mdl.predict(self.x_ext) r_ext = pearson(self.y_ext, y_pred_ext) rho_ext = spearman(self.y_ext, y_pred_ext) rmse_ext = spearman(self.y_ext, y_pred_ext) ci_ext = ci(self.y_ext, y_pred_ext) auc_ext = AUC(self.y_ext, y_pred_ext) print('Test Set Results') print( f'r_test: {r_test:.3f}, rho_test: {rho_test:.3f}, rmse_test: {rmse_test:.3f}, \ ci_test: {ci_test:.3f} auc_test: {auc_test:.3f}') print('Metz dataSet results') print( f'r_ext: {r_ext:.3f}, rho_ext: {rho_ext:.3f}, rmse_ext {rmse_ext:.3f}, \ ci_ext: {ci_ext:.3f}, auc_ext: {auc_ext:.3f}')
def external_set(external_data): """ This function is used to evaluate the model on external data set Parameters ---------- Features : np.array Protein and ligand features are concatenated and used as input file Output ------ Prints model performance on external set in various evaluation metrics """ ext_data = np.load(external_data) x_ext = ext_data[:, :-1] y_ext = ext_data[:,-1:].ravel() model_name = MODEL_DIR+"/xgb.mdl" model = joblib.load(model_name) y_pred_ext = model.predict(x_ext) print('external set is getting evaluated') PEARSON_R = pearson(y_ext, y_pred_ext) SPEARMAN_R = spearman(y_ext, y_pred_ext) RMSE = rmse(y_ext, y_pred_ext) Conc_Index = ci(y_ext,y_pred_ext) auc = AUC(y_ext, y_pred_ext) print("PEARSON_R {:0.3f}: ".format(PEARSON_R)) print("SPEARMAN_R {:0.3f}: ".format(SPEARMAN_R)) print("RMSE {:0.3f}: ".format(RMSE)) print("Conc_Index {:0.3f}: ".format(Conc_Index)) print("Avg_AUC {:0.3f}: ".format(auc))
def train_test(dataset): """ This function is used to trains the model by grid search method and evaluates the test set Parameters ---------- Features : np.array Protein and ligand features are concatenated and used as input file Output ------ Saves the best model and also prints its performance in various evaluation metrics """ data_set = np.load(dataset) X = data_set[:, :-1] y = data_set[:, -1] x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42) # x_train = np.load('x_train.npy') # y_train = np.load('y_train.npy') # x_test = np.load('x_test.npy') # y_test = np.load('y_test.npy') param_grid = {'n_estimators':[1000, 500, 100], 'objective':['reg:linear'], 'colsample_bytree':[0.3, 0.6, 1.0], 'learning_rate':[0.1, 0.001, 0.005, 1.0],\ 'subsample':[0.8, 1.0], 'max_depth':[3, 5, 10 ], 'alpha':[0,10], 'gamma':[0, 1, 5]} xgbr = xgb.XGBRegressor() # xgbr = RandomizedSearchCV(estimator = xgbr, param_distributions = param_grid, n_iter = 10, cv = 5) xgbr = GridSearchCV(estimator=xgbr, param_grid=param_grid, cv= 5) print('XGBoost mdl fitting is started') xgbr=xgbr.fit(x_train, y_train) print(xgbr.best_params_) best_model = xgbr.best_estimator_ model_name = MODEL_DIR+"/xgb.mdl" joblib.dump(best_model, model_name) y_pred = best_model.predict(x_test) print('XGBoost model is saved') PEARSON_R = pearson(y_test, y_pred) SPEARMAN_R = spearman(y_test, y_pred) RMSE = rmse(y_test, y_pred) Conc_Index = ci(y_test,y_pred) Avg_AUC = average_AUC(y_test, y_pred) print("PEARSON_R{:0.3f}: ".format(PEARSON_R)) print("SPEARMAN_R{:0.3f}: ".format(SPEARMAN_R)) print("RMSE{:0.3f}: ".format(RMSE)) print("Conc_Index{:0.3f}".format(Conc_Index)) print("Avg_AUC{:0.3f}".format(Avg_AUC))
def train_test(dataset): """ This function is used to trains the model by grid search method and evaluates the test set Parameters ---------- Features : np.array Protein and ligand features are concatenated and used as input file Output ------ Saves the best model and also prints its performance in various evaluation metrics """ data_set = np.load(dataset) x = data_set[:, :-1] y = data_set[:, -1] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42) # x_train = np.load('x_train.npy') # y_train = np.load('y_train.npy') # x_test = np.load('x_test.npy') # y_test = np.load('y_test.npy') rfr = RandomForestRegressor(n_jobs=-1) param_grid={'n_estimators':[50,100,200,400,600,800], 'max_features': ['auto','sqrt','log2',None],\ 'min_samples_split':[2, 5, 10]} print "Starting model fitting" #rfr = RandomizedSearchCV(estimator=rfr, param_distributions=param_grid, n_iter=10, cv= 5) rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5) rfr=rfr.fit(x_train, y_train) print rfr.best_params_ best_model = rfr.best_estimator_ model_name = MODEL_DIR+"/rfr.mdl" joblib.dump(best_model, model_name) y_pred = best_model.predict(x_test) # r2=r2_score(y_test, y_pred) print "RFR model is saved" PEARSON_R = pearson(y_test, y_pred) SPEARMAN_R = spearman(y_test, y_pred) RMSE = rmse(y_test, y_pred) Conc_Index = ci(y_test,y_pred) Avg_AUC = average_AUC(y_test, y_pred) print "PEARSON_R {:0.3f}: ".format(PEARSON_R) print "SPEARMAN_R {:0.3f}: ".format(SPEARMAN_R) print "RMSE {:0.3f}: ".format(RMSE) print "Conc_Index {:0.3f}: ".format(Conc_Index) print "Avg_AUC {:0.3f}: ".format(Avg_AUC)
random_state=1, verbose=False) model.fit(train_x, train_y) joblib.dump(model, os.path.join(MODEL_DIR, 'model_' + str(i + 1))) #train_predict = model.predict(train_x) #train_r2 = r2_score(y_pred=train_predict, y_true=train_y) test_predict = model.predict(test_x) #test_r2 = r2_score(y_pred=test_predict, y_true=test_y) #print(" TRAIN R2: {:.2}, TEST R2: {:.2}".format(train_r2, test_r2)) RMSE = rmse(test_y, test_predict) rmse_list.append(RMSE) PEARSON = pearson(test_y, test_predict) pearson_list.append(PEARSON) SPEARMAN = spearman(test_y, test_predict) spearman_list.append(SPEARMAN) F1 = f1(test_y, test_predict) f1_list.append(F1) CI = ci(test_y, test_predict) ci_list.append(CI) AVG_AUC = average_AUC(test_y, test_predict) auc_list.append(AVG_AUC) print( "RMSE: {:.2f} PEARSON: {:.2f} SPEARMAN: {:.2f}, F1: {:.2f}, CI: {:.2f}, AVG AUC: {:.2f}" .format(RMSE, PEARSON, SPEARMAN, F1, CI, AVG_AUC)) print( "MEAN RMSE: {:.2}, PEARSON: {:.2}, SPEARMAN: {:.2}, F1: {:.2}, CI: {:.2}, AVG AUC: {:.2}" .format(
def test_spearman(self): self.assertEqual(ev.spearman(self.actual, self.predicted), 0.37569026743498013)
args = parser.parse_args() if __name__ == '__main__': if args.status == "VALIDATED": sub_df = pd.read_csv(args.submission_file) gs_df = pd.read_csv(args.goldstandard_file) combined_df = pd.merge(sub_df, gs_df, how='inner') actual = combined_df["pKd_[M]"] predicted = combined_df["pKd_[M]_pred"] rmse = ev.rmse(actual, predicted) spearman = ev.spearman(actual, predicted) average_auc = ev.average_AUC(actual, predicted) rounded_rmse = round(rmse, 3) rounded_spearman = round(spearman, 3) rounded_average_auc = round(average_auc, 3) result = { "prediction_file_status":"SCORED", "rmse": rmse, "spearman": spearman, "average_auc": average_auc, "rounded_rmse": rounded_rmse, "rounded_spearman": rounded_spearman, "rounded_average_auc": rounded_average_auc}
def get_scores(labels, predictions, validation_test, total_training_loss, total_validation_test_loss, epoch, fold_epoch_results, fold=None): score_dict = { "rm2": None, "CI (DEEPDTA)": None, "MSE": None, "RMSE": None, "Pearson": None, "Spearman": None, "CI (Challenge)": None, "Average AUC": None, "Precision 5.0": None, "Recall 5.0": None, "F1-Score 5.0": None, "Accuracy 5.0": None, "MCC 5.0": None, "Precision 6.0": None, "Recall 6.0": None, "F1-Score 6.0": None, "Accuracy 6.0": None, "MCC 6.0": None, "Precision 7.0": None, "Recall 7.0": None, "F1-Score 7.0": None, "Accuracy 7.0": None, "MCC 7.0": None, } score_dict = { "rm2": None, "CI (DEEPDTA)": None, "MSE": None, "RMSE": None, "Pearson": None, "Spearman": None, "CI (Challenge)": None, "Average AUC": None, "Precision 10uM": None, "Recall 10uM": None, "F1-Score 10uM": None, "Accuracy 10uM": None, "MCC 10uM": None, "Precision 1uM": None, "Recall 1uM": None, "F1-Score 1uM": None, "Accuracy 1uM": None, "MCC 1uM": None, "Precision 100nM": None, "Recall 100nM": None, "F1-Score 100nM": None, "Accuracy 100nM": None, "MCC 100nM": None, "Precision 30nM": None, "Recall 30nM": None, "F1-Score 30nM": None, "Accuracy 30nM": None, "MCC 30nM": None, } score_list = get_list_of_scores() score_dict["rm2"] = get_rm2(np.asarray(labels), np.asarray(predictions)) score_dict["CI (DEEPDTA)"] = get_cindex(np.asarray(labels), np.asarray(predictions)) score_dict["MSE"] = mse(np.asarray(labels), np.asarray(predictions)) score_dict["RMSE"] = rmse(np.asarray(labels), np.asarray(predictions)) score_dict["Pearson"] = pearson(np.asarray(labels), np.asarray(predictions)) score_dict["Spearman"] = spearman(np.asarray(labels), np.asarray(predictions)) score_dict["CI (Challenge)"] = ci(np.asarray(labels), np.asarray(predictions)) score_dict["Average AUC"] = average_AUC(np.asarray(labels), np.asarray(predictions)) prec_rec_f1_acc_mcc_threshold_dict = prec_rec_f1_acc_mcc( np.asarray(labels), np.asarray(predictions)) for key in prec_rec_f1_acc_mcc_threshold_dict.keys(): score_dict[key] = prec_rec_f1_acc_mcc_threshold_dict[key] """ lst_calculated_scores = [] for scr in score_list: lst_calculated_scores.append(score_dict[scr]) """ if fold != None: fold_epoch_results[-1].append(score_dict) print("Fold:{}\tEpoch:{}\tTraining Loss:{}\t{} Loss:{}".format( fold + 1, epoch, total_training_loss, validation_test, total_validation_test_loss)) else: fold_epoch_results.append(score_dict) print("Epoch:{}\tTraining Loss:{}\t{} Loss:{}".format( epoch, total_training_loss, validation_test, total_validation_test_loss)) for scr in score_list: print("{} {}:\t{}".format(validation_test, scr, score_dict[scr])) """ print("{} RM2:\t{}".format(validation_test, deep_dta_rm2)) print("{} MSE\t{}".format(validation_test, deep_dta_mse)) print("{} RMSE\t{}".format(validation_test, rmse_score)) print("{} c-index\t{}".format(validation_test, deep_dta_cindex)) print("{} Pearson:\t{}".format(validation_test, pearson_score)) print("{} Spearman:\t{}".format(validation_test, spearman_score)) print("{} Ci:\t{}".format(validation_test, ci_score)) print("{} Average_AUC:\t{}".format(validation_test, ave_auc_score)) for key in prec_rec_f1_acc_mcc_threshold_dict.keys(): """ return score_dict