def cross_validate_mechine(args: TrainArgs, logger: Logger = None): """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold dmpnn_scores = [] for fold_num in range(args.num_folds): if args.dataset_type == 'classification': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/test.csv' elif args.dataset_type == 'regression': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/test.csv' info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores,model,scaler,df = run_training(args, logger) if args.loss_save: df.to_csv('/home/cxw/python——work/paper_gcn/dmpnn_epoch_loss/'+args.protein+'loss.csv',index=None) # df.to_csv(args.protein+'loss.csv',index=None) break dmpnn_scores.append(model_scores) train_target, train_feature, val_target, val_feature, test_target, test_feature,train_smiles,val_smiles,test_smiles,test_preds = get_xgboost_feature(args, logger,model) train_target = pd.DataFrame(train_target) train_feature = pd.DataFrame(train_feature) val_target = pd.DataFrame(val_target) val_feature = pd.DataFrame(val_feature) test_target = pd.DataFrame(test_target) test_feature = pd.DataFrame(test_feature) train_morgan_feature = get_morgan_feature(train_smiles) val_morgan_feature = get_morgan_feature(val_smiles) test_morgan_feature = get_morgan_feature(test_smiles) if args.dataset_type == 'classification': if test_target.shape[1]==1: scores = svm_knn_rf_class(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = svm_knn_rf_class_more(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type','auc'] elif args.dataset_type == 'regression': if test_target.shape[1]==1: scores = svm_knn_rf_regre(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = svm_knn_rf_regre_more(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type', 'RMSE'] scores.to_csv(args.protein+'mechine_scores.csv')
def cross_validate(args: TrainArgs, logger: Logger = None) -> Tuple[float, float]: """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir task_names = args.target_columns or get_task_names(args.data_path) # Run training on different random seeds for each fold dmpnn_scores = [] dmpnn_xgb_scores = [] morgan_scores = [] dmpnn_morgan_scores = [] for fold_num in range(args.num_folds): if args.dataset_type == 'classification': args.data_path = 'molnet_benchmark/molnet_random_' + args.protein + '_c/seed' + str( fold_num + 1) + '/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_' + args.protein + '_c/seed' + str( fold_num + 1) + '/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_' + args.protein + '_c/seed' + str( fold_num + 1) + '/test.csv' elif args.dataset_type == 'regression': args.data_path = 'molnet_benchmark/molnet_random_' + args.protein + '_r/seed' + str( fold_num + 1) + '/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_' + args.protein + '_r/seed' + str( fold_num + 1) + '/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_' + args.protein + '_r/seed' + str( fold_num + 1) + '/test.csv' info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores, model, scaler = run_training(args, logger) dmpnn_scores.append(model_scores) train_target, train_feature, val_target, val_feature, test_target, test_feature, train_smiles, val_smiles, test_smiles = get_xgboost_feature( args, logger, model) train_target = pd.DataFrame(train_target) train_feature = pd.DataFrame(train_feature) val_target = pd.DataFrame(val_target) val_feature = pd.DataFrame(val_feature) test_target = pd.DataFrame(test_target) test_feature = pd.DataFrame(test_feature) train_morgan_feature = get_morgan_feature(train_smiles) val_morgan_feature = get_morgan_feature(val_smiles) test_morgan_feature = get_morgan_feature(test_smiles) if args.dataset_type == 'classification': if test_target.shape[1] == 1: xgb_gbc = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0, max_depth=4, min_child_weight=8, missing=None, n_estimators=2000, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=0.8, tree_method='gpu_hist', n_gpus=-1) xgb_gbc.fit(train_feature, train_target, eval_set=[(val_feature, val_target)], eval_metric='auc', early_stopping_rounds=200) pre_pro = xgb_gbc.predict_proba(test_feature)[:, 1] fpr, tpr, threshold = roc_curve(test_target, pre_pro) AUC = auc(fpr, tpr) pre_pro = [1 if i > 0.5 else 0 for i in pre_pro] tn, fp, fn, tp = confusion_matrix(test_target, pre_pro).ravel() # Sn = TP /(TP + FN) Sp = TN / (TN+FP) Sn = tp / (tp + fn) Sp = tn / (tn + fp) acc = accuracy_score(test_target, pre_pro) dmpnn_xgb_scores.append([AUC, Sn, Sp, acc]) joblib.dump(xgb_gbc, 'external_test/dmpnn_xgb.model') xgb_gbc = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0, max_depth=4, min_child_weight=8, missing=None, n_estimators=2000, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=0.8, tree_method='gpu_hist', n_gpus=-1) xgb_gbc.fit(train_morgan_feature, train_target, eval_set=[(val_morgan_feature, val_target)], eval_metric='auc', early_stopping_rounds=200) pre_pro = xgb_gbc.predict_proba(test_morgan_feature)[:, 1] fpr, tpr, threshold = roc_curve(test_target, pre_pro) AUC = auc(fpr, tpr) pre_pro = [1 if i > 0.5 else 0 for i in pre_pro] tn, fp, fn, tp = confusion_matrix(test_target, pre_pro).ravel() # Sn = TP /(TP + FN) Sp = TN / (TN+FP) Sn = tp / (tp + fn) Sp = tn / (tn + fp) acc = accuracy_score(test_target, pre_pro) morgan_scores.append([AUC, Sn, Sp, acc]) joblib.dump(xgb_gbc, 'external_test/morgan_xgb.model') train_gcn_mor_feature = pd.concat( [train_feature, train_morgan_feature], axis=1) val_gcn_mor_feature = pd.concat( [val_feature, val_morgan_feature], axis=1) test_gcn_mor_feature = pd.concat( [test_feature, test_morgan_feature], axis=1) train_gcn_mor_feature.columns = val_gcn_mor_feature.columns = test_gcn_mor_feature.columns = range( train_gcn_mor_feature.shape[1]) xgb_gbc = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0, max_depth=4, min_child_weight=8, missing=None, n_estimators=2000, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=0.8, tree_method='gpu_hist', n_gpus=-1) xgb_gbc.fit(train_gcn_mor_feature, train_target, eval_set=[(val_gcn_mor_feature, val_target)], eval_metric='auc', early_stopping_rounds=200) pre_pro = xgb_gbc.predict_proba(test_gcn_mor_feature)[:, 1] fpr, tpr, threshold = roc_curve(test_target, pre_pro) AUC = auc(fpr, tpr) pre_pro = [1 if i > 0.5 else 0 for i in pre_pro] tn, fp, fn, tp = confusion_matrix(test_target, pre_pro).ravel() Sn = tp / (tp + fn) Sp = tn / (tn + fp) acc = accuracy_score(test_target, pre_pro) dmpnn_morgan_scores.append([AUC, Sn, Sp, acc]) joblib.dump(xgb_gbc, 'external_test/dmpnn_morgan_xgb.model') else: aucs = [] for i in range(test_target.shape[1]): xgb_gbc = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0, max_depth=4, min_child_weight=8, missing=None, n_estimators=2000, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=0.8, tree_method='gpu_hist', n_gpus=-1) if max(val_target[i]) == 0 or max( train_target[i]) == 0 or max(test_target[i]) == 0: continue xgb_gbc.fit(train_feature, train_target[i], eval_set=[(val_feature, val_target[i])], eval_metric='auc', early_stopping_rounds=100) pre_pro = xgb_gbc.predict_proba(test_feature)[:, 1] fpr, tpr, threshold = roc_curve(test_target[i], pre_pro) AUC = auc(fpr, tpr) if args.metric == "prc-auc": precision, recall, _ = precision_recall_curve( test_target[i], pre_pro) AUC = auc(recall, precision) pre_pro = [1 if i > 0.5 else 0 for i in pre_pro] tn, fp, fn, tp = confusion_matrix(test_target, pre_pro).ravel() Sn = tp / (tp + fn) Sp = tn / (tn + fp) acc = accuracy_score(test_target, pre_pro) aucs.append([AUC, Sn, Sp, acc]) dmpnn_xgb_scores.append([np.mean(aucs)]) elif args.dataset_type == 'regression': if test_target.shape[1] == 1: xgb_gbc = xgb.XGBRegressor(learn_rate=0.1, max_depth=4, min_child_weight=10, gamma=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.8, objective='reg:linear', n_estimators=2000, tree_method='gpu_hist', n_gpus=-1) xgb_gbc.fit(train_feature, train_target, eval_set=[(val_feature, val_target)], eval_metric='rmse', early_stopping_rounds=200) y_pred = xgb_gbc.predict(test_feature) y_pred = scaler.inverse_transform(y_pred) y_test = test_target.astype('float') MSE = mean_squared_error(y_test, y_pred) RMSE = MSE**0.5 MAE = median_absolute_error(y_test, y_pred) dmpnn_xgb_scores.append([RMSE, MAE]) joblib.dump(xgb_gbc, 'external_test/dmpnn_xgb.model') xgb_gbc = xgb.XGBRegressor(learn_rate=0.1, max_depth=4, min_child_weight=10, gamma=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.8, objective='reg:linear', n_estimators=2000, tree_method='gpu_hist', n_gpus=-1) xgb_gbc.fit(train_morgan_feature, train_target, eval_set=[(val_morgan_feature, val_target)], eval_metric='rmse', early_stopping_rounds=200) y_pred = xgb_gbc.predict(test_morgan_feature) y_pred = scaler.inverse_transform(y_pred) MSE = mean_squared_error(y_test, y_pred) RMSE = MSE**0.5 MAE = median_absolute_error(y_test, y_pred) morgan_scores.append([RMSE, MAE]) joblib.dump(xgb_gbc, 'external_test/morgan_xgb.model') train_gcn_mor_feature = pd.concat( [train_feature, train_morgan_feature], axis=1) val_gcn_mor_feature = pd.concat( [val_feature, val_morgan_feature], axis=1) test_gcn_mor_feature = pd.concat( [test_feature, test_morgan_feature], axis=1) train_gcn_mor_feature.columns = val_gcn_mor_feature.columns = test_gcn_mor_feature.columns = range( train_gcn_mor_feature.shape[1]) xgb_gbc = xgb.XGBRegressor(learn_rate=0.1, max_depth=4, min_child_weight=10, gamma=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.8, objective='reg:linear', n_estimators=2000, tree_method='gpu_hist', n_gpus=-1) xgb_gbc.fit(train_gcn_mor_feature, train_target, eval_set=[(val_gcn_mor_feature, val_target)], eval_metric='rmse', early_stopping_rounds=200) y_pred = xgb_gbc.predict(test_gcn_mor_feature) y_pred = scaler.inverse_transform(y_pred) MSE = mean_squared_error(y_test, y_pred) RMSE = MSE**0.5 MAE = median_absolute_error(y_test, y_pred) dmpnn_morgan_scores.append([RMSE, MAE]) joblib.dump(xgb_gbc, 'external_test/dmpnn_morgan_xgb.model') else: MAEs = [] for i in range(test_target.shape[1]): xgb_gbc = xgb.XGBRegressor(learn_rate=0.1, max_depth=4, min_child_weight=10, gamma=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.8, objective='reg:linear', n_estimators=2000, tree_method='gpu_hist', n_gpus=-1) xgb_gbc.fit(train_feature, train_target[i], eval_set=[(val_feature, val_target[i])], eval_metric='rmse', early_stopping_rounds=200) y_pred = xgb_gbc.predict(test_feature) y_test = test_target[i].astype('float') MSE = mean_squared_error(y_test, y_pred) RMSE = MSE**0.5 MAE = median_absolute_error(y_test, y_pred) MAEs.append([MAE, RMSE]) dmpnn_xgb_scores.append([np.mean(MAEs)]) dmpnn_scores = np.array(dmpnn_scores) # Report scores across models dmpnn_scores = np.nanmean( dmpnn_scores, axis=1) # average score for each model across tasks dmpnn_mean_score, dmpnn_std_score = np.nanmean(dmpnn_scores), np.nanstd( dmpnn_scores) print('three dmpnn test = ', dmpnn_scores) info( f'Overall dmpnn test {args.metric} = {dmpnn_mean_score:.6f} +/- {dmpnn_std_score:.6f}' ) dmpnn_xgb_scores = np.nanmean( dmpnn_xgb_scores, axis=1) # average score for each model across tasks dmpnn_xgb_mean_score, dmpnn_xgb_std_score = np.nanmean( dmpnn_xgb_scores), np.nanstd(dmpnn_xgb_scores) print('three dmpnn_xgb_test = ', dmpnn_xgb_scores) info( f'Overall dmpnn_xgb_test {args.metric} = {dmpnn_xgb_mean_score:.6f} +/- {dmpnn_xgb_std_score:.6f}' ) morgan_scores = np.nanmean( morgan_scores, axis=1) # average score for each model across tasks morgan_mean_score, morgan_std_score = np.nanmean(morgan_scores), np.nanstd( morgan_scores) print('three morgen_test = ', morgan_scores) info( f'Overall morgen_test {args.metric} = {morgan_mean_score:.6f} +/- {morgan_std_score:.6f}' ) dmpnn_morgan_scores = np.nanmean( dmpnn_morgan_scores, axis=1) # average score for each model across tasks dmpnn_morgan_mean_score, dmpnn_morgan_std_score = np.nanmean( dmpnn_morgan_scores), np.nanstd(dmpnn_morgan_scores) print('three dmpnn_morgan_scores = ', dmpnn_morgan_scores) info( f'Overall dmpnn_morgen_test {args.metric} = {dmpnn_morgan_mean_score:.6f} +/- {dmpnn_morgan_std_score:.6f}' ) return model
def cross_validate(args: TrainArgs, logger: Logger = None) -> Tuple[float, float]: """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold dmpnn_scores = [] scores_df = pd.DataFrame() for fold_num in range(args.num_folds): if args.dataset_type == 'classification': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/test.csv' elif args.dataset_type == 'regression': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/test.csv' info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores,model,scaler,df = run_training(args, logger) if args.loss_save: # df.to_csv('/home/cxw/python_work/paper_gcn/dmpnn_epoch_loss/'+args.protein+'_loss.csv',index=None) # df.to_csv(args.protein+'loss.csv',index=None) # break dmpnn_scores.append(model_scores) train_target, train_feature, val_target, val_feature, test_target, test_feature,train_smiles,val_smiles,test_smiles,test_preds = get_xgboost_feature(args, logger,model) train_target = pd.DataFrame(train_target) train_feature = pd.DataFrame(train_feature) val_target = pd.DataFrame(val_target) val_feature = pd.DataFrame(val_feature) test_target = pd.DataFrame(test_target) test_feature = pd.DataFrame(test_feature) train_morgan_feature = get_morgan_feature(train_smiles) val_morgan_feature = get_morgan_feature(val_smiles) test_morgan_feature = get_morgan_feature(test_smiles) max_depth_numbers = [2,4,6,8,10] learning_rate_numbers = [0.01,0.05,0.1,0.15,0.2] min_child_weight_numbers = [2,4,6,8,10] if args.dataset_type == 'classification': if test_target.shape[1]==1: scores = xgboost_cv(max_depth_numbers,learning_rate_numbers,min_child_weight_numbers, train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = xgb_cv_more(max_depth_numbers,learning_rate_numbers,min_child_weight_numbers, train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type','max_depth','learning_rate','min_child_weight','auc','sn','sp','acc'] scores_df = pd.concat([scores_df,scores]) elif args.dataset_type == 'regression': if test_target.shape[1]==1: scores = xgb_regre_cv(max_depth_numbers, learning_rate_numbers, min_child_weight_numbers, train_feature, train_target, val_feature, val_target, test_feature, test_target, train_morgan_feature, val_morgan_feature, test_morgan_feature, test_preds, scaler) else: scores = xgb_regre_more(max_depth_numbers, learning_rate_numbers, min_child_weight_numbers, train_feature, train_target, val_feature, val_target, test_feature, test_target, train_morgan_feature, val_morgan_feature, test_morgan_feature, test_preds, scaler) scores.columns = ['type', 'max_depth', 'learning_rate', 'min_child_weight', 'RMSE'] scores_df = pd.concat([scores_df,scores]) df_groupby = scores_df.groupby(['type', 'max_depth', 'learning_rate', 'min_child_weight']).mean() df_groupby.to_csv(args.protein+'_scores.csv') return model def cross_validate_mechine(args: TrainArgs, logger: Logger = None): """k-fold cross validation""" info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold dmpnn_scores = [] for fold_num in range(args.num_folds): if args.dataset_type == 'classification': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/test.csv' elif args.dataset_type == 'regression': args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/train.csv' args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/val.csv' args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/test.csv' info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores,model,scaler,df = run_training(args, logger) if args.loss_save: df.to_csv('/home/cxw/python——work/paper_gcn/dmpnn_epoch_loss/'+args.protein+'loss.csv',index=None) # df.to_csv(args.protein+'loss.csv',index=None) break dmpnn_scores.append(model_scores) train_target, train_feature, val_target, val_feature, test_target, test_feature,train_smiles,val_smiles,test_smiles,test_preds = get_xgboost_feature(args, logger,model) train_target = pd.DataFrame(train_target) train_feature = pd.DataFrame(train_feature) val_target = pd.DataFrame(val_target) val_feature = pd.DataFrame(val_feature) test_target = pd.DataFrame(test_target) test_feature = pd.DataFrame(test_feature) train_morgan_feature = get_morgan_feature(train_smiles) val_morgan_feature = get_morgan_feature(val_smiles) test_morgan_feature = get_morgan_feature(test_smiles) if args.dataset_type == 'classification': if test_target.shape[1]==1: scores = svm_knn_rf_class(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = svm_knn_rf_class_more(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type','auc'] elif args.dataset_type == 'regression': if test_target.shape[1]==1: scores = svm_knn_rf_regre(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) else: scores = svm_knn_rf_regre_more(train_feature, train_target,val_feature, val_target,test_feature,test_target, train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds) scores.columns = ['type', 'RMSE'] scores.to_csv(args.protein+'mechine_scores.csv')