def adam_pretrain(model, model_name, train_xs, train_ys, num_epoch, test_xs, test_ys): model.compile(optimizer=keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy']) model.fit(train_xs, train_ys, batch_size=128, nb_epoch=num_epoch, validation_data=(test_xs, test_ys), shuffle=True) model_name = '%s_adam_pretrain' % model_name save_model(model, model_name) model = load_model(model_name) return model
n_iter = 300 if classification_task: rf_gs = supervised_learning_steps("rf", "roc_auc", data_type, classification_task, model, param_rf, X_train, y_train, n_iter) else: rf_gs = supervised_learning_steps("rf", "r2", data_type, classification_task, model, param_rf, X_train, y_train, n_iter) rf_gs.cv_results_ # + rf_gs = load_model("rf_models/rf_" + data_type_options[input_option] + "_regressor_gs.pk") np.max(rf_gs.cv_results_["mean_test_score"]) file_list = [ "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_LS_LS.csv", "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_MFP_LS.csv" ] filename = file_list[input_option] with open(filename, "rb") as file: print("Loading ", filename) big_df = pd.read_csv(filename, header='infer', delimiter=",") total_length = len(big_df.columns) X = big_df.iloc[:, range(5, total_length)] Y = big_df[['pchembl_value']].to_numpy().flatten() meta_X = big_df.iloc[:, [0, 1, 2, 3]]
# - def get_cv_results(model, index, cv_scores, nfolds): for i in range(0, nfolds): cv_scores.append( np.round( model.cv_results_["split" + str(i) + "_test_score"][index], nfolds)) return (cv_scores) # + #Get the CV models and optimal scores #GLM with protein and compound LS glm_LS_LS = load_model( "../models/glm_models/glm_LS_Compound_LS_Protein_regressor_gs.pk") glm_LS_LS_index = np.argmax(glm_LS_LS.cv_results_["mean_test_score"]) glm_LS_LS_r2_scores = get_cv_results(glm_LS_LS, glm_LS_LS_index, [], 5) glm_MFP_LS = load_model( "../models/glm_models/glm_MFP_Compound_LS_Protein_regressor_gs.pk") glm_MFP_LS_index = np.argmax(glm_MFP_LS.cv_results_["mean_test_score"]) glm_MFP_LS_r2_scores = get_cv_results(glm_MFP_LS, glm_MFP_LS_index, [], 5) #RF with protein and compound LS rf_LS_LS = load_model( "../models/rf_models/rf_LS_Compound_LS_Protein_regressor_gs.pk") rf_LS_LS_index = np.argmax(rf_LS_LS.cv_results_["mean_test_score"]) rf_LS_LS_r2_scores = get_cv_results(rf_LS_LS, rf_LS_LS_index, [], 5) rf_MFP_LS = load_model(
X_train_copy = scaler.fit_transform(X_train) if classification_task: svm_gs = supervised_learning_steps("svm", "roc_auc", data_type, classification_task, model, param_svm, X_train_copy, y_train, n_iter) else: svm_gs = supervised_learning_steps("svm", "r2", data_type, classification_task, model, param_svm, X_train_copy, y_train, n_iter) svm_gs.cv_results_ save_model(scaler, "%s_models/%s_%s_scaling_gs.pk" % ("svm", "svm", data_type)) # - svm_gs = load_model("svm_models/svm__LS_Drug_LS_Protein_regressor_gs.pk") scaler = load_model("svm_models/svm__LS_Drug_LS_Protein_scaling_gs.pk") svm_best = svm_gs.best_estimator_ # + np.max(svm_gs.cv_results_['mean_test_score']) file_list = [ "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_LS_LS.csv", "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_MFP_LS.csv" ] filename = file_list[input_option] with open(filename, "rb") as file: print("Loading ", filename) big_df = pd.read_csv(filename, header='infer', delimiter=",")
print(X_train[:10]) print(X_train.shape, y_train.shape) print(X_train.columns) print(big_df.isnull().sum().sum()) print("Loaded training file") #Get results for test file print("Loading test file") test_filename = args.input2 big_X_test = pd.read_csv("../data/" + args.input2, header='infer', sep=",") total_length = len(big_X_test.columns) X_test = big_X_test.iloc[:, range(5, total_length)] if ("MFP" not in args.input1): rf_best = load_model( "../models/rf_models/rf_LS_Compound_LS_Protein_regressor_best_estimator.pk" ) else: rf_best = load_model( "../models/rf_models/rf_MFP_Compound_LS_Protein_regressor_best_estimator.pk" ) print("Making Predictions") y_pred = rf_best.predict(X_test) meta_X_test = big_X_test.iloc[:, [0, 2]].copy() meta_X_test.loc[:, 'predictions'] = y_pred if ("sars_cov_2" in args.input2): meta_X_test.loc[:, 'labels'] = 0 else: meta_X_test.loc[:, 'labels'] = big_X_test.iloc[:, 4].copy()
print(X_train[:10]) print(X_train.shape,y_train.shape) print(X_train.columns) print(big_df.isnull().sum().sum()) print("Loaded training file") #Get results for test file print("Loading test file") test_filename = args.input2 big_X_test = pd.read_csv("../data/"+args.input2,header='infer',sep=",") total_length = len(big_X_test.columns) X_test = big_X_test.iloc[:,range(5,total_length)] if ("MFP" not in args.input1): xgb_best = load_model("../models/xgb_models/xgb_LS_Compound_LS_Protein_regressor_best_estimator.pk") else: xgb_best = load_model("../models/xgb_models/xgb_MFP_Compound_LS_Protein_regressor_best_estimator.pk") print("Making Predictions") y_pred = xgb_best.predict(X_test) meta_X_test = big_X_test.iloc[:,[0,2]].copy() meta_X_test.loc[:,'predictions']=y_pred if ("sars_cov_2" in args.input2): meta_X_test.loc[:,'labels']=0 else: meta_X_test.loc[:,'labels']=big_X_test.iloc[:,4].copy() out_file = args.output meta_X_test.to_csv("../results/"+out_file,index=False)
## explain the model's predictions using SHAP values #explainer = shap.TreeExplainer(xgb_gs.best_estimator_) #shap_values = explainer.shap_values(X_train) #shap.summary_plot(shap_values, X_train) # + #Get results for SARS-COV-2 for SMILES embeddig + protein embedding (input option = 0) or Morgan fingerprints + protein emedding (input_option = 1) input_option = 0 if (input_option == 0): big_X_test = pd.read_csv( "../data/sars_cov_2_Compound_Viral_interactions_for_Supervised_Learning_with_LS_LS.csv", header='infer', sep=",") total_length = len(big_X_test.columns) X_test = big_X_test.iloc[:, range(5, total_length)] svm_best = load_model( "../models/svm_models/svm_LS_Compound_LS_Protein_regressor_best_estimator.pk" ) scaler = load_model( "../models/svm_models/svm_LS_Compound_LS_Protein_scaling_gs.pk") y_pred = svm_best.predict(scaler.transform(X_test)) meta_X_test = big_X_test.iloc[:, [0, 2]].copy() meta_X_test.loc[:, 'predictions'] = y_pred meta_X_test.loc[:, 'labels'] = 0 meta_X_test.to_csv("../results/SVM_" + data_type_options[input_option] + "supervised_sars_cov_2_predictions.csv", index=False) elif (input_option == 1): big_X_test = pd.read_csv( "../data/sars_cov_2_Compound_Viral_interactions_for_Supervised_Learning_with_MFP_LS.csv",
from keras.datasets import mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x = x_train.reshape(-1, 28 * 28) x = (x-x.mean(axis=1).reshape(-1, 1))/x.std(axis=1).reshape(-1, 1) x = x.reshape(-1, 28, 28, 1) y = pd.get_dummies(y_train).to_numpy() xt = x_test.reshape(-1, 28 * 28) xt = (xt-xt.mean(axis=1).reshape(-1, 1))/xt.std(axis=1).reshape(-1, 1) xt = xt.reshape(-1, 28, 28, 1) yt = pd.get_dummies(y_test).to_numpy() m = Sequential() m.add(Conv2d( input_shape=(28, 28, 1), filters=4, padding=None, kernel_size=(3, 3), activation="relu")) m.add(Conv2d(filters=8, kernel_size=(3, 3), padding=None, activation="relu")) m.add(Pool2d(kernel_size=(2, 2))) m.add(Flatten()) m.add(FFL(neurons=64, activation="relu")) m.add(Dropout(0.1)) m.add(FFL(neurons=10, activation='softmax')) m.compile_model(lr=0.01, opt="adam", loss="cse") m.summary() m.train(x[:30], y[:30], epochs=2, batch_size=30, val_x=xt[:10], val_y=yt[:10]) m.visualize() m.save_model() load_model() m.summary() print(m.predict(x[10]))
X_train_copy = scaler.fit_transform(X_train) if classification_task: svm_gs = supervised_learning_steps("svm", "roc_auc", data_type, classification_task, model, param_svm, X_train_copy, y_train, n_iter) else: svm_gs = supervised_learning_steps("svm", "r2", data_type, classification_task, model, param_svm, X_train_copy, y_train, n_iter) svm_gs.cv_results_ save_model(scaler, "%s_models/%s_%s_scaling_gs.pk" % ("svm", "svm", data_type)) # - svm_gs = load_model("svm_models/svm__LS_Drug_LS_Protein_regressor_gs.pk") scaler = load_model("svm_models/svm__LS_Drug_LS_Protein_scaling_gs.pk") svm_best = svm_gs.best_estimator_ y_pred_svm = svm_best.predict(X_train_copy) plt.hist(y_pred_svm) # + np.max(svm_gs.cv_results_['mean_test_score']) filename = "../data/Test_Drug_Viral_interactions_with_LS_v2_for_Supervised_Learning.csv" with open(filename, "rb") as file: print("Loading ", filename) big_df = pd.read_csv(filename, header='infer', delimiter=",") total_length = len(big_df.columns) X = big_df.iloc[:, range(5, total_length)] Y = big_df[['pchembl_value']].to_numpy().flatten() meta_X = big_df.iloc[:, [0, 1, 2, 3]]
"max_depth": scipy.stats.randint(1, 9), "min_samples_leaf": scipy.stats.randint(1, 10), "max_features": scipy.stats.uniform.ppf([0.1,0.7]) } n_iter=200 if classification_task: rf_gs=supervised_learning_steps("rf","roc_auc",data_type,classification_task,model,param_rf,X_train,y_train,n_iter) else: rf_gs=supervised_learning_steps("rf","r2",data_type,classification_task,model,param_rf,X_train,y_train,n_iter) rf_gs.cv_results_ # + rf_gs = load_model("rf_models/rf__LS_Drug_LS_Protein_regressor_gs.pk") np.max(rf_gs.cv_results_["mean_test_score"]) filename = "../data/Test_Drug_Viral_interactions_with_LS_v2_for_Supervised_Learning.csv" with open(filename, "rb") as file: print("Loading ", filename) big_df = pd.read_csv(filename, header='infer', delimiter=",") total_length = len(big_df.columns) X = big_df.iloc[:,range(5,total_length)] Y = big_df[['pchembl_value']].to_numpy().flatten() meta_X = big_df.iloc[:,[0,1,2,3]] print("Lengths --> X = %d, Y = %d" % (len(X), len(Y))) print(X.columns) n_samples = len(X) indices = np.arange(n_samples)
n_iter = 200 if classification_task: xgb_gs = supervised_learning_steps("xgb", "roc_auc", data_type, classification_task, model, param_xgb, X_train, y_train, n_iter) else: xgb_gs = supervised_learning_steps("xgb", "r2", data_type, classification_task, model, param_xgb, X_train, y_train, n_iter) xgb_gs.cv_results_ # - xgb_gs = load_model("xgb_models/xgb__LS_Drug_LS_Protein_regressor_gs.pk") xgb_best = xgb_gs.best_estimator_ y_pred_xgb = xgb_best.predict(X_train) plt.hist(y_pred_xgb) calculate_regression_metrics(y_train, y_pred_xgb) # + np.max(xgb_gs.cv_results_["mean_test_score"]) filename = "../data/Test_Drug_Viral_interactions_with_LS_v2_for_Supervised_Learning.csv" with open(filename, "rb") as file: print("Loading ", filename) big_df = pd.read_csv(filename, header='infer', delimiter=",") total_length = len(big_df.columns) X = big_df.iloc[:, range(5, total_length)] Y = big_df[['pchembl_value']].to_numpy().flatten() meta_X = big_df.iloc[:, [0, 1, 2, 3]]