예제 #1
0
def adam_pretrain(model, model_name, train_xs, train_ys, num_epoch, test_xs, test_ys):
    model.compile(optimizer=keras.optimizers.Adam(),
                  loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(train_xs, train_ys, batch_size=128, nb_epoch=num_epoch,
              validation_data=(test_xs, test_ys), shuffle=True)
    model_name = '%s_adam_pretrain' % model_name
    save_model(model, model_name)
    model = load_model(model_name)
    return model
예제 #2
0
n_iter = 300

if classification_task:
    rf_gs = supervised_learning_steps("rf", "roc_auc", data_type,
                                      classification_task, model, param_rf,
                                      X_train, y_train, n_iter)
else:
    rf_gs = supervised_learning_steps("rf", "r2", data_type,
                                      classification_task, model, param_rf,
                                      X_train, y_train, n_iter)

rf_gs.cv_results_

# +
rf_gs = load_model("rf_models/rf_" + data_type_options[input_option] +
                   "_regressor_gs.pk")
np.max(rf_gs.cv_results_["mean_test_score"])

file_list = [
    "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_LS_LS.csv",
    "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_MFP_LS.csv"
]

filename = file_list[input_option]
with open(filename, "rb") as file:
    print("Loading ", filename)
    big_df = pd.read_csv(filename, header='infer', delimiter=",")
    total_length = len(big_df.columns)
    X = big_df.iloc[:, range(5, total_length)]
    Y = big_df[['pchembl_value']].to_numpy().flatten()
    meta_X = big_df.iloc[:, [0, 1, 2, 3]]
예제 #3
0
# -
def get_cv_results(model, index, cv_scores, nfolds):
    for i in range(0, nfolds):
        cv_scores.append(
            np.round(
                model.cv_results_["split" + str(i) + "_test_score"][index],
                nfolds))
    return (cv_scores)


# +
#Get the CV models and optimal scores

#GLM with protein and compound LS
glm_LS_LS = load_model(
    "../models/glm_models/glm_LS_Compound_LS_Protein_regressor_gs.pk")
glm_LS_LS_index = np.argmax(glm_LS_LS.cv_results_["mean_test_score"])
glm_LS_LS_r2_scores = get_cv_results(glm_LS_LS, glm_LS_LS_index, [], 5)

glm_MFP_LS = load_model(
    "../models/glm_models/glm_MFP_Compound_LS_Protein_regressor_gs.pk")
glm_MFP_LS_index = np.argmax(glm_MFP_LS.cv_results_["mean_test_score"])
glm_MFP_LS_r2_scores = get_cv_results(glm_MFP_LS, glm_MFP_LS_index, [], 5)

#RF with protein and compound LS
rf_LS_LS = load_model(
    "../models/rf_models/rf_LS_Compound_LS_Protein_regressor_gs.pk")
rf_LS_LS_index = np.argmax(rf_LS_LS.cv_results_["mean_test_score"])
rf_LS_LS_r2_scores = get_cv_results(rf_LS_LS, rf_LS_LS_index, [], 5)

rf_MFP_LS = load_model(
X_train_copy = scaler.fit_transform(X_train)

if classification_task:
    svm_gs = supervised_learning_steps("svm", "roc_auc", data_type,
                                       classification_task, model, param_svm,
                                       X_train_copy, y_train, n_iter)
else:
    svm_gs = supervised_learning_steps("svm", "r2", data_type,
                                       classification_task, model, param_svm,
                                       X_train_copy, y_train, n_iter)

svm_gs.cv_results_
save_model(scaler, "%s_models/%s_%s_scaling_gs.pk" % ("svm", "svm", data_type))
# -

svm_gs = load_model("svm_models/svm__LS_Drug_LS_Protein_regressor_gs.pk")
scaler = load_model("svm_models/svm__LS_Drug_LS_Protein_scaling_gs.pk")
svm_best = svm_gs.best_estimator_

# +
np.max(svm_gs.cv_results_['mean_test_score'])

file_list = [
    "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_LS_LS.csv",
    "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_MFP_LS.csv"
]

filename = file_list[input_option]
with open(filename, "rb") as file:
    print("Loading ", filename)
    big_df = pd.read_csv(filename, header='infer', delimiter=",")
예제 #5
0
    print(X_train[:10])
    print(X_train.shape, y_train.shape)
    print(X_train.columns)
    print(big_df.isnull().sum().sum())
    print("Loaded training file")

    #Get results for test file
    print("Loading test file")
    test_filename = args.input2
    big_X_test = pd.read_csv("../data/" + args.input2, header='infer', sep=",")
    total_length = len(big_X_test.columns)
    X_test = big_X_test.iloc[:, range(5, total_length)]

    if ("MFP" not in args.input1):
        rf_best = load_model(
            "../models/rf_models/rf_LS_Compound_LS_Protein_regressor_best_estimator.pk"
        )
    else:
        rf_best = load_model(
            "../models/rf_models/rf_MFP_Compound_LS_Protein_regressor_best_estimator.pk"
        )

    print("Making Predictions")
    y_pred = rf_best.predict(X_test)

    meta_X_test = big_X_test.iloc[:, [0, 2]].copy()
    meta_X_test.loc[:, 'predictions'] = y_pred
    if ("sars_cov_2" in args.input2):
        meta_X_test.loc[:, 'labels'] = 0
    else:
        meta_X_test.loc[:, 'labels'] = big_X_test.iloc[:, 4].copy()
예제 #6
0
    print(X_train[:10])
    print(X_train.shape,y_train.shape)
    print(X_train.columns)
    print(big_df.isnull().sum().sum())
    print("Loaded training file")

    #Get results for test file
    print("Loading test file")
    test_filename = args.input2
    big_X_test = pd.read_csv("../data/"+args.input2,header='infer',sep=",")
    total_length = len(big_X_test.columns)
    X_test = big_X_test.iloc[:,range(5,total_length)]


    if ("MFP" not in args.input1):
        xgb_best = load_model("../models/xgb_models/xgb_LS_Compound_LS_Protein_regressor_best_estimator.pk")
    else:
        xgb_best = load_model("../models/xgb_models/xgb_MFP_Compound_LS_Protein_regressor_best_estimator.pk")

    print("Making Predictions")
    y_pred = xgb_best.predict(X_test)

    meta_X_test = big_X_test.iloc[:,[0,2]].copy()
    meta_X_test.loc[:,'predictions']=y_pred
    if ("sars_cov_2" in args.input2):
        meta_X_test.loc[:,'labels']=0
    else:
        meta_X_test.loc[:,'labels']=big_X_test.iloc[:,4].copy()
                                       
    out_file = args.output
    meta_X_test.to_csv("../results/"+out_file,index=False)
## explain the model's predictions using SHAP values
#explainer = shap.TreeExplainer(xgb_gs.best_estimator_)
#shap_values = explainer.shap_values(X_train)
#shap.summary_plot(shap_values, X_train)
# +
#Get results for SARS-COV-2 for SMILES embeddig + protein embedding (input option = 0) or Morgan fingerprints + protein emedding  (input_option = 1)
input_option = 0
if (input_option == 0):
    big_X_test = pd.read_csv(
        "../data/sars_cov_2_Compound_Viral_interactions_for_Supervised_Learning_with_LS_LS.csv",
        header='infer',
        sep=",")
    total_length = len(big_X_test.columns)
    X_test = big_X_test.iloc[:, range(5, total_length)]
    svm_best = load_model(
        "../models/svm_models/svm_LS_Compound_LS_Protein_regressor_best_estimator.pk"
    )
    scaler = load_model(
        "../models/svm_models/svm_LS_Compound_LS_Protein_scaling_gs.pk")
    y_pred = svm_best.predict(scaler.transform(X_test))

    meta_X_test = big_X_test.iloc[:, [0, 2]].copy()
    meta_X_test.loc[:, 'predictions'] = y_pred
    meta_X_test.loc[:, 'labels'] = 0
    meta_X_test.to_csv("../results/SVM_" + data_type_options[input_option] +
                       "supervised_sars_cov_2_predictions.csv",
                       index=False)

elif (input_option == 1):
    big_X_test = pd.read_csv(
        "../data/sars_cov_2_Compound_Viral_interactions_for_Supervised_Learning_with_MFP_LS.csv",
예제 #8
0
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x = x_train.reshape(-1, 28 * 28)
x = (x-x.mean(axis=1).reshape(-1, 1))/x.std(axis=1).reshape(-1, 1)
x = x.reshape(-1, 28, 28, 1)
y = pd.get_dummies(y_train).to_numpy()
xt = x_test.reshape(-1, 28 * 28)
xt = (xt-xt.mean(axis=1).reshape(-1, 1))/xt.std(axis=1).reshape(-1, 1)
xt = xt.reshape(-1, 28, 28, 1)
yt = pd.get_dummies(y_test).to_numpy()

m = Sequential()
m.add(Conv2d(
    input_shape=(28, 28, 1), filters=4, padding=None,
    kernel_size=(3, 3), activation="relu"))
m.add(Conv2d(filters=8, kernel_size=(3, 3), padding=None, activation="relu"))
m.add(Pool2d(kernel_size=(2, 2)))
m.add(Flatten())
m.add(FFL(neurons=64, activation="relu"))
m.add(Dropout(0.1))

m.add(FFL(neurons=10, activation='softmax'))
m.compile_model(lr=0.01, opt="adam", loss="cse")
m.summary()
m.train(x[:30], y[:30], epochs=2, batch_size=30, val_x=xt[:10], val_y=yt[:10])
m.visualize()
m.save_model()
load_model()
m.summary()
print(m.predict(x[10]))
예제 #9
0
X_train_copy = scaler.fit_transform(X_train)

if classification_task:
    svm_gs = supervised_learning_steps("svm", "roc_auc", data_type,
                                       classification_task, model, param_svm,
                                       X_train_copy, y_train, n_iter)
else:
    svm_gs = supervised_learning_steps("svm", "r2", data_type,
                                       classification_task, model, param_svm,
                                       X_train_copy, y_train, n_iter)

svm_gs.cv_results_
save_model(scaler, "%s_models/%s_%s_scaling_gs.pk" % ("svm", "svm", data_type))
# -

svm_gs = load_model("svm_models/svm__LS_Drug_LS_Protein_regressor_gs.pk")
scaler = load_model("svm_models/svm__LS_Drug_LS_Protein_scaling_gs.pk")
svm_best = svm_gs.best_estimator_
y_pred_svm = svm_best.predict(X_train_copy)
plt.hist(y_pred_svm)

# +
np.max(svm_gs.cv_results_['mean_test_score'])
filename = "../data/Test_Drug_Viral_interactions_with_LS_v2_for_Supervised_Learning.csv"
with open(filename, "rb") as file:
    print("Loading ", filename)
    big_df = pd.read_csv(filename, header='infer', delimiter=",")
    total_length = len(big_df.columns)
    X = big_df.iloc[:, range(5, total_length)]
    Y = big_df[['pchembl_value']].to_numpy().flatten()
    meta_X = big_df.iloc[:, [0, 1, 2, 3]]
n_iter = 300

if classification_task:
    rf_gs = supervised_learning_steps("rf", "roc_auc", data_type,
                                      classification_task, model, param_rf,
                                      X_train, y_train, n_iter)
else:
    rf_gs = supervised_learning_steps("rf", "r2", data_type,
                                      classification_task, model, param_rf,
                                      X_train, y_train, n_iter)

rf_gs.cv_results_

# +
rf_gs = load_model("rf_models/rf_" + data_type_options[input_option] +
                   "_regressor_gs.pk")
np.max(rf_gs.cv_results_["mean_test_score"])

file_list = [
    "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_LS_LS.csv",
    "../data/Test_Compound_Viral_interactions_for_Supervised_Learning_with_MFP_LS.csv"
]

filename = file_list[input_option]
with open(filename, "rb") as file:
    print("Loading ", filename)
    big_df = pd.read_csv(filename, header='infer', delimiter=",")
    total_length = len(big_df.columns)
    X = big_df.iloc[:, range(5, total_length)]
    Y = big_df[['pchembl_value']].to_numpy().flatten()
    meta_X = big_df.iloc[:, [0, 1, 2, 3]]
               "max_depth": scipy.stats.randint(1, 9),
               "min_samples_leaf": scipy.stats.randint(1, 10),
               "max_features": scipy.stats.uniform.ppf([0.1,0.7])
}

n_iter=200

if classification_task:
    rf_gs=supervised_learning_steps("rf","roc_auc",data_type,classification_task,model,param_rf,X_train,y_train,n_iter)
else:
    rf_gs=supervised_learning_steps("rf","r2",data_type,classification_task,model,param_rf,X_train,y_train,n_iter)

rf_gs.cv_results_

# +
rf_gs = load_model("rf_models/rf__LS_Drug_LS_Protein_regressor_gs.pk")
np.max(rf_gs.cv_results_["mean_test_score"])
filename = "../data/Test_Drug_Viral_interactions_with_LS_v2_for_Supervised_Learning.csv"
with open(filename, "rb") as file:
    print("Loading ", filename)
    big_df = pd.read_csv(filename, header='infer', delimiter=",")
    total_length = len(big_df.columns)
    X = big_df.iloc[:,range(5,total_length)]
    Y = big_df[['pchembl_value']].to_numpy().flatten()
    meta_X = big_df.iloc[:,[0,1,2,3]]
    print("Lengths --> X = %d, Y = %d" % (len(X), len(Y)))

print(X.columns)
n_samples = len(X)
indices = np.arange(n_samples)
n_iter = 200

if classification_task:
    xgb_gs = supervised_learning_steps("xgb", "roc_auc", data_type,
                                       classification_task, model, param_xgb,
                                       X_train, y_train, n_iter)
else:
    xgb_gs = supervised_learning_steps("xgb", "r2", data_type,
                                       classification_task, model, param_xgb,
                                       X_train, y_train, n_iter)

xgb_gs.cv_results_
# -

xgb_gs = load_model("xgb_models/xgb__LS_Drug_LS_Protein_regressor_gs.pk")
xgb_best = xgb_gs.best_estimator_
y_pred_xgb = xgb_best.predict(X_train)
plt.hist(y_pred_xgb)
calculate_regression_metrics(y_train, y_pred_xgb)

# +
np.max(xgb_gs.cv_results_["mean_test_score"])
filename = "../data/Test_Drug_Viral_interactions_with_LS_v2_for_Supervised_Learning.csv"
with open(filename, "rb") as file:
    print("Loading ", filename)
    big_df = pd.read_csv(filename, header='infer', delimiter=",")
    total_length = len(big_df.columns)
    X = big_df.iloc[:, range(5, total_length)]
    Y = big_df[['pchembl_value']].to_numpy().flatten()
    meta_X = big_df.iloc[:, [0, 1, 2, 3]]