Пример #1
0
def embedFig(target="IBD"):
    #Embed
    f = plt.figure(figsize=(15, 5))
    X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput(
        otu_train,
        otu_test,
        map_train,
        map_test,
        target=target,
        embed=True,
        qual_vecs=qual_vecs)
    X_train = pd.concat([X_train, X_val], axis=0)
    y_train = y_train + y_val
    plt.subplot(1, 2, 1)
    m, auc_embed, auc_train_embed, fpr_embed, tpr_embed, prec_embed, f1_embed, f2_embed, _ = hf.predictIBD(
        X_train,
        y_train,
        X_test,
        y_test,
        graph_title="Embedding weighted by averaging taxa " +
        str(X_train.shape[1]) + " features",
        max_depth=5,
        n_estimators=95,
        weight=20,
        plot=True,
        plot_pr=True)

    f.savefig(os.path.join(fig_dir, "curves_AGP_test_embed.pdf"))
Пример #2
0
def pcaFig(target="IBD"):
    f = plt.figure(figsize=(15, 5))
    X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput(
        otu_train,
        otu_test,
        map_train,
        map_test,
        target=target,
        pca_reduced=True,
        numComponents=100)
    X_train = pd.concat([X_train, X_val], axis=0)
    y_train = y_train + y_val
    plt.subplot(1, 2, 1)
    m, auc_pca, auc_train_pca, fpr_pca, tpr_pca, prec_pca, f1_pca, f2_pca, _ = hf.predictIBD(
        X_train,
        y_train,
        X_test,
        y_test,
        graph_title="PCA dimensionality reduced " + str(X_train.shape[1]) +
        " features",
        max_depth=5,
        n_estimators=50,
        weight=20,
        plot=True,
        plot_pr=True)
    f.savefig(os.path.join(fig_dir + "curves_AGP_test_pca.pdf"))
Пример #3
0
def asinFig(target="IBD"):
    #Normalize with asinh
    f = plt.figure(figsize=(15, 5))
    X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput(
        otu_train,
        otu_test,
        map_train,
        map_test,
        target=target,
        asinNormalized=True)
    X_train = pd.concat([X_train, X_val], axis=0)
    y_train = y_train + y_val
    plt.subplot(1, 2, 1)
    m, auc_asin, auc_train_asin, fpr_asin, tpr_asin, prec_asin, f1_asin, f2_asin, _ = hf.predictIBD(
        X_train,
        y_train,
        X_test,
        y_test,
        graph_title="Normalized asinh Taxa Abundances " +
        str(X_train.shape[1]) + " features",
        max_depth=5,
        n_estimators=170,
        weight=20,
        plot=True,
        plot_pr=True)

    f.savefig(os.path.join(fig_dir, "asin_otu.pdf"))
# Classifying embedded data i.e. 113 features using Naive Bayes
X_embed_train, X_embed_test, y_embed_train, y_embed_test = train_test_split(X_embed, y_embed, test_size = 0.2, random_state = 10)

# Input data has negative values, MultinomialNB and ComplementNG cannot be used. 
clf = GaussianNB()
model = clf.fit(X_embed_train, y_embed_train)

predicted_y = model.predict(X_embed_test)

f = plt.figure(figsize=(15,5))
roc_auc, fpr, tpr, average_precision, f1, f2 = hf.computeMLstats(model, X_embed_test, y_embed_test, plot=True, plot_pr=True, graph_title = "Naive Bayes Classifier on embedded data", flipped = False)
f.savefig(os.path.join(fig_dir, "naive_bayes_classifier_embed.pdf"))

# Classifying OTU data i.e. 26k+ features using Naive Bayes
X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput(otu_train, otu_test, map_train, map_test, target = "IBD", asinNormalized=True)
X_train = pd.concat([X_train, X_val], axis = 0)
y_train = y_train + y_val
 
# Input data has negative values, MultinomialNB and ComplementNG cannot be used. 
clf = GaussianNB()
model = clf.fit(X_train, y_train)

predicted_y = model.predict(X_test)

f = plt.figure(figsize=(15,5))
roc_auc, fpr, tpr, average_precision, f1, f2 = hf.computeMLstats(model, X_test, y_test, plot=True, plot_pr=True, graph_title = "Naive Bayes Classifier on OTU table", flipped = False)
f.savefig(os.path.join(fig_dir, "naive_bayes_classifier_otu.pdf"))


Пример #5
0
        weight=20,
        plot=True,
        plot_pr=True)
    f.savefig(os.path.join(fig_dir + "curves_AGP_test_pca.pdf"))


pcaFig()

importlib.reload(hf)
target = "IBD"
# hf.getMlInput generates training, validation, and testing data
# if embed is True, the data is normalized using asinh i.e. hyperbolic inverse sin function
X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput(
    otu_train,
    otu_test,
    map_train,
    map_test,
    target=target,
    embed=True,
    qual_vecs=qual_vecs)
X = pd.concat([X_train, X_val, X_test], axis=0)
y = y_train + y_val + y_test

# These values are never used again (?)
auc_crossVal, auc_prec_crossVal, f1_crossVal, feat_imp_embed = hf.crossValPrediction(
    X, y, max_depth=2, n_estimators=50, weight=20)

# weights/importance of 113 features (100 properties + 13 demographic features)
feat_imp_df = hf.getFeatImpDf(feat_imp_embed)

# mapping of property to function from kegg database
pathway_table = pd.read_csv(data_dir + "/property_pathway_dict.txt",