def print_results_for_field(dataset, field, prefix):

    dataset = drop_na(dataset, field)
    dataset_notrea_dataset = drop_trea(dataset)

    print_order = [
        "full_xgboost", "full_logi", "notrea_xgboost", "notrea_logi"
    ]
    max_len_order = max(map(len, print_order))
    rez = defaultdict(list)
    for i in range(100):
        ds = get_balanced_split(dataset, field)
        rez["full_xgboost"].append(calc_results_simple(*ds, XGBClassifier()))
        rez["full_logi"].append(
            calc_results_simple(*ds, LogisticRegression(max_iter=1000)))

        ds = get_balanced_split(dataset_notrea_dataset, field)
        rez["notrea_xgboost"].append(calc_results_simple(*ds, XGBClassifier()))
        rez["notrea_logi"].append(
            calc_results_simple(*ds, LogisticRegression(max_iter=1000)))


#        for order in print_order:
#            print(order, " "*(max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1]))
#        print ("")

    for order in print_order:
        print("==> ", field, prefix, order, " " * (max_len_order - len(order)),
              ": ", list2d_to_4g_str_pm(rez[order]))
示例#2
0
                                  XGBClassifier()))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))


atreat_dataset = read_alltreat_dataset()

pam_types_cat_dataset = read_pam_types_cat_dataset()
notrea_dataset = drop_trea(pam_types_cat_dataset)

#Variant 1  study_20194_GPL96_all-bmc15 protocol 1 vs protocol 5

set1 = atreat_dataset.loc[
    (atreat_dataset['study'] == "study_20194_GPL96_all-bmc15")
    & (atreat_dataset['treatment_protocol_number'] == '1')]['patient_ID']
set2 = atreat_dataset.loc[
    (atreat_dataset['study'] == "study_20194_GPL96_all-bmc15")
    & (atreat_dataset['treatment_protocol_number'] == '5')]['patient_ID']

print("==> study_20194_GPL96_all-bmc15 protocol 1  vs protocol 5")
print_results(notrea_dataset, set1, set2)
print("==>")

#Variant 2  study_9893_GPL5049_all-bmc15 protocol 1 vs protocol 2
示例#3
0
    print("stack")
    print_mean_fold_importance(np.concatenate([X_set1, X_set2]),
                               np.concatenate([y_set1, y_set2]), genes_list)

    X_set1_wf = add_one_features_tail(X_set1, 0)
    X_set2_wf = add_one_features_tail(X_set2, 1)

    print("stack_with_set")
    print_mean_fold_importance(np.concatenate([X_set1_wf, X_set2_wf]),
                               np.concatenate([y_set1, y_set2]), genes_list)


atreat_dataset = read_alltreat_dataset()
combat_dataset = read_combat_dataset()
notrea_dataset = drop_trea(combat_dataset)
#dataset = read_pam_types_num_dataset()
#notrea_dataset = drop_trea(dataset)

#Variant 1  study_20194_GPL96_all-bmc15 protocol 1 vs protocol 5

set1 = atreat_dataset.loc[
    (atreat_dataset['study'] == "study_20194_GPL96_all-bmc15")
    & (atreat_dataset['treatment_protocol_number'] == '1')]['patient_ID']
set2 = atreat_dataset.loc[
    (atreat_dataset['study'] == "study_20194_GPL96_all-bmc15")
    & (atreat_dataset['treatment_protocol_number'] == '5')]['patient_ID']

print("==> study_20194_GPL96_all-bmc15 protocol 1  vs protocol 5")
print_results(notrea_dataset, set1, set2)
print("==>")
    acc = np.mean(y_test == y_pred)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    recall_1 = recall_score(y_test, y_pred, pos_label=1)

    return acc, recall_0, recall_1


def count_to_str(y):
    c = Counter(y)
    return "count_01=%i/%i" % (c[0], c[1])


full_dataset = read_full_dataset()
mike_dataset = read_mike_dataset()
treat_dataset = read_treat_dataset()
full_notrea_dataset = drop_trea(full_dataset)
mike_notrea_dataset = drop_trea(mike_dataset)

all_studies = list(set(full_dataset['study']))

print_order = [
    "full", "full_notrea", "full_pam50", "mike", "mike_svm", "mike_logi",
    "mike_notrea", "mike_notrea_svm", "mike_notrea_logi", "mike_pam50",
    "mike_pam50_svm", "mike_pam50_logi", "trea", "trea_svm", "trea_logi"
]
max_len_order = max(map(len, print_order))

for study in ['study_20194_GPL96_all-bmc15']:
    #for study in ['study_17705_GPL96_MDACC_Tissue_BC_Tamoxifen-bmc15']:

    X_full, y_full = prepare_dataset(full_dataset, study)
import pandas as pd
import os
import numpy as np
import random
import math
from funs_common import read_mike_dataset, drop_trea, prepare_full_dataset
from sklearn.manifold import TSNE
import pickle

dataset = read_mike_dataset()

notrea_dataset = drop_trea(dataset)
X, y = prepare_full_dataset(notrea_dataset, y_field='study')

X_embedded = TSNE(n_components=2).fit_transform(X)

pickle.dump(X_embedded, open("experement24_studytest_tsne.p", "wb"))
    
    
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1)
    rez = []
    for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)):
        X_train, X_test = X_full[train_index], X_full[test_index]
        y_train, y_test = y_full[train_index], y_full[test_index]
        rez.append(calc_results_simple(X_train, X_test, y_train, y_test, XGBClassifier()))
        
    print(list2d_to_4g_str_pm(rez))
    
    return np.mean(np.array(rez)[:,0])
                

dataset = read_combat_dataset()
dataset = drop_trea(dataset)


all_studies = list(set(dataset['study']))

all_rez = []
for s1,s2 in itertools.combinations(sorted(all_studies), 2):
    ds1 = dataset.loc[dataset['study'] == s1]
    ds2 = dataset.loc[dataset['study'] == s2]
    print("studies: ", s1, s2, len(ds1), len(ds2))
    rez = print_results_get_mean_acc(ds1, ds2)
    all_rez.append((rez, s1, s2))
    
print("\n--------------------------------------\n")
for rez,s1,s2 in sorted(all_rez):
    if (rez < 0.99):
示例#7
0
    print("==> DFS", prefix)
    print_results_for_field(dataset, Xt_full_dict, "DFS", prefix)
    print("")
    print("")

    print("==> posOutcome", prefix)
    print_results_for_field(dataset, Xt_full_dict, "posOutcome", prefix)
    print("")
    print("")


treat_dataset = read_treat_dataset()
combat_dataset = read_combat_dataset()
pam_types_cat_dataset = read_pam_types_cat_dataset()

X_full, _ = prepare_full_dataset(drop_trea(combat_dataset))

assert all(pam_types_cat_dataset['patient_ID'] == combat_dataset['patient_ID'])

#print(list(pam_types_cat_dataset))
#print_results(pam_types_cat_dataset, "old")

for n_cluster in [1, 5, 10, 20, 100, 200]:
    lda = LatentDirichletAllocation(n_components=n_cluster)
    Xt_full = lda.fit_transform(X_full - np.min(X_full))
    Xt_full_dict = {
        i: x
        for i, x in zip(combat_dataset['patient_ID'], Xt_full)
    }

    print_results(pam_types_cat_dataset, Xt_full_dict, "nc" + str(n_cluster))