def print_results_for_field(dataset, field, prefix):

    dataset = drop_na(dataset, field)
    dataset_notrea_dataset = drop_trea(dataset)

    print_order = [
        "full_xgboost", "full_logi", "notrea_xgboost", "notrea_logi"
    ]
    max_len_order = max(map(len, print_order))
    rez = defaultdict(list)
    for i in range(100):
        ds = get_balanced_split(dataset, field)
        rez["full_xgboost"].append(calc_results_simple(*ds, XGBClassifier()))
        rez["full_logi"].append(
            calc_results_simple(*ds, LogisticRegression(max_iter=1000)))

        ds = get_balanced_split(dataset_notrea_dataset, field)
        rez["notrea_xgboost"].append(calc_results_simple(*ds, XGBClassifier()))
        rez["notrea_logi"].append(
            calc_results_simple(*ds, LogisticRegression(max_iter=1000)))


#        for order in print_order:
#            print(order, " "*(max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1]))
#        print ("")

    for order in print_order:
        print("==> ", field, prefix, order, " " * (max_len_order - len(order)),
              ": ", list2d_to_4g_str_pm(rez[order]))
def print_results_get_mean_acc(ds1, ds2):
    l1 = len(ds1)
    l2 = len(ds2)
    bias_acc = max(l1,l2) / (l1 + l2)
    print(f"bias_accuracy: {bias_acc : .3f}")
    X_set1, _ = prepare_full_dataset(ds1)
    X_set2, _ = prepare_full_dataset(ds2)
    
    y_set1 = np.zeros(l1)
    y_set2 = np.ones(l2)
    
    X_full = np.concatenate([X_set1, X_set2])
    y_full = np.concatenate([y_set1, y_set2])
    
    
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1)
    rez = []
    for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)):
        X_train, X_test = X_full[train_index], X_full[test_index]
        y_train, y_test = y_full[train_index], y_full[test_index]
        rez.append(calc_results_simple(X_train, X_test, y_train, y_test, XGBClassifier()))
        
    print(list2d_to_4g_str_pm(rez))
    
    return np.mean(np.array(rez)[:,0])
예제 #3
0
def print_results(dataset, set1, set2):
    X_set1, y_set1 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set1)])
    X_set2, y_set2 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set2)])

    #    X_set1 = np.random.rand(*X_set1.shape)
    #    X_set2 = np.random.rand(*X_set2.shape)

    X_set1_wf = add_one_features(X_set1, 0)
    X_set2_wf = add_one_features(X_set2, 1)

    X_genes = np.concatenate([X_set1, X_set2])
    X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf])
    y_all = np.concatenate([y_set1, y_set2])

    Xs = X_genes_wf[:, :1]

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
    print_order = [
        "genes", "genes_set", "genes_biased", "genes_double", "study"
    ]

    max_len_order = max(map(len, print_order))

    rez = defaultdict(list)

    for i, (train_index, test_index) in enumerate(kf.split(X_genes, y_all)):
        rez["genes"].append(
            calc_results_for_fold(X_genes, y_all, train_index, test_index,
                                  XGBClassifier()))
        rez["genes_set"].append(
            calc_results_for_fold(X_genes_wf, y_all, train_index, test_index,
                                  XGBClassifier()))
        rez["genes_biased"].append(
            calc_results_for_fold(X_genes_wf, y_all, train_index, test_index,
                                  BiasedXgboost()))
        rez["genes_double"].append(
            calc_results_for_fold(X_genes_wf, y_all, train_index, test_index,
                                  DoubleXgboost()))
        rez["study"].append(
            calc_results_for_fold(Xs, y_all, train_index, test_index,
                                  XGBClassifier()))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))
예제 #4
0
def print_results(dataset, set1, set2):
    X_set1, y_set1 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set1)])
    X_set2, y_set2 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set2)])

    X_set1_wf = add_one_features(X_set1, 0)
    X_set2_wf = add_one_features(X_set2, 1)

    X_full = np.concatenate([X_set1, X_set2])
    X_full_wf = np.concatenate([X_set1_wf, X_set2_wf])
    y_full = np.concatenate([y_set1, y_set2])

    Xs = X_full_wf[:, :1]

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1)
    rez = []
    rez_wf = []
    rez_s = []
    print_order = ["base", "studyf", "studyf_l", "only_study"]

    max_len_order = max(map(len, print_order))

    rez = defaultdict(list)

    for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)):
        rez["base"].append(
            calc_results_for_fold(X_full, y_full, train_index, test_index,
                                  XGBClassifier()))
        rez["studyf"].append(
            calc_results_for_fold(X_full_wf, y_full, train_index, test_index,
                                  XGBClassifier()))
        rez["studyf_l"].append(
            calc_results_for_fold(X_full_wf, y_full, train_index, test_index,
                                  XGBClassifier("binary:logistic")))
        rez["only_study"].append(
            calc_results_for_fold(Xs, y_full, train_index, test_index,
                                  XGBClassifier()))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))
예제 #5
0
def print_results(dataset, set1, set2):
    bias_acc = max(len(set1), len(set2)) / (len(set1) + len(set2))
    print(len(set1), len(set2))
    print(f"bias_accuracy: {bias_acc : .3f}")
    X_set1, _ = prepare_full_dataset(dataset.loc[dataset['patient_ID'].isin(set1)])
    X_set2, _ = prepare_full_dataset(dataset.loc[dataset['patient_ID'].isin(set2)])
    
    y_set1 = np.zeros(len(set1))
    y_set2 = np.ones(len(set2))
    
    X_full = np.concatenate([X_set1, X_set2])
    y_full = np.concatenate([y_set1, y_set2])
    
    
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
    rez = []
    for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)):
        X_train, X_test = X_full[train_index], X_full[test_index]
        y_train, y_test = y_full[train_index], y_full[test_index]
        rez.append(calc_results_simple(X_train, X_test, y_train, y_test, XGBClassifier()))
    print(list2d_to_4g_str_pm(rez))
def print_results(dataset, set1, set2):

    bias_acc = max(len(set1), len(set2)) / (len(set1) + len(set2))
    print(len(set1), len(set2))
    print(f"bias_accuracy: {bias_acc : .3f}")
    X_set1, _ = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set1)])
    X_set2, _ = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set2)])

    y_set1 = np.zeros(len(set1))
    y_set2 = np.ones(len(set2))

    X_full = np.concatenate([X_set1, X_set2])
    y_full = np.concatenate([y_set1, y_set2])

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1)
    rez = []
    for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)):
        X_train, X_test = X_full[train_index], X_full[test_index]
        y_train, y_test = y_full[train_index], y_full[test_index]
        clf = XGBClassifier()
        rez.append(calc_results_simple(X_train, X_test, y_train, y_test, clf))
        clf.fit(X_train, y_train)
        print(np.argsort(-1 * clf.feature_importances_)[:20])
        print(np.sort(-1 * clf.feature_importances_)[:20])

        for i in np.argsort(-1 * clf.feature_importances_)[:20]:
            print(
                list(
                    dataset.drop(columns=[
                        'study', 'patient_ID', 'pCR', 'RFS', 'DFS',
                        'posOutcome'
                    ]))[i])

    print(list2d_to_4g_str_pm(rez))
def _print_results_set1set2(X_set1, y_set1, X_set2, y_set2):
    X_set1_wf = add_one_features(X_set1, 0)
    X_set2_wf = add_one_features(X_set2, 1)
    X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf])
    y_all = np.concatenate([y_set1, y_set2])

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
    print_order = ["set1", "stack"]
    max_len_order = max(map(len, print_order))

    rez = defaultdict(list)
    for i, (train_index, test_index) in enumerate(kf.split(X_genes_wf, y_all)):
        rez["stack"].append(
            calc_results_for_fold(X_genes_wf, y_all, train_index, test_index,
                                  XGBClassifier()))

    for i, (train_index, test_index) in enumerate(kf.split(X_set1, y_set1)):
        rez["set1"].append(
            calc_results_for_fold(X_set1, y_set1, train_index, test_index,
                                  XGBClassifier()))

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))
예제 #8
0
def print_results(dataset, set1, set2):
    X_set1, y_set1 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set1)])
    X_set2, y_set2 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set2)])

    #    X_set1 = np.random.rand(*X_set1.shape)
    #    X_set2 = np.random.rand(*X_set2.shape)

    X_set1_wf = add_one_features(X_set1, 0)
    X_set2_wf = add_one_features(X_set2, 1)

    X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf])
    y_all = np.concatenate([y_set1, y_set2])

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
    print_order = [
        "genes", "genes_set", "genes_biased", "genes_double", "study"
    ]

    max_len_order = max(map(len, print_order))

    rez = defaultdict(list)

    for i, (train_index, test_index) in enumerate(kf.split(X_genes_wf, y_all)):
        X_genes_wf_train, X_genes_wf_test = X_genes_wf[
            train_index], X_genes_wf[test_index]
        y_train, y_test = y_all[train_index], y_all[test_index]

        print("before balanced")
        print_count_two_sets(X_genes_wf_train[:, 0], y_train)
        print_count_two_sets(X_genes_wf_test[:, 0], y_test)
        #        print("counter before balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test))
        X_genes_wf_train, y_train = random_upsample_balance(
            X_genes_wf_train, y_train)
        X_genes_wf_test, y_test = random_upsample_balance(
            X_genes_wf_test, y_test)
        #        print("counter after balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test))
        print("after balanced")
        print_count_two_sets(X_genes_wf_train[:, 0], y_train)
        print_count_two_sets(X_genes_wf_test[:, 0], y_test)

        X_genes_train = X_genes_wf_train[:, 1:]
        X_genes_test = X_genes_wf_test[:, 1:]

        Xs_train = X_genes_wf_train[:, :1]
        Xs_test = X_genes_wf_test[:, :1]

        rez["genes"].append(
            calc_results_simple(X_genes_train, X_genes_test, y_train, y_test,
                                XGBClassifier()))
        rez["genes_set"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, XGBClassifier()))
        rez["genes_biased"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, BiasedXgboost()))
        rez["genes_double"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, DoubleXgboost()))
        rez["study"].append(
            calc_results_simple(Xs_train, Xs_test, y_train, y_test,
                                XGBClassifier()))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))