def calc_results_for_ensamble(X, y, train_index, test_index, nrun, clf): all_rez = [] for i in range(nrun): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = random_upsample_balance(X_train, y_train) X_test, y_test = random_upsample_balance(X_test, y_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) all_rez.append(y_pred) y_pred = np.array([ Counter(ys).most_common()[0][0] for ys in np.array(all_rez).transpose() ]) acc = np.mean(y_test == y_pred) recall_0 = recall_score(y_test, y_pred, pos_label=0) recall_1 = recall_score(y_test, y_pred, pos_label=1) return acc, recall_0, recall_1
def calc_results_onlystudy(X, y, train_index, test_index, clf): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = random_upsample_balance(X_train, y_train) X_test, y_test = random_upsample_balance(X_test, y_test) return calc_results_simple(X_train, X_test, y_train, y_test, clf)
def get_balanced_split_for_study(full_dataset, study, y_field="posOutcome"): X, y = prepare_dataset(full_dataset, study, y_field=y_field) kf = RepeatedStratifiedKFold(n_splits=5) (train_index, test_index) = next(kf.split(X, y)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = random_upsample_balance(X_train, y_train) X_test, y_test = random_upsample_balance(X_test, y_test) return X_train, X_test, y_train, y_test
def calc_results_withfull_simple(X, y, train_index, test_index, full_dataset, test_study, clf): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = random_upsample_balance(X_train, y_train) X_test, y_test = random_upsample_balance(X_test, y_test) X_train_other, y_train_other = get_balanced_studies_except_test_study(full_dataset, test_study) X_train = np.concatenate([X_train, X_train_other]) y_train = np.concatenate([y_train, y_train_other]) return calc_results_simple(X_train, X_test, y_train, y_test, clf)
def calc_results_for_fold(X, y, train_index, test_index, clf): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = random_upsample_balance(X_train, y_train) X_test, y_test = random_upsample_balance(X_test, y_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = np.mean(y_test == y_pred) recall_0 = recall_score(y_test, y_pred, pos_label=0) recall_1 = recall_score(y_test, y_pred, pos_label=1) return acc, recall_0, recall_1
def calc_results_withfull_balanced2(X, y, train_index, test_index, full_dataset, test_study, clf): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = random_upsample_balance(X_train, y_train) X_test, y_test = random_upsample_balance(X_test, y_test) X_train_other, y_train_other = get_balanced_studies_except_test_study(full_dataset, test_study) N_rep = int(len(y_train_other) / len(y_train)) X_train_rep = np.repeat(X_train, N_rep, axis=0) y_train_rep = np.repeat(y_train, N_rep, axis=0) X_train = np.concatenate([X_train_rep, X_train_other]) y_train = np.concatenate([y_train_rep, y_train_other]) return calc_results_simple(X_train, X_test, y_train, y_test, clf)
def get_balanced_study(full_dataset, study): X, y = prepare_dataset(full_dataset, study) return random_upsample_balance(X, y)
from funs_balance import random_upsample_balance X = [[1, 1], [2, 1], [3, 1], [4, 1], [1, 0], [2, 0]] y = [1, 1, 1, 1, 0, 0] Xb, yb = random_upsample_balance(X, y) print(Xb) print(yb)
def print_results(dataset, set1, set2): X_set1, y_set1 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set1)]) X_set2, y_set2 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set2)]) # X_set1 = np.random.rand(*X_set1.shape) # X_set2 = np.random.rand(*X_set2.shape) X_set1_wf = add_one_features(X_set1, 0) X_set2_wf = add_one_features(X_set2, 1) X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf]) y_all = np.concatenate([y_set1, y_set2]) kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10) print_order = [ "genes", "genes_set", "genes_biased", "genes_double", "study" ] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i, (train_index, test_index) in enumerate(kf.split(X_genes_wf, y_all)): X_genes_wf_train, X_genes_wf_test = X_genes_wf[ train_index], X_genes_wf[test_index] y_train, y_test = y_all[train_index], y_all[test_index] print("before balanced") print_count_two_sets(X_genes_wf_train[:, 0], y_train) print_count_two_sets(X_genes_wf_test[:, 0], y_test) # print("counter before balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test)) X_genes_wf_train, y_train = random_upsample_balance( X_genes_wf_train, y_train) X_genes_wf_test, y_test = random_upsample_balance( X_genes_wf_test, y_test) # print("counter after balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test)) print("after balanced") print_count_two_sets(X_genes_wf_train[:, 0], y_train) print_count_two_sets(X_genes_wf_test[:, 0], y_test) X_genes_train = X_genes_wf_train[:, 1:] X_genes_test = X_genes_wf_test[:, 1:] Xs_train = X_genes_wf_train[:, :1] Xs_test = X_genes_wf_test[:, :1] rez["genes"].append( calc_results_simple(X_genes_train, X_genes_test, y_train, y_test, XGBClassifier())) rez["genes_set"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, XGBClassifier())) rez["genes_biased"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, BiasedXgboost())) rez["genes_double"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, DoubleXgboost())) rez["study"].append( calc_results_simple(Xs_train, Xs_test, y_train, y_test, XGBClassifier())) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))