def print_results_for_field(dataset, field, prefix): dataset = drop_na(dataset, field) dataset_notrea_dataset = drop_trea(dataset) print_order = [ "full_xgboost", "full_logi", "notrea_xgboost", "notrea_logi" ] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i in range(100): ds = get_balanced_split(dataset, field) rez["full_xgboost"].append(calc_results_simple(*ds, XGBClassifier())) rez["full_logi"].append( calc_results_simple(*ds, LogisticRegression(max_iter=1000))) ds = get_balanced_split(dataset_notrea_dataset, field) rez["notrea_xgboost"].append(calc_results_simple(*ds, XGBClassifier())) rez["notrea_logi"].append( calc_results_simple(*ds, LogisticRegression(max_iter=1000))) # for order in print_order: # print(order, " "*(max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) # print ("") for order in print_order: print("==> ", field, prefix, order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))
def print_results_get_mean_acc(ds1, ds2): l1 = len(ds1) l2 = len(ds2) bias_acc = max(l1,l2) / (l1 + l2) print(f"bias_accuracy: {bias_acc : .3f}") X_set1, _ = prepare_full_dataset(ds1) X_set2, _ = prepare_full_dataset(ds2) y_set1 = np.zeros(l1) y_set2 = np.ones(l2) X_full = np.concatenate([X_set1, X_set2]) y_full = np.concatenate([y_set1, y_set2]) kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1) rez = [] for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)): X_train, X_test = X_full[train_index], X_full[test_index] y_train, y_test = y_full[train_index], y_full[test_index] rez.append(calc_results_simple(X_train, X_test, y_train, y_test, XGBClassifier())) print(list2d_to_4g_str_pm(rez)) return np.mean(np.array(rez)[:,0])
def print_results(dataset, set1, set2): X_set1, y_set1 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set1)]) X_set2, y_set2 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set2)]) # X_set1 = np.random.rand(*X_set1.shape) # X_set2 = np.random.rand(*X_set2.shape) X_set1_wf = add_one_features(X_set1, 0) X_set2_wf = add_one_features(X_set2, 1) X_genes = np.concatenate([X_set1, X_set2]) X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf]) y_all = np.concatenate([y_set1, y_set2]) Xs = X_genes_wf[:, :1] kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10) print_order = [ "genes", "genes_set", "genes_biased", "genes_double", "study" ] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i, (train_index, test_index) in enumerate(kf.split(X_genes, y_all)): rez["genes"].append( calc_results_for_fold(X_genes, y_all, train_index, test_index, XGBClassifier())) rez["genes_set"].append( calc_results_for_fold(X_genes_wf, y_all, train_index, test_index, XGBClassifier())) rez["genes_biased"].append( calc_results_for_fold(X_genes_wf, y_all, train_index, test_index, BiasedXgboost())) rez["genes_double"].append( calc_results_for_fold(X_genes_wf, y_all, train_index, test_index, DoubleXgboost())) rez["study"].append( calc_results_for_fold(Xs, y_all, train_index, test_index, XGBClassifier())) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))
def print_results(dataset, set1, set2): X_set1, y_set1 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set1)]) X_set2, y_set2 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set2)]) X_set1_wf = add_one_features(X_set1, 0) X_set2_wf = add_one_features(X_set2, 1) X_full = np.concatenate([X_set1, X_set2]) X_full_wf = np.concatenate([X_set1_wf, X_set2_wf]) y_full = np.concatenate([y_set1, y_set2]) Xs = X_full_wf[:, :1] kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1) rez = [] rez_wf = [] rez_s = [] print_order = ["base", "studyf", "studyf_l", "only_study"] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)): rez["base"].append( calc_results_for_fold(X_full, y_full, train_index, test_index, XGBClassifier())) rez["studyf"].append( calc_results_for_fold(X_full_wf, y_full, train_index, test_index, XGBClassifier())) rez["studyf_l"].append( calc_results_for_fold(X_full_wf, y_full, train_index, test_index, XGBClassifier("binary:logistic"))) rez["only_study"].append( calc_results_for_fold(Xs, y_full, train_index, test_index, XGBClassifier())) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))
def print_results(dataset, set1, set2): bias_acc = max(len(set1), len(set2)) / (len(set1) + len(set2)) print(len(set1), len(set2)) print(f"bias_accuracy: {bias_acc : .3f}") X_set1, _ = prepare_full_dataset(dataset.loc[dataset['patient_ID'].isin(set1)]) X_set2, _ = prepare_full_dataset(dataset.loc[dataset['patient_ID'].isin(set2)]) y_set1 = np.zeros(len(set1)) y_set2 = np.ones(len(set2)) X_full = np.concatenate([X_set1, X_set2]) y_full = np.concatenate([y_set1, y_set2]) kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10) rez = [] for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)): X_train, X_test = X_full[train_index], X_full[test_index] y_train, y_test = y_full[train_index], y_full[test_index] rez.append(calc_results_simple(X_train, X_test, y_train, y_test, XGBClassifier())) print(list2d_to_4g_str_pm(rez))
def print_results(dataset, set1, set2): bias_acc = max(len(set1), len(set2)) / (len(set1) + len(set2)) print(len(set1), len(set2)) print(f"bias_accuracy: {bias_acc : .3f}") X_set1, _ = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set1)]) X_set2, _ = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set2)]) y_set1 = np.zeros(len(set1)) y_set2 = np.ones(len(set2)) X_full = np.concatenate([X_set1, X_set2]) y_full = np.concatenate([y_set1, y_set2]) kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1) rez = [] for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)): X_train, X_test = X_full[train_index], X_full[test_index] y_train, y_test = y_full[train_index], y_full[test_index] clf = XGBClassifier() rez.append(calc_results_simple(X_train, X_test, y_train, y_test, clf)) clf.fit(X_train, y_train) print(np.argsort(-1 * clf.feature_importances_)[:20]) print(np.sort(-1 * clf.feature_importances_)[:20]) for i in np.argsort(-1 * clf.feature_importances_)[:20]: print( list( dataset.drop(columns=[ 'study', 'patient_ID', 'pCR', 'RFS', 'DFS', 'posOutcome' ]))[i]) print(list2d_to_4g_str_pm(rez))
def _print_results_set1set2(X_set1, y_set1, X_set2, y_set2): X_set1_wf = add_one_features(X_set1, 0) X_set2_wf = add_one_features(X_set2, 1) X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf]) y_all = np.concatenate([y_set1, y_set2]) kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10) print_order = ["set1", "stack"] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i, (train_index, test_index) in enumerate(kf.split(X_genes_wf, y_all)): rez["stack"].append( calc_results_for_fold(X_genes_wf, y_all, train_index, test_index, XGBClassifier())) for i, (train_index, test_index) in enumerate(kf.split(X_set1, y_set1)): rez["set1"].append( calc_results_for_fold(X_set1, y_set1, train_index, test_index, XGBClassifier())) for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))
def print_results(dataset, set1, set2): X_set1, y_set1 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set1)]) X_set2, y_set2 = prepare_full_dataset( dataset.loc[dataset['patient_ID'].isin(set2)]) # X_set1 = np.random.rand(*X_set1.shape) # X_set2 = np.random.rand(*X_set2.shape) X_set1_wf = add_one_features(X_set1, 0) X_set2_wf = add_one_features(X_set2, 1) X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf]) y_all = np.concatenate([y_set1, y_set2]) kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10) print_order = [ "genes", "genes_set", "genes_biased", "genes_double", "study" ] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i, (train_index, test_index) in enumerate(kf.split(X_genes_wf, y_all)): X_genes_wf_train, X_genes_wf_test = X_genes_wf[ train_index], X_genes_wf[test_index] y_train, y_test = y_all[train_index], y_all[test_index] print("before balanced") print_count_two_sets(X_genes_wf_train[:, 0], y_train) print_count_two_sets(X_genes_wf_test[:, 0], y_test) # print("counter before balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test)) X_genes_wf_train, y_train = random_upsample_balance( X_genes_wf_train, y_train) X_genes_wf_test, y_test = random_upsample_balance( X_genes_wf_test, y_test) # print("counter after balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test)) print("after balanced") print_count_two_sets(X_genes_wf_train[:, 0], y_train) print_count_two_sets(X_genes_wf_test[:, 0], y_test) X_genes_train = X_genes_wf_train[:, 1:] X_genes_test = X_genes_wf_test[:, 1:] Xs_train = X_genes_wf_train[:, :1] Xs_test = X_genes_wf_test[:, :1] rez["genes"].append( calc_results_simple(X_genes_train, X_genes_test, y_train, y_test, XGBClassifier())) rez["genes_set"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, XGBClassifier())) rez["genes_biased"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, BiasedXgboost())) rez["genes_double"].append( calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train, y_test, DoubleXgboost())) rez["study"].append( calc_results_simple(Xs_train, Xs_test, y_train, y_test, XGBClassifier())) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))