def print_results_for_field(dataset, field): dataset = drop_na(dataset, field) notrea_dataset = drop_trea(dataset) print_order = [ "full_xgboost", "full_logi", "notrea_xgboost", "notrea_logi" ] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i in range(10): print("run ", i) ds = get_balanced_split(dataset) rez["full_xgboost"].append(calc_results_simple(*ds, XGBClassifier())) rez["full_logi"].append( calc_results_simple(*ds, LogisticRegression(max_iter=1000))) ds = get_balanced_split(notrea_dataset) rez["notrea_xgboost"].append(calc_results_simple(*ds, XGBClassifier())) rez["notrea_logi"].append( calc_results_simple(*ds, LogisticRegression(max_iter=1000))) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") sys.stdout.flush() for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order]))
def print_results_for_field(dataset, field): dataset = drop_na(dataset, field) dataset_notrea_dataset = drop_trea(dataset) print_order = ["full_xgboost", "notrea_xgboost"] max_len_order = max(map(len, print_order)) rez = defaultdict(list) for i in range(20): ds = get_balanced_split(dataset, field) if (len(ds) == 0): continue rez["full_xgboost"].append(calc_results_simple(*ds, XGBClassifier())) ds = get_balanced_split(dataset_notrea_dataset, field) if (len(ds) == 0): continue rez["notrea_xgboost"].append(calc_results_simple(*ds, XGBClassifier())) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order])) output.write("==> " + str(order) + " " * (max_len_order - len(order)) + ": " + str(list_to_4g_str(np.mean(rez[order], axis=0)))) output.write("\n")
pam_types_cat_dataset = pd.merge(metagx_dataset, pam_types_cat_dataset, left_on="sample_name", right_on="sample_name") pam_types_cat_dataset = drop_na(pam_types_cat_dataset, "posOutcome") pam_types_cat_dataset_notrea_dataset = drop_trea(pam_types_cat_dataset) X_full, y_full = prepare_full_dataset(pam_types_cat_dataset) X_notrea, _ = prepare_full_dataset(pam_types_cat_dataset_notrea_dataset) kf = StratifiedKFold(n_splits=5, shuffle=True) max_len_order = max(map(len, print_order)) print("==> posOutcome " + str(dataset)) output.write("==> posOutcome " + str(dataset) + "\n") rez = defaultdict(list) for i in range(20): ds = get_balanced_split(pam_types_cat_dataset, 'posOutcome') rez["full"].append(calc_results_simple(*ds, XGBClassifier())) ds = get_balanced_split(pam_types_cat_dataset_notrea_dataset, 'posOutcome') rez["full_notrea"].append(calc_results_simple(*ds, XGBClassifier())) for order in print_order: print(order, " " * (max_len_order - len(order)), ": ", list_to_4g_str(rez[order][-1])) print("") for order in print_order: print("==> ", order, " " * (max_len_order - len(order)), ": ", list2d_to_4g_str_pm(rez[order])) output.write("==> " + str(order) + " " * (max_len_order - len(order)) +