def print_results_for_field(dataset, field):

    dataset = drop_na(dataset, field)
    notrea_dataset = drop_trea(dataset)

    print_order = [
        "full_xgboost", "full_logi", "notrea_xgboost", "notrea_logi"
    ]
    max_len_order = max(map(len, print_order))

    rez = defaultdict(list)
    for i in range(10):
        print("run ", i)
        ds = get_balanced_split(dataset)
        rez["full_xgboost"].append(calc_results_simple(*ds, XGBClassifier()))
        rez["full_logi"].append(
            calc_results_simple(*ds, LogisticRegression(max_iter=1000)))
        ds = get_balanced_split(notrea_dataset)
        rez["notrea_xgboost"].append(calc_results_simple(*ds, XGBClassifier()))
        rez["notrea_logi"].append(
            calc_results_simple(*ds, LogisticRegression(max_iter=1000)))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")
        sys.stdout.flush()

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))
Пример #2
0
def print_results_for_field(dataset, field):
    dataset = drop_na(dataset, field)
    dataset_notrea_dataset = drop_trea(dataset)

    print_order = ["full_xgboost", "notrea_xgboost"]
    max_len_order = max(map(len, print_order))
    rez = defaultdict(list)
    for i in range(20):
        ds = get_balanced_split(dataset, field)
        if (len(ds) == 0):
            continue
        rez["full_xgboost"].append(calc_results_simple(*ds, XGBClassifier()))

        ds = get_balanced_split(dataset_notrea_dataset, field)
        if (len(ds) == 0):
            continue
        rez["notrea_xgboost"].append(calc_results_simple(*ds, XGBClassifier()))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))
        output.write("==> " + str(order) + " " * (max_len_order - len(order)) +
                     ": " + str(list_to_4g_str(np.mean(rez[order], axis=0))))
        output.write("\n")
    pam_types_cat_dataset = pd.merge(metagx_dataset,
                                     pam_types_cat_dataset,
                                     left_on="sample_name",
                                     right_on="sample_name")
    pam_types_cat_dataset = drop_na(pam_types_cat_dataset, "posOutcome")
    pam_types_cat_dataset_notrea_dataset = drop_trea(pam_types_cat_dataset)
    X_full, y_full = prepare_full_dataset(pam_types_cat_dataset)
    X_notrea, _ = prepare_full_dataset(pam_types_cat_dataset_notrea_dataset)

    kf = StratifiedKFold(n_splits=5, shuffle=True)
    max_len_order = max(map(len, print_order))
    print("==> posOutcome " + str(dataset))
    output.write("==> posOutcome " + str(dataset) + "\n")
    rez = defaultdict(list)
    for i in range(20):
        ds = get_balanced_split(pam_types_cat_dataset, 'posOutcome')
        rez["full"].append(calc_results_simple(*ds, XGBClassifier()))

        ds = get_balanced_split(pam_types_cat_dataset_notrea_dataset,
                                'posOutcome')
        rez["full_notrea"].append(calc_results_simple(*ds, XGBClassifier()))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))
        output.write("==> " + str(order) + " " * (max_len_order - len(order)) +