Python Preprocess.prepare_k_folds примеры, Preprocess.prepare_k_folds, sqlalchemy Python примеры использования

Пример #1

0

Показать файл

Файл: PB6_test.py Проект: Juncai/CS6140

def main():

    target = 'v2'
    # training parameter
    k = 10  # fold
    layer_thresh = 2
    T = 50
    threshes_path = 'data/spambase.threshes'

    # laod and preprocess training data
    training_data = loader.load_dataset('data/spambase.data')

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    k_folds = Preprocess.prepare_k_folds(training_data, k)
    tr_data, te_data = Preprocess.get_i_fold(k_folds, 0)
    f_cur = [x[0] for x in tr_data[0]]

    t = dt.DecisionTree()
    if target == 'v1':
        for i in range(100):
            h_y = t.compute_entropy(tr_data[1])
            thresh = threshes[0][30]
            ig = t.compute_ig(f_cur, tr_data[1], thresh, h_y)
    else:
        h_y = t.compute_entropy_v2(tr_data[1])
        thresh = threshes[0][0]
        ig = t.compute_ig_v2(f_cur, tr_data[1], thresh, h_y)

Пример #2

0

Показать файл

Файл: PB5_RELIEF.py Проект: Juncai/CS6140

def main():
    # training parameter
    is_sklearn = True
    k = 10  # fold
    result_path = 'results/PB2_spam.acc'
    model_name = 'spam_' + str(k) + 'fold'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    # util.replace_zero_label_with_neg_one(training_data)

    # Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
    # training_data[0] = preprocessing.scale(training_data[0])


    # start training
    training_errs = []
    testing_errs = []
    print('Preparing k fold data.')
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in (0,):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print('{:.2f} Start training.'.format(time.time() - st))
        kernel = c.EUCLIDEAN
        # kernel = c.GAUSSIAN
        f_select = True
        best_features_num = 5
        clf = kNN.kNN(kernel=kernel)
        clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num)
        print("Best features: {}".format(clf.best_f_indices))
        for kk in (1, 2, 3, 7):
            tr_pred = clf.predict(tr_data[0], k=kk)
            te_pred = clf.predict(te_data[0], k=kk)

            tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
            te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

            print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))

Пример #3

0

Показать файл

Файл: PB2_A_spam.py Проект: Juncai/CS6140

def main():
    # training parameter
    k = 8  # fold
    result_path = 'results/PB2_spam.acc'
    model_name = 'spam_' + str(k) + 'fold'
    data_path = 'data/spam/data.pickle'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    # util.replace_zero_label_with_neg_one(training_data)

    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])
    # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0])


    # start training
    training_accs = []
    testing_accs = []
    print('Preparing k fold data.')
    k_folds = Preprocess.prepare_k_folds(training_data, k)
    kernel = c.EUCLIDEAN
    sst = time.time()
    for i in (1,):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print('{:.2f} Start training.'.format(time.time() - st))
        for r in (2.5, 2.7):
            clf = kNN.kNN(kernel=kernel)
            # clf.fit(training_data[0], training_data[1])
            clf.fit(tr_data[0], tr_data[1])
            # tr_pred = clf.predict(training_data[0], r=r)
            tr_pred = clf.predict(tr_data[0], r=r)
            te_pred = clf.predict(te_data[0], r=r)

            # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0]
            tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
            te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

            testing_accs.append(te_acc)
            print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))

Пример #4

0

Показать файл

Файл: PB1_A_spam_LIBSVM.py Проект: Juncai/CS6140

def main():
    # training parameter
    k = 10  # fold
    result_path = "results/PB1_A_spam.acc"
    model_name = "spam_" + str(k) + "fold"
    threshes_path = "data/spambase.threshes"
    data_path = "data/spam/data.pickle"
    # kernel = 'poly'
    kernel = "linear"
    # kernel = 'rbf'
    verbose = False
    tol = 0.01
    c = 0.1

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)
    # TODO convert labels from {0, 1} to {-1, 1}
    util.replace_zero_label_with_neg_one(training_data)

    # normalize
    Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0])

    print("Preparing k fold data.")
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in range(1):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)

        # start training
        print("{:3f} Start training. Kernel: {}".format(time.time() - st, kernel))

        clf = svm.SVC(C=c, kernel=kernel, tol=tol, verbose=verbose)
        # clf = svm.NuSVC(kernel=kernel, tol=tol, verbose=verbose)
        clf.fit(tr_data[0], tr_data[1])
        tr_pred = clf.predict(tr_data[0])
        te_pred = clf.predict(te_data[0])

        tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0]
        te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0]

        print("{:3f} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, tr_acc, te_acc))

Пример #5

0

Показать файл

Файл: PB3_random_data_spam.py Проект: Juncai/CS6140

def main():
    # training parameter
    round_limit = 50
    result_path = 'results/spamActive_random_final_1.acc'
    model_name = 'spam_active'
    threshes_path = 'data/spambase.threshes'

    # laod and preprocess training data
    training_data = loader.load_dataset('data/spambase.data')
    # TODO convert labels from {0, 1} to {-1, 1}
    util.replace_zero_label_with_neg_one(training_data)

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    training_errs = []
    testing_errs = []
    # round_err_1st_boost = None
    # tr_errs_1st_boost = None
    # te_errs_1st_boost = None
    # te_auc_1st_boost = None
    roc = []
    auc = 0.0
    k_folds = Preprocess.prepare_k_folds(training_data, 5)
    tr_data_pool, te_data = Preprocess.get_i_fold(k_folds, 1)
    data_set = DataSet.DataSet(tr_data_pool)
    data_rates = (5, 10, 15, 20, 30, 50)
    for c in data_rates:
        tr_data = data_set.random_pick(c, False)
        tr_n, f_d = np.shape(tr_data[0])
        te_n, = np.shape(te_data[1])
        # TODO prepare distribution
        d = util.init_distribution(len(tr_data[0]))
        # TODO compute thresholds cheat sheet
        thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes)
        boost = b.Boosting(d)
        testing_predict = np.zeros((1, te_n)).tolist()[0]
        training_predict = np.zeros((1, tr_n)).tolist()[0]
        round_tr_err = []
        round_te_err = []
        round_model_err = []
        round_te_auc = []
        converged = False
        tol = 1e-5
        te_auc = 2.
        round = 0
        while round < round_limit: # and not converged:
            round += 1
            boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs)
            boost.update_predict(tr_data[0], training_predict)
            boost.update_predict(te_data[0], testing_predict)
            c_model_err = boost.model[-1].w_err
            round_model_err.append(c_model_err)
            c_f_ind = boost.model[-1].f_ind
            c_thresh = boost.model[-1].thresh
            c_tr_err = util.get_err_from_predict(training_predict, tr_data[1])
            c_te_err = util.get_err_from_predict(testing_predict, te_data[1])
            # TODO calculate the AUC for testing results
            # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1])
            round_tr_err.append(c_tr_err)
            round_te_err.append(c_te_err)
            # round_te_auc.append(c_te_auc)
            print('Data {}% Round: {} Feature: {} Threshold: {:.3f} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {}'.format(c, round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, 0))
            # converged =  abs(c_te_auc - te_auc) / te_auc <= tol
            # te_auc = c_te_auc

        training_errs.append(round_tr_err[-1])
        testing_errs.append(round_te_err[-1])
        # break      # for testing


    mean_training_err = np.mean(training_errs)
    mean_testing_err = np.mean(testing_errs)

    print('Training errs are:')
    print(training_errs)
    print('Mean training err is:')
    print(mean_training_err)
    print('Testing errs are:')
    print(testing_errs)
    print('Mean testing err is:')
    print(mean_testing_err)

    result = {}
    result['Trainingerrs'] = training_errs
    result['MeanTrainingAcc'] = mean_training_err
    result['Testingerrs'] = testing_errs
    result['MeanTestingAcc'] = mean_testing_err

    # result['ROC'] = str(roc)
    result['AUC'] = auc



    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)

Пример #6

0

Показать файл

Файл: PB6_boosting_spam.py Проект: Juncai/CS6140

def main():
    # training parameter
    k = 10  # fold
    layer_thresh = 2
    T = 50
    result_path = 'results/spamDT_final.acc'
    model_name = 'spam_' + str(k) + 'fold'
    threshes_path = 'data/spambase.threshes'

    # laod and preprocess training data
    training_data = loader.load_dataset('data/spambase.data')

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    training_errs = []
    testing_errs = []
    roc = []
    auc = 0.0
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in range(1):
        st = time.time()
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
        tr_n, f_d = np.shape(tr_data[0])
        te_n, = np.shape(te_data[1])
        t = dt.DecisionTree()
        t.build(tr_data[0], tr_data[1], threshes, layer_thresh)
        # test the bagging model and compute testing acc
        training_errs.append(t.test(tr_data[0], tr_data[1], util.acc))
        testing_errs.append(t.test(te_data[0], te_data[1], util.acc))
        print('Round {} finishes, time used: {}'.format(i, time.time() - st))


    mean_training_err = np.mean(training_errs)
    mean_testing_err = np.mean(testing_errs)

    print(str(k) + '-fold validation done. Training errs are:')
    print(training_errs)
    print('Mean training err is:')
    print(mean_training_err)
    print('Testing errs are:')
    print(testing_errs)
    print('Mean testing err is:')
    print(mean_testing_err)

    result = {}
    result['Fold'] = k
    result['Trainingerrs'] = training_errs
    result['MeanTrainingAcc'] = mean_training_err
    result['Testingerrs'] = testing_errs
    result['MeanTestingAcc'] = mean_testing_err

    result['ROC'] = roc
    result['AUC'] = auc



    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)

Пример #7

0

Показать файл

Файл: spam_NBBern.py Проект: Juncai/CS6140

k = 10  # fold
result_path = 'results/spamNBBern_1.acc'
model_name = 'spam_' + str(k) + 'fold_' + 'SS'


# laod and preprocess training data
training_data = loader.load_dataset('data/spambase.data')

# start training
training_accs = []
training_cms = []
testing_accs = []
testing_cms = []
roc = []
auc = 0.0
k_folds = Preprocess.prepare_k_folds(training_data, k)

means = loader.load_spam_mean('data/spam_mean')

for i in range(k):
    tr_data, te_data = Preprocess.get_i_fold(k_folds, i)


    model = m.NBBernoulli(means)
    model.build(tr_data[0], tr_data[1])

    training_test_res = model.test(tr_data[0], tr_data[1], util.compute_acc_confusion_matrix)
    training_accs.append(training_test_res[0])
    training_cms.append(training_test_res[1])
    testing_test_res = model.test(te_data[0], te_data[1], util.compute_acc_confusion_matrix)
    testing_accs.append(testing_test_res[0])

Пример #8

0

Показать файл

Файл: PB2_adaboost_10fold.py Проект: Juncai/CS6140

def main():
    # training parameter
    target = 'crx'
    # target = 'vote'
    k = 10  # fold
    round_limit = 150

    if target == 'crx':
        result_path = 'results/crxBoosting_final_1.acc'
        model_name = 'crx_' + str(k) + 'fold'
        threshes_path = 'data/crx.threshes'
        data_path = 'data/crx_parsed.data'
    else:
        result_path = 'results/voteBoosting_final.acc'
        model_name = 'vote_' + str(k) + 'fold'
        threshes_path = 'data/vote.threshes'
        data_path = 'data/vote_parsed.data'

    # laod and preprocess training data
    training_data = loader.load_pickle_file(data_path)

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    training_errs = []
    testing_errs = []
    round_err_1st_boost = None
    tr_errs_1st_boost = None
    te_errs_1st_boost = None
    te_auc_1st_boost = None
    roc = []
    auc = 0.0
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in range(k):
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
        tr_n, f_d = np.shape(tr_data[0])
        te_n, = np.shape(te_data[1])
        # TODO prepare distribution
        d = util.init_distribution(len(tr_data[0]))
        # TODO compute thresholds cheat sheet
        thresh_cs = util.pre_compute_threshes_uci(tr_data[0], tr_data[1], threshes)
        boost = b.Boosting(d)
        testing_predict = np.zeros((1, te_n)).tolist()[0]
        training_predict = np.zeros((1, tr_n)).tolist()[0]
        round_tr_err = []
        round_te_err = []
        round_model_err = []
        round_te_auc = []
        converged = False
        tol = 1e-5
        te_auc = 2.
        round = 0
        while round < round_limit: # and not converged:
            round += 1
            boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs)
            boost.update_predict(tr_data[0], training_predict)
            boost.update_predict(te_data[0], testing_predict)
            c_model_err = boost.model[-1].w_err
            round_model_err.append(c_model_err)
            c_f_ind = boost.model[-1].f_ind
            c_thresh = boost.model[-1].thresh
            c_tr_err = util.get_err_from_predict(training_predict, tr_data[1])
            c_te_err = util.get_err_from_predict(testing_predict, te_data[1])
            # TODO calculate the AUC for testing results
            # c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1])
            # round_tr_err.append(c_tr_err)
            # round_te_err.append(c_te_err)
            # round_te_auc.append(c_te_auc)
            print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err))
            # converged =  abs(c_te_auc - te_auc) / te_auc <= tol
            # te_auc = c_te_auc

        training_errs.append(c_tr_err)
        testing_errs.append(c_te_err)
        # if k == 0:
        #     round_err_1st_boost = round_model_err
        #     tr_errs_1st_boost = round_tr_err
        #     te_errs_1st_boost = round_te_err
            # te_auc_1st_boost = round_te_auc

        # break      # for testing


    mean_training_err = np.mean(training_errs)
    mean_testing_err = np.mean(testing_errs)

    print(str(k) + '-fold validation done. Training errs are:')
    print(training_errs)
    print('Mean training err is:')
    print(mean_training_err)
    print('Testing errs are:')
    print(testing_errs)
    print('Mean testing err is:')
    print(mean_testing_err)

    result = {}
    result['Fold'] = str(k)
    result['Trainingerrs'] = str(training_errs)
    result['MeanTrainingAcc'] = str(mean_training_err)
    result['Testingerrs'] = str(testing_errs)
    result['MeanTestingAcc'] = str(mean_testing_err)
    result['1stBoostTrainingError'] = str(tr_errs_1st_boost)
    result['1stBoostTestingError'] = str(te_errs_1st_boost)
    result['1stBoostModelError'] = str(round_err_1st_boost)
    result['1stBoostTestingAUC'] = str(te_auc_1st_boost)

    # result['ROC'] = str(roc)
    result['AUC'] = str(auc)



    # log the training result to file
    util.write_result_to_file(result_path, model_name, result)

Пример #9

0

Показать файл

Файл: player_clustering_v1.py Проект: Juncai/LOLAnalytics

def main():

    # centers = [i for i in range(3, 15)]
    #
    # for c in centers:
    #     k_means(c)
    st = time.time()

    # load data
    print('{} Loading match data...'.format(time.time() - st))
    match_dict = util.load_pickle_file(perfect_match_path)


    print('Loading player dict...')
    player_dict = util.load_pickle(player_dict_path)

    print('Loading champion tags...')
    champ_tags = util.load_pickle(champ_tags_path)
    champ_tags_list = list(champ_tags[0])
    champ_tags_dict = champ_tags[1]

    # get data from the dict
    print('Getting features from the dict...')
    # each player feature will have six elements (one for each champ tag)
    # ORDER: Tank, Marksman, Support, Fighter, Mage, Assassin
    n_p_feature = 44
    player_feature_dict_pre = {}
    col_to_del = [1, 34, 35]
    for pid in player_dict:
        player_feature_dict_pre[pid] = {c.MATCH_COUNT : np.zeros((6,)), c.FEATURES : []}
        for i in range(6):
            player_feature_dict_pre[pid][c.FEATURES].append(np.zeros((n_p_feature,)))
        for cid in player_dict[pid]:
            for t in champ_tags_dict[cid]:
                cur_f = np.delete(player_dict[pid][cid][c.FEATURES], col_to_del)
                player_feature_dict_pre[pid][c.FEATURES][champ_tags_list.index(t)] += cur_f
                player_feature_dict_pre[pid][c.MATCH_COUNT][champ_tags_list.index(t)] += player_dict[pid][cid][c.MATCH_COUNT]
        for i, f in enumerate(player_feature_dict_pre[pid][c.FEATURES]):
            cur_m_count = player_feature_dict_pre[pid][c.MATCH_COUNT][i]
            f /= (cur_m_count if cur_m_count > 0 else 1)

    # player_features_id = np.array([np.append(player_dict[pid][c.FEATURES], pid) for pid in player_dict])  # last column is pid
    # player_features = player_features_id[:, 0 : -1]




    # n = len(player_features)
    #
    #
    # # 2D embedding of the digits dataset
    # print("Computing embedding")
    # player_features = manifold.SpectralEmbedding(n_components=2).fit_transform(player_features)
    # print("Done.")

    # construct new features as a team play style (currently a simple aggregation of all the players' play style)
    print('{} Constructing new dataset...'.format(time.time() - st))
    n_feature = n_p_feature * len(champ_tags_list)
    features = []
    label = []
    flip = False    # flag for flip win/lose every match
    for mid, m in match_dict.items():
        win_f = np.zeros((n_feature,))
        loss_f = np.zeros((n_feature,))
        team_f = [win_f, loss_f]
        for t_ind, team in enumerate(m):
            ct_count = np.zeros((6,))  # counts for each champion tag
            for ind, pid in enumerate(team[c.TEAM_INFO_PLAYERS]):
                champ_id = team[c.TEAM_INFO_CHAMPIONS][ind]
                champ_tags = champ_tags_dict[champ_id]
                for ct in champ_tags:
                    ct_ind = champ_tags_list.index(ct)
                    ct_count[ct_ind] += 1
                    start_col = 0 + ct_ind * n_p_feature
                    end_col = (ct_ind + 1) * n_p_feature
                    cur_pf = player_feature_dict_pre[pid][c.FEATURES][ct_ind]
                    # print("ct: {}, ct_ind: {}, start_col: {}, end_col: {}".format(ct, ct_ind, start_col, end_col))
                    # print(team_f[t_ind][start_col:end_col])
                    # print(cur_pf)
                    team_f[t_ind][start_col:end_col] += cur_pf
            for ctc_ind, ctc in enumerate(ct_count):
                start_col = 0 + ctc_ind * n_p_feature
                end_col = (ctc_ind + 1) * n_p_feature
                if ctc > 1:
                    team_f[t_ind][start_col:end_col] /= ctc
                elif ctc == 0:
                    for pid in team[c.TEAM_INFO_PLAYERS]:
                        team_f[t_ind][start_col:end_col] +=  player_feature_dict_pre[pid][c.FEATURES][ctc_ind]
                    team_f[t_ind][start_col:end_col] /= 5

        if np.random.random_sample() >= 0.5:
            features.append(np.append(loss_f, win_f))
            # features.append(loss_f - win_f)
            label.append(-1)
        else:
            features.append(np.append(win_f, loss_f))
            # features.append(win_f - loss_f)
            label.append(1)
        flip = not flip  # flip the flag

    features = np.array(features)
    label = np.array(label)

    # features = normalize(features)

    # prepare training and testing set
    print('{} Start training...'.format(time.time() - st))
    k = 9
    k_folds = Preprocess.prepare_k_folds([features, label], k)

    for i in range(k):
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
        tr_n, f_d = np.shape(tr_data[0])
        te_n, = np.shape(te_data[1])

        # train with some algorithm

        # clf1 = LogisticRegression(random_state=123)  # 0.57
        cc = 0.01
        kernel = 'rbf'
        tol = 0.01
        # clf1 = svm.SVC(C=cc, kernel=kernel, tol=tol)  # rbf: 0.5,
        # clf1 = KNeighborsClassifier(n_neighbors=4)  # 3: 0.55, 4: 0.53

        clf1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                 algorithm="SAMME",
                 n_estimators=200)

        clf1.fit(tr_data[0], tr_data[1])
        tr_pred1 = clf1.predict(tr_data[0])
        te_pred1 = clf1.predict(te_data[0])




        # NN
        # net = buildNetwork(, 3, 1)






        tr_acc = (tr_pred1 == tr_data[1]).sum() / tr_n
        te_acc = (te_pred1 == te_data[1]).sum() / te_n
        print('Training acc: {}, Testing acc: {}'.format(tr_acc, te_acc))

Пример #10

0

Показать файл

Файл: PB1_adaboost_spam_optimal.py Проект: Juncai/CS6140

def main():
    # training parameter
    k = 10  # fold
    round_limit = 100
    result_path = 'results/spamODSBoosting_final.acc'
    model_name = 'spam_' + str(k) + 'fold'
    threshes_path = 'data/spambase.threshes'

    # laod and preprocess training data
    training_data = loader.load_dataset('data/spambase.data')
    # TODO convert labels from {0, 1} to {-1, 1}
    util.replace_zero_label_with_neg_one(training_data)

    # load thresholds
    threshes = loader.load_pickle_file(threshes_path)

    # start training
    training_errs = []
    testing_errs = []
    round_err_1st_boost = None
    tr_errs_1st_boost = None
    te_errs_1st_boost = None
    te_auc_1st_boost = None
    te_roc_1st_boost = None
    roc = []
    auc = 0.0
    k_folds = Preprocess.prepare_k_folds(training_data, k)

    for i in range(1):
        tr_data, te_data = Preprocess.get_i_fold(k_folds, i)
        tr_n, f_d = np.shape(tr_data[0])
        te_n, = np.shape(te_data[1])
        # TODO prepare distribution
        d = util.init_distribution(len(tr_data[0]))
        # TODO compute thresholds cheat sheet
        thresh_cs = util.pre_compute_threshes(tr_data[0], tr_data[1], threshes)
        boost = b.Boosting(d)
        testing_predict = np.zeros((1, te_n)).tolist()[0]
        training_predict = np.zeros((1, tr_n)).tolist()[0]
        round_tr_err = []
        round_te_err = []
        round_model_err = []
        round_te_auc = []
        converged = False
        tol = 1e-5
        te_auc = 2.
        round = 0
        while round < round_limit:  # and not converged:
            round += 1
            boost.add_model(ds.DecisionStump, tr_data[0], tr_data[1], threshes, thresh_cs)
            boost.update_predict(tr_data[0], training_predict)
            boost.update_predict(te_data[0], testing_predict)
            c_model_err = boost.model[-1].w_err
            round_model_err.append(c_model_err)
            c_f_ind = boost.model[-1].f_ind
            c_thresh = boost.model[-1].thresh
            c_tr_err = util.get_err_from_predict(training_predict, tr_data[1])
            c_te_err = util.get_err_from_predict(testing_predict, te_data[1])
            # TODO calculate the AUC for testing results
            c_te_auc = util.get_auc_from_predict(testing_predict, te_data[1])
            round_tr_err.append(c_tr_err)
            round_te_err.append(c_te_err)
            round_te_auc.append(c_te_auc)
            print('Round: {} Feature: {} Threshold: {} Round_err: {:.12f} Train_err: {:.12f} Test_err {:.12f} AUC {:.12f}'.format(round, c_f_ind, c_thresh, c_model_err, c_tr_err, c_te_err, c_te_auc))
            converged =  abs(c_te_auc - te_auc) / te_auc <= tol
            te_auc = c_te_auc

        training_errs.append(round_tr_err[-1])
        testing_errs.append(round_te_err[-1])
        if i == 0:
            round_err_1st_boost = round_model_err
            tr_errs_1st_boost = round_tr_err
            te_errs_1st_boost = round_te_err
            te_auc_1st_boost = round_te_auc
            _, te_roc_1st_boost = util.get_auc_from_predict(testing_predict, te_data[1], True)

        # break      # for testing

    mean_training_err = np.mean(training_errs)
    mean_testing_err = np.mean(testing_errs)

    print(str(k) + '-fold validation done. Training errs are:')
    print(training_errs)
    print('Mean training err is:')
    print(mean_training_err)
    print('Testing errs are:')
    print(testing_errs)
    print('Mean testing err is:')
    print(mean_testing_err)

    result = {}
    result['Fold'] = k
    result['Trainingerrs'] = training_errs
    result['MeanTrainingAcc'] = mean_training_err
    result['Testingerrs'] = testing_errs
    result['MeanTestingAcc'] = mean_testing_err
    result['1stBoostTrainingError'] = tr_errs_1st_boost
    result['1stBoostTestingError'] = te_errs_1st_boost
    result['1stBoostModelError'] = round_err_1st_boost
    result['1stBoostTestingAUC'] = te_auc_1st_boost
    result['1stBoostTestingROC'] = te_roc_1st_boost

    # result['ROC'] = str(roc)
    result['AUC'] = auc

    # log the training result to file
    util.write_result_to_file(result_path, model_name, result, True)

Python Preprocess.prepare_k_folds примеры использования