Пример #1
0
def main():
    dataset = constant.CREDIT
    X_con, X_cat, Y, test_con, test_cat, test_y = read_data(dataset, 0)

    train_validation_sets = k_fold_split(X_cat, X_con, Y, 5)

    for set in train_validation_sets:
        # extract training and validation sets
        train, validate = set
        t_cat, t_con, t_y = train
        v_cat, v_con, v_y = validate

        if t_y.shape[1] == 1:
            t_y, v_y = nb.convertY(t_y, v_y)

        # calculate prior, likelihoods and posterior probability depending
        # on what kind of data is in the dataset
        priors = nb.computePrior(t_y, dataset)

        if t_cat is not None and t_con is None:
            w_cat = nb.computeLikelihoodBernoulli(t_cat, t_y, priors)
            post = nb.posterior(priors, w_cat, None, v_cat, None)

        elif t_con is not None and t_cat is None:
            w_gauss = nb.computeGaussian(t_con, t_y)
            post = nb.posterior(priors, None, w_gauss, None, v_con)

        else:
            w_cat = nb.computeLikelihoodBernoulli(t_cat, t_y, priors)
            w_gauss = nb.computeGaussian(t_con, t_y)
            post = nb.posterior(priors, w_cat, w_gauss, v_cat, v_con)

        y_hat = nb.predict(post)
        rate = evaluate_acc(v_y, y_hat)
        print("success rate: " + str(rate))
def test_lr_vs_perf(dataset, lr_list):
    x, y, x_t, y_t = testImport.read_data(dataset, 1)
    folds = k_fold(x, y, 5)
    result = []
    for lr in lr_list:
        perf = run_k_folds((lr, 0.005, 25000), folds)
        result.append(perf)
    return result
def test_lr_vs_its(dataset, lr_list):
    x, y, x_t, y_t = testImport.read_data(dataset, 1)
    result = []
    for lr in lr_list:
        model = LogRegression.Log_Regression(lr, 0.005, 25000)
        model.fit(x, y)
        result.append(model.compute_avg_its())
    return result
def test_both():
    log_res = []
    nb_res = []
    for i in range(1, 5):
        x, y, x_test, y_test = testImport.read_data(i, 1)
        x_con, x_cat, y_, xt_con, xt_cat, yt = testImport.read_data(i, 0)

        log = LogRegression.Log_Regression(1, 0.005, 25000)
        nb = NaiveBayes.NaiveBayes()

        log.fit(x, y)
        nb.fit(x_con, x_cat, y)

        log_per = evaluate_acc(y_test, log.predict(x_test))
        nb_per = evaluate_acc_NB(yt, nb.predict(xt_con, xt_cat))
        log_res.append(log_per)
        nb_res.append(nb_per)
    print(log_res)
    print(nb_res)
def test_n_vs_perf(dataset, n_list):
    x, y, x_t, y_t = testImport.read_data(dataset, 1)
    perf_list = []
    for n in n_list:
        xs, ys = less_cases_together(x, y, n)
        model = LogRegression.Log_Regression(1, 0.005, 10000)
        model.fit(xs, ys)
        perf = evaluate_acc(y_t, model.predict(x_t))
        perf_list.append(perf)
    return perf_list
def test_model_nb(dataset):
    X_con, X_cat, Y, test_con, test_cat, test_y = testImport.read_data(
        dataset, 0)

    model = NaiveBayes.NaiveBayes()
    model.fit(X_con, X_cat, Y)
    y_hat = model.predict(test_con, test_cat)

    ac = evaluate_acc_NB(test_y, y_hat)
    print(ac)
def test_d_vs_perf(dataset, d_list):
    x, y, x_t, y_t = testImport.read_data(dataset, 1)
    print(x.shape[1])
    perf_list = []
    for d in d_list:
        xs, xs_t = less_features(x, x_t, d)
        model = LogRegression.Log_Regression(1, 0.005, 10000)
        model.fit(xs, y)
        perf = evaluate_acc(y_t, model.predict(xs_t))
        perf_list.append(perf)
    return perf_list
def val_vs_perf():
    full_p = []
    k_p = []
    k_on_t_p = []
    for i in range(1, 5):
        x, y, x_t, y_t = testImport.read_data(i, 1)
        full_model = LogRegression.Log_Regression(1, 0.005, 25000)
        full_model.fit(x, y)
        perf = evaluate_acc(y_t, full_model.predict(x_t))
        full_p.append(perf)
        perf = run_k_folds((1, 0.005, 25000), k_fold(x, y, 5))
        k_model = run_k_folds_best((1, 0.005, 25000), k_fold(x, y, 5))
        k_p.append(perf)
        perf = evaluate_acc(y_t, k_model.predict(x_t))
        k_on_t_p.append(perf)
    print(full_p)
    print(k_p)
    print(k_on_t_p)
def test_nb_smaller_d():
    d_list = [200, 100, 50, 40, 30, 25, 20, 10, 7, 5, 4, 3]
    for i in range(1, 5):
        smaller_d = []
        x_con, x_cat, y, xt_con, xt_cat, yt = testImport.read_data(i, 0)
        for d in d_list:
            if x_con is not None:
                x_con, xt_con = less_features(x_con, xt_con, d)
            if x_cat is not None:
                x_cat, xt_cat = less_features(x_cat, xt_cat, d)
            model = NaiveBayes.NaiveBayes()
            model.fit(x_con, x_cat, y)
            smaller_d.append(evaluate_acc_NB(yt, model.predict(xt_con,
                                                               xt_cat)))
        plt.plot(d_list, smaller_d)
        plt.xlabel('N')
        plt.ylabel('performance')
    plt.legend(['ionosphere', 'census', 'poker', 'credit'])
    plt.savefig('nb_testing/smaller_d')
def test_nb_smaller_n():
    n_list = [
        1000, 500, 400, 300, 250, 200, 150, 100, 75, 50, 40, 30, 20, 15, 10
    ]
    for i in range(1, 5):
        smaller_n = []
        x_con, x_cat, y, xt_con, xt_cat, yt = testImport.read_data(i, 0)
        for n in n_list:
            if x_con is not None and x_cat is not None:
                x_con, x_cat, y = less_cases_separate(x_con, x_cat, y, n)
            if x_con is not None:
                x_con, y = less_cases_together(x_con, y, n)
            else:
                x_cat, y = less_cases_together(x_cat, y, n)
            model = NaiveBayes.NaiveBayes()
            model.fit(x_con, x_cat, y)
            smaller_n.append(evaluate_acc_NB(yt, model.predict(xt_con,
                                                               xt_cat)))
        plt.plot(n_list, smaller_n)
        plt.xlabel('N')
        plt.ylabel('performance')
    plt.legend(['ionosphere', 'census', 'poker', 'credit'])
    plt.savefig('nb_testing/smaller_n')
def test_model_log(dataset,
                   lr_list,
                   eps_list,
                   max_list,
                   n_sizes,
                   d_sizes,
                   folds,
                   reg_list=None):
    x, y, x_t, y_t = testImport.read_data(dataset, 1)

    # currently don't combine removing features and test cases
    # tests each n_size with all the other stuff
    n_performances = []
    for size in n_sizes:
        x_s, y_s = less_cases_together(x, y, size)
        '''
		print(x_s.shape)
		print(y_s.shape)
		print(x_t.shape)
		print(y_t.shape)
		'''
        fold_list = k_fold(x_s, y_s, folds)
        # performance is a list of tuples
        # (lr, eps, m, (reg if its not none), k_fold performance, test performance)
        performances = []

        # test all possible pairings of the different log_r parameters
        for lr in lr_list:
            for eps in eps_list:
                for m in max_list:
                    if reg_list != None:
                        for r in reg_list:
                            model = LogRegression.Log_Regression(lr, eps, m, r)
                            k_perf = run_k_folds(model, fold_list)
                            real_perf = evaluate_acc(y_t, model.predict(x_t))
                            performances.append(
                                (size, lr, eps, m, r, k_perf, real_perf))
                    else:
                        model = LogRegression.Log_Regression(lr, eps, m)
                        k_perf = run_k_folds(model, fold_list)
                        real_perf = evaluate_acc(y_t, model.predict(x_t))
                        performances.append(
                            (size, lr, eps, m, k_perf, real_perf))
        n_performances.append(performances)

    #tests each d_size with all the other stuff
    d_performances = []
    for size in d_sizes:
        x_s, x_t_s = less_features(x, x_t, size)
        fold_list = k_fold(x_s, y, folds)
        # performance is a list of tuples
        # (lr, eps, m, (reg if its not none), performance)
        performances = []

        # test all possible pairings of the different log_r parameters
        for lr in lr_list:
            for eps in eps_list:
                for m in max_list:
                    if reg_list != None:
                        for r in reg_list:
                            model = LogRegression.Log_Regression(lr, eps, m, r)
                            k_perf = run_k_folds(model, fold_list)
                            real_perf = evaluate_acc(y_t, model.predict(x_t_s))
                            performances.append(
                                (size, lr, eps, m, r, k_perf, real_perf))
                    else:
                        model = LogRegression.Log_Regression(lr, eps, m)
                        k_perf = run_k_folds(model, fold_list)
                        real_perf = evaluate_acc(y_t, model.predict(x_t_s))
                        performances.append(
                            (size, lr, eps, m, k_perf, real_perf))
        d_performances.append(performances)

    return n_performances, d_performances