def main(): dataset = constant.CREDIT X_con, X_cat, Y, test_con, test_cat, test_y = read_data(dataset, 0) train_validation_sets = k_fold_split(X_cat, X_con, Y, 5) for set in train_validation_sets: # extract training and validation sets train, validate = set t_cat, t_con, t_y = train v_cat, v_con, v_y = validate if t_y.shape[1] == 1: t_y, v_y = nb.convertY(t_y, v_y) # calculate prior, likelihoods and posterior probability depending # on what kind of data is in the dataset priors = nb.computePrior(t_y, dataset) if t_cat is not None and t_con is None: w_cat = nb.computeLikelihoodBernoulli(t_cat, t_y, priors) post = nb.posterior(priors, w_cat, None, v_cat, None) elif t_con is not None and t_cat is None: w_gauss = nb.computeGaussian(t_con, t_y) post = nb.posterior(priors, None, w_gauss, None, v_con) else: w_cat = nb.computeLikelihoodBernoulli(t_cat, t_y, priors) w_gauss = nb.computeGaussian(t_con, t_y) post = nb.posterior(priors, w_cat, w_gauss, v_cat, v_con) y_hat = nb.predict(post) rate = evaluate_acc(v_y, y_hat) print("success rate: " + str(rate))
def test_lr_vs_perf(dataset, lr_list): x, y, x_t, y_t = testImport.read_data(dataset, 1) folds = k_fold(x, y, 5) result = [] for lr in lr_list: perf = run_k_folds((lr, 0.005, 25000), folds) result.append(perf) return result
def test_lr_vs_its(dataset, lr_list): x, y, x_t, y_t = testImport.read_data(dataset, 1) result = [] for lr in lr_list: model = LogRegression.Log_Regression(lr, 0.005, 25000) model.fit(x, y) result.append(model.compute_avg_its()) return result
def test_both(): log_res = [] nb_res = [] for i in range(1, 5): x, y, x_test, y_test = testImport.read_data(i, 1) x_con, x_cat, y_, xt_con, xt_cat, yt = testImport.read_data(i, 0) log = LogRegression.Log_Regression(1, 0.005, 25000) nb = NaiveBayes.NaiveBayes() log.fit(x, y) nb.fit(x_con, x_cat, y) log_per = evaluate_acc(y_test, log.predict(x_test)) nb_per = evaluate_acc_NB(yt, nb.predict(xt_con, xt_cat)) log_res.append(log_per) nb_res.append(nb_per) print(log_res) print(nb_res)
def test_n_vs_perf(dataset, n_list): x, y, x_t, y_t = testImport.read_data(dataset, 1) perf_list = [] for n in n_list: xs, ys = less_cases_together(x, y, n) model = LogRegression.Log_Regression(1, 0.005, 10000) model.fit(xs, ys) perf = evaluate_acc(y_t, model.predict(x_t)) perf_list.append(perf) return perf_list
def test_model_nb(dataset): X_con, X_cat, Y, test_con, test_cat, test_y = testImport.read_data( dataset, 0) model = NaiveBayes.NaiveBayes() model.fit(X_con, X_cat, Y) y_hat = model.predict(test_con, test_cat) ac = evaluate_acc_NB(test_y, y_hat) print(ac)
def test_d_vs_perf(dataset, d_list): x, y, x_t, y_t = testImport.read_data(dataset, 1) print(x.shape[1]) perf_list = [] for d in d_list: xs, xs_t = less_features(x, x_t, d) model = LogRegression.Log_Regression(1, 0.005, 10000) model.fit(xs, y) perf = evaluate_acc(y_t, model.predict(xs_t)) perf_list.append(perf) return perf_list
def val_vs_perf(): full_p = [] k_p = [] k_on_t_p = [] for i in range(1, 5): x, y, x_t, y_t = testImport.read_data(i, 1) full_model = LogRegression.Log_Regression(1, 0.005, 25000) full_model.fit(x, y) perf = evaluate_acc(y_t, full_model.predict(x_t)) full_p.append(perf) perf = run_k_folds((1, 0.005, 25000), k_fold(x, y, 5)) k_model = run_k_folds_best((1, 0.005, 25000), k_fold(x, y, 5)) k_p.append(perf) perf = evaluate_acc(y_t, k_model.predict(x_t)) k_on_t_p.append(perf) print(full_p) print(k_p) print(k_on_t_p)
def test_nb_smaller_d(): d_list = [200, 100, 50, 40, 30, 25, 20, 10, 7, 5, 4, 3] for i in range(1, 5): smaller_d = [] x_con, x_cat, y, xt_con, xt_cat, yt = testImport.read_data(i, 0) for d in d_list: if x_con is not None: x_con, xt_con = less_features(x_con, xt_con, d) if x_cat is not None: x_cat, xt_cat = less_features(x_cat, xt_cat, d) model = NaiveBayes.NaiveBayes() model.fit(x_con, x_cat, y) smaller_d.append(evaluate_acc_NB(yt, model.predict(xt_con, xt_cat))) plt.plot(d_list, smaller_d) plt.xlabel('N') plt.ylabel('performance') plt.legend(['ionosphere', 'census', 'poker', 'credit']) plt.savefig('nb_testing/smaller_d')
def test_nb_smaller_n(): n_list = [ 1000, 500, 400, 300, 250, 200, 150, 100, 75, 50, 40, 30, 20, 15, 10 ] for i in range(1, 5): smaller_n = [] x_con, x_cat, y, xt_con, xt_cat, yt = testImport.read_data(i, 0) for n in n_list: if x_con is not None and x_cat is not None: x_con, x_cat, y = less_cases_separate(x_con, x_cat, y, n) if x_con is not None: x_con, y = less_cases_together(x_con, y, n) else: x_cat, y = less_cases_together(x_cat, y, n) model = NaiveBayes.NaiveBayes() model.fit(x_con, x_cat, y) smaller_n.append(evaluate_acc_NB(yt, model.predict(xt_con, xt_cat))) plt.plot(n_list, smaller_n) plt.xlabel('N') plt.ylabel('performance') plt.legend(['ionosphere', 'census', 'poker', 'credit']) plt.savefig('nb_testing/smaller_n')
def test_model_log(dataset, lr_list, eps_list, max_list, n_sizes, d_sizes, folds, reg_list=None): x, y, x_t, y_t = testImport.read_data(dataset, 1) # currently don't combine removing features and test cases # tests each n_size with all the other stuff n_performances = [] for size in n_sizes: x_s, y_s = less_cases_together(x, y, size) ''' print(x_s.shape) print(y_s.shape) print(x_t.shape) print(y_t.shape) ''' fold_list = k_fold(x_s, y_s, folds) # performance is a list of tuples # (lr, eps, m, (reg if its not none), k_fold performance, test performance) performances = [] # test all possible pairings of the different log_r parameters for lr in lr_list: for eps in eps_list: for m in max_list: if reg_list != None: for r in reg_list: model = LogRegression.Log_Regression(lr, eps, m, r) k_perf = run_k_folds(model, fold_list) real_perf = evaluate_acc(y_t, model.predict(x_t)) performances.append( (size, lr, eps, m, r, k_perf, real_perf)) else: model = LogRegression.Log_Regression(lr, eps, m) k_perf = run_k_folds(model, fold_list) real_perf = evaluate_acc(y_t, model.predict(x_t)) performances.append( (size, lr, eps, m, k_perf, real_perf)) n_performances.append(performances) #tests each d_size with all the other stuff d_performances = [] for size in d_sizes: x_s, x_t_s = less_features(x, x_t, size) fold_list = k_fold(x_s, y, folds) # performance is a list of tuples # (lr, eps, m, (reg if its not none), performance) performances = [] # test all possible pairings of the different log_r parameters for lr in lr_list: for eps in eps_list: for m in max_list: if reg_list != None: for r in reg_list: model = LogRegression.Log_Regression(lr, eps, m, r) k_perf = run_k_folds(model, fold_list) real_perf = evaluate_acc(y_t, model.predict(x_t_s)) performances.append( (size, lr, eps, m, r, k_perf, real_perf)) else: model = LogRegression.Log_Regression(lr, eps, m) k_perf = run_k_folds(model, fold_list) real_perf = evaluate_acc(y_t, model.predict(x_t_s)) performances.append( (size, lr, eps, m, k_perf, real_perf)) d_performances.append(performances) return n_performances, d_performances