def makeOneFold(nb_folds): # Returns one fold from the cross-validation training set # Note: has to create the whole cross-validation set (could be improved) data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, nb_folds) rand_fold = random.randint(0, nb_folds-1) # Pick a random fold to test np.random.shuffle(data_trains[rand_fold]) # shuffles training examples x_train = data_trains[rand_fold][:, 1:] y_train = data_trains[rand_fold][:, 0] x_test = data_tests[rand_fold][:, 1:] y_test = data_tests[rand_fold][:, 0] return x_train, y_train, x_test, y_test
import PreprocessData as pp import TestRun from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier import matplotlib as plt # This file is used to test other machine learning algorithms if __name__ == '__main__': pass data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, 9) print("Tests") errs = [] for i in range(9): x_train = data_trains[i][:, 1:] y_train = data_trains[i][:, 0] x_test = data_tests[i][:, 1:] y_test = data_tests[i][:, 0] # logistic regression reg = LogisticRegression() reg.fit(x_train, y_train) print("Error:", reg.score(x_test, y_test)) # support vector machine
def crossValidate(net, nb_folds, iterations=1000, learning_rate=0.01, grad_decay=0.9, epsilon=0.000001, adadelta=False): # Splits the data into nb_folds batches using each batch as a testing set in turn and rest as the training set ######## Need to fix: how to train on multiple years at once? data_trains, data_tests = pp.preprocessing_cross_valid(2012, 2014, nb_folds) for i in range(nb_folds): np.random.shuffle(data_trains[i]) # shuffles training examples min_errs = [] test_errs = [] train_errs = [] nb_buckets = 5 # Could make this a parameter freq_probs_test = [0] * nb_buckets freq_wins_test = [0] * nb_buckets freq_probs_train = [0] * nb_buckets freq_wins_train = [0] * nb_buckets for i in range(nb_folds): print("--- Fold " + str(i+1) + " ---") start = time.clock() net.reset() # Make test and training sets x_train = data_trains[i][:, 1:] y_train = data_trains[i][:, 0] x_test = data_tests[i][:, 1:] y_test = data_tests[i][:, 0] temp = net.test(x_train, y_train, iterations, learning_rate, grad_decay, epsilon, adadelta, X_test=x_test, y_test=y_test) min_errs.append(temp[0]) test_errs.append(temp[1]) train_errs.append(temp[2]) freqs = net.testProbBuckets(x_train, y_train, nb_buckets=nb_buckets, X_test=x_test, y_test=y_test) # Aggregates the prob buckets from each fold together freq_probs_test = list(map(add, freq_probs_test, freqs[0])) freq_wins_test = list(map(add, freq_wins_test, freqs[1])) freq_probs_train = list(map(add, freq_probs_train, freqs[2])) freq_wins_train = list(map(add, freq_wins_train, freqs[3])) print("Time:", time.clock() - start) print("\n----------") print(net, "\tNb folds:", nb_folds) print("Avg min:", sum(min_errs)/nb_folds, "\t\t\t", min_errs) print("Avg final test:", sum(test_errs)/nb_folds, "\t\t\t", test_errs) print("Avg final train:", sum(train_errs)/nb_folds, "\t\t\t", train_errs) probs_test = [freq_wins_test[i]/ freq_probs_test[i] if freq_probs_test[i] != 0 else -1 for i in range(nb_buckets)] probs_train = [freq_wins_train[i]/ freq_probs_train[i] if freq_probs_train[i] != 0 else -1 for i in range(nb_buckets)] print("Total freq test:") print(freq_probs_test) print(freq_wins_test) print(["{0:.2f}".format(x) for x in probs_test]) print("Total freq train:") print(freq_probs_train) print(freq_wins_train) print(["{0:.2f}".format(x) for x in probs_train]) # Returns average min test error return sum(min_errs)/nb_folds