def _read_data(filename): with util.get_learner_data_file(filename) as f: alldata = np.genfromtxt(f, delimiter=',') if filename == 'Istanbul.csv': # Skip the date column (first) and header row (first) return alldata[1:, 1:] return alldata
def get_rmse(n_leaves): inf = 'Istanbul.csv' rmse_train, rmse_test = [], [] percents = np.arange(0.05,0.8,0.05) for i in percents: with util.get_learner_data_file(inf) as f: alldata = np.genfromtxt(f,delimiter=',') # Skip the date column and header row if we're working on Istanbul data alldata = alldata[1:,1:] datasize = alldata.shape[0] cutoff = int(datasize * i) permutation = np.random.permutation(alldata.shape[0]) col_permutation = np.random.permutation(alldata.shape[1]-1) train_data = alldata[permutation[:cutoff],:] trainX = train_data[:,col_permutation] trainY = train_data[:,-1] test_data = alldata[permutation[cutoff:],:] testX = test_data[:,col_permutation] testY = test_data[:,-1] n = 10 #times_dt, times_rt = [], [] #for n in range(1, n_leaves + 1): #learner = bl.BagLearner(learner=dtl.DTLearner, kwargs={"leaf_size":n}, bags=20) #learner = il.InsaneLearner(verbose=False) # rtlearner = rtl.RTLearner(leaf_size=n) # t0 = time.time() # rtlearner.addEvidence(trainX, trainY) # train it # t1 = time.time() # times_rt.append(t1-t0) learner = rtl.RTLearner(leaf_size=n) t2 = time.time() learner.addEvidence(trainX, trainY) t3 = time.time() #times_dt.append(t3-t2) # evaluate in sample predY = learner.query(trainX) # get the predictions rmse_1 = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0]) rmse_train.append(rmse_1) c = np.corrcoef(predY, y=trainY) # evaluate out of sample predY = learner.query(testX) # get the predictions rmse_2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0]) rmse_test.append(rmse_2) c = np.corrcoef(predY, y=testY) return rmse_train, rmse_test, percents
def get_data_dict(dataName, portion = False, shuffle = False): with util.get_learner_data_file(dataName) as f: data = np.genfromtxt(f,delimiter=',') # Skip the date column and header row if we're working on Istanbul data if dataName == 'Istanbul.csv': data = data[1:,1:] # compute how much of the data is training and testing if shuffle: np.random.shuffle(data) myData = {} train_rows = int(0.6* data.shape[0]) if portion: data = data[:int(portion*data.shape[0])] train_rows = int(0.6* data.shape[0]) myData['trainX'] = data[:train_rows,0:-1] myData['trainY'] = data[:train_rows,-1] myData['testX'] = data[train_rows:,0:-1] myData['testY'] = data[train_rows:,-1] return myData
def test_learners(description, group, datafile, seed, outputs, grader): """Test ML models returns correct predictions. Requires test description, test case group, inputs, expected outputs, and a grader fixture. """ points_earned = 0.0 # initialize points for this test case try: learner_class = None kwargs = {'verbose': False} # (BPH) Copied from grade_strategy_qlearning.py #Set fixed seed for repetability np.random.seed(seed) random.seed(seed) #remove ability to seed either np.random or python random tmp_numpy_seed = np.random.seed tmp_random_seed = random.seed np.random.seed = fake_seed random.seed = fake_rseed # Try to import KNNLearner (only once) # if not 'KNNLearner' in globals(): # from KNNLearner import KNNLearner if not 'RTLearner' in globals(): from RTLearner import RTLearner if not 'DTLearner' in globals(): from DTLearner import DTLearner if (group is 'BagLearner') or (group is 'InsaneLearner') or ( group is 'RandomName') and (not 'BagLearner' in globals()): from BagLearner import BagLearner #put seeds back for the moment np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed # Tweak kwargs # kwargs.update(inputs.get('kwargs', {})) # Read separate training and testing data files # with open(inputs['train_file']) as f: # data_partitions=list() testX, testY, trainX, trainY = None, None, None, None permutation = None author = None with util.get_learner_data_file(datafile) as f: alldata = np.genfromtxt(f, delimiter=',') # Skip the date column and header row if we're working on Istanbul data if datafile == 'Istanbul.csv': alldata = alldata[1:, 1:] datasize = alldata.shape[0] cutoff = int(datasize * 0.6) permutation = np.random.permutation(alldata.shape[0]) col_permutation = np.random.permutation(alldata.shape[1] - 1) train_data = alldata[permutation[:cutoff], :] # trainX = train_data[:,:-1] trainX = train_data[:, col_permutation] trainY = train_data[:, -1] test_data = alldata[permutation[cutoff:], :] # testX = test_data[:,:-1] testX = test_data[:, col_permutation] testY = test_data[:, -1] msgs = [] if (group is "RTLearner") or (group is "DTLearner"): clss_name = RTLearner if group is "RTLearner" else DTLearner tree_sptc = 3 if group is "RTLearner" else 10 corr_in, corr_out, corr_in_50 = None, None, None def oneleaf(): np.random.seed(seed) random.seed(seed) np.random.seed = fake_seed random.seed = fake_rseed learner = clss_name(leaf_size=1, verbose=False) learner.addEvidence(trainX, trainY) insample = learner.query(trainX) outsample = learner.query(testX) np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed author_rv = None try: author_rv = learner.author() except: pass return insample, outsample, author_rv def fiftyleaves(): np.random.seed(seed) random.seed(seed) np.random.seed = fake_seed random.seed = fake_rseed learner = clss_name(leaf_size=50, verbose=False) learner.addEvidence(trainX, trainY) np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed return learner.query(trainX) predY_in, predY_out, author = run_with_timeout( oneleaf, tree_sptc, (), {}) predY_in_50 = run_with_timeout(fiftyleaves, tree_sptc, (), {}) corr_in = np.corrcoef(predY_in, y=trainY)[0, 1] corr_out = np.corrcoef(predY_out, y=testY)[0, 1] corr_in_50 = np.corrcoef(predY_in_50, y=trainY)[0, 1] incorrect = False if corr_in < outputs['insample_corr_min'] or np.isnan(corr_in): incorrect = True msgs.append( " In-sample with leaf_size=1 correlation less than allowed: got {} expected {}" .format(corr_in, outputs['insample_corr_min'])) else: points_earned += 1.0 if corr_out < outputs['outsample_corr_min'] or np.isnan(corr_out): incorrect = True msgs.append( " Out-of-sample correlation less than allowed: got {} expected {}" .format(corr_out, outputs['outsample_corr_min'])) else: points_earned += 1.0 if corr_in_50 > outputs['insample_corr_max'] or np.isnan( corr_in_50): incorrect = True msgs.append( " In-sample correlation with leaf_size=50 greater than allowed: got {} expected {}" .format(corr_in_50, outputs['insample_corr_max'])) else: points_earned += 1.0 # Check author string if (author is None) or (author == 'tb34'): incorrect = True msgs.append(" Invalid author: {}".format(author)) points_earned += -2.0 elif group is "BagLearner": corr1, corr20 = None, None bag_sptc = 10 def onebag(): np.random.seed(seed) random.seed(seed) np.random.seed = fake_seed random.seed = fake_rseed learner1 = BagLearner(learner=RTLearner, kwargs={"leaf_size": 1}, bags=1, boost=False, verbose=False) learner1.addEvidence(trainX, trainY) q_rv = learner1.query(testX) a_rv = learner1.author() np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed return q_rv, a_rv def twentybags(): np.random.seed(seed) random.seed(seed) np.random.seed = fake_seed random.seed = fake_rseed learner20 = BagLearner(learner=RTLearner, kwargs={"leaf_size": 1}, bags=20, boost=False, verbose=False) learner20.addEvidence(trainX, trainY) q_rv = learner20.query(testX) np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed return q_rv predY1, author = run_with_timeout(onebag, bag_sptc, pos_args=(), keyword_args={}) predY20 = run_with_timeout(twentybags, bag_sptc, (), {}) corr1 = np.corrcoef(predY1, testY)[0, 1] corr20 = np.corrcoef(predY20, testY)[0, 1] incorrect = False # msgs = [] if corr20 <= corr1: incorrect = True msgs.append( " Out-of-sample correlation for 20 bags is not greater than for 1 bag. 20 bags:{}, 1 bag:{}" .format(corr20, corr1)) else: points_earned += 2.0 # Check author string if (author is None) or (author == 'tb34'): incorrect = True msgs.append(" Invalid author: {}".format(author)) points_earned += -1.0 elif group is "InsaneLearner": try: def insane(): import InsaneLearner as it learner = it.InsaneLearner(verbose=False) learner.addEvidence(trainX, trainY) Y = learner.query(testX) run_with_timeout(insane, 10, pos_args=(), keyword_args={}) incorrect = False except Exception as e: incorrect = True msgs.append( " Exception calling InsaneLearner: {}".format(e)) points_earned = -10 elif group is "RandomName": try: il_name, il_code = gen_class() exec(il_code) in globals(), locals() il_cobj = eval(il_name) def rnd_name(): np.random.seed(seed) random.seed(seed) np.random.seed = fake_seed random.seed = fake_rseed learner = BagLearner(learner=il_cobj, kwargs={'verbose': False}, bags=20, boost=False, verbose=False) learner.addEvidence(trainX, trainY) Y = learner.query(testX) np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed return il_cobj.init_callcount_dict, il_cobj.add_callcount_dict, il_cobj.query_callcount_dict iccd, accd, qccd = run_with_timeout(rnd_name, 10, pos_args=(), keyword_args={}) incorrect = False if (len(iccd) != 20) or (any([v != 1 for v in iccd.values()])): incorrect = True msgs.append( " Unexpected number of calls to __init__, sum={} (should be 20), max={} (should be 1), min={} (should be 1)" .format(len(iccd), max(iccd.values()), min(iccd.values()))) points_earned = -10 if (len(accd) != 20) or (any([v != 1 for v in accd.values()])): incorrect = True msgs.append( " Unexpected number of calls to addEvidence sum={} (should be 20), max={} (should be 1), min={} (should be 1)" .format(len(accd), max(accd.values()), min(accd.values()))) points_earned = -10 if (len(qccd) != 20) or (any([v != 1 for v in qccd.values()])): incorrect = True msgs.append( " Unexpected number of calls to query, sum={} (should be 20), max={} (should be 1), min={} (should be 1)" .format(len(qccd), max(qccd.values()), min(qccd.values()))) points_earned = -10 except Exception as e: incorrect = True msgs.append(" Exception calling BagLearner: {}".format(e)) points_earned = -10 if incorrect: inputs_str = " data file: {}\n" \ " permutation: {}".format(datafile, permutation) raise IncorrectOutput, "Test failed on one or more output criteria.\n Inputs:\n{}\n Failures:\n{}".format( inputs_str, "\n".join(msgs)) except Exception as e: # Test result: failed msg = "Description: {} (group: {})\n".format(description, group) # Generate a filtered stacktrace, only showing erroneous lines in student file(s) tb_list = tb.extract_tb(sys.exc_info()[2]) for i in xrange(len(tb_list)): row = tb_list[i] tb_list[i] = (os.path.basename(row[0]), row[1], row[2], row[3] ) # show only filename instead of long absolute path tb_list = [ row for row in tb_list if (row[0] == 'RTLearner.py') or (row[0] == 'BagLearner.py') ] if tb_list: msg += "Traceback:\n" msg += ''.join(tb.format_list(tb_list)) # contains newlines msg += "{}: {}".format(e.__class__.__name__, e.message) # Report failure result to grader, with stacktrace grader.add_result( GradeResult(outcome='failed', points=points_earned, msg=msg)) raise else: # Test result: passed (no exceptions) grader.add_result( GradeResult(outcome='passed', points=points_earned, msg=None))
def generate_plots(datafile='Istanbul.csv'): np.random.seed(10) print " Plotting graphs for " + str(datafile) with util.get_learner_data_file(datafile) as f: alldata = np.genfromtxt(f, delimiter=',') # Skip the date column and header row if we're working on Istanbul data if datafile == 'Istanbul.csv': alldata = alldata[1:, 1:] data = alldata # print data.shape[0] data = np.random.permutation(data) # compute how much of the data is training and testing np.random.shuffle(data) train_rows = int(0.6 * data.shape[0]) test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows, 0:-1] trainY = data[:train_rows, -1] testX = data[train_rows:, 0:-1] testY = data[train_rows:, -1] rmses_in = np.empty(100) rmses_in.fill(-1) rmses_out = np.empty(100) rmses_out.fill(-1) dt_learner_tree_size = np.empty(100) dt_learner_tree_size.fill(-1) dt_learner_tree_time = np.empty(100) dt_learner_tree_time.fill(-1) leaf_size = range(100) leaf_plot = range(1, 101) for i in leaf_size: learner = dt.DTLearner(leaf_size=i + 1, verbose=False) # constructor rmses_in[i], rmses_out[i], dt_learner_tree_size[ i], dt_learner_tree_time[i] = run_learner(learner, trainX, trainY, testX, testY) # training step plt.plot(leaf_plot, rmses_in, label="In Sample") plt.plot(leaf_plot, rmses_out, label="Out Sample") plt.legend(loc="best") plt.grid(True) plt.title('DTLearner overfitting with increasing leaf size.', fontsize=10) plt.ylabel('RMSE') plt.xlabel('Leaf Size') plt.savefig('DTLearner overfitting with increasing leaf size.') plt.clf() plt.cla() plt.close() rmses_in_2 = np.empty(100) rmses_in_2.fill(-1) rmses_out_2 = np.empty(100) rmses_out_2.fill(-1) for i in leaf_size: kwargs = {'leaf_size': i + 1} learner = bl.BagLearner(learner=dt.DTLearner, kwargs=kwargs, verbose=False, bags=20) # constructor rmses_in_2[i], rmses_out_2[i], ignore, ignore_2 = run_learner( learner, trainX, trainY, testX, testY) # training step plt.plot(leaf_plot, rmses_in_2, label="In Sample") plt.plot(leaf_plot, rmses_out_2, label="Out Sample") plt.legend(loc="best") plt.grid(True) plt.title('Baglearner at 20 bags overfitting with increasing leaf size.', fontsize=10) plt.ylabel('RMSE') plt.xlabel('Leaf Size') plt.savefig('Baglearner at 20 bags overfitting with increasing leaf size.') plt.clf() plt.cla() plt.close() rmses_in_3 = np.empty(100) rmses_in_3.fill(-1) rmses_out_3 = np.empty(100) rmses_out_3.fill(-1) rt_learner_tree_size = np.empty(100) rt_learner_tree_size.fill(-1) rt_learner_tree_time = np.empty(100) rt_learner_tree_time.fill(-1) for i in leaf_size: learner = rt.RTLearner(leaf_size=i + 1, verbose=False) # constructor rmses_in_3[i], rmses_out_3[i], rt_learner_tree_size[ i], rt_learner_tree_time[i] = run_learner(learner, trainX, trainY, testX, testY) # training step plt.plot(leaf_plot, rmses_in_3, label="In Sample") plt.plot(leaf_plot, rmses_out_3, label="Out Sample") plt.legend(loc="best") plt.grid(True) plt.title('RTLearner overfitting with increasing leaf size.', fontsize=10) plt.ylabel('RMSE') plt.xlabel('Leaf Size') plt.savefig('RTLearner overfitting with increasing leaf size.') plt.clf() plt.cla() plt.close() plt.plot(leaf_plot, rt_learner_tree_size, label="RTLearner") plt.plot(leaf_plot, dt_learner_tree_size, label="DTLearner") plt.legend(loc="best") plt.grid(True) plt.title('RTLearner vs DTLearner Tree Size.', fontsize=10) plt.ylabel('Tree Size') plt.xlabel('Leaf Size') plt.savefig('RTLearner vs DTLearner Tree Size.') plt.clf() plt.cla() plt.close() plt.plot(leaf_plot, rt_learner_tree_time, label="RTLearner") plt.plot(leaf_plot, dt_learner_tree_time, label="DTLearner") plt.legend(loc="best") plt.grid(True) plt.title('RTLearner vs DTLearner Tree Building Time.', fontsize=10) plt.ylabel('Time (Seconds)') plt.xlabel('Leaf Size') plt.savefig('RTLearner vs DTLearner Tree Building Time.') plt.clf() plt.cla() plt.close() rmses_in_5 = np.empty(100) rmses_in_5.fill(-1) rmses_out_5 = np.empty(100) rmses_out_5.fill(-1) for i in leaf_size: kwargs = {'leaf_size': i + 1} learner = bl.BagLearner(learner=dt.DTLearner, kwargs=kwargs, verbose=False, bags=10) # constructor rmses_in_5[i], rmses_out_5[i], ignore, ignore_2 = run_learner( learner, trainX, trainY, testX, testY) # training step plt.plot(leaf_plot, rmses_in_5, label="In Sample") plt.plot(leaf_plot, rmses_out_5, label="Out Sample") plt.legend(loc="best") plt.grid(True) plt.title('Baglearner at 10 bags overfitting with increasing leaf size.', fontsize=10) plt.ylabel('RMSE') plt.xlabel('Leaf Size') plt.savefig('Baglearner at 10 bags overfitting with increasing leaf size.') plt.clf() plt.cla() plt.close()
import DTLearner as dt import RTLearner as rt import BagLearner as bl import sys import matplotlib.pyplot as plt import time import scipy.stats as stats if __name__ == "__main__": Path = './' if len(sys.argv) != 2: print "Usage: python testlearner.py <filename>" sys.exit(1) datafile = sys.argv[1] data = np.genfromtxt(util.get_learner_data_file(datafile), delimiter=',') if datafile == 'Istanbul.csv': data = data[1:, 1:] datasize = data.shape[0] cutoff = int(datasize * 0.6) leaf_sizes = range(1, 51) #Problem 1 rmse_in = np.zeros((len(leaf_sizes), 5)) rmse_out = np.zeros((len(leaf_sizes), 5)) ind = 0 for leaf_size in leaf_sizes:
import DTLearner as dt import RTLearner as rt import BagLearner as bl import util import matplotlib.pyplot as plt import sys def author(): return 'akarthik3' if __name__ == "__main__": # Get istanbul.csv data and remove unwanted parts data = np.genfromtxt(util.get_learner_data_file('Istanbul.csv'), delimiter=',') data = data[1:, 1:] # Separate data into training and testing (want every data point, split into 60-40 ratio between train/test) dataSize = int(0.6 * data.shape[0]) xTraining = data[:dataSize, 0:-1] yTraining = data[:dataSize, -1] xTesting = data[dataSize:, 0:-1] yTesting = data[dataSize:, -1] # Test 1 trainingRMSEs = np.zeros((100, 1)) testingRMSEs = np.zeros((100, 1)) for size in range(1, 101): learner = dt.DTLearner(size)
from sklearn.datasets import load_digits from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt if __name__=="__main__": datafile='Istanbul.csv' testX, testY, trainX, trainY = None, None, None, None permutation = None author = None with util.get_learner_data_file(datafile) as f: alldata = np.genfromtxt(f, delimiter=',') # Skip the date column and header row if we're working on Istanbul data if datafile == 'Istanbul.csv': alldata = alldata[1:, 1:] datasize = alldata.shape[0] cutoff = int(datasize * 0.6) permutation = np.random.permutation(alldata.shape[0]) col_permutation = np.random.permutation(alldata.shape[1] - 1) train_data = alldata[permutation[:cutoff], :] # trainX = train_data[:,:-1] trainX = train_data[:, col_permutation] trainY = train_data[:, -1] test_data = alldata[permutation[cutoff:], :] # testX = test_data[:,:-1] testX = test_data[:, col_permutation]
data = removedHeader # remove non-numerical (e.g. date) on the first column if np.isnan(col).all(): removedFirstCol = data[:, 1:] data = removedFirstCol return data if __name__ == "__main__": if len(sys.argv) != 2: print "Usage: python testlearner.py <filename>" sys.exit(1) fileParam = sys.argv[1] data = np.genfromtxt(util.get_learner_data_file(fileParam), delimiter=',') data = remove_header_col(data) # compute how much of the data is training and testing train_rows = int(0.6 * data.shape[0]) test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows, 0:-1] trainY = data[:train_rows, -1] testX = data[train_rows:, 0:-1] testY = data[train_rows:, -1] # Question 1 in_rmse_result_Q1, out_rmse_result_Q1, in_corr_result_Q1, out_corr_result_Q1 = question_one(
import numpy as np import math import LinRegLearner as lrl import sys import util import DTLearner as dt import RTLearner as rt import BagLearner as bl if __name__ == "__main__": if len(sys.argv) != 2: print "Usage: python testlearner.py <filename>" sys.exit(1) inf = sys.argv[1] with util.get_learner_data_file(inf) as f: data = np.genfromtxt(f, delimiter=',') # Skip the date column and header row if we're working on Istanbul data if inf == 'Istanbul.csv': data = data[1:, 1:] # compute how much of the data is training and testing train_rows = int(0.6 * data.shape[0]) test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows, 0:-1] trainY = data[:train_rows, -1] testX = data[train_rows:, 0:-1] testY = data[train_rows:, -1] print testX.shape
import util import numpy as np import math import LinRegLearner as lrl import DTLearner as dt import BagLearner as bl import InsaneLearner as it import RTLearner as rt import sys import time if __name__=="__main__": if len(sys.argv) != 2: print "Usage: python testlearner.py <filename>" sys.exit(1) with util.get_learner_data_file(sys.argv[1]) as f: data = np.genfromtxt(f,delimiter=',') # Skip the date column and header row if we're working on Istanbul data if sys.argv[1] == 'Istanbul.csv': data = data[1:,1:] # compute how much of the data is training and testing train_rows = int(0.6* data.shape[0]) test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows,0:-1] trainY = data[:train_rows,-1] testX = data[train_rows:,0:-1] testY = data[train_rows:,-1]
import DTLearner as dt import RTLearner as rt import BagLearner as bl import InsaneLearner as it import sys from util import get_learner_data_file import matplotlib.pyplot as plt import pandas as pd import time if __name__ == "__main__": if len(sys.argv) != 2: print "Usage: python testlearner.py <filename>" sys.exit(1) datafile = sys.argv[1] with get_learner_data_file(datafile) as f: data = np.genfromtxt(f, delimiter=',') # Skip the date column and header row if we're working on Istanbul data if datafile == 'Istanbul.csv': data = data[1:, 1:] # compute how much of the data is training and testing train_rows = int(0.6 * data.shape[0]) test_rows = data.shape[0] - train_rows # separate out training and testing data trainX = data[:train_rows, 0:-1] trainY = data[:train_rows, -1] testX = data[train_rows:, 0:-1] testY = data[train_rows:, -1]
def test_learners(description, group, datafile, seed, outputs, grader): """Test ML models returns correct predictions. Requires test description, test case group, inputs, expected outputs, and a grader fixture. """ points_earned = 0.0 # initialize points for this test case try: learner_class = None kwargs = {'verbose': False} # (BPH) Copied from grade_strategy_qlearning.py #Set fixed seed for repetability np.random.seed(seed) random.seed(seed) # These lines will be uncommented in the batch grader to # prevent accidentally fixing the seed within student # code # tmp_numpy_seed = np.random.seed # tmp_random_seed = random.seed # np.random.seed = fake_seed # random.seed = fake_rseed # Try to import KNNLearner (only once) # if not 'KNNLearner' in globals(): # from KNNLearner import KNNLearner if not 'RTLearner' in globals(): from RTLearner import RTLearner if group is 'BagLearner' and (not 'BagLearner' in globals()): from BagLearner import BagLearner # Tweak kwargs # kwargs.update(inputs.get('kwargs', {})) # Read separate training and testing data files # with open(inputs['train_file']) as f: # data_partitions=list() testX, testY, trainX, trainY = None, None, None, None permutation = None author = None with util.get_learner_data_file(datafile) as f: alldata = np.genfromtxt(f, delimiter=',') # Skip the date column and header row if we're working on Istanbul data if datafile == 'Istanbul.csv': alldata = alldata[1:, 1:] datasize = alldata.shape[0] cutoff = int(datasize * 0.6) permutation = np.random.permutation(alldata.shape[0]) col_permutation = np.random.permutation(alldata.shape[1] - 1) train_data = alldata[permutation[:cutoff], :] # trainX = train_data[:,:-1] trainX = train_data[:, col_permutation] trainY = train_data[:, -1] test_data = alldata[permutation[cutoff:], :] # testX = test_data[:,:-1] testX = test_data[:, col_permutation] testY = test_data[:, -1] if group is "RTLearner": corr_in, corr_out, corr_in_50 = None, None, None def oneleaf(): learner = RTLearner(leaf_size=1, verbose=False) learner.addEvidence(trainX, trainY) insample = learner.query(trainX) outsample = learner.query(testX) return insample, outsample, learner.author() def fiftyleaves(): learner = RTLearner(leaf_size=50, verbose=False) learner.addEvidence(trainX, trainY) return learner.query(trainX) predY_in, predY_out, author = run_with_timeout( oneleaf, seconds_per_test_case, (), {}) predY_in_50 = run_with_timeout(fiftyleaves, seconds_per_test_case, (), {}) corr_in = np.corrcoef(predY_in, y=trainY)[0, 1] corr_out = np.corrcoef(predY_out, y=testY)[0, 1] corr_in_50 = np.corrcoef(predY_in_50, y=trainY)[0, 1] incorrect = False msgs = [] if corr_in < outputs['insample_corr_min']: incorrect = True msgs.append( " In-sample with leaf_size=1 correlation less than allowed: got {} expected {}" .format(corr_in, outputs['insample_corr_min'])) else: points_earned += 1.5 if corr_out < outputs['outsample_corr_min']: incorrect = True msgs.append( " Out-of-sample correlation less than allowed: got {} expected {}" .format(corr_out, outputs['outsample_corr_min'])) else: points_earned += 1.5 if corr_in_50 > outputs['insample_corr_max']: incorrect = True msgs.append( " In-sample correlation with leaf_size=50 greater than allowed: got {} expected {}" .format(corr_in_50, outputs['insample_corr_max'])) else: points_earned += 1.0 # Check author string if (author is None) or (author == 'tb34'): incorrect = True msgs.append(" Invalid author: {}".format(author)) points_earned += -1.0 elif group is "BagLearner": corr1, corr20 = None, None def onebag(): learner1 = BagLearner(learner=RTLearner, kwargs={"leaf_size": 1}, bags=1, boost=False, verbose=False) learner1.addEvidence(trainX, trainY) return learner1.query(testX), learner1.author() def twentybags(): learner20 = BagLearner(learner=RTLearner, kwargs={"leaf_size": 1}, bags=20, boost=False, verbose=False) learner20.addEvidence(trainX, trainY) return learner20.query(testX) predY1, author = run_with_timeout(onebag, seconds_per_test_case, pos_args=(), keyword_args={}) predY20 = run_with_timeout(twentybags, seconds_per_test_case, (), {}) corr1 = np.corrcoef(predY1, testY)[0, 1] corr20 = np.corrcoef(predY20, testY)[0, 1] incorrect = False msgs = [] if corr20 <= corr1: incorrect = True msgs.append( " Out-of-sample correlation for 20 bags is not greater than for 1 bag. 20 bags:{}, 1 bag:{}" .format(corr20, corr1)) else: points_earned += 2.0 # Check author string if (author is None) or (author == 'tb34'): incorrect = True msgs.append(" Invalid author: {}".format(author)) points_earned += -1.0 if incorrect: inputs_str = " data file: {}\n" \ " permutation: {}".format(datafile, permutation) raise IncorrectOutput, "Test failed on one or more output criteria.\n Inputs:\n{}\n Failures:\n{}".format( inputs_str, "\n".join(msgs)) except Exception as e: # Test result: failed msg = "Description: {} (group: {})\n".format(description, group) # Generate a filtered stacktrace, only showing erroneous lines in student file(s) tb_list = tb.extract_tb(sys.exc_info()[2]) for i in xrange(len(tb_list)): row = tb_list[i] tb_list[i] = (os.path.basename(row[0]), row[1], row[2], row[3] ) # show only filename instead of long absolute path tb_list = [ row for row in tb_list if (row[0] == 'RTLearner.py') or (row[0] == 'BagLearner.py') ] if tb_list: msg += "Traceback:\n" msg += ''.join(tb.format_list(tb_list)) # contains newlines msg += "{}: {}".format(e.__class__.__name__, e.message) # Report failure result to grader, with stacktrace grader.add_result( GradeResult(outcome='failed', points=points_earned, msg=msg)) raise else: # Test result: passed (no exceptions) grader.add_result( GradeResult(outcome='passed', points=points_earned, msg=None))