def eval_tree_once(filename, train_size, test_size, replacement, ind_vars, target_var, max_depth): """ Evaluates a tree by sampling the training and testing data sets once. Returns the error for cross validation (SSE with respect to test set) and the error for evaluating residuals (SSE with respect to train set) """ partition_data(filename, train_size, test_size, replacement) # Build tree from train data f = file('data/my_train.csv', 'r') tree = construct(f, ind_vars, target_var, max_depth) f.close() # Write tree, just to see what the tree looks like. f = file('trees/my_tree.tree', 'w') write_tree(tree, f) f.close() # Append PassengerId as a variable for prediction purposes. ind_vars[('PassengerId', 'continuous')] = None # Make predictions on both test and train data sets. write_predictions('my_test', ind_vars, tree) write_predictions('my_train', ind_vars, tree) # Remove PassengerId from ind_vars ind_vars.pop(('PassengerId', 'continuous'), None) # calculate errors for both test and train data sets cross_val_error = calc_performance('my_test', target_var) res_error = calc_performance('my_train', target_var) return cross_val_error, res_error
('Pclass', 'categorical'): ['1', '2', '3'], ('Embarked', 'categorical'): ['C', 'S', 'Q'], ('Title', 'categorical'): titles, ('Ticket_Code', 'categorical'): ticket_codes, } target_var = ['Survived', 'categorical', '0', '1'] max_depth = 100 # Build tree! f = file('data/%s.csv' % filename_train, 'r') tree = construct(f, ind_vars, target_var, max_depth) f.close() # Write tree to file. Not necessary, but nice to have. f = file('trees/%s.tree' % filename_tree, 'w') write_tree(tree, f) f.close() # Append PassengerId for prediction purposes. f = file('data/%s.csv' % filename_test, 'r') ind_vars[('PassengerId', 'continuous')] = None data = get_data(f, ind_vars) var_dict = simplify_var_dict(ind_vars, None) f.close() # Output target variable predictions to csv. f = file('predictions/%s.csv' % filename_predictions, 'w') f.write('PassengerId,%s\n' % target_var[0]) for datum in data: distribution = tree.predict(datum, var_dict) write_prediction(