def cross_vad(examples, num_folds = 10): data = ut.dataCrossSplit(examples, num_folds, False) errorRates = [] for i in range(num_folds): egs = data[i] dt = DTree(SelectAtt) dt.training(egs[0], 1) # calculate error rate error = [0.] * 2 for j in range(2): for x in egs[j]: if dt.predict(x) != x[0]: error[j] = error[j] + 1 error[j] = error[j] / len(egs[j]) print "Fold ", i, " trainingData errorRate: ", error[0], " testData errorRate:", error[1] errorRates.append(error) arr = np.array(errorRates) print "Train Mean ErrorRate:", np.mean(arr[:,0]), " Test Mean ErrorRate:", np.mean(arr[:,1]) print "Train StdVar ErrorRate:", np.sqrt(np.var(arr[:,0])), " Test Mean ErrorRate:", np.sqrt(np.var(arr[:, 1]))
def getWeakLearner(w, egs): """ given: a set of examples egs a distribution on the examples w return: a decision stump y that minimize J = \sum w_n I(y(x_n) \neq t_n) """ record = 0 error = 3171713.0 Nfeatures = len(egs[0]) - 1 for i in range(Nfeatures): dt = DTree(lambda examples: i + 1) dt.training(egs, 1) tmp = weightedError(w, egs, dt) if tmp < error: error = tmp record = dt return record
def cross_vad(examples, num_folds = 10): data = ut.dataCrossSplit(examples, num_folds, False) errorRates = [] for i in range(num_folds): egs = data[i] dt = DTree(SelectAtt) dt.training(egs[0], 1) # calculate error rate error = [0.] * 2 for j in range(2): for x in egs[j]: if dt.predict(x) != x[0]: error[j] = error[j] + 1 error[j] = error[j] / len(egs[j]) print "Fold ", i, " trainingData errorRate: ", error[0], " testData errorRate:", error[1] errorRates.append(error) arr = np.array(errorRates) print "Train Mean ErrorRate:", np.mean(arr[:,0]), " Test Mean ErrorRate:", np.mean(arr[:,1]) print "Train StdVar ErrorRate:", np.sqrt(np.var(arr[:,0])), " Test Mean ErrorRate:", np.sqrt(np.var(arr[:, 1])) if __name__ == '__main__': filename = sys.argv[1] egs = ut.importRawData(filename) egs = ut.preprocess(egs) cross_vad(egs) dt = DTree(SelectAtt) dt.training(egs) print "========= the decision tree ============" dt.printTree()