def classify(strong_hypothesis, example): classification = 0 for weight, learner in strong_hypothesis: ex_class = 1 if id3.classify(learner, example) == 1 else -1 classification += weight*ex_class return 1 if classification > 0 else 0
def trial_id3(tree, testing_examples): """ A single trial on *testing_examples* using *tree*. """ list_classes = [] for i in range(0, len(testing_examples)): feature_vector = testing_examples[i][:-1] list_classes.append(id3.classify(tree, feature_vector)) return list_classes
def adaboost(training_data, rounds): m = len(training_data) weights = np.ones(m) * 1.0 / m strong_hypothesis = np.zeros(m) learners = [] alphas = [] attributes = set(range(57)) for t in range(rounds): error = 0.0 resampled_examples = [] examples_index = resample(weights, m) for i in range(m): resampled_examples.append(training_data[examples_index[i]]) weak_learner = id3.id3_depth_limited(resampled_examples, attributes, 2) learners.append(weak_learner) #classifications = [(id3.classify(weak_learner, X), y) for X, y in resampled_examples] classifications = [(id3.classify(weak_learner, X), y) for X, y in training_data] error = 0 for i in range(len(classifications)): predicted, actual = classifications[i] error += (predicted != actual)*weights[i] print "Error", error if error == 0.0: alpha = 4.0 elif error > 0.5: break else: alpha = 0.5 * np.log((1 - error)/error) alphas.append(alpha) learners.append(weak_learner) for i in range(m): h, y = classifications[i] h = -1 if h == 0 else 1 y = -1 if y == 0 else 1 #weights[examples_index[i]] = weights[examples_index[i]] * np.exp(-alpha * h * y) weights[i] = weights[i] * np.exp(-alpha * h * y) sum_weights = sum(weights) print 'Sum of weights', sum_weights normalized_weights = [float(w)/sum_weights for w in weights] weights = normalized_weights return zip(alphas, learners)
def ensemble(Xtest, ytest, trees): error = 0 predictions = [] ensemble_pred = [] for i in xrange(Xtest.shape[0]): example = Xtest[i] predictions.append({}) for j in xrange(len(trees)): curr_pred = id3.classify(trees[j],example) if curr_pred not in predictions[i]: predictions[i][curr_pred] = 1 else: predictions[i][curr_pred] += 1 ensemble_pred.append(max(predictions[i],key = predictions[i].get)) for i in xrange(len(ensemble_pred)): if ensemble_pred[i] != ytest[i]: error +=1 return float(error) / float(len(Xtest))
def ensemble(Xtest, ytest, trees): error = 0 predictions = [] ensemble_pred = [] for i in xrange(Xtest.shape[0]): example = Xtest[i] predictions.append({}) for j in xrange(len(trees)): curr_pred = id3.classify(trees[j], example) if curr_pred not in predictions[i]: predictions[i][curr_pred] = 1 else: predictions[i][curr_pred] += 1 ensemble_pred.append(max(predictions[i], key=predictions[i].get)) for i in xrange(len(ensemble_pred)): if ensemble_pred[i] != ytest[i]: error += 1 return float(error) / float(len(Xtest))
if len(sys.argv) > 3: n = int(sys.argv[3]) else: n = 100 print 'Learning...' forest = adaboost(Xtrain, ytrain, n, len(Xtrain), 57) print 'Classifying...' predictions = [] ensemble_pred = [] for i in xrange(Xtest.shape[0]): example = Xtest[i] predictions.append({}) for j in xrange(len(trees)): curr_pred = id3.classify(trees[j],example) if curr_pred not in predictions[i]: predictions[i][curr_pred] = 1 else: predictions[i][curr_pred] += 1 print 'Writing to', output_file with open(output_file, 'wb') as csvfile: filewriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) filewriter.writerow(['Id', '|' 'Category']) for index, classification in enumerate(ensemble_pred): filewriter.writerow([index+1, '|', classification]) print 'Done writing to', output_file
def classify(self,data): if not (self.dataSet and self.labels): return None return id3.classify(self.tree, self.labels, data)
'../data/data-splits/data.test', n_features=n_features, preprocessor=preprocessor) cv_data = np.array_split(np.hstack((train_data, train_labels)), 5) max_acc = 0 opt_depth = 0 for i in range(2, n_features + 2): acc = [] for j in range(len(cv_data)): cv_test = cv_data[j] cv_train = np.vstack(cv_data[:j] + cv_data[j + 1:]) tree, depth = id3(cv_train[:, :-1], cv_train[:, -1], max_depth=i) cv_acc = evaluate_tree(cv_test[:, :-1], cv_test[:, -1], tree) acc.append(cv_acc) avg_acc = np.mean(acc) if avg_acc > max_acc: opt_depth = i max_acc = avg_acc tree, depth = id3(train_data, train_labels, max_depth=opt_depth) train_acc = evaluate_tree(train_data, train_labels, tree) test_acc = evaluate_tree(test_data, test_labels, tree) write_output('ID3', opt_depth, max_acc, train_acc, test_acc) write_predictions('id3', lambda row: classify(row, tree), n_features=n_features, preprocessor=preprocessor)
if len(sys.argv) > 3: n = int(sys.argv[3]) else: n = 100 print 'Learning...' forest = adaboost(Xtrain, ytrain, n, len(Xtrain), 57) print 'Classifying...' predictions = [] ensemble_pred = [] for i in xrange(Xtest.shape[0]): example = Xtest[i] predictions.append({}) for j in xrange(len(trees)): curr_pred = id3.classify(trees[j], example) if curr_pred not in predictions[i]: predictions[i][curr_pred] = 1 else: predictions[i][curr_pred] += 1 print 'Writing to', output_file with open(output_file, 'wb') as csvfile: filewriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) filewriter.writerow(['Id', '|' 'Category']) for index, classification in enumerate(ensemble_pred): filewriter.writerow([index + 1, '|', classification]) print 'Done writing to', output_file
test_examples = zip(test_block, test_label_block.T.tolist()[0]) #cross validation for random forest print "Cross Validating Random Forest..." train_size = int(training_block.shape[0]) att_size = int(len(attributes)) forest_size = 100 [ensemble_error, ensemble_pred] = rf.ensemble(test_block, test_label_block, rf.raise_forest(training_block,training_label_block, forest_size, train_size, att_size)) error[0]+= (1.0/k) * ensemble_error #cross validation for decision tree print "Cross Validating Decision Tree..." dec_tree = id3.id3(train_examples, attributes) dec_tree_errors = 0 for i in xrange(len(test_block)): if id3.classify(dec_tree, test_block[i]) != test_label_block[i]: dec_tree_errors += 1 error[1] += (1.0/k) * (float(dec_tree_errors) / set_size) print "Cross Validating AdaBoost..." adaboost_classifier = adaboost.adaboost(train_examples, adaboost_rounds) adaboost_errors = 0 for i in xrange(len(test_block)): if adaboost.classify(adaboost_classifier, test_block[i]) != test_label_block[i]: adaboost_errors += 1 error[2] += (1.0/k) * (float(adaboost_errors) / set_size) print (1-error[0]), (1-error[1]) print 'Estimated accuracy of Random Forest:', (1-error[0])
for _ in range(int(len(data) * 0.2)): tests.append(data.pop()) # create tree sample tree = create_tree(data, attrs, sys.argv[2]) # print tree print_tree(tree) # test classification print '\nTesting sampled records:' good = 0.0 bads = [] for s in tests: try: r = classify(tree, [s])[0] rx = s[sys.argv[2]] valid = '[!]' if r != rx else '' print '{:4}'.format(s[label]), 'classified as:', r, 'actually is', \ rx, valid if r == rx: good += 1.0 except KeyError: bads.append(s[label]) print '--' if bads: print 'Could not classify the following:', ', '.join(bads) print 'Total accuracy: {:.2f}%, {}/{}'.format(100 * good / len(tests), int(good), len(tests))
def predictor(row): return mode(list(map(lambda tree: classify(row, tree), trees)))[0][0]