def main(): data = np.loadtxt(open("/Users/rio512hsu/dataset/MachineLearningTechniques" + "/hw2_adaboost_train.csv", "rb"), delimiter=" ") X = data[:, :-1] y = data[:, -1] u = np.ones(X.shape[0]) / X.shape[0] clf = DecisionStump().fit(X, y, u) # Q12 print clf.getEin() # Q13 adaboost = AdaBoost(DecisionStump).fit(X, y, 300) # print adaboost.predict(X) print np.sum(adaboost.predict(X) != y) # Q17 test = np.loadtxt(open("/Users/rio512hsu/dataset/" + "MachineLearningTechniques/" + "hw2_adaboost_test.csv"), delimiter=' ') X_test = test[:, :-1] y_test = test[:, -1] print np.sum(clf.predict(X) != y) / float(test.shape[0]) # Q18 print np.sum(adaboost.predict(X_test) != y_test) / float(test.shape[0]) return 0
def adaboost_avg_run(max_classes, avg_num_of_run, training_set, testing_set): testing_error_list = [] all_error_list = [] # because datasets sometimes place the class attribute at the end or even # at the beginning or the middle, we'll separate the attribute vector from # the class-label. also note that this is the way scikit-learn does it. # train_x: the attribute vector; train_y: the class_label (train_x, train_y) = split_attribute_and_label(training_set) (test_x, test_y) = split_attribute_and_label(testing_set) # print(len(train_x)) train_subset_num = int(len(train_y) * 0.2) for cl in range(1, max_classes + 1, 2): train_error = [] testing_error = [] scikit_error = [] for i in range(avg_num_of_run): ada_obj = AdaBoost(cl, train_subset_num, THRESHOLD, ETA, UPPER_BOUND, ETA_WEIGHTS, False) ada_obj.fit(train_x, train_y) hypothesis_list = ada_obj.predict(train_x) mistakes = ada_obj.xor_tuples(train_y, hypothesis_list) error_rate_train = classifier_error_rate(mistakes) hypothesis_list = ada_obj.predict(test_x) mistakes = ada_obj.xor_tuples(test_y, hypothesis_list) error_rate_test = classifier_error_rate(mistakes) train_error.append(error_rate_train) testing_error.append(error_rate_test) pada = perceptron.Perceptron(max_iter=UPPER_BOUND, verbose=0, random_state=None, fit_intercept=True, eta0=ETA) bdt = AdaBoostClassifier(pada, algorithm="SAMME", n_estimators=cl) bdt.fit(train_x, train_y) result_list = bdt.predict(test_x) scikit_error.append(calculate_error(test_y, result_list)) errors = ErrorWrapper(cl, sum(train_error) / len(train_error), sum(testing_error) / len(testing_error), sum(scikit_error) / len(scikit_error)) all_error_list.append(errors) print("Train avg for %s %s" % (cl, errors.train_error)) print("Testing avg for %s %s" % (cl, errors.test_error)) testing_error_list.append( (sum(testing_error) / len(testing_error)) * 100) print("Scikit adaboost avg for %s %s" % (cl, errors.scikit_error)) #return testing_error_list return all_error_list
def testTitanicCARTAdaBoost(): print('-' * 30, '\ntestTitanicCARTAdaBoost\n', '-' * 30) trd = pd.read_csv('Titanic_dataset/train.csv') # drop useless and continue features #for i in ["PassengerId", "Name", "Ticket", "Cabin", "Age", "SibSp", "Parch", "Fare"]: for i in ["PassengerId", "Name", "Ticket", "Cabin"]: trd.pop(i) trd = trd.dropna() # drop nan values # convert non-digits to digits trd = pd.get_dummies(trd, columns=['Sex']) Embarked_map = { val: idx for idx, val in enumerate(trd['Embarked'].unique()) } trd['Embarked'] = trd['Embarked'].map(Embarked_map) if DEBUG: print(trd[:5]) # create train data trdd = trd.sample(frac=0.4) # using "Survived" as labels trl = trd.pop("Survived") trl[trl == 0] = -1 trll = trdd.pop("Survived") trll[trll == 0] = -1 # training tree t = AdaBoost(CART_weight_classifier, 10) t.fit(trdd, trll) # prediction pred = t.predict(trd) print('Acc.: ', np.sum(pred == trl.reset_index(drop=True)) / trl.shape[0])
def main(): X_train = np.array([ [1.0, 2.1], [2.0, 1.1], [1.3, 1.0], [1.0, 1.0], [2.0, 1.0] ]) y_train = np.array([1.0, 1.0, -1.0, -1.0, 1.0]).reshape((-1, 1)) model = AdaBoost(verbose=1) model.fit(X_train, y_train) X_test = np.array([ [5, 5], [0, 0] ]) y_predict = model.predict(X_test) print('predict result: ', y_predict)
# KNN scikit-learn knn_begin = time.time() clf2 = KNeighborsClassifier(n_neighbors=5) clf2.fit(features_train, labels_train) pred = clf2.predict(features_test) accuracy = accuracy_score(labels_test, pred) knn_end = time.time() print('====== sklearn KNN ======') print('The sklearn KNN classification accuracy = {}'.format(accuracy)) print('Training and prediction time = {}'.format(knn_end - knn_begin)) # AdaBoost custom implementation custom_adaboost_begin = time.time() custom_adaboost = AdaBoost(num_of_hypotheses=100) custom_adaboost.fit(features_train, labels_train) pred = custom_adaboost.predict(features_test) accuracy = accuracy_score(labels_test, pred) custom_adaboost_end = time.time() print('====== Custom AdaBoost ======') print('The custom AdaBoost classification accuracy = {}'.format(accuracy)) print('Training and prediction time = {}'.format(custom_adaboost_end - custom_adaboost_begin)) # AdaBoost scikit-learn adaboost_begin = time.time() clf3 = AdaBoostClassifier(n_estimators=100) clf3.fit(features_train, labels_train) pred = clf3.predict(features_test) accuracy = accuracy_score(labels_test, pred) adaboost_end = time.time() print('====== sklearn AdaBoost ======') print('The sklearn AdaBoost classification accuracy = {}'.format(accuracy))
algorithm='SAMME', n_estimators=no_base_classifiers, learning_rate=1.0) ## CV kf = KFold(n_splits=no_folds) cv_acc_arr = [] cv_sk_acc_arr = [] i = 0 for train_ind, test_ind in kf.split(X_train): print("cross split no", i) x_tr, x_te = X_train.copy()[train_ind], X_train.copy()[test_ind] y_tr, y_te = y_train.copy()[train_ind], y_train.copy()[test_ind] f.init(x_tr, y_tr) f.train(no_base_classifiers) y_predict = f.predict(x_te) accuracy = np.mean(y_predict == y_te) cv_acc_arr.append(accuracy) ## comparing sklearn implementation of boost boost.fit(x_tr, y_tr) y_pred = boost.predict(x_te) accuracy_sk = np.mean(y_pred == y_te) cv_sk_acc_arr.append(accuracy_sk) i += 1 print(np.mean(cv_acc_arr)) print(np.mean(cv_sk_acc_arr))
if trainOrTest == 'train': myBoost = AdaBoost(300,verbose = False) TrainX,TrainY,TrainXID = myBoost.getDataFromFile(train_test_file) myBoost.train(TrainX,TrainY) pk.dump(myBoost,open(model_file,'wb')) if trainOrTest == 'test': try: myBoost = pk.load(open(model_file,'rb')) except: print("output file has not been generated") if myBoost.isTrained: Xtest,yTest,XtestID = myBoost.getDataFromFile(train_test_file) finalPredictions = myBoost.predict(Xtest) myBoost.writeToFile(XtestID,finalPredictions,'output.txt') print("Accuracy is: " ,sum(finalPredictions==yTest)/len(yTest)) else: print("Untrained model being tested") #train train-data.txt knn_model.txt knn #test test-data.txt knn_model.txt knn if model == 'knn' : if trainOrTest == 'train': knn.train(train_test_file,model_file) if trainOrTest == 'test': try: myKnn = open(model_file,'rb')
train_set, validation_set = split_train_validation(dataset, validation_set_size) num_to_train_on = 10000000 time_before("training adaboost") ab.train_set(dataset[:num_to_train_on]) time_after("training adaboost") time_before("training naive bayes") nb.train_set(dataset[:num_to_train_on]) time_after("training naive bayes") kg_validations_nb = [] kg_validations_ab = [] for i in validation_set: kg_validations_nb.append(nb.predict(*i[1:]) == i[0]) kg_validations_ab.append(ab.predict(*i[1:]) == i[0]) # print("Errors nb: %s " % sum([0 if i else 1 for i in kg_validations_nb])) print("Errors ab: %s " % sum([0 if i else 1 for i in kg_validations_ab])) # import pdb; pdb.set_trace() predictions = [] print("creating predictions...") with open(testset, "r") as testfile: data = testfile.read() lines = data.split('\n')[1:][:num_to_train_on] for line in lines: if not line: continue
# encoding=utf-8 # @Author: wendesi # @Date: 15-11-16 # @Email: [email protected] # @Last modified by: wendesi # @Last modified time: 15-11-16 import logging from generate_dataset import * from adaboost import AdaBoost from sklearn.metrics import accuracy_score if __name__ == '__main__': logger = logging.getLogger() logger.setLevel(logging.DEBUG) train_features, train_labels, test_features, test_labels = generate_dataset( 200) ada = AdaBoost() ada.train(train_features, train_labels) print 'end train' test_predict = ada.predict(test_features) score = accuracy_score(test_labels, test_predict) print "ada boost the accruacy socre is ", score