def test_adaboost(self): X, Y = ada.parse_spambase_data("tiny.spam.train") Y2 = ada.new_label(self.y) trees, weights = ada.adaboost(X, Y2, 2) self.assertEqual(len(trees), 2) self.assertEqual(len(weights), 2) self.assertTrue( isinstance(trees[0], sklearn.tree.tree.DecisionTreeClassifier)) x = np.array([[0, -1], [1, 0], [-1, 0]]) y = np.array([-1, 1, 1]) trees, weights = ada.adaboost(x, y, 1) h = trees[0] pred = h.predict(x) for i in range(len(y)): self.assertEqual(pred[i], y[i])
def test_adaboost_predict(self): x = np.array([[0, -1], [1, 0], [-1, 0]]) y = np.array([-1, 1, 1]) trees, weights = ada.adaboost(x, y, 1) pred = ada.adaboost_predict(x, trees, weights) for i in range(len(y)): self.assertEqual(pred[i], y[i])
def main(): filename = sys.argv[1] hypothesisFile = sys.argv[2] learning_type = sys.argv[3] input_data = [] print("opening training file", filename) file = open(filename, "r", encoding="utf8") for line in file.readlines(): input_data.append(line.rstrip()) training_dataset = get_attributes(input_data) attribute_list = [i for i in range(len(training_dataset[0]) - 1)] if learning_type == "dt": print("Calling decisionTree...") root = decisiontree.decision_tree(training_dataset, attribute_list, depth=5) print("Decision Tree Model ready..") elif learning_type == "ada": print("Calling Adaboost...") root = adaboost.adaboost(training_dataset, attribute_list, K=8) print("Adaboost Model ready..") else: print("Invalid option!") sys.exit() with open(hypothesisFile, "wb") as output_file: pickle.dump(root, output_file)
def train(self): """ Adaboost, where 75 strong classifiers (decision trees with maxdepth = 10) are boosted, will be used. """ self.boostedTrees, self.adaptiveParams = adaboost(self.trainDat, 200) fp1 = open("proc_data/boostedTreesFinalChal.pkl", "wb") fp2 = open("proc_data/adaptiveParamsFinalChal.pkl", "wb") pkl.dump(self.boostedTrees, fp1) pkl.dump(self.adaptiveParams, fp2) fp1.close() fp2.close()
def train(self): ''' Adaboost, where 75 strong classifiers (decision trees with maxdepth = 10) are boosted, will be used. ''' self.boostedTrees, self.adaptiveParams = adaboost(self.trainDat, 200) fp1 = open('proc_data/boostedTreesFinalChal.pkl', 'wb') fp2 = open('proc_data/adaptiveParamsFinalChal.pkl', 'wb') pkl.dump(self.boostedTrees, fp1) pkl.dump(self.adaptiveParams, fp2) fp1.close() fp2.close()
N = 200 mat = sio.loadmat('../mnist.mat') print mat['test_X'].shape[0] chooseTrain = np.random.permutation(mat['train_X'].shape[0]) chooseTest = np.random.permutation(mat['test_X'].shape[0]) maxIter = 20 train_X = mat['train_X'] train_Y = mat['train_Y'] test_X = mat['test_X'] test_Y = mat['test_Y'] print train_X.shape print train_Y.shape print chooseTrain.shape e_train, e_test, maxIter = adaboost(train_X[chooseTrain[0:10000], :], train_Y[0, chooseTrain[0:10000]], test_X[chooseTest[0:1000], :], test_Y[0, chooseTest[0:1000]], maxIter) print e_train print e_test # evenly sampled time at 200ms intervals t = range(maxIter) # red dashes, blue squares and green triangles plt.plot(t, e_train, 'b:', t, e_test, 'k-', t, e_train[0] * np.ones(maxIter), 'b--', e_test[0] * np.ones(maxIter), 'b--') plt.show() #plot
'--test_users', action=readFile, help='Indexes in the trained model. Type = list, in json file') parser.add_argument( '-ai', '--available_items', default=slice(None), action=readFile, help='Indexes in the trained model. Type = list, in json file') # parser.add_argument('-bs','--batch_size', type=int, default=5000, help='Size of user batch') # parser.add_argument('-th','--thread', type=int, default=6, help='# of threads for multi processing') args = parser.parse_args() if args.command == "train": ensemble = adaboost(args.opt_data, args.data, args.n_iter, saveTime=args.save_time, modelList=args.model) # retrain:ensemble=ensemble with open("data/ensemble2", "wb") as f: pickle.dump(ensemble, f) elif args.command == "test": recall = get_recall(args.model, args.data, args.opt_data, args.recall_at, args.test_users, args.available_items, args.in_out) print(recall) elif args.command == "rec": topidx = getRecList(args.model, args.n_rec, args.opt_data, args.test_users, args.available_items) print(topidx.shape)
# ============================================================================= # ################ Main Function ################ # ============================================================================= maxDepth = 20 # Online\Avg Perceptron Loop number fileName1 = "pa3_train_reduced_bo.csv" # Training File name fileName2 = "pa3_valid_reduced.csv" # Validate File name warnings.filterwarnings("error") print("\n ------------ ImportDaTa ------------") trainData = hp.importCsv(fileName1) validateData = hp.importCsv(fileName2) for l in [1, 5, 10, 20]: # for l in [1]: print("\n ------------ Adaboost-{0} ------------{1}".format(l, datetime.datetime.now())) adaClass = ada.adaboost(ftrNum=trainData.shape[1] - 1, depth=1, lNum=l, dataNum=trainData.shape[0]) adaClass.runAdaboost(df=trainData) print(adaClass.computeFinalAccNumRate(df=trainData)) # for d, m, n in [(9, 20, 1), (9, 20, 2), (9, 20, 5), (9, 20, 10), (9, 20, 25)]: # for d, m, n in [(9, 50, 1), (9, 50, 2), (9, 50, 5), (9, 50, 10), (9, 50, 25)]: # for d, m, n in [(9, 10, 1), (9, 10, 2), (9, 10, 5), (9, 10, 10), (9, 10, 25)]: # print("\n ------------ Build Forest{0} ------------{1}".format(n, datetime.datetime.now())) # ftClass = ft.randomForest(treeNum=n, ftrNum=m, depth=d, dataNum=trainData.shape[0]) # ftClass.buildRandomForest(df=trainData) # ftClass.predicDataResult(df=trainData) # print("\n ------------ Build DT ------------{0}".format(datetime.datetime.now())) # dtClass = dt.decesionTree(maxDepth, trainData.shape[0]) # root1 = root2 = cur = Node((None, None, dtClass.getLabelFromLargeData(trainData))) # cl, cr = dtClass.getResultInfo(trainData)
train_size = int(training_block.shape[0]) att_size = int(len(attributes)) forest_size = 100 [ensemble_error, ensemble_pred] = rf.ensemble(test_block, test_label_block, rf.raise_forest(training_block,training_label_block, forest_size, train_size, att_size)) error[0]+= (1.0/k) * ensemble_error #cross validation for decision tree print "Cross Validating Decision Tree..." dec_tree = id3.id3(train_examples, attributes) dec_tree_errors = 0 for i in xrange(len(test_block)): if id3.classify(dec_tree, test_block[i]) != test_label_block[i]: dec_tree_errors += 1 error[1] += (1.0/k) * (float(dec_tree_errors) / set_size) print "Cross Validating AdaBoost..." adaboost_classifier = adaboost.adaboost(train_examples, adaboost_rounds) adaboost_errors = 0 for i in xrange(len(test_block)): if adaboost.classify(adaboost_classifier, test_block[i]) != test_label_block[i]: adaboost_errors += 1 error[2] += (1.0/k) * (float(adaboost_errors) / set_size) print (1-error[0]), (1-error[1]) print 'Estimated accuracy of Random Forest:', (1-error[0]) print 'Estimated accuracy of Decision Tree:', (1-error[1]) print 'Estimated accuracy of AdaBoost:', (1-error[2])
This method is specific to the format of the classifiers """ output = {key: [] for key in classifiers} N = len(X) for cf in classifiers: for i in range(N): cf_classification = cf[2](X[i]) if cf_classification != Y[i]: # output[cf].append(X[i]) output[cf].append(adaboost.key_from_value(ids_to_points, X[i])) return output digits_classifiers = digits_make_classifiers(X, y) digits_ids_to_points = adaboost.make_point_identifiers(X) digits_classifiers_to_misclassified = digits_make_classifiers_to_misclassified( X, y, digits_classifiers, digits_ids_to_points) digits_points = digits_ids_to_points.keys() resulting_classifier = adaboost.adaboost(digits_points, digits_classifiers_to_misclassified, max_num_rounds=20) print 'resulting_classifier', resulting_classifier features_chosen = [i[0][0] for i in resulting_classifier] print 'features_chosen', features_chosen # Testing # feature_test = lambda x,cutoff: x[0] > cutoff # print test_feature(feature_test,X,y,0) # print sum(y)