Пример #1
0
def Q3():  # AdaBoost
    val_error = []
    train_error = []
    for T in range(1, 205, 5):
        adaboost = aba.AdaBoost(DecisionStump, T)
        adaboost.train(X_train, y_train)
        train_error.append(adaboost.error(X_train, y_train))
        val_error.append(adaboost.error(X_val, y_val))

    plot(list(range(1, 205, 5)), train_error)
    plot(list(range(1, 205, 5)), val_error)
    xlabel("Iteration_num")
    ylabel("error")
    legend(["Training Error", "Validation Error"], loc=5)
    show()

    figure(1)
    ion()
    for index, T in enumerate([1, 5, 10, 50, 100, 200]):
        adaboost = aba.AdaBoost(DecisionStump, T)
        adaboost.train(X_train, y_train)
        subplot(2, 3, index + 1)
        decision_boundaries(adaboost, X_train, y_train, "Iteration: " + str(T))

    pause(8)

    best_iteration = val_error.index(np.min(val_error)) * 5
    print(best_iteration)
    ab = aba.AdaBoost(DecisionStump, best_iteration)
    ab.train(X_train, y_train)
    print(ab.error(X_test, y_test))
    return
Пример #2
0
def main():

    mode, filename, modelFile, model = sys.argv[1:]

    # Driver
    if mode == "train":
        if model == "nearest" or model == "best":
            knn = KNN()
            knn.train(filename)
        elif model == "adaboost":
            ada = adaboost.AdaBoost()
            ada.train(filename)
            save_obj(ada, modelFile)
        elif model == "nnet":
            pass
        else:
            print("Incorrect model.........exiting")
            sys.exit(1)

    elif mode == "test":
        if model == "nearest" or model == "best":
            knn = KNN()
            knn.classify(filename, modelFile)
        elif model == "adaboost":
            ada = load_obj(modelFile)
            ada.classify(filename, modelFile)
        elif model == "nnet":
            pass
        else:
            print("Incorrect model....exiting")
            sys.exit(1)

    else:
        print("Incorrect mode......exiting")
        sys.exit(1)
    def testWineData(self):
        trn, tst = load_wine_data()
        trn_X, trn_y = trn
        tst_X, tst_y = tst

        td = training_utils.TrainingData(trn_X, trn_y)
        XX, yy = td.get_training_sample()
        classifier = adaboost.AdaBoost(rank1_metric.Rank1_Metric)
        classifier.set_training_sample(XX, yy)
        T = 20
        classifier.train(T, 1)
        f = metric_learning.boosted_dist(classifier)

        knn = knn_utils.KNN(3)
        knn.dist_func = f
        knn.set_training_data(trn_X, trn_y)
        # test on training
        knn.compute_distance(trn_X)
        predicted_y = numpy.array(knn.classify(True))
        accuracy_trn = compute_accuracy(predicted_y, trn_y)
        # test on testing
        knn.compute_distance(tst_X)
        predicted_y = numpy.array(knn.classify(False))
        accuracy_tst = compute_accuracy(predicted_y, tst_y)

        print accuracy_trn, accuracy_tst
        self.failUnless(min(accuracy_trn, accuracy_tst) > .95)
Пример #4
0
def Cascade(trn, max_level):
    if len(trn) > 2:
        X = array([x for x, y in trn])
        Y = array([y for x, y in trn])
    else:
        X, Y = trn
    T = 512
    classifiers = []

    td = training_utils.TrainingData(X, Y)
    SD = distance_utils.calcDistanceMatrix2([X])
    trunk = td.trunk
    sensitivity = 0.99
    o = metric_learning.search_threshold(SD, trunk, sensitivity)
    thres = o[0]
    TP, FP = o[1]
    num_FN = (1 - sensitivity) * (len(TP) / sensitivity)
    pairs = TP, FP
    MAX_NUM_POS = 2000
    MAX_NUM_NEG = 4000
    num_pos = min(len(pairs[0]), MAX_NUM_POS)
    num_neg = min(len(pairs[1]), MAX_NUM_NEG)
    print "TP vs FP: %d vs %d" % (len(TP), len(FP))
    trunk = o[2]
    classifiers.append((thres, (TP, FP)))
    FPvsFN = len(FP) / num_FN
    print "#FP/#FN = ", FPvsFN
    #return classifiers

    for level in range(1, max_level + 1):
        print "level = %d" % level
        random.shuffle(TP)
        random.shuffle(FP)
        pos = TP[-1:-num_pos - 1:-1]
        neg = FP[0:num_neg]
        pairs = pos, neg
        print len(pairs[0]), len(pairs[1])
        XX, yy = training_utils.create_training_sample(X, pairs)
        classifier = adaboost.AdaBoost(rank1_metric.Rank1_Metric)
        classifier.set_training_sample(XX, yy)
        classifier.train(T, 1)
        #classifier

        VD, vy = training_utils.create_training_sample(
            X, (TP, FP))  # for validation
        SD = -classifier.predict(VD)  # be careful about the sign!!!

        sensitivity = 0.99
        o = metric_learning.search_threshold(SD, trunk, sensitivity)
        thres = o[0]
        TP, FP = o[1]
        num_FN = (1 - sensitivity) * (len(TP) / sensitivity)
        print "TP vs FP: %d vs %d" % (len(TP), len(FP))
        trunk = o[2]
        classifiers.append((classifier, thres, (TP, FP)))
        FPvsFN = len(FP) / num_FN
        print "#FP/#FN = ", FPvsFN
        if len(TP) == 0 or len(FP) == 0 or FPvsFN < 1.2: break
    return classifiers
 def _init_classifiers(self):
     # Initialize classifier objects
     self.fenc = FreemanEncoder()
     self.knn = KNN.KNN()
     self.HMM = HMM.HMM()
     self.NaiveBayes = NaiveBayes.NaiveBayes()
     self.RandomForest = RandomForest.RandomForests()
     self.SVM = svm.SVM_SVC()
     self.LogisticReg = LogisticReg.LogisticReg()
     self.AdaBoost = adaboost.AdaBoost()
     self.GBRT = gbrt.GBRT()
     
     #Train initially on the default data set, if no model saved already
     
     # Initialize KNN, no saved model for KNN
     self.knn.knn_train(CharRecognitionGUI_support.training_dataset, 1.0)
     
     # Initialize HMM
     self.HMM.training(CharRecognitionGUI_support.training_dataset)
     
     # Initialize Naive Bayes
     try:
         pickle.load( open( "./Models/naivebayes_model.p", "rb" ) )
     except IOError:
         self.NaiveBayes.training(CharRecognitionGUI_support.training_dataset)
     
     # Initialize Random Forest
     try:
         pickle.load( open( "./Models/random_forest.p", "rb" ) )
     except IOError:
         self.RandomForest.training(CharRecognitionGUI_support.training_dataset)
     
     # Initialize SVM
     try:
         pickle.load( open( "./Models/svm.p", "rb" ) )
     except IOError:
         self.SVM.training(CharRecognitionGUI_support.training_dataset)
     
     # Initialize Logistic Regression
     try:
         pickle.load( open( "./Models/logistic_model.p", "rb" ) )
     except IOError:
         self.LogisticReg.training(CharRecognitionGUI_support.training_dataset)
         
     # Initialize AdaBoost
     try:
         pickle.load( open( "./Models/AdaBoostClassifier.p", "rb" ) )
     except IOError:
         self.AdaBoost.training(CharRecognitionGUI_support.training_dataset)
         
     # Initialize GBRT
     try:
         pickle.load( open( "./Models/GradientBoostingClassifier.p", "rb" ) )
     except IOError:
         self.GBRT.training(CharRecognitionGUI_support.training_dataset)
def q_11(noise):
    """
    This function operates the required in q_11- plots the decisions of the learned
    classifier with T=[5, 10, 50, 100, 200, 500]
    :param noise: the noise to generate data with (the test data)
    """
    x_train, y_train = generate_data(NUM_SAMPLES_TRAIN, noise)
    x_test, y_test = generate_data(NUM_SAMPLES_TEST, noise)
    for i in range(len(T_LIST)):
        boost = adb.AdaBoost(DecisionStump, T_LIST[i])
        boost.train(x_train, y_train)
        plt.subplot(2, 3, i + 1)
        decision_boundaries(boost, x_test, y_test, T_LIST[i])
    plt.show()
def q_12(noise):
    """
    This function operates the required in q_12- finds the T that minimizes the test
    error, and plots the decisions boundaries of this classifier with the training data
    :param noise: the noise to generate data with (the test data)
    """
    x_train, y_train = generate_data(NUM_SAMPLES_TRAIN, noise)
    train_err, test_err = calc_train_and_test_err(x_train, y_train, noise)
    T_hat = np.argmin(test_err)
    min_boost = adb.AdaBoost(DecisionStump, T_hat)
    min_boost.train(x_train, y_train)
    decision_boundaries(min_boost, x_train, y_train, T_hat)
    plt.show()
    print("The T_hat is: " + str(T_hat) + " and its test error is: " +
          str(test_err[T_hat]))
Пример #8
0
def Q3():  # AdaBoost
    """

    :return:
    """

    syn_data = get_syn_data()
    X_test, X_train, X_val, y_test, y_train, y_val = syn_data[0], syn_data[1],\
                                                   syn_data[2], syn_data[3],\
                                                   syn_data[4], syn_data[5]

    # init T
    T = 22 * [0]
    T[0] = 1
    for i in range(20):
        T[i + 1] = (i + 1) * 5

    T[21] = 200
    keep_T = [1, 5, 10, 50, 100, 200]
    learned_classifiers = [None] * len(keep_T)
    min_val_err = 1
    min_val_err_classifier = None
    training_error, validation_error, test_error = [], [], []

    for t in T:

        ada_boost = adaboost.AdaBoost(WL=tools.DecisionStump, T=t)
        ada_boost.train(X_train, y_train)

        if t in keep_T:
            learned_classifiers[keep_T.index(t)] = ada_boost

        if t != 1:
            training_error.append(ada_boost.error(X_train, y_train))
            validation_error.append(ada_boost.error(X_val, y_val))

    plot_decisions(keep_T, learned_classifiers, X_train, y_train,
                   "adaBoost on SynData")
    plt.plot(T[1:], training_error, label='training error', color='magenta')
    plt.plot(T[1:],
             validation_error,
             label='validation error',
             color='deepskyblue')
    plt.title('adaBoost error on SynData as function of T')
    plt.legend(loc='best')
    plt.xlabel('T - number of base learners to learn')
    plt.ylabel('Error')
    plt.show()
def calc_train_and_test_err(x_train, y_train, noise):
    """
    This function calculate the train and test error vectors
    :param x_train: the samples of the train data to calculate its error
    :param y_train: the labels of the train data to calculate its error
    :param noise: the noise to generate data with (the test data)
    :return: the train and test error vectors
    """
    x_test, y_test = generate_data(NUM_SAMPLES_TEST, noise)
    boost = adb.AdaBoost(DecisionStump, T)
    boost.train(x_train, y_train)

    train_err = []
    test_err = []
    for t in range(T):
        train_err.append(boost.error(x_train, y_train, t + 1))
        test_err.append(boost.error(x_test, y_test, t + 1))
    return train_err, test_err
Пример #10
0
def Q3(): # AdaBoost
    syn_data = getSynData()
    X_test,X_train,X_val = syn_data[0],syn_data[1],syn_data[2]
    Y_test,Y_train,Y_val = syn_data[3] ,syn_data[4] ,syn_data[5]

    T = [5*(i+1) for i in range(20)] + [200]
    test_err, validation_err = list(), list()

    for t in T:
        AdaB = adaboost.AdaBoost(ex4_tools.DecisionStump,t)
        AdaB.train(X_train,Y_train)

        test_err.append(AdaB.error(X_test,Y_test))
        validation_err.append(AdaB.error(X_val,Y_val))

    graph_plot("T","error of adaBoost",T,test_err,
               "test err", validation_err,"validation err","Q3")
    #
    return
def q_13(noise):
    """
    This function operates the required in q_13- plots the training set with size
    proportional to its weight in D^T. (after seeing the results with the original D^T,
    the function normalized the D^T and then plotted)
    :param noise: the noise to generate data with (the test data)
    """
    x_train, y_train = generate_data(NUM_SAMPLES_TRAIN, noise)

    # using D_T without normalize - cant see any points:
    # boost = adb.AdaBoost(DecisionStump, T)
    # D_T = boost.train(x_train, y_train)
    # decision_boundaries(boost, x_train, y_train, T, D_T)
    # plt.show()

    # using D_T with normalize:
    boost = adb.AdaBoost(DecisionStump, T)
    D_T = boost.train(x_train, y_train)
    D_T = (D_T / np.max(D_T)) * 10
    decision_boundaries(boost, x_train, y_train, T, D_T)
    plt.show()
Пример #12
0
def main():
    k = 5  # Number of folds

    # Dataset retrieving and formatting
    dataset = dh.load_dataset("tic-tac-toe.data")
    dataset = dh.format_outputs(dataset)
    dataset = dh.fold_dataset(dataset, k)

    cv_accuracies = []
    cv_errors = []
    cv_model_errors = []
    # Execute k-fold cross-validation
    for i in range(k):
        print("Round", i + 1)
        testing_set = dh.separate_attributes(dataset[i])
        remaining_folds = np.concatenate(np.delete(dataset, i))
        training_set = dh.separate_attributes(remaining_folds)

        ada = ab.AdaBoost(training_set, testing_set)
        results = ada.boost(301)

        cv_accuracies.append(results[0])
        cv_errors.append(results[1])
        cv_model_errors.append(results[2])

    # Convert lists to numpy arrays for faster calculations
    cv_accuracies = np.asarray(cv_accuracies)
    cv_errors = np.asarray(cv_errors)
    cv_model_errors = np.asarray(cv_model_errors)

    # Calculate the mean of the accuracies and the errors
    cv_accuracies = np.divide(np.sum(cv_accuracies, axis=0), k)
    cv_errors = np.divide(np.sum(cv_errors, axis=0), k)
    cv_model_errors = np.divide(np.sum(cv_model_errors, axis=0), k)

    # Save the results to a CSV
    dh.save_results(cv_accuracies, "boosting_accuracy")
    dh.save_results(cv_errors, "boosting_error")
    dh.save_results(cv_model_errors, "model_error")
Пример #13
0
def Q5():  # spam data
    n_folds = 5
    # creating the data
    y = np.loadtxt("SpamData/spam.data", usecols=(-1, ))
    y[y == 0] = -1
    data = np.loadtxt("SpamData/spam.data")
    X = data[:, :-1]
    sample_data_idx = np.random.permutation(np.array(range(len(X))))
    X_test, y_test, X_train, y_train = X[sample_data_idx[:1536]], y[sample_data_idx[:1536]],\
                                       X[sample_data_idx[1536:]], y[sample_data_idx[1536:]]
    T_values = [5, 50, 100, 200, 500, 1000]
    d_values = [5, 8, 10, 12, 15, 18]

    # splitting to folds
    cross_val_idx = np.random.permutation(np.array(range(len(X_train))))
    folds = np.array([
        X_train[cross_val_idx[int(i * len(X_train) /
                                  n_folds):int((i + 1) * len(X_train) /
                                               n_folds)]]
        for i in range(n_folds)
    ])
    folds_y = np.array([
        y_train[cross_val_idx[int(i * len(X_train) /
                                  n_folds):int((i + 1) * len(X_train) /
                                               n_folds)]]
        for i in range(n_folds)
    ])
    AB_mean_errors = []
    DT_mean_errors = []
    AB_sd = []
    DT_sd = []
    # cross validation:
    for val in range(len(T_values)):
        print("val" + str(val))
        # the len of T and d is the same
        ab_errors = []
        dt_errors = []
        for fold_idx in range(n_folds):
            print("fold idx " + str(fold_idx))
            fold_test = folds[fold_idx]
            fold_test_y = folds_y[fold_idx]
            other_fold_idx = [i for i in range(n_folds) if i != fold_idx]
            other_folds = np.concatenate(folds[other_fold_idx])
            other_folds_y = np.concatenate(folds_y[other_fold_idx])

            # adaboost training on current fold:
            ab = aba.AdaBoost(DecisionStump, T_values[val])
            ab.train(other_folds, other_folds_y)
            ab_errors.append(ab.error(fold_test, fold_test_y))

            # DT training on current fold:
            dt = dta.DecisionTree(d_values[val])
            dt.train(other_folds, other_folds_y)
            dt_errors.append(dt.error(fold_test, fold_test_y))

        # add the mean errors:
        AB_mean_errors.append(np.array(ab_errors).mean())
        AB_sd.append(np.std(ab_errors))
        DT_mean_errors.append(np.array(dt_errors).mean())
        DT_sd.append(np.std(dt_errors))

    # plotting the errors in order to view the best T and d parameters:
    errorbar(T_values,
             AB_mean_errors,
             np.reshape(np.array(AB_sd), (len(T_values), 1)),
             ecolor="green")
    xlabel("parameter - T")
    ylabel("error rate")
    show()
    errorbar(d_values,
             DT_mean_errors,
             np.reshape(np.array(DT_sd), (len(T_values), 1)),
             ecolor="green")
    xlabel("parameter max depth")
    ylabel("error rate")
    show()
    # best_b = AB_mean_errors.index(np.min(AB_mean_errors))
    best_b = 200
    print(" best parameter for adaboost:" + str(best_b))
    ab = aba.AdaBoost(DecisionStump, best_b)
    ab.train(X_train, y_train)
    print(ab.error(X_test, y_test))
    # best_b = DT_mean_errors.index(np.min(DT_mean_errors))
    best_b = 10
    print("DT best parameter:" + str(best_b))
    dt = dta.DecisionTree(best_b)
    dt.train(X_train, y_train)
    print(dt.error(X_test, y_test))
Пример #14
0
import adaboost
from sklearn.ensemble import AdaBoostClassifier
t = adaboost.AdaBoost()

# print detailed debugging information regarding the classifier selection
t.debug = 2

x = [[1, 2], [1, 4], [2.5, 5.5], [3.5, 6.5], [4, 5.4], [2, 1], [2, 4],
     [3.5, 3.5], [5, 2], [5, 5.5]]
y = [1, 1, 1, 1, 1, -1, -1, -1, -1, -1]
# train classifier
t.train(x, y)  # x is a matrix, y is a actual classifications (-1 or 1)

# classify novel set of values, the sign of the return value is predicted binary class
novel_y_prime = t.apply_to_matrix(x)
print novel_y_prime

clf = AdaBoostClassifier(n_estimators=10)
clf.fit(x, y)
print clf.feature_importances_
print clf.predict(x)
    print('Preprocessing Finished...')
    end_preprocessing = time.time()

    print("Preprocessing took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min")
    print()
    print()
else:
    df_credit = pd.read_csv('Preprocessed_Credit.csv', delimiter=',', header=None)

df_credit_train, df_credit_test = model_selection.train_test_split(df_credit, test_size=0.20)

start_training = time.time()
# dt = decision_tree.DecisionTree(df_credit.shape[1] - 1)
# dt.train(df_credit_train.iloc[:, df_credit.shape[1] - 1], df_credit_train.iloc[:, :df_credit.shape[1] - 1])

dt = adaboost.AdaBoost(df_credit_train, decision_tree.DecisionTree, 20)
dt.train()
end_training = time.time()

print("Training took " + str(float(end_training - start_training) / 60) + " min")
print()
print()
results = []
for smpl in range(0, df_credit_train.shape[0]):
    results.append(dt.decide(df_credit_train.iloc[smpl, :df_credit.shape[1] - 1].tolist()))

tn, fp, fn, tp = metrics.confusion_matrix(df_credit_train.iloc[:, df_credit.shape[1] - 1].tolist(), results).ravel()

print('True positive = ' + str(tp))
print('True negative = ' + str(tn))
print('False positive = ' + str(fp))
Пример #16
0
def helper(question):
    """
    execution of question 3,4,bonus & 5, including training the classifiers using adaboost,
    decision trees and bagging, and calculating the error for each of the T,d or b values (both
    training, validation & test)

    :return: training_errors, validation_errors, test_errors, classifiers, x_matrix_training,
    y_vector_training
    """

    with open("./SynData/X_train.txt", 'r') as training_x, open("./SynData/y_train.txt", 'r') as \
            training_y, open("./SynData/X_val.txt", 'r') as validation_x, \
            open("./SynData/y_val.txt", 'r') as validation_y, open("./SynData/y_test.txt",
                                                                   'r') as test_y, \
            open("./SynData/X_test.txt", 'r') as test_x:

        # convert the files into vectors and matrices
        y_vector_training = text_lines_to_vector(training_y)
        spam_matrix = text_lines_to_matrix(training_x)

        y_vector_validation = text_lines_to_vector(validation_y)
        x_matrix_validation = text_lines_to_matrix(validation_x)

        y_vector_test = text_lines_to_vector(test_y)
        x_matrix_test = text_lines_to_matrix(test_x)

        # preset the ingredients for the training
        training_errors = []
        validation_errors = []
        test_errors = []

        # written form question 3
        if question == 3:

            wl = ex4_tools.DecisionStump
            classifiers = list()

            # adding the classifier for T = 1
            adb_h_1 = adb.AdaBoost(wl, 1)
            adb_h_1.train(spam_matrix, y_vector_training)
            classifiers.append(adb_h_1)

            t_values = [5, 10, 50, 100, 200]

            # train a classification hypothesis using adaboost for different values of T
            for t in range(run.T):
                adb_h_t = adb.AdaBoost(wl, t)
                adb_h_t.train(spam_matrix, y_vector_training)
                training_errors.append(
                    adb_h_t.error(spam_matrix, y_vector_training))
                validation_errors.append(
                    adb_h_t.error(x_matrix_validation, y_vector_validation))
                test_errors.append(adb_h_t.error(x_matrix_test, y_vector_test))
                # collecting the trained classifiers for part 2 of question 3
                if t in t_values:
                    classifiers.append(adb_h_t)

            return training_errors, validation_errors, test_errors, classifiers,\
                   spam_matrix, y_vector_training

        # written for question 4
        if question == 4:

            classifiers = list()
            D_VALUES = [3, 6, 8, 10, 12]
            for d in D_VALUES:
                tree_classifier = dt.DecisionTree(d)
                tree_classifier.train(spam_matrix, y_vector_training)
                training_errors.append(
                    tree_classifier.error(spam_matrix, y_vector_training))
                validation_errors.append(
                    tree_classifier.error(x_matrix_validation,
                                          y_vector_validation))
                test_errors.append(
                    tree_classifier.error(x_matrix_test, y_vector_test))

            return training_errors, validation_errors, test_errors, classifiers, \
                   spam_matrix, y_vector_training

        # written for question 4 - bonus
        if question == 'bonus':

            b_values = run.B
            max_depth = 10  # since we discovered at question 4 that 10 was the optimal depth for
            #  this data
            tree_classifier = dt.DecisionTree(max_depth)
            for b in b_values:
                print(b)
                bagging_classifier = bag.Bagging(tree_classifier, b)
                bagging_classifier.train(spam_matrix, y_vector_training)
                validation_errors.append(
                    bagging_classifier.error(x_matrix_validation,
                                             y_vector_validation))
                test_errors.append(
                    bagging_classifier.error(x_matrix_test, y_vector_test))

            return validation_errors, test_errors

        # written for question 5

    if question == '5':

        with open("./SpamData/spam.data") as spam:

            # convert the file into a matrix and a vector, and create a list of partitioned parts
            # for the cross validation

            spam_matrix = text_lines_to_matrix(spam)

            x_spam_matrix, y_spam_vector, x_vault_matrix, y_vault_vector = spam_modif(
                spam_matrix)

            subgroup_length = np.shape(x_spam_matrix)[0] // 5

            partitioned_x, partitioned_y = cross_val_partition(
                x_spam_matrix, y_spam_vector, subgroup_length)

            # preparations for the adaboost training
            wl = ex4_tools.DecisionStump
            validation_errors_adb = []

            # train a classification hypothesis using adaboost for different values of T,
            # and through each of the subsets as a validation group (cross validation)
            for j, t in enumerate(T_VALUES):
                for i in range(5):

                    segment_len = len(partitioned_x[0])

                    x, y = sub_matrix(x_spam_matrix, y_spam_vector,
                                      segment_len, i)

                    adb_h_t = adb.AdaBoost(wl, t)
                    adb_h_t.train(x, y)
                    if i == 0:
                        validation_errors_adb.append([
                            adb_h_t.error(partitioned_x[i], partitioned_y[i])
                        ])
                    else:
                        validation_errors_adb[j].append(
                            adb_h_t.error(partitioned_x[i], partitioned_y[i]))
                print(validation_errors_adb)

            # preparations for the decision tree training
            validation_errors_dt = []

            # train a classification hypothesis using decision trees for different values of depth,
            # and through each of the subsets as a validation group (cross validation)
            for j, d in enumerate(D_VALUES):
                for i in range(5):
                    x, y = sub_matrix(x_spam_matrix, partitioned_x,
                                      y_spam_vector, partitioned_y)

                    tree_classifier = dt.DecisionTree(d)
                    tree_classifier.train(x, y)
                    if i == 0:
                        validation_errors_dt.append([
                            tree_classifier.error(partitioned_x[i],
                                                  partitioned_y[i])
                        ])
                    else:
                        validation_errors_dt[j].append(
                            tree_classifier.error(partitioned_x[i],
                                                  partitioned_y[i]))

            return validation_errors_adb, validation_errors_dt
Пример #17
0
def Q5():  # spam data

    T = [5, 50, 100, 200, 500, 1000]
    D = [5, 8, 10, 12, 15, 18]
    # get spam data
    spam_data = np.loadtxt('SpamData/spam.data')

    # change values of 0 to -1
    spam_data[:, -1][spam_data[:, -1] == 0] = -1\

    # get vault data and train data
    np.random.shuffle(spam_data)
    vault_index = np.random.choice(len(spam_data), 1536, replace=False)
    train_index = np.array(
        [i for i in range(len(spam_data)) if i not in vault_index])
    train_data = spam_data[train_index]

    vault_data = spam_data[vault_index]

    # Use 5-fold cross validation to pick T and d
    data_size = len(train_data)
    split = int(data_size / 5)
    folds = np.split(train_data, [split, 2 * split, 3 * split, 4 * split])
    data_sets = split_data_to_folds(folds)

    DT_error = [0] * 6
    adaboost_error = [0] * 6
    best_DT_error = None
    bes_adaboost_error = None

    for i in range(5):

        fold_size1 = data_sets[i][0].shape[1]

        arr1 = data_sets[i][0]
        arr2 = data_sets[i][1]



        X_train, y_train = arr1[:, 0:fold_size1 - 1],\
                           arr1[:,fold_size1 - 1:fold_size1]

        X_validation = arr2[:, 0:(fold_size1 - 1)]
        y_validation = arr2[:, (fold_size1 - 1):fold_size1]

        y_train = y_train.reshape((-1, ))
        y_validation = y_validation.reshape((-1, ))

        for t in T:
            ada_boost = adaboost.AdaBoost(tools.DecisionStump, t)
            ada_boost.train(X_train, y_train)
            current_adaboost_error = ada_boost.error(X_validation,
                                                     y_validation)
            adaboost_error[i] += current_adaboost_error
            if bes_adaboost_error == None or bes_adaboost_error > current_adaboost_error:
                bes_adaboost_error = t

        for d in D:
            dt = decision_tree.DecisionTree(d)
            dt.train(X_train, y_train)
            current_dt_error = dt.error(X_validation, y_validation)
            DT_error[i] += current_dt_error
            if best_DT_error == None or best_DT_error > current_dt_error:
                best_DT_error = d

    # get mean error
    adaboost_error = np.array([x / 5 for x in adaboost_error])
    DT_error = np.array([x / 5 for x in DT_error])

    plt.errorbar(T, adaboost_error, capsize=np.std, color='magenta')
    plt.title('validation error on SpamData for adaBoost as function of T')
    plt.legend(loc='best')
    plt.xlabel('T')
    plt.ylabel('Error')
    plt.errorbar(T, adaboost_error)
    plt.show()

    #
    plt.errorbar(D, DT_error, capsize=np.std, color='magenta')
    plt.title('validation error on SpamData for DT as function of max depth')
    plt.legend(loc='best')
    plt.xlabel('max depth')
    plt.ylabel('Error')
    plt.show()

    # Train classifiers using the chosen parameter values, using the complete training set.
    #

    X_train, y_train = train_data[:, 0:57], train_data[:, 57]
    X_vault, y_vault = vault_data[:, 0:57], vault_data[:, 57]

    ada_boost = adaboost.AdaBoost(tools.DecisionStump, bes_adaboost_error)
    ada_boost.train(X_train, y_train)
    vault_adaboost_error = ada_boost.error(X_vault, y_vault)

    dt = decision_tree.DecisionTree(best_DT_error)
    dt.train(X_train, y_train)
    vault_dt_error = dt.error(X_vault, y_vault)

    print("vault_adaboost_error= " + vault_adaboost_error)
    print("vault_dt_error= " + vault_dt_error)
    df_adult_test = ap.binarize_test(binarizers_adult, binarizers_adult_columns, df_adult_test)
    df_adult_test = df_adult_test.reset_index(drop=True)
    df_adult_test.to_csv('Preprocessed_Adult_Test.csv', sep=',')
    end_preprocessing = time.time()

    print("Preprocessing on test data took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min")
else:
    df_adult_train = pd.read_csv('Preprocessed_Adult_Train.csv', delimiter=',', header=None)
    df_adult_test = pd.read_csv('Preprocessed_Adult_Test.csv', delimiter=',', header=None)


start_training = time.time()
# dt = decision_tree.DecisionTree(df_adult_train.shape[1] - 1)
# dt.train(df_adult_train.iloc[:, df_adult_train.shape[1] - 1], df_adult_train.iloc[:, :df_adult_train.shape[1] - 1])

dt = adaboost.AdaBoost(df_adult_train, decision_tree.DecisionTree, 20)
dt.train()
end_training = time.time()

print("Training took " + str(float(end_training - start_training) / 60) + " min")
print()
print()
results = []
for smpl in range(0, df_adult_train.shape[0]):
    results.append(dt.decide(df_adult_train.iloc[smpl, :df_adult_train.shape[1] - 1].tolist()))

tn, fp, fn, tp = metrics.confusion_matrix(df_adult_train.iloc[:, df_adult_train.shape[1] - 1].tolist(), results).ravel()

print('True positive = ' + str(tp))
print('True negative = ' + str(tn))
print('False positive = ' + str(fp))
Пример #19
0
def Q5(): # spam data

    SpamData = getSpamData()
    # changing the labels from 0 to -1, because this is how the classifiers work.
    SpamData[:, -1] [SpamData[:, -1] == 0] = -1
    # Shuffling the data
    np.random.shuffle(SpamData)
    np.random.shuffle(SpamData)
    mSpam = len(SpamData)
    inds_vault = np.random.choice(mSpam, 1536, replace=False)
    inds_train = np.array([i for i in range(mSpam) if i not in inds_vault])

    data_train = SpamData[inds_train]
    data_vault = SpamData[inds_vault]
    m_train = len(data_train)
    # get a folds as a tuple (train, validation,i)
    folds = np.split(data_train, [int(m_train /5), 2*int(m_train /5), 3*int(m_train /5), 4*int(m_train /5)])
    data_5_cv = split(folds)

    T = [5,50,100,200,500,1000]
    D = [5,8,10,12,15,18]

    err_DT, err_AdaB = [0] * 6, [0] * 6

    for i in range(5):
        print("*******i: ", i,  "  *********")
        s = data_5_cv[i][0].shape[1]
        X_train, y_train  = data_5_cv[i][0][:, 0:s - 1],data_5_cv[i][0][:, s - 1:s]
        X_validation, y_validation = data_5_cv[i][1][:, 0:s - 1],data_5_cv[i][1][:, s - 1:s]
        y_train = y_train.reshape((-1,))
        y_validation = y_validation.reshape((-1,))

        # train decision tree
        for d in D:
            dt = DecisionTree(d)
            dt.train(X_train,y_train)
            e = dt.error(X_validation,y_validation)
            print("d:",str(d), " e: ",e)
            err_DT[i] +=dt.error(X_validation,y_validation)

        #train AdaBoost
        for t in T:
            adaB = adaboost.AdaBoost(ex4_tools.DecisionStump,t)
            adaB.train(X_train,y_train)
            e = adaB.error(X_validation,y_validation)
            err_AdaB[i] += e
            print("t:",str(t), " e: ",e)


    err_DT, err_AdaB = np.array([i/5 for i in err_DT]), np.array([i/5 for i in err_AdaB])
    print("err_DT: ",err_DT)
    print("err_ADAB: ",err_AdaB)

    # checking optimal value for the data vault
    X ,y = SpamData[:,0:57],SpamData[:,57:58].reshape((len(SpamData),))
    X_test, y_test = data_vault[:,0:57],data_vault[:,57:58].reshape((1536,))
    dt = DecisionTree(15)
    dt.train(X, y)
    e = dt.error(X_test,y_test)
    print( "DT: e -  ", e)

    adaB = adaboost.AdaBoost(ex4_tools.DecisionStump, 100)
    adaB.train(X, y)
    e = adaB.error(X_test,y_test)
    print( "adaBoost: e =  ", e)

    return
          str(float(end_preprocessing - start_preprocessing) / 60) + " min")
    print()
    print()
else:
    df_telco = pd.read_csv('Preprocessed_Telco.csv',
                           delimiter=',',
                           header=None)

df_telco_train, df_telco_test = model_selection.train_test_split(
    df_telco, test_size=0.20)

start_training = time.time()
# dt = decision_tree.DecisionTree(df_telco.shape[1] - 1)
# dt.train(df_telco_train.iloc[:, df_telco.shape[1] - 1], df_telco_train.iloc[:, :df_telco.shape[1] - 1])

dt = adaboost.AdaBoost(df_telco_train, decision_tree.DecisionTree, 20)
dt.train()
end_training = time.time()

print("Training took " + str(float(end_training - start_training) / 60) +
      " min")
print()
print()
results = []
for smpl in range(0, df_telco_train.shape[0]):
    results.append(
        dt.decide(df_telco_train.iloc[smpl, :df_telco.shape[1] - 1].tolist()))

tn, fp, fn, tp = metrics.confusion_matrix(
    df_telco_train.iloc[:, df_telco.shape[1] - 1].tolist(), results).ravel()
Пример #21
0
import adaboost
import bool_ann

if __name__ == '__main__':
    print '...loading datas'
    x, y = load(open('datas.dat', 'rb'))
    rate = 0.8
    spInd = x.shape[0] * rate
    test_s = x.shape[0] - spInd + 1

    print 'traing data: %d, testing data: %d' % (spInd - 1, test_s)
    print 'the true rate: %lf' % ((y == True).sum() * 1.0 / y.shape[0])

    #    print '======Test with Bool_ANN======='
    #    classifier = bool_ann.bool_ann(x[:spInd], y[:spInd],{'disp':False, 'maxiter':10})
    #    correct = (classifier.predict(x[spInd:])==y[spInd:]).sum()
    #    print 'correct = %d/%d = %lf' % (correct, test_s, correct*1.0/test_s)

    print '======Test with ANN========'
    classifier = ann.NeuralNetworkClassifier([50])
    classifier.fit(x[:spInd], y[:spInd])
    correct = (classifier.predict(x[spInd:]) == y[spInd:]).sum()
    print 'correct = %d/%d = %lf' % (correct, test_s, correct * 1.0 / test_s)

    print '======Test with Adaboost======='
    new_y = array(map(lambda y: (y - 0.5) * 2, y))
    classifier = adaboost.AdaBoost(x[:spInd], new_y[:spInd], adaNum=200)
    pre = classifier.predict(x[spInd:])
    correct = (pre == y[spInd:]).sum()
    print 'correct = %d/%d = %lf' % (correct, test_s, correct * 1.0 / test_s)