def q1():
    """ feature analysis with Adaboost """
    #spamData = hw3u.pandas_to_data(hw3u.load_and_normalize_spambase())
    spamData = utils.load_and_normalize_polluted_spam_data()
    k = 10
    all_folds = hw3u.partition_folds(spamData, k)
    col_errs = []
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)

    # We're not actually cross-validating anything -- we just want feature weights
    #X = np.concatenate([X, X_test], axis=0)
    #y = np.concatenate([y, y_test], axis=0)

    #adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='random'))
    adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='best'))
    #adaboost = adac.AdaboostOptimal(max_rounds=10, do_fast=False, learner=hw4u.TreeOptimal)
    adaboost.fit(X, y)


    margin_fractions = get_margin_fractions(adaboost, X[0])
    #margin_fractions_v = hw5u.get_margin_fractions_validate(adaboost, X, y)
    #print col_errs
    ranked = rank(margin_fractions)
    print_ranks(ranked)

    pred = adaboost.predict(X_test)
    print 'Accuracy: {}'.format(accuracy_score(adaboost._check_y_not_zero(y_test), adaboost._check_y_not_zero(pred)))
def q3():  # Got points off b/c I have 89 accuracy instead of 92
    """ Logistic Regression """
    data = utils.load_and_normalize_polluted_spam_data()
    k = 10
    k_folds = hw3u.partition_folds(data, k)
    train_acc = []
    test_acc = []
    hw2_train_acc = []
    hw2_test_acc = []
    for ki in range(k):
        grouped_fold = hw5u.group_fold(k_folds, ki)
        y, X = utils.split_truth_from_data(grouped_fold)
        y_truth, X_test = utils.split_truth_from_data(k_folds[ki])
        clf = lm.LogisticRegression() #penalty="l1")
        ridge_clf = hw5u.Ridge()
        #clf = lm.Lasso(alpha=.5)
        #clf = lm.RidgeClassifier(alpha=.1)
        clf.fit(X, y)
        ridge_clf.fit(X, y)

        y_train = [1 if p >= .5 else 0 for p in clf.predict(X)]
        y_test = [1 if p >= .5 else 0 for p in clf.predict(X_test)]
        yhat_ridge_train = [1 if p >= .5 else 0 for p in ridge_clf.predict(X)]
        yhat_ridge_test = [1 if p >= .5 else 0 for p in ridge_clf.predict(X_test)]
        train_acc.append(accuracy_score(y, y_train))
        test_acc.append(accuracy_score(y_truth, y_test))
        hw2_train_acc.append(accuracy_score(y, yhat_ridge_train))
        hw2_test_acc.append(accuracy_score(y_truth, yhat_ridge_test))
        print 'Fold {} train acc: {} test acc: {} HW2 ridge train: {}  HW2 ridge test: {}'.format(ki+1, train_acc[-1], test_acc[-1], hw2_train_acc[-1], hw2_test_acc[-1])
    print 'Average acc - Train: {}  Test: {}  HW2 ridge: {}'.format(np.mean(train_acc), np.mean(test_acc), np.mean(hw2_train_acc), np.mean(hw2_test_acc))
def q2():  # Done
    """
    standard deviation for some columns is 0
    """
    data = utils.load_and_normalize_polluted_spam_data()
    #data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    #data, _ = utils.random_sample(data, None, 300)  #TODO - remove
    #GaussianNB(data)
    GaussianNB(data, num_features=100)