def SpamClassifier(features, skclassifier, myclassifier):
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    if features != 'all':
        # Only use the features passed in the features array
        new = []
        t = utils.transpose_array(data)
        for i in xrange(len(t)):
            if i in features:
                new.append(t[i])
            data = utils.transpose_array(t)
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=myclassifier)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclassifier)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def runDigitsDensity(n,_i, j):
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute
    #skclf = KernelDensity(metric=ma)
    myclf = hw7u.MyKNN(metric=metric[j], density=True)
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    #skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    #print 'scikit predict'
    #sk_pred = skclf.predict(X_test)
    #print sk_pred
    print y_test
    print y_pred
    #print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
    print 'My Accuracy: {}'.format(myacc)
def q1():
    """ feature analysis with Adaboost """
    #spamData = hw3u.pandas_to_data(hw3u.load_and_normalize_spambase())
    spamData = utils.load_and_normalize_polluted_spam_data()
    k = 10
    all_folds = hw3u.partition_folds(spamData, k)
    col_errs = []
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)

    # We're not actually cross-validating anything -- we just want feature weights
    #X = np.concatenate([X, X_test], axis=0)
    #y = np.concatenate([y, y_test], axis=0)

    #adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='random'))
    adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='best'))
    #adaboost = adac.AdaboostOptimal(max_rounds=10, do_fast=False, learner=hw4u.TreeOptimal)
    adaboost.fit(X, y)


    margin_fractions = get_margin_fractions(adaboost, X[0])
    #margin_fractions_v = hw5u.get_margin_fractions_validate(adaboost, X, y)
    #print col_errs
    ranked = rank(margin_fractions)
    print_ranks(ranked)

    pred = adaboost.predict(X_test)
    print 'Accuracy: {}'.format(accuracy_score(adaboost._check_y_not_zero(y_test), adaboost._check_y_not_zero(pred)))
def runDigits(n, skclf, myclf):
    mnsize = n
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y, dtype=np.float), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float)
    print 'my fit'
    clf = OneVsRestClassifier(myclf).fit(X, y)
    print 'scikit fit'
    skclf = skclf.fit(X, y)
    print 'my predict'
    y_pred = clf.predict(X_test)
    myacc = accuracy_score(y_test, y_pred)
    print '({})'.format(myacc)
    print 'scikit predict'
    sk_pred = skclf.predict(X_test)
    print sk_pred
    print y_test
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
def q1():
    """GDA """
    """Run the Gaussian Discriminant Analysis on the spambase data. Use the k-folds from the previous problem (1 for testing, k-1 for training, for each fold)
Since you have 57 real value features, each of the  2gaussians (for + class and for - class) will have a mean  vector with 57 components, and a they will have
either a common (shared) covariance matrix size 57x57. This covariance is estimated from all training data (both classes)
or two separate covariance 57x57 matrices (estimated separately for each class)
(you can use a Matlab or Python of Java built in function to estimated covariance matrices, but the estimator is easy to code up).
Looking at the training and testing performance, does it appear that the gaussian assumption (normal distributed data) holds for this particular dataset?
"""

    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())  # returns an array of arrays - this is by row
    k = 10
    train_acc_sum = 0
    k_folds = hw3.partition_folds(spamData, k)
    gdas = []
    for ki in range(k - 1):
        subset = []
        gda = hw3.GDA()
        X, truth = hw3.separate_X_and_y(k_folds[ki])
        covariance_matrix = hw3.get_covar(X)
        gda.p_y = float(sum(truth)) / len(truth)
        gda.train(X, covariance_matrix, truth)
        predictions = gda.predict(X)
        #print predictions
        accuracy = mystats.get_error(predictions, truth, True)
        #gdas.append(gda)
        print_output(ki, accuracy)
        #print gda.prob
        gdas.append(gda)
def TreeTest():
    spamDat = spamData()
    k = 10
    all_folds = hw3.partition_folds(spamDat, k)
    num_in_fold = []
    err_in_fold = []
    for i in range(len(all_folds) - 1):
        spam = all_folds[i]
        num_in_fold.append(len(spam))
        truth, f_data = decTree.split_truth_from_data(spam)
        tree = decTree.TreeOptimal(max_depth=2)
        #tree = decTree.TreeRandom()
        tree.fit(f_data, truth)
        print 'Prediction...\n'
        predict = tree.predict(f_data)
        print predict
        print truth
        error = 1. - hw3.get_accuracy(predict, truth)
        err_in_fold.append(error)
        print 'Tree error is: {}'.format(error)
    spam = all_folds[k -1]
    truth, f_data = decTree.split_truth_from_data(spam)
    tree = decTree.TreeOptimal(max_depth=2)
    #tree = decTree.TreeRandom()
    tree.fit(f_data, truth)
    predict = tree.predict(f_data)
    error = 1. - hw3.get_accuracy(predict, truth)
    sum_training_err = 0
    for i in range(len(num_in_fold)):
        sum_training_err += err_in_fold[i]
        #sum_training_err += float(err_in_fold)/num_in_fold
    average_training_error = float(sum_training_err)/len(num_in_fold)
    print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def q2():
    models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins']
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    k_folds = hw3.partition_folds(spamData, k)
    for model_type in range(4):
        print '\nModel: {}'.format(models[model_type])
        train_acc_sum = 0
        nb_models = []
        for ki in range(k - 1):
            alpha = .001 if model_type==0 else 0
            nb_model = nb.NaiveBayes(model_type, alpha=alpha)
            truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki])
            nb_model.train(data_rows, truth_rows)
            predict = nb_model.predict(data_rows)
            print predict
            accuracy = hw3.get_accuracy(predict, truth_rows)
            train_acc_sum += accuracy
            print_output(ki, accuracy)
            nb_models.append(nb_model)
        nb_combined = nb.NaiveBayes(model_type, alpha=.001)
        if model_type < 2:
            nb_combined.aggregate_model(nb_models)
        else:
            nb_combined.aggregate_model3(nb_models)
        truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[k - 1])
        test_predict = nb_combined.predict(data_rows)
        test_accuracy = hw3.get_accuracy(test_predict, truth_rows)
        print_test_output(test_accuracy, float(train_acc_sum)/(k-1))



            #print len(k_folds[0])
    truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(spamData)
def q2_plots():
    models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins']
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    num_points = 50
    k_folds = hw3.partition_folds(spamData, k)
    for model_type in range(4):
        roc = ROC.ROC()
        print '\nModel: {}'.format(models[model_type])
        train_acc_sum = 0
        nb_models = []
        for ki in [0]:
            alpha = .001 if model_type==0 else 0
            nb_model = nb.NaiveBayes(model_type, alpha=alpha)
            truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki])
            nb_model.train(data_rows, truth_rows)
            for ti in range(num_points + 2):
                theta = ti * 1./(num_points + 1)
                predict = nb_model.predict(data_rows, theta)
                print predict
                accuracy = hw3.get_accuracy(predict, truth_rows)
                train_acc_sum += accuracy
                roc.add_tp_tn(predict, truth_rows, theta)

                #print_plot_output(ki, accuracy, theta)

        roc.plot_ROC('/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks/roc_{}.pdf'.format(model_type))
        roc.print_info()
def tests_radius():
    i = 0
    j = 0
    k = 10
    X, y = testData()
    #print X
    X = np.concatenate([X, y.reshape((len(y), 1))], axis=1)
    X = [list(x.ravel()) for x in X]
    radius = [3, 5, 7]
    radius = [1e-1,1e-2,1e-3]  # for radius
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = speedy.Kernel(ktype=metric[j]).compute
    #ma = hw7u.Kernel(ktype=metric[j]).compute
    print 'spam radius is {}'.format(radius[i])
    clf = hw7u.MyKNN(radius=radius[i], metric=metric[j], outlier_label=-1)
    skclf = RadiusNeighborsClassifier(radius=radius[i], algorithm='brute', metric="euclidean", p=2, outlier_label=.5)
    all_folds = hw3u.partition_folds(X, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclf)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=clf)
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
示例#10
0
def q3():  # Got points off b/c I have 89 accuracy instead of 92
    """ Logistic Regression """
    data = utils.load_and_normalize_polluted_spam_data()
    k = 10
    k_folds = hw3u.partition_folds(data, k)
    train_acc = []
    test_acc = []
    hw2_train_acc = []
    hw2_test_acc = []
    for ki in range(k):
        grouped_fold = hw5u.group_fold(k_folds, ki)
        y, X = utils.split_truth_from_data(grouped_fold)
        y_truth, X_test = utils.split_truth_from_data(k_folds[ki])
        clf = lm.LogisticRegression() #penalty="l1")
        ridge_clf = hw5u.Ridge()
        #clf = lm.Lasso(alpha=.5)
        #clf = lm.RidgeClassifier(alpha=.1)
        clf.fit(X, y)
        ridge_clf.fit(X, y)

        y_train = [1 if p >= .5 else 0 for p in clf.predict(X)]
        y_test = [1 if p >= .5 else 0 for p in clf.predict(X_test)]
        yhat_ridge_train = [1 if p >= .5 else 0 for p in ridge_clf.predict(X)]
        yhat_ridge_test = [1 if p >= .5 else 0 for p in ridge_clf.predict(X_test)]
        train_acc.append(accuracy_score(y, y_train))
        test_acc.append(accuracy_score(y_truth, y_test))
        hw2_train_acc.append(accuracy_score(y, yhat_ridge_train))
        hw2_test_acc.append(accuracy_score(y_truth, yhat_ridge_test))
        print 'Fold {} train acc: {} test acc: {} HW2 ridge train: {}  HW2 ridge test: {}'.format(ki+1, train_acc[-1], test_acc[-1], hw2_train_acc[-1], hw2_test_acc[-1])
    print 'Average acc - Train: {}  Test: {}  HW2 ridge: {}'.format(np.mean(train_acc), np.mean(test_acc), np.mean(hw2_train_acc), np.mean(hw2_test_acc))
示例#11
0
def q1():
    spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase())
    k = 10
    all_folds = hw3.partition_folds(spamData, k)
    tprs = []
    fprs = []
    for i in [0]: #range(len(all_folds)):
        kf_data, kf_test = dl.get_train_and_test(all_folds, i)
        y, X = hw4.split_truth_from_data(kf_data)
        y_test, X_test = hw4.split_truth_from_data(kf_test)
        adaboost = run_adaboost(X, y, X_test, y_test, i)
        predicted = adaboost.predict(X)
        print(roc_auc_score(y, predicted))
        for i in range(len(adaboost.snapshots)):
            round_number = i + 1
            ab = adaboost.snapshots[i]
            yt_pred = ab.predict(X_test)
            round_err = float(np.sum([1 if yt==yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test)
            adaboost.adaboost_error_test[round_number] = round_err
        print predicted[:20]
        print y[:20]
        name = 'q1'
        directory = '/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks'
        path = os.path.join(directory, name + 'hw4errors.pdf')
        tterrpath = os.path.join(directory, name + 'hw4_errors_test_train.pdf')
        print path
        plt.Errors([adaboost.local_errors]).plot_all_errors(path)
        plt.Errors([adaboost.adaboost_error, adaboost.adaboost_error_test]).plot_all_errors(tterrpath)
        roc = plt.ROC()
        #roc.add_tpr_fpr_arrays(adaboost.tpr.values(), adaboost.fpr.values())
        get_tpr_fpr(adaboost, roc, X_test, y_test, 30)
        roc.plot_ROC(os.path.join(directory, name + 'hw4_roc.pdf'))
示例#12
0
def svm_q1(data, classifier=svm.SVC()):
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'length train: {} length test {}'.format(len(X), len(X_test))
    clf = classifier
    clf.fit(X, y)
    y_pred = clf.predict(X_test)
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y), fix_y(clf.predict(X))), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
示例#13
0
def multiclassSVC(classifier, sz=2000):

    mnsize = sz
    df = hw6u.load_mnist_features(mnsize)
    data = utils.pandas_to_data(df)
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False)
    y, X = np.asarray(y), np.asarray(X)
    y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False)
    y_test, X_test = np.asarray(y_test), np.asarray(X_test)
    print 'Beginning analysis: {}'.format(X.shape)
    #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y)
    clf = OneVsOneClassifier(classifier).fit(X, y)
    #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y)
    y_pred = clf.predict(X)
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
    print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
 def run(self, data, weights):
     k_folds = hw3.partition_folds(data, self.number_k_folds)
     for k in xrange(self.number_k_folds - 1):
         err_matrix = []
         fold = k_folds[k]
         truth, f_data = split_truth_from_data(fold)
         model = self.fit(f_data)
         predicted = self.predict(model, f_data)  # {-1, 1}
         err_matrix = self.compute_error_matrix(truth, predicted)
         self.training_errors.append(self.get_error(err_matrix))
         self.training_errors_weighted.append(sum(self.weight_errors(err_matrix, weights)))
     fold = k_folds[self.number_k_folds - 1]
     truth, f_data = utils.split_truth_from_data(fold)
     predicted = self.predict(model, f_data)
     # Error matrix for round computed from test data
     self.err_matrix = self.compute_error_matrix(truth, predicted)
     self.testing_error = self.get_error(self.err_matrix)
     self.testing_errors_weighted = self.weight_errors(self.err_matrix, weights)
     self.set_weight_distribution_and_total()  # Dt(x) and epsilon
     self.set_alpha()
示例#15
0
def relief(n):
    max_iters = 1
    j = 0
    i = 1
    n_neighbors = [1, 3, 7]
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]).compute
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    loops = 0
    weights = np.zeros(len(X[0]))
    loops += 1
    n_features = len(X[0])
    n_samples = len(X)
    for j in range(n_features): #feature

        for i in range(n_samples):  # data
            closest_same = None
            closest_opp = None
            for z_i in range(n_samples):
                if z_i == i:
                    continue
                diff = (X[z_i][j] - X[i][j]) ** 2
                if y[z_i] == y[i]:  # same
                    if closest_same is None or diff < closest_same:
                        closest_same = diff
                else:  # opp
                    if closest_opp is None or diff < closest_opp:
                        closest_opp = diff
            weights[j] += (-closest_same + closest_opp)
            if i % 1000 == 0:
                print 'feature {} of {}, sample {} of {}'.format(j, n_features, i, n_samples)
    print weights

    return sorted(zip(weights, range(len(weights))), reverse=True)[:n][1]
def tests_density():
    i = 0
    j = 2
    k = 10
    X, y = testData()
    print X
    X = np.concatenate([X, y.reshape((len(y), 1))], axis=1)
    X = [list(x.ravel()) for x in X]
    radius = [3, 5, 7]
    metric = ['minkowski', 'cosine', 'gaussian', 'poly2']
    ma = hw7u.Kernel(ktype=metric[j]).compute
    params = {'bandwidth': np.logspace(-1, 1, 20)}
    grid = GridSearchCV(KernelDensity(), params)
    grid.fit(X)
    clf = hw7u.MyKNN(metric=metric[j], density=True)

    bw = grid.best_estimator_.bandwidth
    print("best bandwidth: {0}".format(bw))

    # use the best estimator to compute the kernel density estimate
    kde = grid.best_estimator_
    skclf = KernelDensity(bandwidth=bw, kernel='gaussian')
    skclf.fit(X[:-10], y[:-10])
    print skclf.score_samples(X[-10:])
    return
    all_folds = hw3u.partition_folds(X, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclf)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=clf)
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print y_pred
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
示例#17
0
def GaussianNB(X, num_features=None):
    model_type = 1
    train_acc_sum = 0
    test_acc_sum = 0
    k = 10
    nb_models = []
    if num_features is not None:
        y, X = utils.split_truth_from_data(X)
        q4_slct = SelectKBest(k=num_features).fit(X, y)
        X = q4_slct.transform(X)
        X = utils.add_row(X, y)
    k_folds = hw3u.partition_folds(X, k)
    for ki in range(k):
        grouped_fold = hw5u.group_fold(k_folds, ki)
        alpha = .001 if model_type==0 else 0
        mask_cols = check_cols(grouped_fold)
        #nb_model = nb.NaiveBayes(model_type, alpha=alpha, ignore_cols=mask_cols)
        nb_model = BernoulliNB()
        print 'len of kfolds {}'.format(len(grouped_fold))
        #truth_rows, data_rows, data_mus, y_mu = hw3u.get_data_and_mus(grouped_fold)
        truth_rows, data_rows = utils.split_truth_from_data(grouped_fold)
        print 'len of data {}'.format(len(data_rows))
        #nb_model.train(data_rows, truth_rows)
        nb_model.fit(data_rows, truth_rows)
        predict = nb_model.predict(data_rows)
        #print predict
        accuracy = hw3u.get_accuracy(predict, truth_rows)
        train_acc_sum += accuracy
        print_output(ki, accuracy)
        nb_models.append(nb_model)

        truth_rows, data_rows = utils.split_truth_from_data(k_folds[ki])
        test_predict = nb_model.predict(data_rows)
        test_accuracy = hw3u.get_accuracy(test_predict, truth_rows)
        test_acc_sum += test_accuracy
        print_output(ki, test_accuracy, 'test')

    print_test_output(float(train_acc_sum)/k, float(test_acc_sum)/k)
示例#18
0
def runSpamDensity(_i, j, features='all'):
    metric = ['gaussian', 'poly2', 'cosine_similarity', 'gaussian_density']
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)

    print(len(X))
    print(len(X_test))

    myclassifier = hw7u.MyKNN(metric=metric[j], density=True)
    print 'start MyKNN'
    myclassifier.fit(X, y)
    #print 'start scikit'
    #knnsci = skclassifier.fit(X, y)
    print 'start my pred'
    y_pred = myclassifier.predict(X_test)
    print(y_pred)
    #print 'start sk pred'
    #y_sci = knnsci.score(X_test)
    #print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
    print '2b: My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
def test_partition_data():
    arr = get_test_data(303)
    print hw3.partition_folds(arr)