Пример #1
0
def test_metric():
    print("###precision###")
    print precision(test_label, knn_predict)
    print precision_score(test_label, knn_predict, average='macro')

    print("###recall###")
    print recall(test_label, knn_predict)
    print recall_score(test_label, knn_predict, average='macro')

    print("###f1###")
    print f1(test_label, knn_predict)
    print f1_score(test_label, knn_predict, average='macro')
Пример #2
0
def main(argv):
    size_training = int(argv[0])
    epoch = int(argv[1])
    learning_rate = float(argv[2])
    path = argv[3]

    training_img_path = path + '/train-images-idx3-ubyte.gz'
    training_label_path = path + '/train-labels-idx1-ubyte.gz'
    testing_img_path = path + '/t10k-images-idx3-ubyte.gz'
    testing_label_path = path + '/t10k-labels-idx1-ubyte.gz'

    # read the file, trans to 1-d array
    training_img = read_mnist(training_img_path)[:size_training].reshape(
        size_training, 784)
    training_label = read_mnist(training_label_path)[:size_training]
    testing_img = read_mnist(testing_img_path)
    testing_img = testing_img.reshape(len(testing_img), 28 * 28)
    testing_label = read_mnist(testing_label_path)

    # shuffle(I dont like it)
    training_set = zip(training_img, training_label)
    random.shuffle(training_set)
    training_img, training_label = zip(*training_set)

    # scale [0, 255] to boolean
    training_img = [[round(pixel / 255) for pixel in sample]
                    for sample in training_img]
    testing_img = [[round(pixel / 255) for pixel in sample]
                   for sample in testing_img]

    # build 10 perceptron, one classifier for one digit
    perceptron_clfs = []
    for target_label in range(10):
        local_training_label = [
            1 if label == target_label else -1 for label in training_label
        ]
        # pack together and shuffle
        local_training_set = zip(training_img, local_training_label)
        perceptron_clfs.append(Winnow(local_training_set, learning_rate,
                                      epoch))

    # training
    for clf in perceptron_clfs:
        clf.train()

    # test on training set
    training_set = zip(training_img, training_label)
    predict_label = []
    for item in training_set:
        # predict
        scores = []
        for clf in perceptron_clfs:
            scores.append(clf.predict(item[0]))
        predict_label.append(scores.index(max(scores)))
    print("Train F1 score: ", f1(training_label, predict_label))

    # test on testing set
    testing_set = zip(testing_img, testing_label)
    predict_label = []
    for item in testing_set:
        # predict
        scores = []
        for clf in perceptron_clfs:
            scores.append(clf.predict(item[0]))
        predict_label.append(scores.index(max(scores)))
    print("Test F1 score: ", f1(testing_label, predict_label))
Пример #3
0
def KFoldCross(model, feature=None, label=None, cv=4):
    """
    data : array-like, data to fit

    target : array-like, target variable

    cv : int, 
    """
    assert len(feature) == len(label)
    num_test_sample = len(feature) // cv
    # num_train_sample = num_train_train_sample + num_validation_sample
    num_train_sample = len(feature) - num_test_sample
    num_validation_sample = num_train_sample // cv
    num_train_train_sample = num_train_sample - num_validation_sample
    assert num_test_sample + num_train_train_sample + num_validation_sample == len(
        feature)

    total_train_f1 = 0
    total_train_accuracy = 0
    total_validation_f1 = 0
    total_validation_accuracy = 0
    total_test_f1 = 0
    total_test_accuracy = 0

    # test fold
    for test_fold in range(0, cv):
        # split test and train
        test_sample_feature = feature[test_fold *
                                      num_test_sample:(test_fold + 1) *
                                      num_test_sample]
        test_sample_label = label[test_fold * num_test_sample:(test_fold + 1) *
                                  num_test_sample]
        train_sample_feature = np.concatenate(
            (feature[:test_fold * num_test_sample],
             feature[(test_fold + 1) * num_test_sample:]),
            axis=0)
        train_sample_label = np.concatenate(
            (label[:test_fold * num_test_sample],
             label[(test_fold + 1) * num_test_sample:]),
            axis=0)
        # check
        assert len(test_sample_feature) == num_test_sample == len(
            test_sample_label)
        assert len(train_sample_feature) == num_train_sample == len(
            train_sample_label)

        total_fold_train_f1 = 0
        total_fold_train_accuracy = 0
        total_fold_validation_f1 = 0
        total_fold_validation_accuracy = 0

        # train fold
        for validation_fold in range(0, cv):
            model.reset()

            # split train and validation
            validation_sample_feature = train_sample_feature[
                validation_fold * num_validation_sample:(validation_fold + 1) *
                num_validation_sample]
            validation_sample_label = train_sample_label[validation_fold *
                                                         num_validation_sample:
                                                         (validation_fold +
                                                          1) *
                                                         num_validation_sample]
            train_train_sample_feature = np.concatenate(
                (train_sample_feature[:validation_fold *
                                      num_validation_sample],
                 train_sample_feature[(validation_fold + 1) *
                                      num_validation_sample:]),
                axis=0)
            train_train_sample_label = np.concatenate(
                (train_sample_label[:validation_fold * num_validation_sample],
                 train_sample_label[(validation_fold + 1) *
                                    num_validation_sample:]),
                axis=0)
            assert len(validation_sample_label) == len(
                validation_sample_feature) == num_validation_sample
            assert len(train_train_sample_feature) == len(
                train_train_sample_label) == num_train_train_sample

            model.fit(train_train_sample_feature, train_train_sample_label)

            # train stat
            train_output = model.predict(train_train_sample_feature)
            total_train_f1 += f1(train_train_sample_label, train_output)
            total_train_accuracy += accuracy(train_train_sample_label,
                                             train_output)
            total_fold_train_f1 += f1(train_train_sample_label, train_output)
            total_fold_train_accuracy += accuracy(train_train_sample_label,
                                                  train_output)

            # validation stat
            validation_output = model.predict(validation_sample_feature)
            total_validation_f1 += f1(validation_sample_label,
                                      validation_output)
            total_validation_accuracy += accuracy(validation_sample_label,
                                                  validation_output)
            total_fold_validation_f1 += f1(validation_sample_label,
                                           validation_output)
            total_fold_validation_accuracy += accuracy(validation_sample_label,
                                                       validation_output)

        # predict in test set
        model.fit(train_sample_feature, train_sample_label)
        output = model.predict(test_sample_feature)

        # fold stat(overall)
        total_test_f1 += f1(test_sample_label, output)
        total_test_accuracy += accuracy(test_sample_label, output)

        # fold statistics(local)
        fold_train_f1 = total_fold_train_f1 / cv
        fold_train_accuracy = total_fold_train_accuracy / cv
        fold_validation_f1 = total_fold_validation_f1 / cv
        fold_validation_accuracy = total_fold_validation_accuracy / cv
        fold_test_f1 = f1(test_sample_label, output)
        fold_test_accuracy = accuracy(test_sample_label, output)

        print("Fold-", test_fold + 1)
        print("Training: F1 score: ", fold_train_f1, ", Accuracy: ",
              fold_train_accuracy)
        print("Validation: F1 score: ", fold_validation_f1, ", Accuracy: ",
              fold_validation_accuracy)
        print("Testing: F1 score: ", fold_test_f1, ", Accuracy: ",
              fold_test_accuracy)
        print

    # statistics
    train_f1 = total_train_f1 / (cv * cv)
    train_accuracy = total_train_accuracy / (cv * cv)
    validation_f1 = total_validation_f1 / (cv * cv)
    validation_accuracy = total_validation_accuracy / (cv * cv)
    test_f1 = total_test_f1 / cv
    test_accuracy = total_test_accuracy / cv

    print("Average")
    overall_report_line = [
        train_f1, train_accuracy, validation_f1, validation_accuracy, test_f1,
        test_accuracy, '\n'
    ]
    print("Training: F1 score: ", train_f1, ", Accuracy: ", train_accuracy)
    print("Validation: F1 score: ", validation_f1, ", Accuracy: ",
          validation_accuracy)
    print("Testing: F1 score: ", test_f1, ", Accuracy: ", test_accuracy)
    print
Пример #4
0
def main():
    my_data = np.genfromtxt('../winequality-white.csv',
                            delimiter=';',
                            dtype=float,
                            skip_header=1)

    # proprocess(normalization and weighted vote)

    # train set
    train_feature = my_data[:3000, :-1]
    train_label = my_data[:3000, -1]

    # test set
    test_feature = my_data[3000:, :-1]
    test_label = my_data[3000:, -1]

    # feature and label
    feature = my_data[:, :-1]
    label = my_data[:, -1]

    embed()

    # test_metric()
    """
    # test knn
    knn = KNN_Classifier(k=1)
    knn.fit(train_feature, train_label)
    f**k = knn.predict(train_feature)
    print(accuracy(train_label, f**k))
    embed()
    return
    """

    # knn kfold
    """
    print("----KNN kfold with p = 1----")
    for i in range(1, 5):
        print("--k = ", i, "--")
        knn = KNN_Classifier(k=3, p=1)
        KFoldCross(knn, feature, label, 4) 
    
    print("##########################")
    print("----KNN kfold with p = 2----")
    for i in range(1, 5):
        print("--k = ", i, "--")
        knn = KNN_Classifier(k=3, p=2)
        KFoldCross(knn, feature, label, 4) 
    
    print("##########################")
    print("----KNN kfold with cosine----")
    for i in range(1, 5):
        print("--k = ", i, "--")
        knn = KNN_Classifier(k=3, metric='cosine')
        KFoldCross(knn, feature, label, 4) 
    """
    """
    # dt kfold
    print("----DT kfold----")
    for i in range(0, 7):
        print("--max depth = ", i, "--")
        dt = DT_Classifier(max_depth=i,sigmoid=True, min_impurity_decrease=1.0)
        KFoldCross(dt, feature, label, 4) 
    """

    # test knn
    # for i in range(5):
    #knn = KNN_Classifier(k=1)
    #knn.fit(train_feature, train_label)
    #knn_predict = knn.predict(test_feature)
    #print(accuracy(test_label, knn_predict))
    #print(precision(test_label, knn_predict))
    #print(recall(test_label, knn_predict))
    #print(f1(test_label, knn_predict))

    # test metric
    # test_metric()

    # test dt
    dt = DT_Classifier()
    dt.fit(train_feature, train_label)
    dt_predict = dt.predict(test_feature)
    print(accuracy_score(test_label, dt_predict))

    # use sklearn dt
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(train_feature, train_label)
    sklearn_predict = clf.predict(test_feature)
    print("sklearn dt accuracy: ", accuracy_score(test_label, sklearn_predict))
    print("sklearn dt f1: ", f1(test_label, sklearn_predict))

    # use sklearn knn
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(train_feature, train_label)
    sklearn_knn_predict = neigh.predict(test_feature)
    print("sklearn knn accuracy: ",
          accuracy_score(test_label, sklearn_knn_predict))
    print("sklearn knn f1: ", f1(test_label, sklearn_knn_predict))