Exemplo n.º 1
0
    def test_blob_classification_numpy(self):
        """
        Tests kNN for classification using
        randomly-generated points drawn from
        Gaussian-shaped clusters.
        
        Splits data into training and testing
        sets.
        """

        k = 3
        X, y = generate_cluster_samples()

        train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)

        knn = KNN(k)
        knn.fit(train_X, train_y)
        pred_y = knn.predict_numpy(test_X)

        # verify shape of output
        self.assertEqual(len(pred_y.shape), 1)
        self.assertEqual(pred_y.shape[0], test_X.shape[0])

        # with k=1, each point should match itself
        accuracy = accuracy_score(test_y, pred_y)
        self.assertAlmostEqual(accuracy, 1.0)
Exemplo n.º 2
0
def main():
    K = [1, 3]

    #load CM1
    data = pd.read_csv('./lvq_output/seed_137/kc1_lvq3.csv')

    X, Y = data.drop(columns=['defects']), data['defects']

    # normalize data
    #X = normalize_data(X)

    # create k-fold splits
    kf = KFold(n_splits=10)

    # instanciate classifier
    for k in K:
        clf = KNN(k=k)
        print("k equals {}".format(k))

        start_time = time.time()
        acc = []
        for train, test in kf.split(X):

            clf.fit(X.iloc[train], Y.iloc[train])
            predictions = clf.predict(X.iloc[test])
            acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100)

        end_time = time.time()

        acc = np.array(acc)

        print("mean accuracy: {}".format(np.mean(acc)))
        print("standard deviation: {}".format(np.std(acc)))
        print("time elapsed: {}".format(end_time - start_time))
Exemplo n.º 3
0
    def test_synthetic_data(self):
        """
        Test KNN.predict using some synthetic data
        """
        x_train = np.array([[1, 2], [1, 3], [2, 2], [2, 3], [1, 1], [2, 1]])
        y_train = np.array([1, 1, 1, -1, -1, -1])

        model = KNN(k=3)
        model.fit(x_train, y_train)

        x_test = np.array([
            [1.8, 2.6],
            [2.0, 1.8],
            [1.5, 2.0],
            [1.0, 2.5],
            [1.5, 1.0],
            [2.0, 1.0],
        ])

        pred = model.predict(x_test)

        self.assertTrue(np.array_equal(pred, np.array([1, 1, 1, 1, -1, -1])))

        # one labels should change if using 1-nn
        model.k = 1
        pred2 = model.predict(x_test)

        self.assertTrue(np.array_equal(pred2, np.array([-1, 1, 1, 1, -1, -1])))
Exemplo n.º 4
0
def test_knn_regression():
    while True:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        ls = np.min([np.random.randint(1, 10), N - 1])
        weights = np.random.choice(["uniform", "distance"])

        X = np.random.rand(N, M)
        X_test = np.random.rand(N, M)
        y = np.random.rand(N)

        knn = KNN(k=k,
                  leaf_size=ls,
                  metric=euclidean,
                  classifier=False,
                  weights=weights)
        knn.fit(X, y)
        preds = knn.predict(X_test)

        gold = KNeighborsRegressor(
            p=2,
            leaf_size=ls,
            n_neighbors=k,
            weights=weights,
            metric="minkowski",
            algorithm="ball_tree",
        )
        gold.fit(X, y)
        gold_preds = gold.predict(X_test)

        for mine, theirs in zip(preds, gold_preds):
            np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
Exemplo n.º 5
0
def main():
    trainSet = pd.read_csv('datasets/train_set.csv',
                           converters={'Trajectory': literal_eval})

    testSet = pd.read_csv('datasets/test_set_a2.csv',
                          converters={'Trajectory': literal_eval})

    # labels for categories
    le = preprocessing.LabelEncoder()
    categoryIds = le.fit_transform(trainSet['journeyPatternId'])

    allSequences = []

    for trainIndex, trainRow in trainSet.iterrows():
        allSequences.append(trainRow['Trajectory'])

    # initialize KNN classifier
    clf = KNN(5, DTW)

    crossValidation(clf, allSequences, categoryIds, le)
    clf.fit(allSequences, categoryIds)

    # predict the categories for the testSet
    predIds = clf.predict(testSet['Trajectory'])
    predCategs = le.inverse_transform(predIds)

    writeInCsv(predCategs)
Exemplo n.º 6
0
def foo(k_num=5, distance=distance_metric(p=1)):
    _data = [[i[0], i[3]] for i in data]
    # Split the data into train and test parts
    #train_d, train_l, test_d, test_l = tt_split(_data, label)
    train_d, train_l, test_d, test_l = (_data[0:30] + _data[50:80] +
                                        _data[100:130], label[0:30] +
                                        label[50:80] + label[100:130],
                                        _data[30:50] + _data[80:100] +
                                        _data[130:], label[30:50] +
                                        label[80:100] + label[130:])
    # Initialize the KNN object
    knn = KNN(neighbors_num=k_num, distance=distance)
    # Fill the data in KNN
    knn.fit(train_d, train_l)
    # Take prediction from KNN
    result = knn.predict(test_d)

    # Print the results on screen as data, real label, predicted label.
    #print("%20s - %20s | %20s | %s" %("[Data]", "<Real Label>", "<Predicted Label>", "Truth"))

    n = 0
    for i, j, r in zip(test_d, test_l, result):
        truthness = True if j == r else False
        if truthness:
            n += 1
        #print("%20s - %20s | %20s | %s" %(i, j, r, truthness))
    #print("Acc:", n / len(test_d))
    return n / len(test_d), n, len(test_d)
Exemplo n.º 7
0
class Stacking():
    def __init__(self):
        pass

    def fit(self, X, y):
        self.rf = RandomForest(num_trees=15, max_depth=np.inf)
        self.rf.fit(X, y)
        y_rf = self.rf.predict(X)

        self.nb = NaiveBayes()
        self.nb.fit(X, y)
        y_nb = self.nb.predict(X)

        self.knn = KNN(k=3)
        self.knn.fit(X, y)
        y_knn = self.knn.predict(X)

        newX = np.array([y_rf, y_nb, y_knn]).transpose()

        model = DecisionTree(max_depth=np.inf,
                             stump_class=DecisionStumpErrorRate)
        self.model = model

        model.fit(newX, y)

    def predict(self, X):
        y_rf = self.rf.predict(X)
        y_nb = self.nb.predict(X)
        y_knn = self.knn.predict(X)
        x_test = np.array([y_rf, y_nb, y_knn]).transpose()

        return self.model.predict(x_test)
Exemplo n.º 8
0
def test_knn_clf():
    while True:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        n_classes = np.random.randint(10)
        ls = np.min([np.random.randint(1, 10), N - 1])
        weights = "uniform"

        X = np.random.rand(N, M)
        X_test = np.random.rand(N, M)
        y = np.random.randint(0, n_classes, size=N)

        knn = KNN(k=k,
                  leaf_size=ls,
                  metric=euclidean,
                  classifier=True,
                  weights=weights)
        knn.fit(X, y)
        preds = knn.predict(X_test)

        gold = KNeighborsClassifier(
            p=2,
            leaf_size=ls,
            n_neighbors=k,
            weights=weights,
            metric="minkowski",
            algorithm="ball_tree",
        )
        gold.fit(X, y)
        gold_preds = gold.predict(X_test)

        for mine, theirs in zip(preds, gold_preds):
            np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
Exemplo n.º 9
0
    def fit(self, X, y):
        # instantiate the input models
        rf = RandomForest(num_trees=15)
        knn = KNN(k=3)
        nb = NaiveBayes(num_classes=2)

        # Random Forest fit and predict
        rf.create_splits(X)
        rf.fit(X, y)
        rf_pred = rf.predict(X)

        # K-Nearest Neighbors fit and predict
        knn.fit(X, y)
        knn_pred = knn.predict(X)

        # Naive Bayes fit and predict
        nb.fit(X, y)
        nb_pred = nb.predict(X)

        # use predictions from input models as inputs for meta-classifiers
        meta_input = np.hstack((rf_pred.reshape(
            (rf_pred.size, 1)), knn_pred.reshape(
                (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1))))

        # use Decision Tree as meta-classifier
        dt = DecisionTree(max_depth=np.inf)
        dt.fit(meta_input, y)

        self.rf = rf
        self.knn = knn
        self.nb = nb
        self.meta_classifier = dt
Exemplo n.º 10
0
    def eval_model(case):
        l, k = case
        results = {'precision': [], 'recall': [], 'f1': []}

        model = KNN(l, k)

        for i in range(folds):
            print(l, k, 'cross validation', i)

            training, testing = split_data(corpus, i, folds)

            print(l, k, 'fit model', i)
            model.fit([d.vector for d in training],
                      [d.label for d in training])

            print(l, k, 'predict', i)
            preds = [model.predict(d.vector) for d in testing]

            labels = [d.label for d in testing]

            metrics = model_metrics(labels, preds)
            for m, key in zip(metrics, ['precision', 'recall', 'f1']):
                results[key].append(m)

        print(l, k, mean(results['precision']), mean(results['recall']),
              mean(results['f1']))

        return results
Exemplo n.º 11
0
def knn(corpus, idf):
    query = read_folder('./query')
    tf_idf(query, idf)

    print('fit KNN model')

    classifier = KNN(5, 5)
    classifier.fit([d.vector for d in corpus], corpus)

    start_time = time.time()

    for i, d in enumerate(query):
        print('Query Doc', i)
        print(d.features)

        # neighbors = classifier.brute_force(d.vector)
        neighbors = classifier.neighbors(d.vector)
        print('Query Neighbors', i)

        for n in neighbors:
            print(n.features)
            print('\n')

        print('\n')

    print("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 12
0
    def knn_validate(data, kernel, metric, k_neighbors, show_plot):
        plot = Plot()
        matrix_full = [[0, 0], [0, 0]]
        y_predict_arr = []
        for i in range(len(data)):
            data.updateTrainTest(i)
            trainDots, trainClass = data.getDotsByMode('train', False)
            testDots, testClass = data.getDotsByMode('test', False)

            knn = KNN(kernel=kernel, metric=metric, neighbors=k_neighbors)
            knn.fit(trainDots, trainClass)
            y_predict, distance = knn.predict(testDots)
            y_predict_arr.append(y_predict[0])

            if show_plot:
                tDots = np.array(trainDots)
                tCls = np.array(trainClass)
                plot.knn(tDots[tCls == 1.0], tDots[tCls == -1.0], distance, testDots[0], y_predict[0])

            matrix = get_metrics(y_predict, testClass)
            matrix_full[0][0] += matrix[0][0]
            matrix_full[0][1] += matrix[0][1]
            matrix_full[1][0] += matrix[1][0]
            matrix_full[1][1] += matrix[1][1]

        return y_predict_arr, get_f_measure(matrix_full), matrix_full
Exemplo n.º 13
0
def main():
    K = [1, 2, 3, 5, 7, 9, 11, 13, 15]

    #load CM1
    data = arff.loadarff('./datasets/CM1.arff')

    X, Y = build_dataframe(data)

    # normalize data
    X = normalize_data(X)

    # create k-fold splits
    kf = KFold(n_splits=10)

    # instanciate classifier
    for k in K:
        clf = KNN(k=k)
        print("k equals {}".format(k))

        start_time = time.time()
        acc = []
        for train, test in kf.split(X):

            clf.fit(X.iloc[train], Y.iloc[train])
            predictions = clf.predict(X.iloc[test])
            acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100)

        end_time = time.time()

        acc = np.array(acc)

        print("mean accuracy: {}".format(np.mean(acc)))
        print("standard deviation: {}".format(np.std(acc)))
        print("time elapsed: {}".format(end_time - start_time))
Exemplo n.º 14
0
    def test_iris_regression(self):
        """
        Tests kNN for regression
        """

        k = 1
        iris_dataset = load_iris()

        knn = KNN(k, "average")

        # get petal length as input
        # ensure this is 2D
        X = iris_dataset.data[:, 2].reshape(-1, 1)

        # get petal width as output
        y = iris_dataset.data[:, 3]

        knn.fit(X, y)
        predicted = knn.predict(X)

        # verify shape of output
        self.assertEqual(len(predicted.shape), 1)
        self.assertEqual(predicted.shape[0], iris_dataset.data.shape[0])

        # with k=1, each point should match itself
        # but with only 1 dimension, some points have
        # the same values
        mse = mean_squared_error(y, predicted)
        self.assertLess(mse, 0.1)
Exemplo n.º 15
0
def calc_accuracy_multiclass(train_X,train_y, test_X, test_y,num_folds,K):
    
    knn_classifier = KNN(k=K)
    knn_classifier.fit(train_X, train_y)
    predict = knn_classifier.predict(test_X)
    #rint('predicted ',predict)
    #print('real value',test_y)
    accuracy = multiclass_accuracy(predict, test_y)
    print("Accuracy: %4.2f" % accuracy)
    return accuracy
def rerank_results(feedback, similar_images, similar_image_vectors,
                   query_image_vector):
    global feedback_imgs_g, feedback_vals_g, similar_images_g, similar_image_vectors_g
    similar_images_g = similar_images
    similar_image_vectors_g = similar_image_vectors

    # Add DT based relevance feedback function
    clf = DecisionTree()
    feedback_imgs = list(feedback.keys())

    feedback_vals = list(feedback.values())
    x_train_old, y_train = get_training_set(feedback_imgs, feedback_vals)
    x_train = []
    for i in x_train_old:
        j = i.tolist()
        x_train.append(j)

    clf.fit(x_train, y_train)
    # x_test = similar_image_vectors_g.values()
    x_test = []
    for i in similar_image_vectors_g.values():
        j = i.tolist()
        x_test.append(j)

    predictions = clf.predict(x_test)
    #relevant images
    indices_rel = [i for i, x in enumerate(predictions) if x == 1]
    print("Relevant", indices_rel)
    x_train_knn_rel = []
    rel_len = len(indices_rel)
    for i in indices_rel:
        x_train_knn_rel.append(x_test[i])
    knn = KNN(rel_len)
    #knn = KNeighborsClassifier(n_neighbours=rel_len)
    knn.fit(x_train_knn_rel)
    neighbours_rel = knn.get_neighbours([query_image_vector])
    print("Neighbours Rel", neighbours_rel)
    #irrelevant images
    indices_ir = [i for i, x in enumerate(predictions) if x == -1]
    print("Irrelevant", indices_ir)
    x_train_knn_ir = []
    ir_len = len(indices_ir)
    for i in indices_ir:
        x_train_knn_ir.append(x_test[i])
    knn = KNN(ir_len)
    knn.fit(x_train_knn_ir)
    neighbours_ir = knn.get_neighbours([query_image_vector])
    print("Neighbours IR", neighbours_ir)
    ranked_indices = []
    ranked_indices.extend(indices_rel)
    ranked_indices.extend(indices_ir)
    rel_similar_images = [
        list(similar_image_vectors_g.keys())[index] for index in ranked_indices
    ]
    return rel_similar_images
Exemplo n.º 17
0
    def test_fit(self):
        """
        Test KNN.fit is actually storing the training data
        """
        x_train = np.array([[1, 2], [1, 3], [2, 2], [2, 3], [1, 1], [2, 1]])
        y_train = np.array([1, 1, 1, -1, -1, -1])

        model = KNN()
        model.fit(x_train, y_train)

        self.assertTrue(np.array_equal(x_train, model.x_train))
        self.assertTrue(np.array_equal(y_train, model.y_train))
Exemplo n.º 18
0
def run_knn(data, target_column):
    st.sidebar.title('Choose parameters for KNN')
    ts = st.sidebar.slider('Training size', min_value=0.0, max_value=1.0, step=0.01, value=0.7)
    k = st.sidebar.number_input('k', min_value=1, max_value=int(len(data)*ts), step=1, value=3)
    run_status = st.sidebar.button('Run Algorithm')
    if run_status:
        with st.spinner('Running...'):
            x_train, x_test, y_train, y_test = train_test_split(data.drop([target_column], axis=1),
                                                                data[target_column],
                                                                test_size=1 - ts)
            clf = KNN(k=k)
            clf.fit(x_train, y_train)
            """
            ## :dart: Accuracy
            """
            st.subheader(accuracy_score(y_test, clf.predict(x_test)))
Exemplo n.º 19
0
def cross_validation(corpus, idf):
    nb_results = {'precision': [], 'recall': [], 'f1': []}

    knn_results = {'precision': [], 'recall': [], 'f1': []}

    vocab = sorted(idf.keys())

    random.shuffle(corpus)

    for i in range(10):
        print('cross validation', i)

        training, testing = split_data(corpus, i, 10)

        nb = NaiveBayes(training, vocab, 0.1)
        knn = KNN(5, 5)
        knn.fit([d.vector for d in training], [d.label for d in training])

        labels = [d.label for d in testing]
        nb_preds = [nb.predict(d) for d in testing]
        knn_preds = [knn.predict(d.vector) for d in testing]

        metrics = model_metrics(labels, nb_preds)
        for m, k in zip(metrics, ['precision', 'recall', 'f1']):
            nb_results[k].append(m)

        metrics = model_metrics(labels, knn_preds)
        for m, k in zip(metrics, ['precision', 'recall', 'f1']):
            knn_results[k].append(m)

    for m in ['precision', 'recall', 'f1']:
        print('nb', m)
        print(nb_results[m])
        print(m, 'nb mean', mean(nb_results[m]))

        print('knn', m)
        print(knn_results[m])
        print(m, 'knn mean', mean(knn_results[m]))

        diff = [a - b for a, b in zip(nb_results[m], knn_results[m])]
        print(m, 'diff')
        print(diff)

        t = mean(diff) / (stdev(diff) / len(diff)**0.5)
        print(m, 't value:', t)
Exemplo n.º 20
0
    def test_iris_classification_loop(self):
        """
        Tests kNN for classification with loops
        """

        k = 1
        iris_dataset = load_iris()

        knn = KNN(k)
        knn.fit(iris_dataset.data, iris_dataset.target)
        predicted = knn.predict_loop(iris_dataset.data)

        # verify shape of output
        self.assertEqual(len(predicted.shape), 1)
        self.assertEqual(predicted.shape[0], iris_dataset.data.shape[0])

        # with k=1, each point should match itself
        accuracy = accuracy_score(iris_dataset.target, predicted)
        self.assertAlmostEqual(accuracy, 1.0)
Exemplo n.º 21
0
    def fit(self, X, y):
        N, D = X.shape
        rfModel = RandomForestClassifier(n_estimators=50)
        nbModel = NaiveBayes(num_classes=2)
        knnModel = KNN(3)

        knnModel.fit(X, y)
        knn_y_pred = knnModel.predict(X).astype(int)

        nbModel.fit(X, y)
        nb_y_pred = nbModel.predict(X).astype(int)

        rfModel.fit(X, y)
        rf_y_pred = rfModel.predict(X).astype(int)

        Xy_label_combined = np.array(
            (knn_y_pred, nb_y_pred, rf_y_pred)).transpose()

        self.Xy_label_combined = Xy_label_combined
        self.y = y
Exemplo n.º 22
0
def cosine():
    train_d, train_l, test_d, test_l = tt_split(data, label)
    # Initialize the KNN object
    knn = KNN(neighbors_num=5, distance=cosine_distance())
    # Fill the data in KNN
    knn.fit(train_d, train_l)
    # Take prediction from KNN
    result = knn.predict(test_d)

    # Print the results on screen as data, real label, predicted label.
    #print("%20s - %20s | %20s | %s" %("[Data]", "<Real Label>", "<Predicted Label>", "Truth"))

    n = 0
    for i, j, r in zip(test_d, test_l, result):
        truthness = True if j == r else False
        if truthness:
            n += 1
        print("%20s - %20s | %20s | %s" % (i, j, r, truthness))
    print("Acc:", n / len(test_d))
    return n / len(test_d)
Exemplo n.º 23
0
 def test_predict(self):
     knn = KNN(3)
     model = knn.fit(self.X, self.y)
     md = model.predict(self.X.iloc[4:])
     exp_md = pd.DataFrame({
         4: ['b', 6],
         5: ['b', 6],
         6: ['b', 6]
     },
                           index=[0, 1]).T
     pdt.assert_frame_equal(exp_md, md)
Exemplo n.º 24
0
def test_knn():
    df = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
        header=None)
    y = df.iloc[0:100, 4].values
    y = np.where(y == 'Iris-setosa', -1, 1)
    y = np.random.randint(2, size=100)
    x = df.iloc[0:100, [0, 2]].values
    print("Testing 2-D Iris data set with only one neighbor...")
    neighbor = KNN(k=1)
    neighbor.fit(x, y)
    neighbor.plot(x, y)
    print("Testing Iris data set with 15 neighbor...")
    iris = datasets.load_iris()
    x = iris.data[:, :2]
    y = iris.target
    neighbor = KNN(15)
    neighbor.fit(x, y)
    y_pred = neighbor.predict(x)
    neighbor.accuracy(y_pred, y)
    neighbor.plot(x, y)
    print(
        "Adding new point to dataset and testing with full Iris 1-k data set..."
    )
    neighbor = KNN(1)
    neighbor.fit(x, y)
    y2 = np.array([1])
    y2 = np.append(y, y2)
    x2 = np.vstack([x, [5.0, 3.2]])
    neighbor.plot(x2, y2)
    print("Testing SKLearn's model...")
    clf = neighbors.KNeighborsClassifier(1)
    clf.fit(x, y)
    plot_decision_regions(x2, y2, clf)
Exemplo n.º 25
0
def plot_knn():
    np.random.seed(12345)
    fig, axes = plt.subplots(4, 4)
    for i, ax in enumerate(axes.flatten()):
        n_in = 1
        n_out = 1
        d = np.random.randint(1, 5)
        n_ex = np.random.randint(5, 500)
        std = np.random.randint(0, 1000)
        intercept = np.random.rand() * np.random.randint(-300, 300)
        X_train, y_train, X_test, y_test, coefs = random_regression_problem(
            n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i)

        LR = LinearRegression(fit_intercept=True)
        LR.fit(X_train, y_train)
        y_pred = LR.predict(X_test)
        loss = np.mean((y_test.flatten() - y_pred.flatten())**2)

        knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights="uniform")
        knn_1.fit(X_train, y_train)
        y_pred_1 = knn_1.predict(X_test)
        loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten())**2)

        knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights="uniform")
        knn_5.fit(X_train, y_train)
        y_pred_5 = knn_5.predict(X_test)
        loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten())**2)

        knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights="uniform")
        knn_10.fit(X_train, y_train)
        y_pred_10 = knn_10.predict(X_test)
        loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten())**2)

        xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
        xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
        X_plot = np.linspace(xmin, xmax, 100)
        y_plot = LR.predict(X_plot)
        y_plot_1 = knn_1.predict(X_plot)
        y_plot_5 = knn_5.predict(X_plot)
        y_plot_10 = knn_10.predict(X_plot)

        ax.scatter(X_test, y_test, alpha=0.5)
        ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
        ax.plot(X_plot, y_plot_1, label="KNN (k=1)", alpha=0.5)
        ax.plot(X_plot, y_plot_5, label="KNN (k=5)", alpha=0.5)
        ax.plot(X_plot, y_plot_10, label="KNN (k=10)", alpha=0.5)
        ax.legend()
        #  ax.set_title(
        #      "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
        #          loss, loss_poly, loss_rbf
        #      )
        #  )

        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])

    plt.tight_layout()
    plt.savefig("img/knn_plots.png", dpi=300)
    plt.close("all")
Exemplo n.º 26
0
 def test_query(self):
     knn = KNN(3)
     model = knn.fit(self.X, self.y)
     gen = model.query(self.X.iloc[4:])
     dist, md = next(gen)
     exp_dist = np.array([
         0,
         euclidean(self.X.iloc[4], self.X.iloc[6]),
         euclidean(self.X.iloc[4], self.X.iloc[5])
     ])
     exp_md = pd.DataFrame([['b', 5], ['b', 7], ['b', 6]], index=[4, 6, 5])
     npt.assert_allclose(exp_dist, dist)
     pdt.assert_frame_equal(exp_md, md)
Exemplo n.º 27
0
def plot(h=.02):
    _data = [[i[0], i[3]] for i in data]
    # Split the data into train and test parts
    ##train_d, train_l, test_d, test_l = tt_split(_data, label)
    train_d, train_l, test_d, test_l = (_data[0:30] + _data[50:80] +
                                        _data[100:130], label[0:30] +
                                        label[50:80] + label[100:130],
                                        _data[30:50] + _data[80:100] +
                                        _data[130:], label[30:50] +
                                        label[80:100] + label[130:])
    # Initialize the KNN object
    knn = KNN(neighbors_num=3, distance=cosine_distance())
    # Fill the data in KNN
    knn.fit(train_d, train_l)

    _t = np.array(train_d)
    x_min, x_max = _t[:, 0].min() - .2, _t[:, 0].max() + .2
    y_min, y_max = _t[:, 1].min() - .2, _t[:, 1].max() + .2
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = np.c_[xx.ravel(), yy.ravel()]
    Z = np.array(knn.predict(Z))
    Z = Z.reshape(xx.shape)
    print(Z)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    # Plot also the training points
    plt.scatter(_t[:, 0],
                _t[:, 1],
                c=train_l,
                cmap=cmap_bold,
                edgecolor='k',
                s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.show()
Exemplo n.º 28
0
# https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python
# https://www.udemy.com/data-science-supervised-machine-learning-in-python
from knn import KNN
from util import get_xor
import matplotlib.pyplot as plt

if __name__ == '__main__':
    X, Y = get_xor()

    # display the data
    plt.scatter(X[:, 0], X[:, 1], s=100, c=Y, alpha=0.5)
    plt.show()

    # get the accuracy
    model = KNN(3)
    model.fit(X, Y)
    print "Accuracy:", model.score(X, Y)
Exemplo n.º 29
0
# https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python
# https://www.udemy.com/data-science-supervised-machine-learning-in-python
from __future__ import print_function, division
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future


from knn import KNN
from util import get_xor
import matplotlib.pyplot as plt

if __name__ == '__main__':
    X, Y = get_xor()

    # display the data
    plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
    plt.show()

    # get the accuracy
    model = KNN(3)
    model.fit(X, Y)
    print("Accuracy:", model.score(X, Y))
Exemplo n.º 30
0
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from knn import KNN

iris = datasets.load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf = KNN(k=3)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

accuracy = np.sum(predictions == y_test) / len(y_test)
print(accuracy * 100)

# my_cmap = ListedColormap(["#FF781F", "#149414", "#52307C"])
# plt.figure()
# # Displaying 2 out of 4 features so that we can see in 2D
# plt.scatter(X[:, 0], X[:, 1], c=y, cmap=my_cmap, edgecolor='k', s=20)
# plt.show()
Exemplo n.º 31
0
        clf.fit(X, y)
        print("Naive Bayes (sklearn) validation error: %.3f" %
              (1 - clf.score(X, y)))

    elif question == '3':
        with open(os.path.join('..', 'data', 'citiesSmall.pkl'), 'rb') as f:
            dataset = pickle.load(f)

        X = dataset['X']
        y = dataset['y']
        Xtest = dataset['Xtest']
        ytest = dataset['ytest']

        for i in [1]:
            knn = KNN(i)
            knn.fit(X, y)
            print("Training")
            tr_prediction = knn.predict(X)
            knn.getError(tr_prediction, y)
            print("Testing")
            prediction = knn.predict(Xtest)
            knn.getError(prediction, ytest)

        utils.plotClassifier(knn, X, y)
        neigh = KNeighborsClassifier(n_neighbors=1)
        neigh.fit(X, y)
        # print(neigh.predict(ytest))
        print("Sklearn KNN: {}".format(1 - neigh.score(X, y)))
        utils.plotClassifier(neigh, X, y)

    elif question == '4':
Exemplo n.º 32
0
    if elem[2]*av_score[elem[1]-1] > 0 or (elem[2]==0 and av_score[elem[1]-1] <= 0):
        accuracy+=1

print "Simple Accuracy:", np.around(100.0*accuracy/len(validationData)), "%"

############# PERSONAL PREF #############
print 20 * "#", "Personal Pref", 20 * "#"
jokeDataNew = jokeData
# replace nan by 0
for i in range(len(jokeData)):
    jokeDataNew[i] = [0 if np.isnan(x) else x for x in jokeData[i] ]
    
for k in [10, 100, 1000]:
    print "K Value:", k
    knn = KNN(k)
    knn.fit(jokeDataNew)
    neighbours = knn.neighbours
    av_score = []
    accuracy = 0
    for i in range(100):
        average_score = (np.mean([jokeDataNew[ind] for ind in neighbours[i]], 0))
        av_score.append(average_score)
        
    for elem in validationData:
        if (elem[2]*av_score[elem[0]-1][elem[1]-1] > 0) or (elem[2]==0 and av_score[elem[0]-1][elem[1]-1] < 0):
            accuracy+=1
    
    print "Pref Accuracy:", np.around(100.0*accuracy/len(validationData)), "%"
        
############# LATENT FACTOR ANALYSIS #############
print 20 * "#", "PCA", 20 * "#"