import numpy as np
import matplotlib.pyplot as plt
from classify import classify
from file_import import file_import
from nearest_points import nearest_points_naive_l1
from nearest_points import nearest_points_naive_sup
from nearest_points import nearest_points_opt_l1
from nearest_points import nearest_points_opt_sup
from scipy.spatial import KDTree

train = file_import("bananas-2-2d.train.csv")
test = file_import("bananas-2-2d.test.csv")
test_index = np.int(np.random.rand() * len(test))
k = 100

plt.figure(1)
nearest = nearest_points_naive_sup(test[test_index, :], train, k)
rest = [i for i in range(len(train)) if i not in nearest]
plt.scatter(train[nearest, 1], train[nearest, 2], s=0.6, c='r')
plt.scatter(train[rest, 1], train[rest, 2], s=0.6, c='k')
plt.scatter(test[test_index, 1], test[test_index, 2], s=1.5, c='b')

plt.figure(2)
nearest = nearest_points_naive_l1(test[test_index, :], train, k)
rest = [i for i in range(len(train)) if i not in nearest]
plt.scatter(train[nearest, 1], train[nearest, 2], s=0.6, c='r')
plt.scatter(train[rest, 1], train[rest, 2], s=0.6, c='k')
plt.scatter(test[test_index, 1], test[test_index, 2], s=1.5, c='b')

plt.figure(3)
nearest = nearest_points_opt_sup(test[test_index, :], train, k)
def classify(nametest, nametrain, Kset, l):
    test = file_import(nametest)
    train = file_import(nametrain)
    D_test = test[:, 1:]
    D_train = train[:, 1:]
    n = len(train)
    großk = int(max(Kset))
    index_array = np.zeros((n, großk), dtype=int)
    m_i = 0
    tic = time.time()
    for i in range(l):
        block_size = n // l
        train_i = train[i * block_size:(i + 1) * block_size, :]
        a_i = train[0:i * block_size, :]
        b_i = train[(i + 1) * block_size:, :]
        train_strich_i = np.vstack((a_i, b_i))
        D_train_i = train_i[:, 1:]  #erste Spalte nicht dran
        c_i = D_train[0:i * block_size, :]
        d_i = D_train[(i + 1) * block_size:, :]
        D_train_strich_i = np.vstack((c_i, d_i))
        index_array_i = np.zeros((m_i, großk), dtype=int)
        for j in range(0, len(D_train_i)):
            index_j_i = nearest_points_naive_sup_2(D_train_i[j, :],
                                                   D_train_strich_i, großk)
            index_array[m_i + j, :] = index_j_i
        m_i = m_i + len(D_train_i)
    toc = time.time()
    print("%.10f seconds" % (toc - tic))
    list_ks = []
    tic = time.time()
    for k in Kset:
        errorarray = []
        for i in range(l):
            m_i = len(D_train_i)
            block_size = n // l
            train_i = train[i * block_size:(i + 1) * block_size, :]
            a_i = train[0:i * block_size, :]
            b_i = train[(i + 1) * block_size:, :]
            train_strich_i = np.vstack((a_i, b_i))
            D_train_i = train_i[:, 1:]  #erste Spalte nicht dran
            c_i = D_train[0:i * block_size, :]
            d_i = D_train[(i + 1) * block_size:, :]
            D_train_strich_i = np.vstack((c_i, d_i))
            index_array_i = np.zeros((m_i, großk), dtype=int)
            C_i = []
            for j in range(0, m_i):
                if train_i[j, 0] == np.sign(
                        np.sum(train_strich_i[index_array[m_i + j, :int(k)],
                                              0])):
                    c = 0
                else:
                    c = 1
                C_i.append(c)
            m_i += len(D_train_i)
            error_classification_i = 1 / m_i * sum(C_i)
            errorarray.append(error_classification_i)
        middle_k = (1 / l) * sum(errorarray)
        list_ks.append(middle_k)
    toc = time.time()
    print("%.10f seconds" % (toc - tic))
    print(list_ks)
    print(
        [np.abs(list_ks[i] - list_ks[i + 1]) for i in range(len(list_ks) - 1)])
    print(list_ks.index(min(list_ks)))
def classify(file_name, KSET, l):
    tic = time.time()
    k_max = max(KSET)
    test = file_import(
        file_name + ".test.csv")  # Vollständiges Array; Enthält Klassifikation
    train = file_import(file_name + ".train.csv")  # dito
    n = train.shape[0]  # Anzahl Punkte
    m = train.shape[
        1]  # Anzahl Dimensionen; Beachte: Enthält die Klassifikation
    index_array = np.zeros(
        (n, k_max), dtype=int
    )  # Enthält alle Indizes der k_max nächsten Nachbarn aller Punkte
    block_size = n // l  # Größe der D_i
    D_i_array = np.zeros(
        (l, block_size, m))  # Zu untersuchende Punkte von train
    D_strich_i_array = np.zeros(
        (l, block_size * (l - 1), m))  # Zu vergleichende Punkte von train
    for i in range(l):
        # Erzeuge alle benötigten Arrays an Punkten, i wird in der ersten Koordinate dieser Arrays indiziert
        D_i_array[i] = train[i * block_size:(i + 1) * block_size, :]
        lower_points = train[0:i * block_size, :]
        upper_points = train[(i + 1) * block_size:l * block_size, :]
        D_strich_i_array[i] = np.vstack((lower_points, upper_points))
    toc = time.time()
    print("Initialisierung : %.10f seconds" % (toc - tic))
    tic = time.time()
    for i in range(l):
        # Bestimme die k_max nächsten Nachbarn
        for j in range(0, block_size):
            index_array[block_size * i + j, :] = nearest_points_opt_l1(
                D_i_array[i, j, :], D_strich_i_array[i, :, :], k_max)  # sic
    list_ks = []
    toc = time.time()
    print("Nächste Nachbarn in train : %.10f seconds" % (toc - tic))
    tic = time.time()
    new_array = np.zeros(
        (l, block_size, k_max)
    )  # Enthält Summen der Klassifikationen (ohne Signum) der n Punkte zu allen nächsten Nachbarn (bis k_max)
    for i in range(l):
        for j in range(block_size):
            new_array[i, j, :] = np.cumsum(
                D_strich_i_array[i, index_array[i * block_size + j, :], 0])
#            new_array[i, j, 0] = np.sum(D_strich_i_array[i, index_array[i * block_size + j, 0], 0])
#            for k in range(len(KSET) - 1):
#                new_array[i, j, k + 1] = new_array[i, j, k] + D_strich_i_array[i, index_array[i * block_size + j, k + 1], 0]
    temp1_array = np.sign(new_array)
    temp2_array = np.zeros((l, block_size, k_max))
    for i in range(l):
        for j in range(block_size):
            for k in range(len(KSET)):
                if temp1_array[i, j, k] == 0:
                    temp1_array[i, j, k] = 1
                if D_i_array[i, j, 0] == temp1_array[i, j, k]:
                    temp2_array[i, j, k] = 0
                else:
                    temp2_array[i, j, k] = 1
    temp3_array = np.sum(temp2_array, 1) / block_size
    temp4_array = np.sum(temp3_array, 0) / l
    #    print(temp4_array)
    toc = time.time()
    print("Bestimmung von k_stern : %.10f seconds" % (toc - tic))
    k_stern = np.argmin(temp4_array)
    print("k_stern = " + str(k_stern))
    print("Klassifikationsfehlerrate: " + str(temp4_array[k_stern]))
    o = len(test)
    test_classification = np.zeros(o)
    test_index_array = np.zeros((l, o, k_stern), dtype=int)
    tic = time.time()
    for i in range(l):
        for j in range(o):
            test_index_array[i, j, :] = nearest_points_opt_l1(
                test[j, :], D_strich_i_array[i, :, :], k_stern)
    toc = time.time()
    print("Nächste Nachbarn von test : %.10f seconds" % (toc - tic))
    tic = time.time()
    for j in range(o):
        temp1 = 0
        for i in range(l):
            temp2 = np.sign(
                np.sum(D_strich_i_array[i, test_index_array[i, j, :], 0]))
            temp1 += temp2
            if temp2 == 0:
                temp1 += 1
        test_classification[j] = np.sign(
            temp1
        )  # Empirisch : Wird nicht Null, also keine weitere Abfrage nötig
    toc = time.time()
    print("Bestimmung der Klassifikation : %.10f seconds" % (toc - tic))
    #    print(test_classification)
    test[:, 0] = test_classification
    with open(file_name + ".result.csv", 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(test)
#    print(test)
    return test
Пример #4
0
import numpy as np
import time
from file_import import file_import
from nearest_points import nearest_points_naive_l1
from nearest_points import nearest_points_naive_sup
from nearest_points import nearest_points_opt_l1
from nearest_points import nearest_points_opt_sup
from scipy.spatial import KDTree

file_name = "ijcnn1.10000.train.csv"
k = 1

data = file_import(file_name)
n = data.shape[0]
result_array_1 = np.zeros((n, k))
tic = time.time()
for i in range(n):
    result_array_1[i, :] = nearest_points_naive_l1(data[i, :], data, k)
toc = time.time()
print("l1-naiv : %.10f seconds" % (toc - tic))

data = file_import(file_name)
n = data.shape[0]
result_array_2 = np.zeros((n, k))
tic = time.time()
for i in range(n):
    result_array_2[i, :] = nearest_points_naive_sup(data[i, :], data, k)
toc = time.time()
print("lsup-naiv : %.10f seconds" % (toc - tic))

data = file_import(file_name)
Пример #5
0
import matplotlib.pyplot as plt
from classify import classify
from file_import import file_import 
import numpy as np

file_name = "bananas-2-2d"

plt.figure(1)
set=file_import(file_name + ".train.csv")
list_1=[i for i in range(len(set)) if set[i,0]==1]
list_2=[i for i in range(len(set)) if set[i,0]==-1]
plt.scatter(set[list_1, 1], set[list_1, 2], s=0.6, c="k", label="1")
plt.scatter(set[list_2, 1], set[list_2, 2], s=0.6, c="r", label="-1")
plt.title(file_name + ".train")
plt.legend(markerscale =7.5, title="Klassifikation")

plt.figure(2)
set=file_import(file_name + ".test.csv")
list_1=[i for i in range(len(set)) if set[i,0]==1]
list_2=[i for i in range(len(set)) if set[i,0]==-1]
plt.scatter(set[list_1, 1], set[list_1, 2], s=0.6, c="k", label="1")
plt.scatter(set[list_2, 1], set[list_2, 2], s=0.6, c="r", label="-1")
plt.title(file_name + ".test")
plt.legend(markerscale =7.5, title="Klassifikation")

plt.figure(3)
#set = classify(file_name,  np.arange(1,200),  5)
set=file_import(file_name + ".result.csv")
list_1=[i for i in range(len(set)) if set[i,0]==1]
list_2=[i for i in range(len(set)) if set[i,0]==-1]
plt.scatter(set[list_1, 1], set[list_1, 2], s=0.6, c="k", label="1")