示例#1
0
class KNearest:
    def __init__(self, k_neighbours=5, dense=False, balanced=False):
        self.k_neighbours = k_neighbours
        self.dense = dense
        self.balanced = balanced
        self.data = None

    def fit(self, X, y):
        self.label_set = set(y)
        data = list()
        for con, lab in it.izip(X, y):
            if not self.dense:
                con = con.toarray()
                con = con[0]
            data.append((con, lab))
        # Create a KDTree using the data given and store it
        self.data = KDTree(data, self.k_neighbours, balanced=self.balanced)

    def predict(self, X_test):
        predictions = list()
        if not self.dense:
            X_test = toArray(X_test)
        for u in X_test:
            dists = list()

            # neighbours =  bucket of vectors to compare u with
            neighbours = self.data.search(u)

            # Make a list of distances between u and each neighbour
            for n in neighbours:
                dists.append((self.__distance(u, n[0]), n[1]))

            # Sort the list so we can get the k closest neighbours
            dists = sorted(dists, key=lambda dist: dist[0])

            nearest = dists[:self.k_neighbours]

            # Find Majority
            predictions.append(self.__findMajority(nearest))
        return predictions

    def __findMajority(self, dists):
        labels = dict()
        # Make a dict of occurences of each label in dists
        for l in self.label_set:
            labels[l] = 0
        for __dist, lab in dists:
            labels[lab] += 1

        # Find max label
        maxval = 0
        maxkey = -1
        for key, value in labels.iteritems():
            if value > maxval:
                maxval = value
                maxkey = key
        return maxkey

    # Euclidean distance of 2 vectors
    def __distance(self, U, V):
        s = 0
        for xu, xv in it.izip(U, V):
            s += (xu - xv)**2
        dist = sqrt(s)
        return dist