Exemplo n.º 1
0
def knn(p, train, k):
    """Does a K-nearest-neighbors classification on
    a point p given a training set.

    Parameters:
    - p: Numpy array with the point to classify
    - train: An iterable object with (p, label) entries,
      where p is a numpy array representing a point
      in the training set, and label is the label for that
      point.
    - k: The number of neighbors to use.
    """

    h =  [[-sys.maxint,None,None] for x in range(k)]

    for pt, label in train:
        k_distance, _, _ = h[0]

        k_distance = -k_distance

        dist = distance(p.astype("float"), pt)

        if dist < k_distance:
            heapq.heapreplace(h, [-dist, pt, label])

    labels = [label for dist,pt,label in h]
    max_label = find_majority(labels)

    h.sort()
    for x in h:
        x[0] = -x[0]

    return (max_label, h)
Exemplo n.º 2
0
    def get_k_means(self, centroids):
        point_assignment = []
        for n in self.points:
            distances = [distance(n,c) for c in centroids]
            point_assignment.append(numpy.argmin(distances))

        new_centroids = numpy.array([ numpy.zeros(self.points.shape[1]) for i in range(self.k)])
        points_in_cluster = [0]*self.k
        for p,i in zip(self.points,point_assignment):
            new_centroids[i] += p
            points_in_cluster[i] += 1

        for i in range(self.k):
            new_centroids[i] /= points_in_cluster[i]

        return new_centroids, point_assignment
Exemplo n.º 3
0
points = data.to_array()

km = KMeans(points, args.k)

centroids = km.select_random_centroids()

iteration = 1
while True:
    print "Iteration #%i with centroids %s" % (iteration, [(x[0], x[1]) for x in centroids])

    new_centroids, point_assignment = km.get_k_means(centroids)

    if args.outfile == None:
        gen_scatter_plot(points, point_assignment, centroids)

    distances = [distance(c1, c2) for c1, c2 in zip(centroids, new_centroids)]
    max_d = max(distances)

    if max_d <= args.cutoff:
        print "Max shift: %.2f <= %.2f" % (max_d, args.cutoff)
        print "Algorithm has converged."
        if args.outfile == None:
            gen_scatter_plot(points, point_assignment, new_centroids)
        break
    else:
        print "Max shift: %.2f > %.2f" % (max_d, args.cutoff)
        print "Algorithm hasn't converged yet."
        centroids = new_centroids
        iteration += 1

if args.outfile != None:
Exemplo n.º 4
0
km = KMeans(points, args.k)

centroids = km.select_random_centroids()

iteration = 1
while True:
    print "Iteration #%i with centroids %s" % (iteration, [(x[0], x[1])
                                                           for x in centroids])

    new_centroids, point_assignment = km.get_k_means(centroids)

    if args.outfile == None:
        gen_scatter_plot(points, point_assignment, centroids)

    distances = [distance(c1, c2) for c1, c2 in zip(centroids, new_centroids)]
    max_d = max(distances)

    if max_d <= args.cutoff:
        print "Max shift: %.2f <= %.2f" % (max_d, args.cutoff)
        print "Algorithm has converged."
        if args.outfile == None:
            gen_scatter_plot(points, point_assignment, new_centroids)
        break
    else:
        print "Max shift: %.2f > %.2f" % (max_d, args.cutoff)
        print "Algorithm hasn't converged yet."
        centroids = new_centroids
        iteration += 1

if args.outfile != None: