Пример #1
0
    def compute(self, kmpp=True):
        if kmpp: self.centers = KMeanspp(self.p, self.k, self.w.ravel(), n_init=self.ni).compute()
        else:    self.centers = self._rand_seeds()
        np.reshape(self.centers, (self.k, self.p.shape[1]))   #just fix the shape
            
        dist, d = utils.get_centers_d(self.p, self.centers)
        xcost = np.sum(utils.get_dist_to_centers(self.p, self.centers, d)*self.w)
        points = self.p
        weights = self.w.T
        for j in range(0, self.e):
            for i in range(0, self.k):
                x = [dist == i]
                a = points[x]
                w = weights[x]
                c = a*w
                new_center = np.sum(c, axis=0, keepdims=1)
                if np.sum(w) == 0:
                    print "not nice"
                    continue
                new_center /= np.sum(w)
                self.centers[i] = new_center
            dist, d = utils.get_centers_d(self.p, self.centers)
            cost = np.sum(utils.get_dist_to_centers(self.p, self.centers, d)*self.w)
            if abs(xcost - cost) < self.epsilon:
                break
            xcost = cost

        return self.centers
Пример #2
0
 def compute(self):
     best_cent = self.seed()
     if self.n_init == 1:
         return best_cent
     best_cost = np.sum(utils.get_dist_to_centers(self.p, best_cent) * self.w)
     for i in range(self.n_init - 1):
         cent = self.seed()
         cost = np.sum(utils.get_dist_to_centers(self.p, cent) * self.w)
         if cost < best_cost:
             best_cost = cost
             best_cent = cent
     return best_cent
Пример #3
0
 def drop_half_points(self, points, weights, M):
     d = utils.get_dist_to_centers(points, M)
     median = np.median(d)
     points = points[d>median]
     if weights is not None:
         weights = weights[d>median]
     return points, weights
Пример #4
0
    def compute(self, size):
        """
        self.points is a vector with n rows and d cols
        bi its a vector of with klogn rows and d dols
        dist(i) represents the sens(p_i) as in the formula discussed.
        """
        e = w_kmeans.Kmeans(self.points, np.expand_dims(self.weights, axis=0), self.k, 10)
        bi = e.compute()

        dist = utils.get_dist_to_centers(self.points, bi) #find distance of each point to its nearset cluster
        if self.weights is not None: # its always not none!!!
            dist /= np.sum(dist) #norm
        dist *= 2
        c = utils.get_centers(self.points, bi)#get centers
        c = self.find_cluester_size_weighted(c, W=self.weights)#get weighted size of center's cluster
        dist += ((4.0)/(c)) #add to each point the size of its cluster as at the formula
        t = np.sum(dist*self.weights)
        weights = 1/(dist*size)
        weights *= t
        # print t
        dist *= self.weights
        dist /= np.sum(dist)
        prob = dist # its actually the sampling probability
        points, weights = utils.sample(self.points, prob, size, weights=weights)
        return points, weights
Пример #5
0
 def compute(self, size, grnds=10, ginit=1):
     q = w_KMeans.KMeans(self.p, np.expand_dims(self.w , axis=0), self.k, grnds, ginit).compute() # this is my kmeans for the coreset.
     sq_d = utils.get_sq_distances(self.p, q) # Squared distances from each point to each center
     dist = utils.get_dist_to_centers(d=sq_d) # I get the sq dist from each point its center.
     dist /= np.sum(dist) # Norm 
     dist *= 2 # according to the paper
     c = utils.get_centers(d=sq_d) # I get the index of the center
     c = self._find_cluster_size(c) # Find the size of the cluster for each point.
     s = dist + 4.0/c # I add it, the 4 is according to the paper.
     t = np.sum(s*self.w) # This is the t from the paper.
     u = t/(s*size) # the new weights for coreset.
     prob = s*self.w/t # the probability for sampling
     p, w = utils.sample(self.p, size, prob=prob, weights=u) # sample coreset: points + weights.
     return p, w
Пример #6
0
    def drop_half_weighted_points(self, points, weights, M, W):
        left = W
        points_to_drop=[]
        d = utils.get_dist_to_centers(points, M)
        idx = np.argsort(d)
        i = 0
        while left > 0:
            index = idx[i]
            if weights[index] > left:
                weights[index] -= left
                left = 0
            else:
                left -= weights[index]
                points_to_drop.append(index)
            i += 1

        points = np.delete(points,points_to_drop,axis=0)
        weights = np.delete(weights,points_to_drop)
        return points, weights
Пример #7
0
 def _compute_cost(self, p, means):
     return np.sum(utils.get_dist_to_centers(p, means))
Пример #8
0
 def mapForCost(arr):
     return np.sum(utils.get_dist_to_centers(arr, means1))
Пример #9
0
t = 50
delta = 100
print "regressing sample size in [50, 2000] w/ jumps of", delta, "each w/", t, "trials..."
x = []
y = []
y_uni = []
for size in range(50, 2000, delta):
    c_mistake = 0
    u_mistake = 0
    x.append(size)
    print "size:", size, "trials",
    for i in range(0,t):
        s = np.random.choice(range(0,10030),size)
        s = p[s]
        centers = model.fit(s).cluster_centers_
        uni_cost = (np.sum(utils.get_dist_to_centers(p, centers)))
        u_mistake += (1 - cost/uni_cost)
        p_cset, w_cset = Coreset(p, 2, w).compute(size)
        e = w_KMeans.KMeans(p_cset, np.expand_dims(w_cset, axis=0), 2, 10)
        e = e.compute()
        res = (np.sum(utils.get_dist_to_centers(p, e)))
        c_mistake += (1-cost/res)
        sys.stdout.write(".")
        sys.stdout.flush()
    c_mistake /= t
    y.append(c_mistake)
    u_mistake /= t
    y_uni.append(u_mistake)
    print "mistakes for uniform:", round(u_mistake, 3), "coreset:", round(c_mistake, 3)
    u_mistake = c_mistake = 0
plt.plot(x, y, 'r')
Пример #10
0
def distanceToClosest(p, centers):
    closest = float("+inf")
    for i in range(len(centers)):
        tempDist = np.sum((p - centers[i]) ** 2)
        if tempDist < closest:
            closest = tempDist
    return closest

if __name__ == "__main__":
    points = np.loadtxt("coreset_points.txt",dtype=np.float64)
    weights = np.loadtxt("coreset_weights.txt",dtype=np.float64)
    org = np.loadtxt("small_dataset.txt",dtype=np.float64)
    k = 2

    means = KMeans(points, np.expand_dims(weights, axis=0), k, rounds=20)
    means = means.compute()
    real_cost = (np.sum(utils.get_dist_to_centers(org, KMeans(org, np.expand_dims(np.ones(org.shape[0]), axis=0), k, rounds=20).compute())))
    print real_cost



    sc = SparkContext(appName="test_results ")    # start from here.
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "123")
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "456")
    points = sc.textFile("small_dataset.txt").map(parseVector)
    closest = points.map(lambda p: (distanceToClosest(p, means)))
    cs_result = closest.reduce(lambda a, b: a+b)
    print cs_result
    print "mistake: ", (1-real_cost/cs_result)
    sc.stop()