示例#1
0
 def seed(self):
     k = self.k - 1
     centers = []
     prob = self.w / np.sum(self.w)
     center = utils.sample(self.p, 1, prob)
     centers.append(center[0])
     min_dist = None
     while k > 0:
         np_centers = np.array(centers)
         if min_dist is None:
             d = utils.get_sq_distances(x=self.p, y=np_centers).ravel()
             min_dist = d
         else:
             d = utils.get_sq_distances(x=self.p, y=np.array([np_centers[-1]])).ravel()
             min_dist = np.minimum(min_dist, d)
         dist = np.array(min_dist)
         dist *= self.w
         prob = dist / np.sum(dist)
         center = utils.sample(self.p, 1, prob)
         centers.append(center[0])
         k -= 1
     return np.array(centers, dtype=np.float64)
示例#2
0
    def points_cost(self, points, centers):
        """
        this function will return the minimal distance of each point from its closest center

        :param points: a list of points with dimension d
        :param centers: a list of centers from which we'll take the miniaml distance
        :return: the minimal distance of each point from its closest center
        """
        minDist = float("inf")
        for center in centers:
            center = [np.array(center)]
            tmpDistances = utils.get_sq_distances(x=points, y=center)
            minDist = np.minimum(minDist, np.amin(tmpDistances))
        return minDist
示例#3
0
 def compute(self, size, grnds=10, ginit=1):
     q = w_KMeans.KMeans(self.p, np.expand_dims(self.w , axis=0), self.k, grnds, ginit).compute() # this is my kmeans for the coreset.
     sq_d = utils.get_sq_distances(self.p, q) # Squared distances from each point to each center
     dist = utils.get_dist_to_centers(d=sq_d) # I get the sq dist from each point its center.
     dist /= np.sum(dist) # Norm 
     dist *= 2 # according to the paper
     c = utils.get_centers(d=sq_d) # I get the index of the center
     c = self._find_cluster_size(c) # Find the size of the cluster for each point.
     s = dist + 4.0/c # I add it, the 4 is according to the paper.
     t = np.sum(s*self.w) # This is the t from the paper.
     u = t/(s*size) # the new weights for coreset.
     prob = s*self.w/t # the probability for sampling
     p, w = utils.sample(self.p, size, prob=prob, weights=u) # sample coreset: points + weights.
     return p, w
示例#4
0
 def sample_independently_bahman(self, points, centers, overSamplingFactor):
     """
     will return a set of center candidates using formulat described in calling function
     :param points:
     :param centers:
     :param overSamplingFactor: how many new centers will we sample
     :return: center candidates
     """
     C_prime = []
     sq_min_dist_array = utils.get_sq_distances(x=points, y=centers).ravel()
     phy_x_c = sum(sq_min_dist_array)
     if phy_x_c is 0:
         phy_x_c = 0.0000001 #used to handle a singular case where all points are the same point
     for i in range(0, len(points) - 1):
         tmp = sq_min_dist_array[i]
         p_x = overSamplingFactor*sq_min_dist_array[i]/phy_x_c
         rand_x=random()
         if rand_x <= p_x:
             C_prime.append(points[i])
     return list(set(C_prime)) #set removes duplicates