Пример #1
0
def test_x_squared_norms_init_centroids():
    """Test that x_squared_norms can be None in _init_centroids"""
    from sklearn.cluster.k_means_ import _init_centroids

    X_norms = np.sum(X ** 2, axis=1)
    precompute = _init_centroids(X, 3, "k-means++", random_state=0, x_squared_norms=X_norms)
    assert_array_equal(precompute, _init_centroids(X, 3, "k-means++", random_state=0))
Пример #2
0
def pick_landmarks(args, train,
                   dim):  # Choose landmarks, we use opt_seed here.
    train.make_stacked()
    rs = check_random_state(args.opt_seed)
    data_fit = train.stacked_features[:, :dim]
    if args.landmark_choice == 'all':  # Sample landmarks from all pts
        if args.landmarks_select == 'kmeans++':
            landmarks = _init_centroids(data_fit,
                                        args.structure,
                                        'k-means++',
                                        random_state=rs)
        elif args.landmarks_select == 'kmeans':
            kmeans = KMeans(n_clusters=args.structure, random_state=rs)
            landmarks = kmeans.fit(data_fit).cluster_centers_
    elif args.landmark_choice == 'bag':  # Sample landmarks per bag
        size = len(train)
        l_size = args.n_landmark_bag
        landmarks = np.zeros((size * l_size, dim))
        for i in range(size):
            bag = train[i]
            bag_x = bag[:, :dim]
            if l_size >= 1:
                landmarks[i * l_size:(i + 1) * l_size] = _init_centroids(
                    bag_x, args.n_landmark_bag, 'k-means++', random_state=rs)
            else:
                raise ValueError('n_landmark_bag: Must be positive > 0')
    return landmarks
Пример #3
0
def test_x_squared_norms_init_centroids():
    # Test that x_squared_norms can be None in _init_centroids
    from sklearn.cluster.k_means_ import _init_centroids

    X_norms = np.sum(X**2, axis=1)
    precompute = _init_centroids(
        X, 3, "k-means++", random_state=0, x_squared_norms=X_norms)
    assert_array_almost_equal(
        precompute,
        _init_centroids(X, 3, "k-means++", random_state=0))
Пример #4
0
def test_x_squared_norms_init_centroids():
    """Test that x_squared_norms can be None in _init_centroids"""
    try:
        from sklearn.cluster.k_means_ import _init_centroids
    
        X_norms = np.sum(X**2, axis=1)
        precompute = _init_centroids(
            X, 3, "k-means++", random_state=0, x_squared_norms=X_norms)
        return [PY_array_equals(
            precompute,
            _init_centroids(X, 3, "k-means++", random_state=0))]
    except Exception:
        return 1
Пример #5
0
def init_uv(X, C, p):

    N, ndim = len(X), len(X[0])

    # np.random.seed()

    print(p, 'test seed', np.random.random((1, )))
    assert isinstance(p.method, str)

    if p.method == 'random':
        V = np.random.random((C, ndim))
    # elif p.method == 'orig':
    #     return origin_init(X, C, p.gamma, p.epsilon)
    elif p.method == 'kmpp':
        V = _init_centroids(X, C, 'k-means++')

    U = np.ones((N, C)) * .1 / (C - 1)

    for i in range(N):
        xi = np.repeat(X[i, :].reshape((1, ndim)), C, axis=0)
        U[i, np.argmin(l21_norm(xi - V, axis=1))] = .9

    # w_epsilon = p.w_epsilon
    # from basics.ours import update_V as ours_update_V
    # V = ours_update_V(V, U, X, w_epsilon)

    return U, V
Пример #6
0
def _fit_single(X, y=None, n_clusters=2, init='random', random_state=None,
                metric='riemann', max_iter=100, tol=1e-4):
    """helper to fit a single run of centroid."""
    # init random state if provided
    mdm = MDM(metric=metric)
    squared_nomrs = [numpy.linalg.norm(x, ord='fro')**2 for x in X]
    mdm.covmeans_ = _init_centroids(X, n_clusters, init,
                                    random_state=random_state,
                                    x_squared_norms=squared_nomrs)
    if y is not None:
        mdm.classes_ = numpy.unique(y)
    else:
        mdm.classes_ = numpy.arange(n_clusters)

    labels = mdm.predict(X)
    k = 0
    while True:
        old_labels = labels.copy()
        mdm.fit(X, old_labels)
        dist = mdm._predict_distances(X)
        labels = mdm.classes_[dist.argmin(axis=1)]
        k += 1
        if (k > max_iter) | (numpy.mean(labels == old_labels) > (1 - tol)):
            break
    inertia = sum([sum(dist[labels == mdm.classes_[i], i])
                   for i in range(len(mdm.classes_))])
    return labels, inertia, mdm
Пример #7
0
def _fit_single(
        X,
        y=None,
        n_clusters=2,
        init='random',
        random_state=None,
        metric='riemann',
        max_iter=100,
        tol=1e-4):
    # init random state if provided
    mdm = MDM(metric=metric)
    mdm.covmeans = _init_centroids(
        X, n_clusters, init, random_state=random_state)
    if y is not None:
        mdm.classes = numpy.unique(y)
    else:
        mdm.classes = numpy.arange(n_clusters)

    labels = mdm.predict(X)
    k = 0
    while True:
        old_labels = labels.copy()
        mdm.fit(X, old_labels)
        dist = mdm._predict_distances(X)
        labels = mdm.classes[dist.argmin(axis=1)]
        k += 1
        if (k > max_iter) | (numpy.mean(labels == old_labels) > (1 - tol)):
            break
    inertia = sum([sum(dist[labels == mdm.classes[i], i])
                   for i in range(len(mdm.classes))])
    return labels, inertia, mdm
Пример #8
0
def km_init(X, K, C_init):
    """
    Initial seeds
    
    """

    N, D = X.shape
    if isinstance(C_init, str):

        if C_init == 'kmeans_plus':
            M = _init_centroids(X, K, init='k-means++')
            l = km_le(X, M, None, None)
        elif C_init == 'rndmeans':
            m = X.min(0)
            mm = X.max(0)
            a = (mm - m) * np.random.random((K, D))
            M = a + m[None, :]
            l = km_le(X, M, None, None)
        elif C_init == 'rndsubset':
            M = X[np.random.choice(list(range(N)), K), :]
            #            tmp = np.random.permutation(N)
            #            M = X[tmp[0:K],:]
            l = km_le(X, M, None, None)
        elif C_init == 'kmeans':
            kmeans = KMeans(n_clusters=K).fit(X)
            l = kmeans.labels_
            M = kmeans.cluster_centers_
    else:
        M = C_init
        l = km_le(X, M, None, None)
    del C_init
    return M, l
Пример #9
0
    def create_codebook(self, features, _class='label'):

        if self.debug:
            print '\t- creating visual codebook for {0} ...'.format(_class)
            print '\t- features.shape', features.shape
            sys.stdout.flush()

        n_feats, n_cuboids, cuboid_depth = features.shape
        features = features.reshape(-1, cuboid_depth)

        if self.codebook_selection == self.cs_dict["kmeans"]:

            codebook = KMeans(init='k-means++', n_clusters=self.codebook_size, n_init=50,
                              tol=1e-10, max_iter=1000, random_state=self.seed, n_jobs=self.n_jobs)

            codebook.fit(features)

            return codebook

        else:

            codebook = KMeans(init='random', n_clusters=self.codebook_size, n_init=1,
                              tol=1e-10, max_iter=1, random_state=self.seed, n_jobs=self.n_jobs)

            codebook.cluster_centers_ = _init_centroids(features, k=self.codebook_size, init='random', random_state=self.seed)

            return codebook
Пример #10
0
def _fit_single(X,
                y=None,
                n_clusters=2,
                init='random',
                random_state=None,
                metric='riemann',
                max_iter=100,
                tol=1e-4):
    # init random state if provided
    mdm = MDM(metric=metric)
    mdm.covmeans = _init_centroids(X,
                                   n_clusters,
                                   init,
                                   random_state=random_state)
    if y is not None:
        mdm.classes = numpy.unique(y)
    else:
        mdm.classes = numpy.arange(n_clusters)

    labels = mdm.predict(X)
    k = 0
    while True:
        old_labels = labels.copy()
        mdm.fit(X, old_labels)
        dist = mdm._predict_distances(X)
        labels = mdm.classes[dist.argmin(axis=1)]
        k += 1
        if (k > max_iter) | (numpy.mean(labels == old_labels) > (1 - tol)):
            break
    inertia = sum([
        sum(dist[labels == mdm.classes[i], i]) for i in range(len(mdm.classes))
    ])
    return labels, inertia, mdm
Пример #11
0
def _fit_single(X, y=None, n_clusters=2, init='random', random_state=None,
                metric='riemann', max_iter=100, tol=1e-4, n_jobs=1):
    """helper to fit a single run of centroid."""
    # init random state if provided
    mdm = MDM(metric=metric, n_jobs=n_jobs)
    squared_nomrs = [numpy.linalg.norm(x, ord='fro')**2 for x in X]
    mdm.covmeans_ = _init_centroids(X, n_clusters, init,
                                    random_state=random_state,
                                    x_squared_norms=squared_nomrs)
    if y is not None:
        mdm.classes_ = numpy.unique(y)
    else:
        mdm.classes_ = numpy.arange(n_clusters)

    labels = mdm.predict(X)
    k = 0
    while True:
        old_labels = labels.copy()
        mdm.fit(X, old_labels)
        dist = mdm._predict_distances(X)
        labels = mdm.classes_[dist.argmin(axis=1)]
        k += 1
        if (k > max_iter) | (numpy.mean(labels == old_labels) > (1 - tol)):
            break
    inertia = sum([sum(dist[labels == mdm.classes_[i], i])
                   for i in range(len(mdm.classes_))])
    return labels, inertia, mdm
Пример #12
0
def test_x_squared_norms_init_centroids():
    """Test that x_squared_norms can be None in _init_centroids"""
    try:
        from sklearn.cluster.k_means_ import _init_centroids

        X_norms = np.sum(X**2, axis=1)
        precompute = _init_centroids(X,
                                     3,
                                     "k-means++",
                                     random_state=0,
                                     x_squared_norms=X_norms)
        return [
            PY_array_equals(precompute,
                            _init_centroids(X, 3, "k-means++", random_state=0))
        ]
    except Exception:
        return 1
Пример #13
0
   def partial_fit(self, X):
    #Update k means estimate on a single iteration.

        X = check_array(X, accept_sparse="csr")
        n_samples, n_features = X.shape
        x_squared_norms = row_norms(X, squared=True) #currently has redundancy
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)
        
        #       if n_samples == 0:
        #            return self

        self.random_state_ = getattr(self, "random_state_",
                                     check_random_state(self.random_state))
        #      if (not hasattr(self, 'counts_')
        #              or not hasattr(self, 'cluster_centers_')):
        if (not hasattr(self, 'cluster_centers_')):
            # this is the first call partial_fit on this object:
            # initialize the cluster centers
            self.cluster_centers_ = k_means_._init_centroids(
                X, self.n_clusters, self.init,
                random_state=self.random_state_,
                x_squared_norms=x_squared_norms)
            print("Initialization complete")
        #           self.counts_ = np.zeros(self.n_clusters, dtype=np.int32)
        #            random_reassign = False
            distances = None
            """        if self.compute_labels:
            self.labels_, self.inertia_ = _labels_inertia(
                X, x_squared_norms, self.cluster_centers_)
            """
            return self
        else:
            """ # The lower the minimum count is, the more we do random
            # reassignment, however, we don't want to do random
            # reassignment too often, to allow for building up counts
            random_reassign = self.random_state_.randint(
                10 * (1 + self.counts_.min())) == 0
            """
            distances = np.zeros(X.shape[0], dtype=np.float64)

            """  _mini_batch_step(X, x_squared_norms, self.cluster_centers_,
                self.counts_, np.zeros(0, np.double), 0
                random_reassign=random_reassign, distances=distances,
                random_state=self.random_state_,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose)
            """
    
            self.cluster_centers_,self.inertia_ , squared_diff = _kmeans_step(
                X=X,x_squared_norms=x_squared_norms,centers=self.cluster_centers_,
                distances=distances,precompute_distances=self.precompute_distances,n_clusters=self.n_clusters)

            """        if self.compute_labels:
                self.labels_, self.inertia_ = _labels_inertia(
                X, x_squared_norms, self.cluster_centers_)
            """
            return self, squared_diff
Пример #14
0
    def _train(self, X, y, rs):
        new_X, new_y = [], []
        for x_sub, yi in self.class_iter(X, y):
            clusters = self.get_cluster_size(x_sub)
            # Choose random clusters
            centers = _init_centroids(x_sub, clusters, 'k-means++', rs)
            new_X.append(centers)
            new_y.extend([yi] * new_X[-1].shape[0])

        return np.vstack(new_X), new_y
Пример #15
0
 def calc_sampling_distribution(self):
     x_squared_norms = row_norms(self.X, squared=True)
     centers = _init_centroids(self.X,
                               self.n_clusters,
                               self.init,
                               random_state=self.random_state,
                               x_squared_norms=x_squared_norms)
     sens = sensitivity.kmeans_sensitivity(self.X, self.w, centers,
                                           max(np.log(self.n_clusters), 1))
     self.p = sens / np.sum(sens)
Пример #16
0
def init_step(dataset, model, device, pretrained, mode='kmeans',n_clusters=None):
    """Initialization of landmarks with k-means or k-means++ given dataset."""
    if n_clusters==None:
        n_clusters = len(np.unique(dataset.y))
    nexamples = len(dataset.x)
        
    X =  torch.stack([dataset.x[i] for i in range(nexamples)])
    
    if mode=='kmeans++':
        if not pretrained: # find centroids in original space
            landmarks = k_means_._init_centroids(X.cpu().numpy(), n_clusters, 'k-means++')
            landmarks = torch.tensor(landmarks, device=device)
            landmarks = landmarks.to(device)
            lndmk_encoded,_ = model(landmarks)
            
        else:
            X = X.to(device)
            encoded,_ = model(X)
            landmarks = k_means_._init_centroids(encoded.data.cpu().numpy(), n_clusters, 'k-means++')
            lndmk_encoded = torch.tensor(landmarks, device=device)
    
    elif mode=='kmeans': # run kmeans clustering
        if not pretrained: 
            kmeans = KMeans(n_clusters, random_state=0).fit(X.cpu().numpy())
            landmarks = torch.tensor(kmeans.cluster_centers_, device=device)
            landmarks = landmarks.to(device)
            lndmk_encoded,_ = model(landmarks)
        else:
            X = X.to(device)
            encoded,_ = model(X)
            kmeans = KMeans(n_clusters, random_state=0).fit(encoded.data.cpu().numpy())
            lndmk_encoded = torch.tensor(kmeans.cluster_centers_, device=device)
    
    return lndmk_encoded
    
    
    
def Subspace_iter(X, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, x_squared_norms=None, random_state=None):
    random_state = check_random_state(random_state)
    centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms)

    new_labels, new_inertia, new_centers = None, None, None

    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
    d_shape = X.shape[1]
    randomval = random_state.random_sample(d_shape ** 2).reshape(d_shape, d_shape)
    V_val, _ = np.linalg.qr(randomval, mode='complete')
    m_val = d_shape // 2
    S_D = np.dot(X.T, X)
    P_Cluster = np.eye(m_val, M=d_shape).T
    for i in range(max_iter):
        centers_old = centers.copy()
        X_values = np.dot(np.dot(X, V_val), P_Cluster)
        centers_c = np.dot(np.dot(centers, V_val), P_Cluster)
        labels, _ = pairwise_distances_argmin_min(X = X_values, Y = centers_c,  metric='euclidean',metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)
        centers = _k_means._centers_dense(X, labels, n_clusters, distances)
        S = np.zeros((d_shape, d_shape))
        for it in range(n_clusters):
            X_it = X[:][labels == it] - centers[:][it]
            S += np.dot(X_it.T, X_it)
        Sigma = S - S_D
        EV, _ = np.linalg.eigh(Sigma)
        m = len(np.where(EV < tol_eig)[0])
        P_Cluster = np.eye(m, M=d_shape).T
        inertia = 0.0
        for j in range(n_clusters):
            inertia += row_norms( X[:][labels == j] - centers[:][j],squared=True).sum()

        if new_inertia is None or inertia < new_inertia:
            new_labels = labels.copy()
            new_centers = centers.copy()
            new_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            break

    if center_shift_total > 0:
        new_labels, new_inertia = _labels_inertia(X, x_squared_norms, new_centers,
                            precompute_distances=False,
                            distances=distances)
    return new_labels, new_inertia, new_centers, i + 1
Пример #18
0
    def fit(self, X, w=None):
        if w is None:
            w = np.ones(X.shape[0])
        elif X.shape[0] != w.shape[0]:
            raise ValueError(
                "The number of weights must match the number of data points.")
        x_squared_norms = row_norms(X, squared=True)
        self.centers = None

        for it in range(self.n_iter):
            best_centers, best_inertia = None, -1
            for init_it in range(self.n_init):
                # initialization could be extended to consider weights
                centers = _init_centroids(X,
                                          self.n_clusters,
                                          self.init,
                                          random_state=self.random_state,
                                          x_squared_norms=x_squared_norms)
                assignment, inertia = weighted_kmeans_.assignment_inertia(
                    X, centers)
                if best_inertia == -1 or w.dot(inertia) < best_inertia:
                    best_centers = centers
                    best_inertia = w.dot(inertia)

            centers = best_centers

            inertia = np.full((X.shape[0]), np.inf)

            for it in range(self.max_iter):
                # E-step
                assignment, new_inertia = weighted_kmeans_.assignment_inertia(
                    X, centers)

                # M-step
                centers = weighted_kmeans_.update_centers(
                    X, w, centers, assignment)

                if w.dot(inertia - new_inertia) <= self.tol:
                    break
                inertia = new_inertia

            if self.centers is None or w.dot(self.inertia - new_inertia) > 0:
                self.inertia = new_inertia
                self.centers = centers
Пример #19
0
 def fit(self, X, y=None):
     # FIXME(gilad): sub-optimal. consider using _kmeans_single_elkan.
     random_state = check_random_state(self.random_state)
     X = self._check_fit_data(X)
     tol = k_means_._tolerance(X, self.tol)
     itr = 0
     init = k_means_._init_centroids(X, self.n_clusters, 'random',
                                     random_state)
     self.cluster_centers_ = center_updater(init, self.fixed_centers,
                                            self.n_fixed)
     self.inertia_ = np.infty
     self.inertia_prev_ = np.infty
     inertia_del = np.infty
     while itr < self.max_iter and inertia_del > tol:
         self.inertia_prev_ = self.inertia_
         self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
             k_means(
                 X, n_clusters=self.n_clusters, init=self.cluster_centers_,
                 n_init=self.n_init, max_iter=1, verbose=self.verbose,
                 precompute_distances=self.precompute_distances,
                 tol=self.tol, random_state=random_state, copy_x=self.copy_x,
                 n_jobs=self.n_jobs, algorithm=self.algorithm,
                 return_n_iter=True)
         self.cluster_centers_ = center_updater(self.cluster_centers_,
                                                self.fixed_centers,
                                                self.n_fixed)
         if itr > 0:
             inertia_del = math.fabs(
                 (self.inertia_ - self.inertia_prev_) / self.inertia_prev_)
         if self.verbose:
             self.log.info(
                 'calculating for itr={}: inertia_del={}, tol={}'.format(
                     itr, inertia_del, tol))
         itr += 1
     if itr < self.max_iter:
         self.log.info(
             'convergence achieved for iteration {}. inertia={}. inertia_del={}'
             .format(itr, self.inertia_, inertia_del))
     else:
         self.log.info(
             'convergence not achieved. itr={}. inertia={}. inertia_del={}'.
             format(itr, self.inertia_, inertia_del))
     return self
Пример #20
0
def kmeanspp(X, k, seed):
    # That we need to do this is a bug in _init_centroids
    x_squared_norms = row_norms(X, squared=True)
    # Use k-means++ to initialise the centroids
    centroids = _init_centroids(X, k, 'k-means++', random_state=seed, x_squared_norms=x_squared_norms)
    # OK, we should just short-circuit and get these from k-means++...
    # quick and dirty solution
    nns = NearestNeighbors()
    nns.fit(X)
    centroid_candidatess = nns.radius_neighbors(X=centroids, radius=0, return_distance=False)
    # Account for "degenerated" solutions: serveral voxels at distance 0, each becoming a centroid
    centroids = set()
    for centroid_candidates in centroid_candidatess:
        centroid_candidates = set(centroid_candidates) - centroids
        if len(set(centroid_candidates) - centroids) == 0:
            raise Exception('Cannot get an unambiguous set of centers;'
                            'theoretically this cannot happen, so check for bugs')
        centroids.add(centroid_candidates.pop())
    return np.array(sorted(centroids))
Пример #21
0
def init_uv(X, C, *, method):

    N, ndim = len(X), len(X[0])

    assert isinstance(method, str)

    if method == 'random':
        V = np.random.random((C, ndim))
    elif method == 'orig':
        return origin_init(X, C)
    else:
        V = _init_centroids(X, C, method)

    U = np.ones((N, C)) * .1 / (C - 1)

    for i in range(N):
        xi = np.repeat(X[i, :].reshape((1, ndim)), C, axis=0)
        U[i, np.argmin(l21_norm(xi - V, axis=1))] = .9

    return U, V
Пример #22
0
def km_init(X, K, C_init, l_init=None):
    """
    Initial seeds
    """
    if isinstance(C_init, str):

        if C_init == 'kmeans_plus':
            M = _init_centroids(X, K, init='k-means++')
            l = km_le(X, M)

        elif C_init == 'kmeans':
            kmeans = KMeans(n_clusters=K).fit(X)
            l = kmeans.labels_
            M = kmeans.cluster_centers_
    else:
        M = C_init.copy()
        # l = km_le(X,M)
        l = l_init.copy()

    del C_init, l_init

    return M, l
Пример #23
0
    def fit(self, X, Y=None):
        """Compute fuzzy c-means clustering.
        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
        """

        if Y is None:
            self.centers = _init_centroids(X,
                                           self.n_clusters,
                                           init=self.init,
                                           random_state=None,
                                           x_squared_norms=row_norms(
                                               X, squared=True))
        else:
            n_labels = int(np.max(Y))
            self.centers = np.zeros([n_labels + 1, np.shape(X)[1]])
            for l in np.arange(n_labels + 1):
                self.centers[l, :] = np.mean(X[Y == l], axis=0)

        u, d = _init_memberships(X, self.centers, self.distance)

        cluster_centers, predicted_labels = \
            f_k_means(X,
                      n_clusters=self.n_clusters,
                      m=self.m,
                      tol_memberships=self.tol_memberships,
                      tol_centroids=self.tol_centroids,
                      max_iter=self.max_iter,
                      init=self.centers,
                      constraint=self.constraint,
                      distance=self.distance,
                      n_init=self.n_init)

        self.labels_ = predicted_labels
        self.cluster_centers_ = cluster_centers

        return self
Пример #24
0
def f_k_means(X, n_clusters, m, tol_memberships, tol_centroids, max_iter, init,
              constraint, distance, n_init):
    # if the initialization method is not 'k-means++',
    # an array of centroids is passed
    # and it is converted in float type
    if hasattr(init, '__array__'):
        n_clusters = init.shape[0]
        init = np.asarray(init, dtype=np.float64)

    # Initialize centers and memberships
    n_samples, n_features = X.shape

    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=True,
                              x_squared_norms=row_norms(X, squared=True))

    u, d = _init_memberships(X, centers, distance)
    labels = _labels_computation(u)
    # Choose the optimization method

    centers, labels, inertia, n_iter, u, fpc = \
        f_k_means_main_loop(X,
                            n_clusters,
                            m,
                            u,
                            centers,
                            d,
                            tol_memberships,
                            tol_centroids,
                            max_iter,
                            constraint,
                            distance)

    return centers, labels
Пример #25
0
def kmeans_lloyd(X, sample_weight, n_clusters, max_iter=300,
                 init='k-means++', verbose=False, x_squared_norms=None,
                 random_state=None, tol=1e-4, same_cluster_size=False):
    """A single run of k-means, assumes preparation completed prior.
    Parameters
    ----------
    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.
    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.
    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.
    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.
    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.
        'random': choose k observations (rows) at random from data for
        the initial centroids.
        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.
        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.
    tol : float, optional
        The relative increment in the results before declaring convergence.
    verbose : boolean, optional
        Verbosity mode
    x_squared_norms : array
        Precomputed x_squared_norms.
    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).
    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.
    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.
    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.
    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).
    n_iter : int
        Number of iterations run.
    """
    random_state = check_random_state(random_state)
    if same_cluster_size:
        assert len(X) % n_clusters == 0, "#samples is not divisible by #clusters"

    if verbose:
        print("\n==> Starting k-means clustering...\n")

    sample_weight = _check_sample_weight(X, sample_weight)
    x_squared_norms = row_norms(X, squared=True)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms,
                            centers, distances=distances, same_cluster_size=same_cluster_size)

        # computation of the means is also called the M-step of EM
        centers = _centers_dense(
            X, sample_weight, labels, n_clusters, distances)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e"
                      % (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight, x_squared_norms,
                            best_centers, distances=distances, same_cluster_size=same_cluster_size)

    return best_labels, best_inertia, best_centers, i + 1
Пример #26
0
if len(sys.argv) >= 4:
	cluster_list = get_clusters_from_file(sys.argv[3])

# Generate testset
X, _ = make_blobs(n_samples=n_samples, centers=n_centers, random_state=random_state)

v1 = X[:, 0]
v2 = X[:, 1]

# Scale to integers
v1 = scale(v1)
v2 = scale(v2)
X = np.array(zip(v1, v2))

# Compute initial centers - using KMean++
centers = _init_centroids(X, n_centers, 'k-means++')

# Write file
with open("kmeans_testset.c", "w") as f:
	f.write("int testset_x[" + str(len(v1)) + "];\n");
	f.write("int testset_y[" + str(len(v1)) + "];\n");
	f.write("int testset_initial_centers_x[" + str(len(centers)) + "];\n");
	f.write("int testset_initial_centers_y[" + str(len(centers)) + "];\n");
	f.write("void init_dataset() {\n");

	# Points
	i = 0
	for x, y in zip(v1, v2):
		f.write("testset_x[" + str(i) + "] = " + str(x) + ";\n");
		f.write("testset_y[" + str(i) + "] = " + str(y) + ";\n");
		i += 1
Пример #27
0
def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300,
                   init='k-means++', verbose=False, random_state=None,
                   tol=1e-4, precompute_distances=True, sample_weight=None):
    """A single run of k-means, assumes preparation completed prior.
    Parameters
    ----------
    X: array-like of floats, shape (n_samples, n_features)
        The observations to cluster.
    n_clusters: int
        The number of clusters to form as well as the number of
        centroids to generate.
    max_iter: int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.
    init: {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':
        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.
        'random': generate k centroids from a Gaussian with mean and
        variance estimated from the data.
        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.
        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.
    tol: float, optional
        The relative increment in the results before declaring convergence.
    verbose: boolean, optional
        Verbosity mode
    x_squared_norms: array
        Precomputed x_squared_norms.
    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).
    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    Returns
    -------
    centroid: float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.
    label: integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.
    inertia: float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).
    n_iter : int
        Number of iterations run.
    """

    if sample_weight == None:
        sample_weight = np.ones(X.shape[0])

    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = k_means_._init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=np.float64)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        sample_weight = np.asarray([1.0] * len(labels))
        # computation of the means is also called the M-step of EM
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, sample_weight, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        shift = squared_norm(centers_old - centers)
        if shift <= tol:
            if verbose:
                print("Converged at iteration %d" % i)

            break

    if shift > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Пример #28
0
def kmeans(X,
           n_clusters,
           delta=.001,
           maxiter=10,
           metric="cityblock",
           p=2,
           verbose=1,
           x_squared_norms=None):
    """ centres, Xtocentre, distances = kmeans( X, initial centres ... )
    in:
        X N x dim  may be sparse
        centres k x dim: initial centres, e.g. random.sample( X, k )
        delta: relative error, iterate until the average distance to centres
            is within delta of the previous average distance
        maxiter
        metric: any of the 20-odd in scipy.spatial.distance
            "chebyshev" = max, "cityblock" = L1, "minkowski" with p=
            or a function( Xvec, centrevec ), e.g. Lqmetric below
        p: for minkowski metric -- local mod cdist for 0 < p < 1 too
        verbose: 0 silent, 2 prints running distances
    out:
        centres, k x dim
        Xtocentre: each X -> its nearest centre, ints N -> k
        distances, N
    see also: kmeanssample below, Klasy Kmeans below.
    """
    if x_squared_norms is None:
        x_squared_norms = row_norms(X, squared=True)

    if not issparse(X):
        X = np.asanyarray(X)  # ?
    centres = _init_centroids(X,
                              n_clusters,
                              'k-means++',
                              random_state=None,
                              x_squared_norms=x_squared_norms)
    N, dim = X.shape
    k, cdim = centres.shape
    if dim != cdim:
        raise ValueError(
            "kmeans: X %s and centres %s must have the same number of columns"
            % (X.shape, centres.shape))
    if verbose:
        print("kmeans: X %s  centres %s  delta=%.2g  maxiter=%d  metric=%s" %
              (X.shape, centres.shape, delta, maxiter, metric))
    allx = np.arange(N)
    prevdist = 0
    for jiter in range(1, maxiter + 1):
        D = cdist_sparse(X, centres, metric=metric, p=p)  # |X| x |centres|
        xtoc = D.argmin(axis=1)  # X -> nearest centre
        distances = D[allx, xtoc]
        avdist = distances.mean()  # median ?
        if verbose >= 2:
            print("kmeans: av |X - nearest centre| = %.4g" % avdist)
        if (1 - delta) * prevdist <= avdist <= prevdist \
        or jiter == maxiter:
            break
        prevdist = avdist
        for jc in range(k):  # (1 pass in C)
            c = np.where(xtoc == jc)[0]
            if len(c) > 0:
                centres[jc] = X[c].mean(axis=0)
    if verbose:
        print("kmeans: %d iterations  cluster sizes:" % jiter,
              np.bincount(xtoc))
    if verbose >= 2:
        r50 = np.zeros(k)
        r90 = np.zeros(k)
        for j in range(k):
            dist = distances[xtoc == j]
            if len(dist) > 0:
                r50[j], r90[j] = np.percentile(dist, (50, 90))
        print("kmeans: cluster 50 % radius", r50.astype(int))
        print("kmeans: cluster 90 % radius", r90.astype(int))
        # scale L1 / dim, L2 / sqrt(dim) ?
    return centres, xtoc, distances
def subspace_kmeans_single(X,
                           sample_weight,
                           n_clusters,
                           init='k-means++',
                           max_iter=300,
                           tol=1e-4,
                           tol_eig=-1e-10,
                           verbose=False,
                           x_squared_norms=None,
                           random_state=None):
    random_state = check_random_state(random_state)
    sample_weight = _check_sample_weight(X, sample_weight)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # === Beginning of original implementation of initialization ===

    # Dimensionality of original space
    d = X.shape[1]

    # Set initial V as QR-decomposed Q of random matrix
    rand_vals = random_state.random_sample(d**2).reshape(d, d)
    V, _ = np.linalg.qr(rand_vals, mode='complete')

    # Set initial m as d/2
    m = d // 2

    # Scatter matrix of the dataset in the original space
    S_D = np.dot(X.T, X)

    # Projection onto the first m attributes
    P_C = np.eye(m, M=d).T

    # === End of original implementation of initialization ===

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # === Beginning of original implementation of E-step of EM ===

        X_C = np.dot(np.dot(X, V), P_C)
        mu_C = np.dot(np.dot(centers, V), P_C)
        labels, _ = pairwise_distances_argmin_min(
            X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)

        # === End of original implementation of E-step of EM ===

        # computation of the means is also called the M-step of EM
        centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters,
                                          distances)

        # === Beginning of original implementation of M-step of EM ===

        S = np.zeros((d, d))
        for i in range(n_clusters):
            X_i = X[:][labels == i] - centers[:][i]
            S += np.dot(X_i.T, X_i)
        Sigma = S - S_D
        evals, evecs = np.linalg.eigh(Sigma)
        idx = np.argsort(evals)[::1]
        V = evecs[:, idx]
        m = len(np.where(evals < tol_eig)[0])
        if m == 0:
            raise ValueError(
                'Dimensionality of clustered space is 0. '
                'The dataset is better explained by a single cluster.')
        P_C = np.eye(m, M=d).T
        inertia = 0.0
        for i in range(n_clusters):
            inertia += row_norms(X[:][labels == i] - centers[:][i],
                                 squared=True).sum()

        # === End of original implementation of M-step of EM ===

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight,x_squared_norms, best_centers,
                            precompute_distances=False,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
def kmeans_constrained_single(X,
                              n_clusters,
                              size_min=None,
                              size_max=None,
                              max_iter=300,
                              init='k-means++',
                              verbose=False,
                              x_squared_norms=None,
                              random_state=None,
                              tol=1e-4):
    """A single run of k-means constrained, assumes preparation completed prior.

    Parameters
    ----------
    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.

    size_min : int, optional, default: None
        Constrain the label assignment so that each cluster has a minimum
        size of size_min. If None, no constrains will be applied

    size_max : int, optional, default: None
        Constrain the label assignment so that each cluster has a maximum
        size of size_max. If None, no constrains will be applied

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': generate k centroids from a Gaussian with mean and
        variance estimated from the data.

        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    verbose : boolean, optional
        Verbosity mode

    x_squared_norms : array
        Precomputed x_squared_norms.

    random_state : int, RandomState instance or None, optional, default: None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    n_iter : int
        Number of iterations run.
    """

    sample_weight = np.ones(X.shape[0])
    random_state = check_random_state(random_state)
    n_samples = X.shape[0]

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(n_samples, ), dtype=X.dtype)

    # Determine min and max sizes if non given
    if size_min is None:
        size_min = 0
    if size_max is None:
        size_max = n_samples  # Number of data points

    # Check size min and max
    if not ((size_min >= 0) and (size_min <= n_samples) and (size_max >= 0) and
            (size_max <= n_samples)):
        raise ValueError(
            "size_min and size_max must be a positive number smaller "
            "than the number of data points or `None`")
    if size_max < size_min:
        raise ValueError("size_max must be larger than size_min")
    if size_min * n_clusters > n_samples:
        raise ValueError(
            "The product of size_min and n_clusters cannot exceed the number of samples (X)"
        )

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = \
            _labels_constrained(X, centers, size_min, size_max, distances=distances)

        # computation of the means is also called the M-step of EM
        if sp.issparse(X):
            centers = _centers_sparse(X, sample_weight, labels, n_clusters,
                                      distances)
        else:
            centers = _centers_dense(X, sample_weight, labels, n_clusters,
                                     distances)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_constrained(X, centers, size_min, size_max, distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Пример #31
0
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
                                   init='k-means++', verbose=False,
                                   x_squared_norms=None,
                                   random_state=None, tol=1e-4,
                                   precompute_distances=True):
    '''
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    '''
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e"
                      % (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Пример #32
0
def _spherical_kmeans_single_lloyd(X,
                                   n_clusters,
                                   max_iter=300,
                                   init='k-means++',
                                   verbose=False,
                                   x_squared_norms=None,
                                   random_state=None,
                                   tol=1e-4,
                                   precompute_distances=True):
    '''
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    '''
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Пример #33
0
    def partial_fit(self, D):
        """
    	Apply one iteration of VR_MBKM
    	
    	Input: self, dataset 
    	Output: self
    	
    	Updated:
    	   -self.curr_iter
    	   -self.curr_inner_iter
    	   -self.tot_inner_iter
    	   -self.cluster_centers_
		"""
        ## perform checks on dataset
        D = check_array(D, accept_sparse='csr')

        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)

        if self.curr_inner_iter == 0:
            self.inner_loop == 0

        if self.curr_iter == 0 or self.inner_loop == 0 or self.update_freq == 0:
            ## OUTER LOOP
            # use the entire dataset
            X = D
            x_squared_norms = row_norms(X, squared=True)
            self.random_state_ = getattr(self, "random_state_",
                                         check_random_state(self.random_state))

            if self.curr_iter == 0:
                ## initialize centers
                if hasattr(self.init, '__array__'):
                    self.cluster_centers_ = self.init
                else:
                    self.cluster_centers_ = k_means_._init_centroids(
                        X,
                        self.n_clusters,
                        self.init,
                        random_state=self.random_state_,
                        x_squared_norms=x_squared_norms,
                        init_size=self.init_size)

                _, cost = k_means_._labels_inertia(X, x_squared_norms,
                                                   self.cluster_centers_)
                #print "Cost of current initial centers on the mini-batch is %r " % cost

                ## initialize counts
                self.counts_ = np.zeros(self.n_clusters, dtype=np.int32)

            ## this ensures the benchmark centers are either the seeds
            ## or obtained from the last iterate of inner loop
            self.benchmark_centers = self.cluster_centers_.copy()

            ## run Lloyd's update with entire data
            distances = np.zeros(X.shape[0], dtype=np.float64)
            self.benchmark_updates, _, self.squared_diff = _kmeans_step(
                X=X,
                x_squared_norms=x_squared_norms,
                centers=self.benchmark_centers.copy(),
                distances=distances,
                precompute_distances=self.precompute_distances,
                n_clusters=self.n_clusters)

            self.cluster_centers_ = self.benchmark_updates.copy()
            self.curr_outer_iter += 1
            self.inner_loop = 1

        else:
            ## INNER LOOP:
            # use a mini-batch of data
            sample_idx = random.sample(range(D.shape[0]), self.mbsize)
            X = D[sample_idx, :]
            #x_squared_norms = row_norms(X, squared=True)
            self.set_eta()
            ## run VRMB_step with entire data
            distances = np.zeros(X.shape[0], dtype=np.float64)

            self.cluster_centers_, self.squared_diff, _ = VR_MB_step(
                X,
                None,
                self.cluster_centers_.copy(),
                self.benchmark_centers.copy(),
                self.benchmark_updates.copy(),
                self.counts_,
                self.curr_iter,
                np.zeros(0, np.double),
                0,
                distances,
                random_reassign=False,
                random_state=self.random_state_,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose,
                learn_rate=self.set_eta())

            # increment inner loop counts
            self.curr_inner_iter = (self.curr_inner_iter +
                                    1) % self.update_freq

        # increment global loop count
        self.curr_iter += 1
    def partial_fit(self, X, y=None, sample_weight=None):
        """Update k means estimate on a single mini-batch X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Coordinates of the data points to cluster. It must be noted that
            X will be copied if it is not C-contiguous.
        y : Ignored
            Not used, present here for API consistency by convention.
        sample_weight : array-like, shape (n_samples,), optional
            The weights for each observation in X. If None, all observations
            are assigned equal weight (default: None).
        Returns
        -------
        self
        """

        X = check_array(X,
                        accept_sparse="csr",
                        order="C",
                        dtype=[np.float64, np.float32])
        n_samples, n_features = X.shape
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=X.dtype)

        if n_samples == 0:
            return self

        # unit-normalize for spherical k-means
        X = normalize(X)

        sample_weight = _check_normalize_sample_weight(sample_weight, X)

        x_squared_norms = row_norms(X, squared=True)
        self.random_state_ = getattr(self, "random_state_",
                                     check_random_state(self.random_state))
        if (not hasattr(self, 'counts_')
                or not hasattr(self, 'cluster_centers_')):
            # this is the first call partial_fit on this object:
            # initialize the cluster centers
            self.cluster_centers_ = _init_centroids(
                X,
                self.n_clusters,
                self.init,
                random_state=self.random_state_,
                x_squared_norms=x_squared_norms,
                init_size=self.init_size)

            self.counts_ = np.zeros(self.n_clusters, dtype=sample_weight.dtype)
            random_reassign = False
            distances = None
        else:
            # The lower the minimum count is, the more we do random
            # reassignment, however, we don't want to do random
            # reassignment too often, to allow for building up counts
            random_reassign = self.random_state_.randint(
                10 * (1 + self.counts_.min())) == 0
            distances = np.zeros(X.shape[0], dtype=X.dtype)

        self.cluster_centers_ = normalize(self.cluster_centers_)

        _mini_batch_spherical_step(X,
                                   sample_weight,
                                   x_squared_norms,
                                   self.cluster_centers_,
                                   self.counts_,
                                   np.zeros(0, dtype=X.dtype),
                                   0,
                                   random_reassign=random_reassign,
                                   distances=distances,
                                   random_state=self.random_state_,
                                   reassignment_ratio=self.reassignment_ratio,
                                   verbose=self.verbose)
        self.cluster_centers_ = normalize(self.cluster_centers_)

        if self.compute_labels:
            self.labels_, self.inertia_ = _labels_inertia(
                X, sample_weight, x_squared_norms, self.cluster_centers_)

        return self
Пример #35
0
    def fit(self, X, y=None):
        """Compute the centroids on X by chunking it into mini-batches.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored

        """
        random_state = check_random_state(self.random_state)
        X = check_array(X,
                        accept_sparse="csr",
                        order='C',
                        dtype=[np.float64, np.float32])
        n_samples, n_features = X.shape
        if n_samples < self.n_clusters:
            raise ValueError("Number of samples smaller than number "
                             "of clusters.")

        n_init = self.n_init
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
            if n_init != 1:
                warnings.warn(
                    'Explicit initial center position passed: '
                    'performing only one init in MiniBatchKMeans instead of '
                    'n_init=%d' % self.n_init,
                    RuntimeWarning,
                    stacklevel=2)
                n_init = 1

        x_squared_norms = k_means_.row_norms(X, squared=True)

        if self.tol > 0.0:
            tol = k_means_._tolerance(X, self.tol)

            # using tol-based early stopping needs the allocation of a
            # dedicated before which can be expensive for high dim data:
            # hence we allocate it outside of the main loop
            old_center_buffer = np.zeros(n_features, dtype=X.dtype)
        else:
            tol = 0.0
            # no need for the center buffer if tol-based early stopping is
            # disabled
            old_center_buffer = np.zeros(0, dtype=X.dtype)

        distances = np.zeros(self.batch_size, dtype=X.dtype)
        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        n_iter = int(self.max_iter * n_batches)

        init_size = self.init_size
        if init_size is None:
            init_size = 3 * self.batch_size
        if init_size > n_samples:
            init_size = n_samples
        self.init_size_ = init_size

        validation_indices = random_state.randint(0, n_samples, init_size)
        X_valid = X[validation_indices]
        x_squared_norms_valid = x_squared_norms[validation_indices]

        # perform several inits with random sub-sets
        best_inertia = None
        for init_idx in range(n_init):
            if self.verbose:
                print("Init %d/%d with method: %s" %
                      (init_idx + 1, n_init, self.init))
            counts = np.zeros(self.n_clusters, dtype=np.int32)

            # TODO: once the `k_means` function works with sparse input we
            # should refactor the following init to use it instead.

            # Initialize the centers using only a fraction of the data as we
            # expect n_samples to be very large when using MiniBatchKMeans
            cluster_centers = k_means_._init_centroids(
                X,
                self.n_clusters,
                self.init,
                random_state=random_state,
                x_squared_norms=x_squared_norms,
                init_size=init_size)

            # Compute the label assignment on the init dataset
            batch_inertia, centers_squared_diff = k_means_._mini_batch_step(
                X_valid,
                x_squared_norms[validation_indices],
                cluster_centers,
                counts,
                old_center_buffer,
                False,
                distances=None,
                verbose=self.verbose)

            # Keep only the best cluster centers across independent inits on
            # the common validation set
            _, inertia = k_means_._labels_inertia(X_valid,
                                                  x_squared_norms_valid,
                                                  cluster_centers)
            if self.verbose:
                print("Inertia for init %d/%d: %f" %
                      (init_idx + 1, n_init, inertia))
            if best_inertia is None or inertia < best_inertia:
                self.cluster_centers_ = cluster_centers
                self.counts_ = counts
                best_inertia = inertia

        # Empty context to be used inplace by the convergence check routine
        convergence_context = {}

        # Perform the iterative optimization until the final convergence
        # criterion
        for iteration_idx in range(n_iter):
            # Sample a minibatch from the full dataset
            minibatch_indices = random_state.randint(0, n_samples,
                                                     self.batch_size)

            # Perform the actual update step on the minibatch data
            batch_inertia, centers_squared_diff = k_means_._mini_batch_step(
                X[minibatch_indices],
                x_squared_norms[minibatch_indices],
                self.cluster_centers_,
                self.counts_,
                old_center_buffer,
                tol > 0.0,
                distances=distances,
                # Here we randomly choose whether to perform
                # random reassignment: the choice is done as a function
                # of the iteration index, and the minimum number of
                # counts, in order to force this reassignment to happen
                # every once in a while
                random_reassign=((iteration_idx + 1) %
                                 (10 + self.counts_.min()) == 0),
                random_state=random_state,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose)

            # Monitor convergence and do early stopping if necessary
            if k_means_._mini_batch_convergence(self,
                                                iteration_idx,
                                                n_iter,
                                                tol,
                                                n_samples,
                                                centers_squared_diff,
                                                batch_inertia,
                                                convergence_context,
                                                verbose=self.verbose):
                break

        self.n_iter_ = iteration_idx + 1

        if self.compute_labels:
            self.labels_, self.inertia_ = self._labels_inertia_minibatch(X)

        return self
Пример #36
0
def _init_unit_centers(X, n_clusters, random_state, init):
    """Initializes unit norm centers.

    Parameters
    ----------
    X : array-like or sparse matrix, shape=(n_samples, n_features)

    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    init:  (string) one of
        k-means++ : uses sklearn k-means++ initialization algorithm
        spherical-k-means : use centroids from one pass of spherical k-means
        random : random unit norm vectors
        random-orthonormal : random orthonormal vectors
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.
    """
    n_examples, n_features = np.shape(X)
    if isinstance(init, np.ndarray):
        n_init_clusters, n_init_features = init.shape
        assert n_init_clusters == n_clusters
        assert n_init_features == n_features

        # ensure unit normed centers
        centers = init
        for cc in range(n_clusters):
            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])

        return centers

    elif init == 'spherical-k-means':
        labels, inertia, centers, iters =\
                spherical_kmeans._spherical_kmeans_single_lloyd(
                    X,
                    n_clusters,
                    x_squared_norms=np.ones((n_examples, )),
                    init='k-means++')

        return centers

    elif init == 'random':
        centers = np.random.randn(n_clusters, n_features)
        for cc in range(n_clusters):
            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])

        return centers

    elif init == 'k-means++':
        centers = _init_centroids(X,
                                  n_clusters,
                                  'k-means++',
                                  random_state=random_state,
                                  x_squared_norms=np.ones((n_examples, )))

        for cc in range(n_clusters):
            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])

        return centers

    elif init == 'random-orthonormal':
        centers = np.random.randn(n_clusters, n_features)
        q, r = np.linalg.qr(centers.T, mode='reduced')

        return q.T

    elif init == 'random-class':
        centers = np.zeros((n_clusters, n_features))
        for cc in range(n_clusters):
            while np.linalg.norm(centers[cc, :]) == 0:
                labels = np.random.randint(0, n_clusters, n_examples)
                centers[cc, :] = X[labels == cc, :].sum(axis=0)

        for cc in range(n_clusters):
            centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :])

        return centers
Пример #37
0
    def sub_kmeans_single_(self, X, sample_weight, x_squared_norms, tol,
                           random_state):
        random_state = check_random_state(random_state)
        sample_weight = _check_sample_weight(X, sample_weight)
        best_labels, best_inertia, best_centers = None, None, None

        distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)
        centers = _init_centroids(X,
                                  self.n_clusters,
                                  init='k-means++',
                                  random_state=random_state,
                                  x_squared_norms=x_squared_norms)

        d = X.shape[1]  # dimentionality of original space
        m = d // 2  # dimentionality of clustered space
        SD = np.dot(X.T,
                    X)  # scatter matrix of the dataset in the original space

        # orthonormal matrix of a rigid transformation
        V, _ = np.linalg.qr(random_state.random_sample(d**2).reshape(d, d),
                            mode='complete')
        for i in range(self.max_iter):
            centers_old = centers.copy()

            # get the clusters' labels
            labels = self.assignment_step_(X=X, V=V, centers=centers, m=m)

            # compute new centers and sum the clusters' scatter matrices
            centers = _k_means._centers_dense(X, sample_weight, labels,
                                              self.n_clusters, distances)
            S = self.update_step_(X, centers, labels)

            # sorted eigenvalues and eigenvectors of SIGMA=S-SD
            V, m = self.eigen_decomposition_(S - SD)
            if m == 0:
                raise ValueError('Might be a single cluster (m = 0).')

            # inertia - sum of squared distances of samples to their closest cluster center
            inertia = sum([
                row_norms(X[labels == j] - centers[j], squared=True).sum()
                for j in range(self.n_clusters)
            ])

            # print("Iteration %2d, inertia %.3f" % (i, inertia))
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia

            center_shift_total = squared_norm(centers_old - centers)
            if center_shift_total <= tol:
                # print("Converged at iteration %d: center shift %e within tolerance %e" % (i, center_shift_total, tol))
                break

        if center_shift_total > 0:
            # rerun E-step in case of non-convergence so that predicted labels match cluster centers
            best_labels, best_inertia = _labels_inertia(
                X,
                sample_weight,
                x_squared_norms,
                best_centers,
                precompute_distances=False,
                distances=distances)

        return best_centers, best_labels, best_inertia