Пример #1
0
    def test_reshape_dot(self):
        npa1 = np.random.random((357, 93))
        npa2 = np.random.random((31, 357))
        result = np.dot(np.reshape(npa1, (1071, 31)), npa2)

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(expr.reshape(t1, (1071, 31)), t2)
        Assert.all_eq(result, t3.glom(), 10e-9)

        npa1 = np.random.random((357, 718))
        npa2 = np.random.random((718, ))
        result = np.dot(npa1, np.reshape(npa2, (718, 1)))

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(t1, expr.reshape(t2, (718, 1)))
        Assert.all_eq(result, t3.glom(), 10e-9)

        npa1 = np.random.random((718, ))
        npa2 = np.random.random((1, 357))
        result = np.dot(np.reshape(npa1, (718, 1)), npa2)

        t1 = expr.from_numpy(npa1)
        t2 = expr.from_numpy(npa2)
        t3 = expr.dot(expr.reshape(t1, (718, 1)), t2)
        Assert.all_eq(result, t3.glom(), 10e-9)
Пример #2
0
  def test_reshape_dot(self):
    npa1 = np.random.random((357, 93))
    npa2 = np.random.random((31, 357))
    result = np.dot(np.reshape(npa1, (1071, 31)), npa2)

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(expr.reshape(t1, (1071, 31)), t2)
    Assert.all_eq(result, t3.glom(), 10e-9)

    npa1 = np.random.random((357, 718))
    npa2 = np.random.random((718, ))
    result = np.dot(npa1, np.reshape(npa2, (718, 1)))

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(t1, expr.reshape(t2, (718, 1)))
    Assert.all_eq(result, t3.glom(), 10e-9)

    npa1 = np.random.random((718, ))
    npa2 = np.random.random((1, 357))
    result = np.dot(np.reshape(npa1, (718, 1)), npa2)

    t1 = expr.from_numpy(npa1)
    t2 = expr.from_numpy(npa2)
    t3 = expr.dot(expr.reshape(t1, (718, 1)), t2)
    Assert.all_eq(result, t3.glom(), 10e-9)
Пример #3
0
 def test_reshape5(self):
     a = expr.arange((35511, ))
     b = expr.reshape(a, (133, 267))
     c = expr.reshape(b, (267, 133))
     d = expr.reshape(c, (1, 35511))
     e = expr.arange((1, 35511))
     Assert.all_eq(d.glom(), e.glom())
Пример #4
0
 def test_reshape3(self):
   a = expr.arange((100, 100))
   b = expr.reshape(a, (10000,))
   c = expr.reshape(b, (10000, 1))
   d = expr.reshape(c, (1, 10000))
   e = expr.arange((1, 10000))
   Assert.all_eq(d.glom(), e.glom())
Пример #5
0
 def test_reshape6(self):
     a = expr.arange((12319, ))
     b = expr.reshape(a, (127, 97))
     c = expr.reshape(b, (97, 127))
     d = expr.reshape(c, (1, 12319))
     e = expr.arange((1, 12319))
     Assert.all_eq(d.glom(), e.glom())
Пример #6
0
 def test_reshape3(self):
     a = expr.arange((100, 100))
     b = expr.reshape(a, (10000, ))
     c = expr.reshape(b, (10000, 1))
     d = expr.reshape(c, (1, 10000))
     e = expr.arange((1, 10000))
     Assert.all_eq(d.glom(), e.glom())
Пример #7
0
 def test_reshape5(self):
   a = expr.arange((35511, ))
   b = expr.reshape(a, (133, 267))
   c = expr.reshape(b, (267, 133))
   d = expr.reshape(c, (1, 35511))
   e = expr.arange((1, 35511))
   Assert.all_eq(d.glom(), e.glom())
Пример #8
0
 def test_reshape6(self):
   a = expr.arange((12319, ))
   b = expr.reshape(a, (127, 97))
   c = expr.reshape(b, (97, 127))
   d = expr.reshape(c, (1, 12319))
   e = expr.arange((1, 12319))
   Assert.all_eq(d.glom(), e.glom())
Пример #9
0
 def test_reshape8(self):
   t1 = expr.sparse_diagonal((137, 113))
   t2 = expr.sparse_diagonal((113, 137))
   a = expr.reshape(t1, (113, 137))
   b = expr.reshape(t2, (137, 113))
   Assert.all_eq(a.glom().todense(), sp.eye(137, 113).tolil().reshape((113, 137)).todense())
   Assert.all_eq(b.glom().todense(), sp.eye(113, 137).tolil().reshape((137, 113)).todense())
Пример #10
0
  def kneighbors(self, X, n_neighbors=None):
    """Finds the K-neighbors of a point.

        Returns distance

        Parameters
        ----------
        X : array-like, last dimension same as that of fit data
            The new point.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
    """
    if n_neighbors is not None:
      self.n_neighbors = n_neighbors

    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)

    if self.algorithm in ('auto', 'brute'):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      fit_X_broadcast = expr.reshape(self.X, (1, self.X.shape[0], self.X.shape[1]))
      distances = expr.sum((X_broadcast - fit_X_broadcast) ** 2, axis=2)
      neigh_ind = expr.argsort(distances, axis=1)
      neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom()
      neigh_dist = expr.sort(distances, axis=1)
      neigh_dist = expr.sqrt(neigh_dist[:, :n_neighbors]).optimized().glom()
      return neigh_dist, neigh_ind
    else:
      results = self.X.foreach_tile(mapper_fn=_knn_mapper,
                                    kw={'X': self.X, 'Q': X,
                                        'n_neighbors': self.n_neighbors,
                                        'algorithm': self.algorithm})
      dist = None
      ind = None
      """ Get the KNN candidates for each tile of X, then find out the real KNN """
      for k, v in results.iteritems():
        if dist is None:
          dist = v[0]
          ind = v[1]
        else:
          dist = np.concatenate((dist, v[0]), axis=1)
          ind = np.concatenate((ind, v[1]), axis=1)

      mask = np.argsort(dist, axis=1)[:, :self.n_neighbors]
      new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)])
      new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)])
      return new_dist, new_ind
Пример #11
0
 def test_reshape8(self):
     t1 = expr.sparse_diagonal((137, 113))
     t2 = expr.sparse_diagonal((113, 137))
     a = expr.reshape(t1, (113, 137))
     b = expr.reshape(t2, (137, 113))
     Assert.all_eq(a.glom().todense(),
                   sp.eye(137, 113).tolil().reshape((113, 137)).todense())
     Assert.all_eq(b.glom().todense(),
                   sp.eye(113, 137).tolil().reshape((137, 113)).todense())
Пример #12
0
 def test_reshape4(self):
   a = expr.arange((10000, ))
   b = expr.reshape(a, (10, 1000))
   c = expr.reshape(b, (1000, 10))
   d = expr.reshape(c, (20, 500))
   e = expr.reshape(d, (500, 20))
   f = expr.reshape(e, (1, 10000))
   g = expr.arange((1, 10000))
   Assert.all_eq(f.glom(), g.glom())
Пример #13
0
 def test_reshape4(self):
     a = expr.arange((10000, ))
     b = expr.reshape(a, (10, 1000))
     c = expr.reshape(b, (1000, 10))
     d = expr.reshape(c, (20, 500))
     e = expr.reshape(d, (500, 20))
     f = expr.reshape(e, (1, 10000))
     g = expr.arange((1, 10000))
     Assert.all_eq(f.glom(), g.glom())
Пример #14
0
  def fit(self, X, centers=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if centers is None:
      centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim))

    for i in range(self.n_iter):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
      distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
      labels = expr.argmin(distances, axis=1)
      center_idx = expr.arange((1, centers.shape[0]))
      matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
      matches = matches.astype(np.int64)
      counts = expr.sum(matches, axis=0)
      centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                              matches.shape[1], 1)),
                         axis=0)

      counts = counts.optimized().glom()
      centers = centers.optimized().glom()

      # If any centroids don't have any points assigined to them.
      zcount_indices = (counts == 0).reshape(self.n_clusters)

      if np.any(zcount_indices):
        # One or more centroids may not have any points assigned to them,
        # which results in their position being the zero-vector.  We reseed these
        # centroids with new random values.
        n_points = np.count_nonzero(zcount_indices)
        # In order to get rid of dividing by zero.
        counts[zcount_indices] = 1
        centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

      centers = centers / counts.reshape(centers.shape[0], 1)
      centers = expr.from_numpy(centers)
    return centers, labels	

    '''
Пример #15
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = expr.force(points)
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = expr.as_array(centers)
    points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1]))
    centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
    distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2)
    # This is used to avoid dividing zero
    distances = distances + 0.00000000001
    util.log_info('distances shape %s' % str(distances.shape))
    distances_broadcast = expr.reshape(distances, (distances.shape[0], 1,
                                                   distances.shape[1]))
    distances_broadcast2 = expr.reshape(distances, (distances.shape[0],
                                                    distances.shape[1], 1))
    prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2,
                                     2.0 / (m - 1)), axis=2)
    prob.force()
    counts = expr.sum(prob, axis=0)
    counts = expr.reshape(counts, (counts.shape[0], 1))
    labels = expr.argmax(prob, axis=1)
    centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
                       expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
                       axis=0)

    # We assume that the size of centers are relative small that can be handled
    # on the master.
    counts = counts.glom()
    centers = centers.glom()
    # If any centroids don't have any points assigned to them.
    zcount_indices = (counts == 0).reshape(k)

    if np.any(zcount_indices):
      # One or more centroids may not have any points assigned to them, which results in their
      # position being the zero-vector.  We reseed these centroids with new random values
      # and set their counts to 1 in order to get rid of dividing by zero.
      counts[zcount_indices, :] = 1
      centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices),
                                                  num_dim)

    centers = centers / counts
  return labels
Пример #16
0
 def update(self):
     """
 gradient_update = 2xTxw - 2xTy + 2* lambda * w
 Correct this if the update function is wrong.
 """
     xT = expr.transpose(self.x)
     g1 = expr.dot(expr.dot(xT, self.x), self.w)
     g2 = expr.dot(xT, self.y)
     g3 = self.ridge_lambda * self.w
     g4 = g1 + g2 + g3
     return expr.reshape(g4, (1, self.N_DIM))
Пример #17
0
  def test_optimization_shape(self):
    shape = (200, 800)
    na = np.arange(np.prod(shape), dtype=np.int).reshape(shape)
    nb = np.random.randint(1, 1000, (1000, 1000))
    nc = np.random.randint(1, 1000, (1000, 1000))
    a = expr.arange(shape, dtype=np.int)
    b = expr.from_numpy(nb)
    c = expr.from_numpy(nc)

    d = b + c
    e = b + d
    f = d[200:900, 200:900]
    g = e[200:900, 200:900]
    h = f + g
    i = f + h
    j = h[100:500, 100:500]
    k = i[100:300, 100:300]
    l = expr.reshape(expr.ravel(j), (800, 200))
    m = expr.dot(a, l)
    n = m + k
    o = n + m 
    q = o[100:200, 100:200]

    nd = nb + nc
    ne = nb + nd
    nf = nd[200:900, 200:900]
    ng = ne[200:900, 200:900]
    nh = nf + ng
    ni = nf + nh
    nj = nh[100:500, 100:500]
    nk = ni[100:300, 100:300]
    nl = np.reshape(np.ravel(nj), (800, 200))
    nm = np.dot(na, nl)
    nn = nm + nk
    no = nn + nm 
    nq = no[100:200, 100:200]


    Assert.all_eq(nq, q.optimized().glom(), tolerance = 1e-10)
Пример #18
0
def center_data(X, y, fit_intercept, normalize=False):
  """
  Centers data to have mean zero along axis 0. This is here because
  nearly all linear models will want their data to be centered.
  """
  if fit_intercept:
    X_mean = X.mean(axis = 0)
    X_mean = expr.reshape(X_mean, (1, X_mean.shape[0]))
    X -= X_mean
    
    if normalize:
      X_std = expr.sqrt(expr.sum(X ** 2, axis=0)).force()
      X_std[X_std == 0] = 1
      X /= X_std
    else:
      X_std = expr.ones(X.shape[1])
    
    y_mean = y.mean(axis=0)
    y -= y_mean
  else:
    X_mean = expr.zeros(X.shape[1])
    X_std = expr.ones(X.shape[1])
    y_mean = 0. if y.ndim == 1 else expr.zeros(y.shape[1], dtype=X.dtype)
  return X, y, X_mean, y_mean, X_std
Пример #19
0
 def test_reshape1(self):
   a = expr.arange((10, 10))
   b = expr.reshape(a, (100,))
   c = expr.arange((100,)) 
   Assert.all_eq(b.glom(), c.glom())
Пример #20
0
  def test_reshape7(self):
    t1 = expr.arange((23, 120, 100)).glom()
    t2 = expr.arange((12, 230, 100)).glom()
    t3 = expr.arange((276000, 1)).glom()
    t4 = expr.arange((1, 276000)).glom()

    a = expr.arange((100, 23, 120))
    b = expr.arange((12, 23, 1000))
    c = expr.arange((1, 276000))
    d = expr.arange((276000, 1))
    e = expr.arange((276000, ))

    Assert.all_eq(expr.reshape(a, (23, 120, 100)).glom(), t1)
    Assert.all_eq(expr.reshape(a, (12, 230, 100)).glom(), t2)
    Assert.all_eq(expr.reshape(a, (276000, 1)).glom(), t3)
    Assert.all_eq(expr.reshape(a, (1, 276000)).glom(), t4)
    Assert.all_eq(expr.reshape(b, (23, 120, 100)).glom(), t1)
    Assert.all_eq(expr.reshape(b, (12, 230, 100)).glom(), t2)
    Assert.all_eq(expr.reshape(b, (276000, 1)).glom(), t3)
    Assert.all_eq(expr.reshape(b, (1, 276000)).glom(), t4)
    Assert.all_eq(expr.reshape(c, (23, 120, 100)).glom(), t1)
    Assert.all_eq(expr.reshape(c, (12, 230, 100)).glom(), t2)
    Assert.all_eq(expr.reshape(c, (276000, 1)).glom(), t3)
    Assert.all_eq(expr.reshape(c, (1, 276000)).glom(), t4)
    Assert.all_eq(expr.reshape(d, (23, 120, 100)).glom(), t1)
    Assert.all_eq(expr.reshape(d, (12, 230, 100)).glom(), t2)
    Assert.all_eq(expr.reshape(d, (276000, 1)).glom(), t3)
    Assert.all_eq(expr.reshape(d, (1, 276000)).glom(), t4)
    Assert.all_eq(expr.reshape(e, (23, 120, 100)).glom(), t1)
    Assert.all_eq(expr.reshape(e, (12, 230, 100)).glom(), t2)
    Assert.all_eq(expr.reshape(e, (276000, 1)).glom(), t3)
    Assert.all_eq(expr.reshape(e, (1, 276000)).glom(), t4)
Пример #21
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
    '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
    points = expr.force(points)
    num_dim = points.shape[1]
    if centers is None:
        centers = expr.rand(k, num_dim)

    labels = expr.zeros((points.shape[0], ), dtype=np.int)

    for iter in range(num_iter):
        centers = expr.as_array(centers)
        points_broadcast = expr.reshape(points,
                                        (points.shape[0], 1, points.shape[1]))
        centers_broadcast = expr.reshape(
            centers, (1, centers.shape[0], centers.shape[1]))
        distances = expr.sum(expr.square(points_broadcast - centers_broadcast),
                             axis=2)
        # This is used to avoid dividing zero
        distances = distances + 0.00000000001
        util.log_info('distances shape %s' % str(distances.shape))
        distances_broadcast = expr.reshape(
            distances, (distances.shape[0], 1, distances.shape[1]))
        distances_broadcast2 = expr.reshape(
            distances, (distances.shape[0], distances.shape[1], 1))
        prob = 1.0 / expr.sum(expr.power(
            distances_broadcast / distances_broadcast2, 2.0 / (m - 1)),
                              axis=2)
        prob.force()
        counts = expr.sum(prob, axis=0)
        counts = expr.reshape(counts, (counts.shape[0], 1))
        labels = expr.argmax(prob, axis=1)
        centers = expr.sum(
            expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
            expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
            axis=0)

        # We assume that the size of centers are relative small that can be handled
        # on the master.
        counts = counts.glom()
        centers = centers.glom()
        # If any centroids don't have any points assigned to them.
        zcount_indices = (counts == 0).reshape(k)

        if np.any(zcount_indices):
            # One or more centroids may not have any points assigned to them, which results in their
            # position being the zero-vector.  We reseed these centroids with new random values
            # and set their counts to 1 in order to get rid of dividing by zero.
            counts[zcount_indices, :] = 1
            centers[zcount_indices, :] = np.random.rand(
                np.count_nonzero(zcount_indices), num_dim)

        centers = centers / counts
    return labels
Пример #22
0
 def test_reshape2(self):
     a = expr.arange((1000, ), tile_hint=[100])
     b = expr.reshape(a, (10, 100)).force()
     c = expr.reshape(b, (1000, )).force()
Пример #23
0
    def test_reshape7(self):
        t1 = expr.arange((23, 120, 100)).glom()
        t2 = expr.arange((12, 230, 100)).glom()
        t3 = expr.arange((276000, 1)).glom()
        t4 = expr.arange((1, 276000)).glom()

        a = expr.arange((100, 23, 120))
        b = expr.arange((12, 23, 1000))
        c = expr.arange((1, 276000))
        d = expr.arange((276000, 1))
        e = expr.arange((276000, ))

        Assert.all_eq(expr.reshape(a, (23, 120, 100)).glom(), t1)
        Assert.all_eq(expr.reshape(a, (12, 230, 100)).glom(), t2)
        Assert.all_eq(expr.reshape(a, (276000, 1)).glom(), t3)
        Assert.all_eq(expr.reshape(a, (1, 276000)).glom(), t4)
        Assert.all_eq(expr.reshape(b, (23, 120, 100)).glom(), t1)
        Assert.all_eq(expr.reshape(b, (12, 230, 100)).glom(), t2)
        Assert.all_eq(expr.reshape(b, (276000, 1)).glom(), t3)
        Assert.all_eq(expr.reshape(b, (1, 276000)).glom(), t4)
        Assert.all_eq(expr.reshape(c, (23, 120, 100)).glom(), t1)
        Assert.all_eq(expr.reshape(c, (12, 230, 100)).glom(), t2)
        Assert.all_eq(expr.reshape(c, (276000, 1)).glom(), t3)
        Assert.all_eq(expr.reshape(c, (1, 276000)).glom(), t4)
        Assert.all_eq(expr.reshape(d, (23, 120, 100)).glom(), t1)
        Assert.all_eq(expr.reshape(d, (12, 230, 100)).glom(), t2)
        Assert.all_eq(expr.reshape(d, (276000, 1)).glom(), t3)
        Assert.all_eq(expr.reshape(d, (1, 276000)).glom(), t4)
        Assert.all_eq(expr.reshape(e, (23, 120, 100)).glom(), t1)
        Assert.all_eq(expr.reshape(e, (12, 230, 100)).glom(), t2)
        Assert.all_eq(expr.reshape(e, (276000, 1)).glom(), t3)
        Assert.all_eq(expr.reshape(e, (1, 276000)).glom(), t4)
Пример #24
0
 def test_reshape1(self):
     a = expr.arange((10, 10))
     b = expr.reshape(a, (100, ))
     c = expr.arange((100, ))
     Assert.all_eq(b.glom(), c.glom())
Пример #25
0
 def test_reshape2(self):
   a = expr.arange((1000,), tile_hint=[100])
   b = expr.reshape(a, (10, 100)).force()
   c = expr.reshape(b, (1000,)).force()
Пример #26
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels
Пример #27
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels
Пример #28
0
    def kneighbors(self, X, n_neighbors=None):
        """Finds the K-neighbors of a point.

        Returns distance

        Parameters
        ----------
        X : array-like, last dimension same as that of fit data
            The new point.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
    """
        if n_neighbors is not None:
            self.n_neighbors = n_neighbors

        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)

        if self.algorithm in ('auto', 'brute'):
            X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
            fit_X_broadcast = expr.reshape(
                self.X, (1, self.X.shape[0], self.X.shape[1]))
            distances = expr.sum((X_broadcast - fit_X_broadcast)**2, axis=2)
            neigh_ind = expr.argsort(distances, axis=1)
            neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom()
            neigh_dist = expr.sort(distances, axis=1)
            neigh_dist = expr.sqrt(
                neigh_dist[:, :n_neighbors]).optimized().glom()
            return neigh_dist, neigh_ind
        else:
            results = self.X.foreach_tile(mapper_fn=_knn_mapper,
                                          kw={
                                              'X': self.X,
                                              'Q': X,
                                              'n_neighbors': self.n_neighbors,
                                              'algorithm': self.algorithm
                                          })
            dist = None
            ind = None
            """ Get the KNN candidates for each tile of X, then find out the real KNN """
            for k, v in results.iteritems():
                if dist is None:
                    dist = v[0]
                    ind = v[1]
                else:
                    dist = np.concatenate((dist, v[0]), axis=1)
                    ind = np.concatenate((ind, v[1]), axis=1)

            mask = np.argsort(dist, axis=1)[:, :self.n_neighbors]
            new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)])
            new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)])
            return new_dist, new_ind