예제 #1
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = points.evaluate()
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  #labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = centers.glom()
    fuzzy = expr.map2(points, 0, fn=kmeans_map2_dist_mapper,
                      fn_kw={"centers": centers, "m": m},
                      shape=(points.shape[0], centers.shape[0]))
    labels = expr.argmax(fuzzy, axis=1)
    new_centers = expr.map2((points, fuzzy), (0, 0), fn=kmeans_map2_center_mapper,
                            fn_kw={"centers": centers, "m": m},
                            shape=(centers.shape[0], centers.shape[1]), reducer=np.add)
    new_centers /= expr.sum(fuzzy ** m, axis=0)[:, expr.newaxis]
    centers = new_centers
  return labels
예제 #2
0
def cholesky(A):
    '''
  Cholesky matrix decomposition.

  Args:
    A(Expr): matrix to be decomposed
  '''

    A = expr.force(A)
    n = int(math.sqrt(len(A.tiles)))
    tile_size = A.shape[0] / n
    for k in range(n):
        # A[k,k] = DPOTRF(A[k,k])
        diag_ex = get_ex(k, k, tile_size, A.shape)
        A = expr.map2(A, ((0, 1), ),
                      fn=_cholesky_dpotrf_mapper,
                      shape=A.shape,
                      update_region=diag_ex)

        if k == n - 1: break

        # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n)
        col_ex = extent.create(((k + 1) * tile_size, k * tile_size),
                               (n * tile_size, (k + 1) * tile_size), A.shape)
        diag_tile = A.force().fetch(diag_ex)
        A = expr.map2(A, ((0, 1), ),
                      fn=_cholesky_dtrsm_mapper,
                      fn_kw=dict(array=force(A), diag_tile=diag_tile),
                      shape=A.shape,
                      update_region=col_ex)

        # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n)
        # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n)
        col_exs = list([
            extent.create((m * tile_size, m * tile_size),
                          (n * tile_size, (m + 1) * tile_size), A.shape)
            for m in range(k + 1, n)
        ])
        dgemm_1 = expr.transpose(A)[(k * tile_size):((k + 1) * tile_size), :]
        dgemm_2 = A[:, (k * tile_size):((k + 1) * tile_size)]
        A = expr.map2((A, dgemm_1, dgemm_2), ((0, 1), 1, 0),
                      fn=_cholesky_dsyrk_dgemm_mapper,
                      fn_kw=dict(array=force(A), k=k),
                      shape=A.shape,
                      update_region=col_exs)

    # update the right corner to 0
    col_exs = list([
        extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size),
                      A.shape) for m in range(1, n)
    ])
    A = expr.map2(A, ((0, 1), ),
                  fn=_zero_mapper,
                  shape=A.shape,
                  update_region=col_exs)
    return A
예제 #3
0
def cholesky(A):
    '''
  Cholesky matrix decomposition.

  Args:
    A(Expr): matrix to be decomposed
  '''
    n = int(math.sqrt(FLAGS.num_workers))
    tile_size = A.shape[0] / n
    print n, tile_size
    for k in range(n):
        # A[k,k] = DPOTRF(A[k,k])
        diag_ex = get_ex(k, k, tile_size, A.shape)
        A = expr.map2(A, ((0, 1), ),
                      fn=_cholesky_dpotrf_mapper,
                      shape=A.shape,
                      update_region=diag_ex)

        if k == n - 1: break

        # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n)
        col_ex = extent.create(((k + 1) * tile_size, k * tile_size),
                               (n * tile_size, (k + 1) * tile_size), A.shape)
        A = expr.map2((A, A[diag_ex.to_slice()]), ((0, 1), None),
                      fn=_cholesky_dtrsm_mapper,
                      shape=A.shape,
                      update_region=col_ex)

        # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n)
        # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n)
        col_exs = list([
            extent.create((m * tile_size, m * tile_size),
                          (n * tile_size, (m + 1) * tile_size), A.shape)
            for m in range(k + 1, n)
        ])
        dgemm = A[:, (k * tile_size):((k + 1) * tile_size)]
        A = expr.map2((A, expr.transpose(dgemm), dgemm), ((0, 1), 1, 0),
                      fn=_cholesky_dsyrk_dgemm_mapper,
                      shape=A.shape,
                      update_region=col_exs).optimized()

    # update the right corner to 0
    col_exs = list([
        extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size),
                      A.shape) for m in range(1, n)
    ])
    A = expr.map2(A, ((0, 1), ),
                  fn=_zero_mapper,
                  shape=A.shape,
                  update_region=col_exs)
    return A
예제 #4
0
def cholesky(A):
  '''
  Cholesky matrix decomposition.

  Args:
    A(Expr): matrix to be decomposed
  '''
  n = int(math.sqrt(FLAGS.num_workers))
  tile_size = A.shape[0] / n
  print n, tile_size
  for k in range(n):
    # A[k,k] = DPOTRF(A[k,k])
    diag_ex = get_ex(k, k, tile_size, A.shape)
    A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper,
                  shape=A.shape, update_region=diag_ex)

    if k == n - 1: break

    # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n)
    col_ex = extent.create(((k+1)*tile_size, k*tile_size), (n*tile_size, (k+1)*tile_size), A.shape)
    A = expr.map2((A, A[diag_ex.to_slice()]), ((0, 1), None), fn=_cholesky_dtrsm_mapper,
                  shape=A.shape, update_region=col_ex)

    # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n)
    # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n)
    col_exs = list([extent.create((m*tile_size, m*tile_size), (n*tile_size, (m+1)*tile_size), A.shape) for m in range(k+1, n)])
    dgemm = A[:, (k * tile_size):((k + 1) * tile_size)]
    A = expr.map2((A, expr.transpose(dgemm), dgemm), ((0, 1), 1, 0),
                  fn=_cholesky_dsyrk_dgemm_mapper,
                  shape=A.shape, update_region=col_exs).optimized()

  # update the right corner to 0
  col_exs = list([extent.create((0, m*tile_size), (m*tile_size, (m+1)*tile_size), A.shape) for m in range(1, n)])
  A = expr.map2(A, ((0, 1), ), fn=_zero_mapper,
                shape=A.shape, update_region=col_exs)
  return A
예제 #5
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels
예제 #6
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels