예제 #1
0
def benchmark_svm(ctx, timer):
  
  print "#worker:", ctx.num_workers
  max_iter = 2
  #N = 200000 * ctx.num_workers
  N = 1000 * 64
  D = 64
  
  # create data
  data = expr.randn(N, D, dtype=np.float64, tile_hint=(N, util.divup(D, ctx.num_workers)))
  labels = expr.shuffle(data, _init_label_mapper, shape_hint=(data.shape[0], 1))
  
  t1 = datetime.now()
  w = fit(data, labels, T=max_iter).force()
  t2 = datetime.now()
  util.log_warn('train time per iteration:%s ms, final w:%s', millis(t1,t2)/max_iter, w.glom().T)
  
  correct = 0
  for i in range(10):
    new_data = expr.randn(1, D, dtype=np.float64, tile_hint=[1, D])
    new_label = predict(w, new_data)
    #print 'point %s, predict %s' % (new_data.glom(), new_label)
     
    new_data = new_data.glom()
    if new_data[0,0] >= new_data[0,1] and new_label == 1.0 or new_data[0,0] < new_data[0,1] and new_label == -1.0:
      correct += 1
  print 'predict precision:', correct * 1.0 / 10
예제 #2
0
def benchmark_cg(ctx, timer):
  print "#worker:", ctx.num_workers
  l = int(math.sqrt(ctx.num_workers))
  n = 2000 * 16
  #n = 4000 * l
  la = 20
  niter = 5
  tile_hint = (n, n/ctx.num_workers)
  
  #nonzer = 7
  #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2)
  #density = 0.5 * nz/(n*n)
  A = expr.rand(n, n, tile_hint=tile_hint)
  A = (A + expr.transpose(A))*0.5
  
  I = expr.sparse_diagonal((n,n), tile_hint=tile_hint) * la
  I.force()
  A = expr.eager(A - I)

  #x1 = numpy_cg(A.glom(), niter)
  util.log_warn('begin cg!')
  t1 = datetime.now()
  x2 = conj_gradient(A, niter).force()
  t2 = datetime.now()
  cost_time = millis(t1,t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/niter)
예제 #3
0
def benchmark_cg(ctx, timer):
    print "#worker:", ctx.num_workers
    l = int(math.sqrt(ctx.num_workers))
    #n = 2000 * 16
    n = 500 * ctx.num_workers
    la = 20
    niter = 5

    #nonzer = 7
    #nz = n * (nonzer + 1) * (nonzer + 1) + n * (nonzer + 2)
    #density = 0.5 * nz/(n*n)
    A = expr.rand(n, n)
    A = (A + expr.transpose(A)) * 0.5

    I = expr.sparse_diagonal((n, n)) * la
    A = A - I

    #x1 = numpy_cg(A.glom(), niter)
    util.log_warn('begin cg!')
    t1 = datetime.now()
    x2 = conj_gradient(A, niter).force()
    t2 = datetime.now()
    cost_time = millis(t1, t2)
    print "total cost time:%s ms, per iter cost time:%s ms" % (
        cost_time, cost_time / niter)
예제 #4
0
def benchmark_naive_bayes(ctx, timer):
  
  print "#worker:", ctx.num_workers
  #N = 100000 * ctx.num_workers
  N = 10000 * 64
  D = 128
  
  # create data
  data = expr.randint(N, D, low=0, high=D, tile_hint=(N, D/ctx.num_workers))
  labels = expr.shuffle(expr.ndarray((data.shape[0], 1), dtype=np.int), _init_label_mapper,
                        kw={'data': data}, shape_hint=(data.shape[0], 1), 
                        cost_hint={hash(data):{'00': 0, '10': np.prod(data.shape)}}
                       )
    
  #util.log_warn('data:%s, label:%s', data.glom(), labels.glom())   
  
  util.log_warn('begin train')
  t1 = datetime.now()
  model = fit(data, labels, D)
  t2 = datetime.now()
  util.log_warn('train time:%s ms', millis(t1,t2))

  correct = 0
  for i in range(10):
    new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D))
    new_label = predict(model, new_data)
    #print 'point %s, predict %s' % (new_data.glom(), new_label)
   
    new_data = new_data.glom()
    if np.isclose(new_data[0, new_label], np.max(new_data)):
      correct += 1
  print 'predict precision:', correct * 1.0 / 10
예제 #5
0
def benchmark_cholesky(ctx, timer):
    print "#worker:", ctx.num_workers

    #n = int(math.pow(ctx.num_workers, 1.0 / 3.0))
    n = int(math.sqrt(ctx.num_workers))
    #ARRAY_SIZE = 1600 * 4
    ARRAY_SIZE = 1600 * n

    util.log_warn('prepare data!')
    #A = np.random.randn(ARRAY_SIZE, ARRAY_SIZE)
    #A = np.dot(A, A.T)
    #A = expr.force(from_numpy(A, tile_hint=(ARRAY_SIZE/n, ARRAY_SIZE/n)))

    #A = expr.randn(ARRAY_SIZE, ARRAY_SIZE, tile_hint=(ARRAY_SIZE/n, ARRAY_SIZE/n))
    A = expr.randn(ARRAY_SIZE, ARRAY_SIZE)
    # FIXME: Ideally we should be able to get rid of tile_hint.
    #        However, current extent.change_partition_axis relies on the
    #        information of one-dimentional size to change tiling to grid tiling.
    #        It assumes that every extent should be partitioned in the same size.
    #        Trace extent.pyx to think about how to fix it!
    A = expr.dot(A,
                 expr.transpose(A),
                 tile_hint=(ARRAY_SIZE, ARRAY_SIZE / ctx.num_workers)).force()

    util.log_warn('begin cholesky!')
    t1 = datetime.now()
    L = cholesky(A).glom()
    t2 = datetime.now()
    assert np.all(np.isclose(A.glom(), np.dot(L, L.T.conj())))
    cost_time = millis(t1, t2)
    print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time,
                                                               cost_time / n)
예제 #6
0
def benchmark_lda(ctx, timer):

    print "#worker:", ctx.num_workers
    NUM_TERMS = 160
    NUM_DOCS = 200 * ctx.num_workers
    #NUM_DOCS = 10 * 64

    # create data
    # NUM_TERMS = 41807
    # NUM_DOCS = 21578
    # terms_docs_matrix = from_file("/scratch/cq/numpy_dense_matrix", sparse = False, tile_hint = (NUM_TERMS, int((NUM_DOCS + ctx.num_workers - 1) / ctx.num_workers))).evaluate()

    terms_docs_matrix = expr.randint(NUM_TERMS, NUM_DOCS, low=0, high=100)

    max_iter = 3
    k_topics = 16

    t1 = datetime.now()
    doc_topics, topic_term_count = learn_topics(terms_docs_matrix,
                                                k_topics,
                                                max_iter=max_iter)
    doc_topics.optimized().evaluate()
    topic_term_count.optimized().evaluate()
    t2 = datetime.now()
    time_cost = millis(t1, t2)
    util.log_warn('total_time:%s ms, train time per iteration:%s ms',
                  time_cost, time_cost / max_iter)
예제 #7
0
def benchmark_naive_bayes(ctx, timer):
  
  print "#worker:", ctx.num_workers
  N = 100000 * ctx.num_workers
  D = 128
  
  # create data
  data = expr.randint(N, D, low=0, high=D, tile_hint=(N/ctx.num_workers, D))
  labels = expr.eager(expr.shuffle(data, _init_label_mapper))
    
  #util.log_warn('data:%s, label:%s', data.glom(), labels.glom())   
  
  util.log_warn('begin train')
  t1 = datetime.now()
  model = fit(data, labels, D)
  t2 = datetime.now()
  util.log_warn('train time:%s ms', millis(t1,t2))

  correct = 0
  for i in range(10):
    new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D))
    new_label = predict(model, new_data)
    #print 'point %s, predict %s' % (new_data.glom(), new_label)
   
    new_data = new_data.glom()
    if np.isclose(new_data[0, new_label], np.max(new_data)):
      correct += 1
  print 'predict precision:', correct * 1.0 / 10
예제 #8
0
def numpy_cg(A, num_iter):
    x = np.ones((A.shape[1], 1))

    for iter in range(num_iter):
        util.log_warn('iteration:%d', iter)
        z = numpy_cgit(A, x)
        x = z / np.linalg.norm(z, 2)
    return x
예제 #9
0
파일: test_cg.py 프로젝트: MaggieQi/spartan
def numpy_cg(A, num_iter):
  x = np.ones((A.shape[1],1))
  
  for iter in range(num_iter):
    util.log_warn('iteration:%d', iter)
    z = numpy_cgit(A, x)
    x = z / np.linalg.norm(z,2)
  return x
예제 #10
0
def sparse_multiply(wts, p, p_tile_hint):
  avg_time = 0.0
  for i in range(num_iter):
    util.log_warn('iteration %d begin!', i)
    t1 = datetime.now()
    p = expr.dot(wts, p, tile_hint=p_tile_hint).force()
    t2 = datetime.now()
    time_cost = millis(t1, t2)
    print "iteration %d sparse * dense: %s ms" % (i, time_cost)
    avg_time += time_cost
  return avg_time / num_iter
예제 #11
0
def benchmark_lreg(ctx, timer):
  print "#worker:", ctx.num_workers
  FLAGS.opt_parakeet_gen = 0
  N_EXAMPLES = 4000000 * ctx.num_workers
  #N_EXAMPLES = 5000000 * 64
  x = expr.rand(N_EXAMPLES, N_DIM)
  y = expr.rand(N_EXAMPLES, 1)
  start = time.time()
  linear_regression.linear_regression(x, y, ITERATION)
  total = time.time() - start
  util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
예제 #12
0
def benchmark_ridgereg(ctx, timer):
    print "#worker:", ctx.num_workers
    #N_EXAMPLES = 100000000 * ctx.num_workers
    N_EXAMPLES = 90000000 * ctx.num_workers
    x = expr.rand(N_EXAMPLES, N_DIM)
    y = expr.rand(N_EXAMPLES, 1)
    start = time.time()
    ridge_regression.ridge_regression(x, y, 1, ITERATION)

    total = time.time() - start
    util.log_warn("time cost : %s s" % (total * 1.0 / ITERATION, ))
예제 #13
0
def benchmark_logreg(ctx, timer):
  print "#worker:", ctx.num_workers
  #N_EXAMPLES = 40000000 * ctx.num_workers
  N_EXAMPLES = 5000000 * 64
  x = expr.eager(expr.rand(N_EXAMPLES, N_DIM, tile_hint=(N_EXAMPLES / ctx.num_workers, N_DIM)))
  y = expr.eager(expr.rand(N_EXAMPLES, 1, tile_hint=(N_EXAMPLES / ctx.num_workers, 1)))
  start = time.time()
  logistic_regression.logistic_regression(x, y, ITERATION)

  total = time.time() - start
  util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
예제 #14
0
def benchmark_ridgereg(ctx, timer):
  print "#worker:", ctx.num_workers
  #N_EXAMPLES = 100000000 * ctx.num_workers
  N_EXAMPLES = 90000000 * ctx.num_workers
  x = expr.rand(N_EXAMPLES, N_DIM)
  y = expr.rand(N_EXAMPLES, 1)
  start = time.time() 
  ridge_regression.ridge_regression(x, y, 1, ITERATION)
  
  total = time.time() - start
  util.log_warn("time cost : %s s" % (total*1.0/ITERATION,))
예제 #15
0
def _cholesky_dsyrk_dgemm_mapper(extents, tiles):
  util.log_warn("dgemm %s" % str(extents))
  input = tiles[0]
  ex = extents[0]
  A_mk = tiles[1].T

  if ex.ul[0] == ex.ul[1] and ex.lr[0] == ex.lr[1]:
    # diag block
    return ex, linalg.blas.dsyrk(-1.0, A_mk, 1.0, input, lower=1)
  else:
    # other block
    A_lk = tiles[2]
    return ex, linalg.blas.dgemm(-1.0, A_lk, A_mk.T, 1.0, input)
예제 #16
0
def _cholesky_dsyrk_dgemm_mapper(extents, tiles):
    util.log_warn("dgemm %s" % str(extents))
    input = tiles[0]
    ex = extents[0]
    A_mk = tiles[1].T

    if ex.ul[0] == ex.ul[1] and ex.lr[0] == ex.lr[1]:
        # diag block
        return ex, linalg.blas.dsyrk(-1.0, A_mk, 1.0, input, lower=1)
    else:
        # other block
        A_lk = tiles[2]
        return ex, linalg.blas.dgemm(-1.0, A_lk, A_mk.T, 1.0, input)
예제 #17
0
def benchmark_als(ctx, timer):
  print "#worker:", ctx.num_workers
  #USER_SIZE = 400 * ctx.num_workers
  USER_SIZE = 200 * 64
  MOVIE_SIZE = 12800
  num_features = 20
  num_iter = 5
  
  A = expr.eager(expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE/ctx.num_workers, MOVIE_SIZE)))
  
  util.log_warn('begin als!')
  t1 = datetime.now()
  U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter)
  U.force()
  M.force()
  t2 = datetime.now()
  cost_time = millis(t1,t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
예제 #18
0
def numpy_cgit(A, x):
    z = np.zeros(x.shape)
    r = x
    rho = np.dot(r.T, r)
    util.log_warn('rho:%s', rho)
    p = r

    for i in xrange(15):
        q = np.dot(A, p)
        alpha = rho / np.dot(p.T, q)
        #util.log_warn('alpha:%s', alpha)
        z = z + p * alpha
        rho0 = rho
        r = r - q * alpha
        rho = np.dot(r.T, r)
        beta = rho / rho0
        #util.log_warn('beta:%s', beta)
        p = r + p * beta

    return z
예제 #19
0
def benchmark_als(ctx, timer):
  print "#worker:", ctx.num_workers
  #USER_SIZE = 100 * ctx.num_workers
  USER_SIZE = 320
  #USER_SIZE = 200 * 64
  MOVIE_SIZE = 12800
  num_features = 20
  num_iter = 2
  
  A = expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5, tile_hint=(USER_SIZE, util.divup(MOVIE_SIZE, ctx.num_workers)))
  #A = expr.randint(USER_SIZE, MOVIE_SIZE, low=0, high=5)
  
  util.log_warn('begin als!')
  t1 = datetime.now()
  U, M = als(A, implicit_feedback=True, num_features=num_features, num_iter=num_iter)
  U.force()
  M.force()
  t2 = datetime.now()
  cost_time = millis(t1,t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/num_iter)
예제 #20
0
파일: test_cg.py 프로젝트: MaggieQi/spartan
def numpy_cgit(A, x):
  z = np.zeros(x.shape)
  r = x
  rho = np.dot(r.T, r)
  util.log_warn('rho:%s', rho)
  p = r
  
  for i in xrange(15):
    q = np.dot(A, p)
    alpha = rho / np.dot(p.T, q)
    #util.log_warn('alpha:%s', alpha)
    z = z + p * alpha
    rho0 = rho
    r = r - q * alpha
    rho = np.dot(r.T, r)
    beta = rho / rho0
    #util.log_warn('beta:%s', beta)
    p = r + p * beta
  
  return z
예제 #21
0
def benchmark_cholesky(ctx, timer):
  print "#worker:", ctx.num_workers

  #n = int(math.pow(ctx.num_workers, 1.0 / 3.0))
  n = int(math.sqrt(ctx.num_workers))
  #ARRAY_SIZE = 1600 * 4
  ARRAY_SIZE = 900 * n

  util.log_warn('prepare data!')
  #A = np.random.randn(ARRAY_SIZE, ARRAY_SIZE)
  #A = np.dot(A, A.T)

  A = expr.randn(ARRAY_SIZE, ARRAY_SIZE)
  A = expr.dot(A, expr.transpose(A))

  util.log_warn('begin cholesky!')
  t1 = datetime.now()
  L = cholesky(A).optimized().glom()
  t2 = datetime.now()
  #assert np.all(np.isclose(A.glom(), np.dot(L, L.T.conj())))
  cost_time = millis(t1, t2)
  print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time/n)
예제 #22
0
def benchmark_cholesky(ctx, timer):
    print "#worker:", ctx.num_workers

    # n = int(math.pow(ctx.num_workers, 1.0 / 3.0))
    n = int(math.sqrt(ctx.num_workers))
    ARRAY_SIZE = 1600 * 4
    # ARRAY_SIZE = 1600 * n

    util.log_warn("prepare data!")
    # A = np.random.randn(ARRAY_SIZE, ARRAY_SIZE)
    # A = np.dot(A, A.T)
    # A = expr.force(from_numpy(A, tile_hint=(ARRAY_SIZE/n, ARRAY_SIZE/n)))

    A = expr.randn(ARRAY_SIZE, ARRAY_SIZE, tile_hint=(ARRAY_SIZE / n, ARRAY_SIZE / n))
    A = expr.dot(A, expr.transpose(A)).force()

    util.log_warn("begin cholesky!")
    t1 = datetime.now()
    L = cholesky(A).glom()
    t2 = datetime.now()
    assert np.all(np.isclose(A.glom(), np.dot(L, L.T.conj())))
    cost_time = millis(t1, t2)
    print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time, cost_time / n)
예제 #23
0
def benchmark_cholesky(ctx, timer):
    print "#worker:", ctx.num_workers

    #n = int(math.pow(ctx.num_workers, 1.0 / 3.0))
    n = int(math.sqrt(ctx.num_workers))
    #ARRAY_SIZE = 1600 * 4
    ARRAY_SIZE = 900 * n

    util.log_warn('prepare data!')
    #A = np.random.randn(ARRAY_SIZE, ARRAY_SIZE)
    #A = np.dot(A, A.T)

    A = expr.randn(ARRAY_SIZE, ARRAY_SIZE)
    A = expr.dot(A, expr.transpose(A))

    util.log_warn('begin cholesky!')
    t1 = datetime.now()
    L = cholesky(A).optimized().glom()
    t2 = datetime.now()
    #assert np.all(np.isclose(A.glom(), np.dot(L, L.T.conj())))
    cost_time = millis(t1, t2)
    print "total cost time:%s ms, per iter cost time:%s ms" % (cost_time,
                                                               cost_time / n)
예제 #24
0
def benchmark_lda(ctx, timer):
  
  print "#worker:", ctx.num_workers
  NUM_TERMS = 160
  NUM_DOCS = 200 * ctx.num_workers
  #NUM_DOCS = 10 * 64

  # create data
  # NUM_TERMS = 41807
  # NUM_DOCS = 21578
  # terms_docs_matrix = from_file("/scratch/cq/numpy_dense_matrix", sparse = False, tile_hint = (NUM_TERMS, int((NUM_DOCS + ctx.num_workers - 1) / ctx.num_workers))).force()
  
  terms_docs_matrix = expr.randint(NUM_TERMS, NUM_DOCS, low=0, high=100)
  
  max_iter = 3
  k_topics = 16
  
  t1 = datetime.now()
  doc_topics, topic_term_count = learn_topics(terms_docs_matrix, k_topics, max_iter=max_iter)
  doc_topics.optimized().force()
  topic_term_count.optimized().force()
  t2 = datetime.now()
  time_cost = millis(t1,t2)
  util.log_warn('total_time:%s ms, train time per iteration:%s ms', time_cost, time_cost/max_iter)
예제 #25
0
def benchmark_jacobi(ctx, timer):
  global base, ITERATION
  util.log_warn('util.log_warn: %s', ctx.num_workers)

  A, b = jacobi.jacobi_init(base * ctx.num_workers)
  A, b = A.evaluate(), b.evaluate()

  start = time.time()

  result = jacobi.jacobi_method(A, b, ITERATION).glom()

  cost = time.time() - start

  util.log_info('\nresult =\n%s', result)
  util.log_warn('time cost: %s s', cost)
  util.log_warn('cost per iteration: %s s\n', cost / ITERATION)
예제 #26
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels
예제 #27
0
def sparse_multiply(wts, p, p_tile_hint):
    for i in range(num_iter):
        util.log_warn('iteration %d begin!', i)
        p = expr.dot(wts, p).optimized()
    p.evaluate()
    return
예제 #28
0
def sparse_multiply(wts, p, p_tile_hint):
  for i in range(num_iter):
    util.log_warn('iteration %d begin!', i)
    p = expr.dot(wts, p).optimized()
  p.evaluate()
  return
예제 #29
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels