def connectedConponents(ctx, dim, numIters):
	linkMatrix = eager(
					expr.shuffle(
						expr.ndarray(
							(dim, dim),
							dtype = np.int64,
							tile_hint = (dim / ctx.num_workers, dim)),
						make_matrix,
					))

	power = eager(
					expr.shuffle(
						expr.ndarray(
							(dim, dim),
							dtype = np.int64,
							tile_hint = (dim / ctx.num_workers, dim)),
						make_matrix,
					))

	eye = expr.eye(dim, tile_hint = (dim / ctx.num_workers,dim))
	startCompute = time.time()
	result = expr.logical_or(eye, linkMatrix).optimized().glom()
	for i in range(numIters):
		power = expr.dot(power, linkMatrix).optimized().glom()
		result = expr.logical_or(result, power)
	result.optimized().glom()
	final = expr.logical_and(result, expr.transpose(result.optimized())).optimized().evaluate()
	endCompute = time.time()
	return endCompute - startCompute
def shortestPath(ctx, dim, numIters):
	dist = eager(
			expr.shuffle(
				expr.ndarray(
					(dim, 1),
					dtype = np.int64,
					tile_hint = (dim / ctx.num_workers, 1)
				),
			make_dist,
			)
		)

	linkMatrix = eager(
				expr.shuffle(
					expr.ndarray(
						(dim, dim),
						dtype = np.int64,
						tile_hint = (dim, dim / ctx.num_workers)),
				make_matrix,
				))

	startCompute = time.time()
	for it in range(numIters):
		first = expr.add(dist, linkMatrix)
		second = first.min(axis = 0)
		dist = second.reshape(dim, 1)
	dist.evaluate()
	endCompute = time.time()
	return endCompute - startCompute
def bfs(ctx, dim):
	util.log_info("start to computing......")

	sGenerate = time.time()
	current = eager(
			expr.shuffle(
				expr.ndarray(
					(dim, 1),
					dtype = np.int64,
					tile_hint = (dim / ctx.num_workers, 1)),
				make_current,
			))
	
	linkMatrix = eager(
				expr.shuffle(
					expr.ndarray(
					 (dim, dim),
					 dtype = np.int64,
					 tile_hint = (dim, dim / ctx.num_workers)),
				make_matrix,
				))
	eGenerate = time.time()

	startCompute = time.time()
	while(True):
		next = expr.dot(linkMatrix, current)
		formerNum = expr.count_nonzero(current)
		laterNum = expr.count_nonzero(next)
		hasNew = expr.equal(formerNum, laterNum).glom()
		current = next
		if (hasNew):
			break
	current.evaluate()
	endCompute = time.time()
	return (eGenerate - sGenerate, endCompute - startCompute) 
Exemplo n.º 4
0
  def precompute(self):
    '''Precompute the most k similar items for each item.

    After this funcion returns. 2 attributes will be created.

    Attributes
    ------
    top_k_similar_table : Numpy array of shape (N, k). 
                          Records the most k similar scores between each items. 
    top_k_similar_indices : Numpy array of shape (N, k).
                            Records the indices of most k similar items for each item.
    '''
    M = self.rating_table.shape[0]
    N = self.rating_table.shape[1]

    self.similarity_table = expr.shuffle(self.rating_table, _similarity_mapper, 
                                         kw={'item_norm': self._get_norm_of_each_item(self.rating_table), 
                                             'step': util.divup(self.rating_table.shape[1], blob_ctx.get().num_workers)}, 
                                         shape_hint=(N, N))

    # Release the memory for item_norm
    top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int)
    
    # Find top-k similar items for each item.
    # Store the similarity scores into table top_k_similar table.
    # Store the indices of top k items into table top_k_similar_indices.
    cost = np.prod(top_k_similar_indices.shape)
    top_k_similar_table = expr.shuffle(self.similarity_table, _select_most_k_similar_mapper, 
                                       kw = {'top_k_similar_indices': top_k_similar_indices, 'k': self.k}, 
                                       shape_hint=(N, self.k), 
                                       cost_hint={hash(top_k_similar_indices):{'00': 0, '01': cost, '10': cost, '11': cost}})
    self.top_k_similar_table = top_k_similar_table.optimized().glom()
    self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
Exemplo n.º 5
0
def fit(data, labels, label_size, alpha=1.0):
  '''
  Train standard naive bayes model.
 
  Args:
    data(Expr): documents to be trained.
    labels(Expr): the correct labels of the training data.
    label_size(int): the number of different labels.
    alpha(float): alpha parameter of naive bayes model.
  '''
  labels = expr.force(labels)
  
  # calc document freq
  df = expr.reduce(data,
                   axis=0,
                   dtype_fn=lambda input: input.dtype,
                   local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis),
                   accumulate_fn=np.add,
                   tile_hint=(data.shape[1],))
  
  idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1
   
  # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency 
  # by the root mean square of features frequencies in that document
  square_sum = expr.reduce(data,
                           axis=1,
                           dtype_fn=lambda input: input.dtype,
                           local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis),
                           accumulate_fn=np.add,
                           tile_hint=(data.shape[0],))
  
  rms = expr.sqrt(square_sum * 1.0 / data.shape[1])
  
  # calculate weight normalized Tf-Idf
  data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1]))
  
  # add up all the feature vectors with the same labels
  sum_instance_by_label = expr.ndarray((label_size, data.shape[1]),
                                       dtype=np.float64, 
                                       reduce_fn=np.add,
                                       tile_hint=(label_size / len(labels.tiles), data.shape[1]))
  sum_instance_by_label = expr.shuffle(data,
                                       _sum_instance_by_label_mapper,
                                       target=sum_instance_by_label,
                                       kw={'labels': labels, 'label_size': label_size})

  # sum up all the weights for each label from the previous step
  weights_per_label = expr.sum(sum_instance_by_label, axis=1, tile_hint=(label_size,))
  
  # generate naive bayes per_label_and_feature weights
  weights_per_label_and_feature = expr.shuffle(sum_instance_by_label,
                                               _naive_bayes_mapper,
                                               kw={'weights_per_label': weights_per_label, 
                                                   'alpha':alpha})
  
  return {'scores_per_label_and_feature': weights_per_label_and_feature.force(),
          'scores_per_label': weights_per_label.force(),
          }
Exemplo n.º 6
0
    def precompute(self):
        '''Precompute the most k similar items for each item.

    After this funcion returns. 2 attributes will be created.

    Attributes
    ------
    top_k_similar_table : Numpy array of shape (N, k). 
                          Records the most k similar scores between each items. 
    top_k_similar_indices : Numpy array of shape (N, k).
                            Records the indices of most k similar items for each item.
    '''
        M = self.rating_table.shape[0]
        N = self.rating_table.shape[1]

        self.similarity_table = expr.shuffle(
            self.rating_table,
            _similarity_mapper,
            kw={
                'item_norm':
                self._get_norm_of_each_item(self.rating_table),
                'step':
                util.divup(self.rating_table.shape[1],
                           blob_ctx.get().num_workers)
            },
            shape_hint=(N, N))

        # Release the memory for item_norm
        top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int)

        # Find top-k similar items for each item.
        # Store the similarity scores into table top_k_similar table.
        # Store the indices of top k items into table top_k_similar_indices.
        cost = np.prod(top_k_similar_indices.shape)
        top_k_similar_table = expr.shuffle(self.similarity_table,
                                           _select_most_k_similar_mapper,
                                           kw={
                                               'top_k_similar_indices':
                                               top_k_similar_indices,
                                               'k': self.k
                                           },
                                           shape_hint=(N, self.k),
                                           cost_hint={
                                               hash(top_k_similar_indices): {
                                                   '00': 0,
                                                   '01': cost,
                                                   '10': cost,
                                                   '11': cost
                                               }
                                           })
        self.top_k_similar_table = top_k_similar_table.optimized().glom()
        self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
Exemplo n.º 7
0
def benchmark_svm(ctx, timer):
  
  print "#worker:", ctx.num_workers
  max_iter = 2
  #N = 200000 * ctx.num_workers
  N = 1000 * 64
  D = 64
  
  # create data
  data = expr.randn(N, D, dtype=np.float64, tile_hint=(N, util.divup(D, ctx.num_workers)))
  labels = expr.shuffle(data, _init_label_mapper, shape_hint=(data.shape[0], 1))
  
  t1 = datetime.now()
  w = fit(data, labels, T=max_iter).force()
  t2 = datetime.now()
  util.log_warn('train time per iteration:%s ms, final w:%s', millis(t1,t2)/max_iter, w.glom().T)
  
  correct = 0
  for i in range(10):
    new_data = expr.randn(1, D, dtype=np.float64, tile_hint=[1, D])
    new_label = predict(w, new_data)
    #print 'point %s, predict %s' % (new_data.glom(), new_label)
     
    new_data = new_data.glom()
    if new_data[0,0] >= new_data[0,1] and new_label == 1.0 or new_data[0,0] < new_data[0,1] and new_label == -1.0:
      correct += 1
  print 'predict precision:', correct * 1.0 / 10
Exemplo n.º 8
0
def benchmark_naive_bayes(ctx, timer):
  
  print "#worker:", ctx.num_workers
  N = 100000 * ctx.num_workers
  D = 128
  
  # create data
  data = expr.randint(N, D, low=0, high=D, tile_hint=(N/ctx.num_workers, D))
  labels = expr.eager(expr.shuffle(data, _init_label_mapper))
    
  #util.log_warn('data:%s, label:%s', data.glom(), labels.glom())   
  
  util.log_warn('begin train')
  t1 = datetime.now()
  model = fit(data, labels, D)
  t2 = datetime.now()
  util.log_warn('train time:%s ms', millis(t1,t2))

  correct = 0
  for i in range(10):
    new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D))
    new_label = predict(model, new_data)
    #print 'point %s, predict %s' % (new_data.glom(), new_label)
   
    new_data = new_data.glom()
    if np.isclose(new_data[0, new_label], np.max(new_data)):
      correct += 1
  print 'predict precision:', correct * 1.0 / 10
Exemplo n.º 9
0
def streaming_kmeans(points, k=10, num_iters=10, num_ballkmeans_runs=2, trim_factor=0.9,
                     test_probability=0.1, correct_weight=False):
  '''
  clustering data points using streaming kmeans method.

  Args:
    points(DistArray): data points to be clustered.
    k(int): the final number of clusters.
    num_iters(int): the number of iterations to run in each ball kmeans run.
    num_ballkmeans_runs(int): the number of ball kmeans to run.
    trim_factor(float): the ball kmeans parameter to separate the nearest points and distant points.
    test_probability(float): the percentage of points to be chosen as test set.
    correct_weights(bool): whether to correct the weights of the centroids.
  '''
  centroids = expr.tile_operation(points,
                                  _streaming_mapper,
                                  kw={'k': k}).evaluate()

  new_centroids = []
  for tile_result in centroids.values():
    for centroids_list in tile_result:
      new_centroids.extend(centroids_list)

  centriods = ball_kmeans(new_centroids, k, num_iters, num_ballkmeans_runs, trim_factor,
                          test_probability, correct_weight)

  centers = np.zeros((k, points.shape[1]))
  for i in range(k):
    centers[i] = centriods[i].get_center()

  return expr.shuffle(points, _cluster_mapper,
                      kw={'centers': centers}, shape_hint=(points.shape[0],))
Exemplo n.º 10
0
def benchmark_naive_bayes(ctx, timer):
  
  print "#worker:", ctx.num_workers
  #N = 100000 * ctx.num_workers
  N = 10000 * 64
  D = 128
  
  # create data
  data = expr.randint(N, D, low=0, high=D, tile_hint=(N, D/ctx.num_workers))
  labels = expr.shuffle(expr.ndarray((data.shape[0], 1), dtype=np.int), _init_label_mapper,
                        kw={'data': data}, shape_hint=(data.shape[0], 1), 
                        cost_hint={hash(data):{'00': 0, '10': np.prod(data.shape)}}
                       )
    
  #util.log_warn('data:%s, label:%s', data.glom(), labels.glom())   
  
  util.log_warn('begin train')
  t1 = datetime.now()
  model = fit(data, labels, D)
  t2 = datetime.now()
  util.log_warn('train time:%s ms', millis(t1,t2))

  correct = 0
  for i in range(10):
    new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D))
    new_label = predict(model, new_data)
    #print 'point %s, predict %s' % (new_data.glom(), new_label)
   
    new_data = new_data.glom()
    if np.isclose(new_data[0, new_label], np.max(new_data)):
      correct += 1
  print 'predict precision:', correct * 1.0 / 10
Exemplo n.º 11
0
  def _evaluate(self, ctx, deps):
    V, M, U = deps['V'], deps['M'], deps['U']

    strata = _compute_strata(V)
    util.log_info('Start eval')
    
    for i, stratum in enumerate(strata):
      util.log_info('Processing stratum: %d of %d (size = %d)', i, len(strata), len(stratum))
      #for ex in stratum: print ex

      worklist = set(stratum)
      expr.shuffle(V, sgd_netflix_mapper,
                   kw={'V' : lazify(V), 'M' : lazify(M), 'U' : lazify(U),
                       'worklist' : worklist }).force()
                       
    util.log_info('Eval done.')
Exemplo n.º 12
0
 def test_slice_shuffle(self):
   x = expr.arange((TEST_SIZE, TEST_SIZE))
   z = x[5:8, 5:8]
   z = expr.shuffle(z, add_one_extent) 
   val = z.force()
   nx = np.arange(TEST_SIZE*TEST_SIZE).reshape(TEST_SIZE, TEST_SIZE)
   
   Assert.all_eq(val.glom(), nx[5:8, 5:8] + 1)
def saveAsTextFile(ctx, dim):
	matrix = eager(
			expr.shuffle(
				expr.ndarray(
					(dim, dim),
					dtype = np.int32,
					tile_hint = (dim, dim / ctx.num_workers)),
					#tile_hint = (2, 2)),
			make_matrix,
			))
Exemplo n.º 14
0
def pagerank_sparse(num_pages,
                    num_outlinks,
                    same_site_prob):
  result = expr.ndarray((num_pages, num_pages), dtype=np.float32, sparse=True)
  cost = num_pages * num_pages
  return expr.shuffle(result,
                      target=result,
                      fn=_make_site_sparse,
                      kw = { 'num_outlinks' : num_outlinks, 
                             'same_site_prob' : same_site_prob }, 
                      cost_hint={hash(result):{'11':0, '01':cost, '10':cost, '00':cost}})
Exemplo n.º 15
0
def fit(data, labels, label_size, alpha=1.0):
  '''
  Train standard naive bayes model.
 
  Args:
    data(Expr): documents to be trained.
    labels(Expr): the correct labels of the training data.
    label_size(int): the number of different labels.
    alpha(float): alpha parameter of naive bayes model.
  '''
  # calc document freq
  df = expr.reduce(data,
                   axis=0,
                   dtype_fn=lambda input: input.dtype,
                   local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis),
                   accumulate_fn=np.add)
  
  idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1
   
  # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency 
  # by the root mean square of features frequencies in that document
  square_sum = expr.reduce(data,
                           axis=1,
                           dtype_fn=lambda input: input.dtype,
                           local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis),
                           accumulate_fn=np.add)
  
  rms = expr.sqrt(square_sum * 1.0 / data.shape[1])
  
  # calculate weight normalized Tf-Idf
  data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1]))
  
  # add up all the feature vectors with the same labels
  #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64)
  #for i in range(label_size):
  #  i_mask = (labels == i)
  #  weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0))
  weights_per_label_and_feature = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
                                               _sum_instance_by_label_mapper,
                                               target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add),
                                               kw={'labels': labels, 'label_size': label_size},
                                               cost_hint={hash(labels):{'00':0, '01':np.prod(labels.shape)}})

  # sum up all the weights for each label from the previous step
  weights_per_label = expr.sum(weights_per_label_and_feature, axis=1)
  
  # generate naive bayes per_label_and_feature weights
  weights_per_label_and_feature = expr.log((weights_per_label_and_feature + alpha) / 
                                           (weights_per_label.reshape((weights_per_label.shape[0], 1)) + 
                                            alpha * weights_per_label_and_feature.shape[1]))

  return {'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(),
          'scores_per_label': weights_per_label.optimized().force(),
          }
Exemplo n.º 16
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
    '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''
    # calculate similarity for each pair of points to generate the adjacency matrix A
    A = expr.shuffle(points,
                     _row_similarity_mapper,
                     kw={'similarity_measurement': similarity_measurement},
                     shape_hint=(points.shape[0], points.shape[0]))

    num_dims = A.shape[1]

    # Construct the diagonal matrix D
    D = expr.sum(A, axis=1, tile_hint=(A.shape[0], ))

    # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
    L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}, shape_hint=A.shape)

    # Perform eigen-decomposition using Lanczos solver
    overshoot = min(k * 2, num_dims)
    d, U = lanczos.solve(L, L, overshoot, True)
    U = U[:, 0:k]

    # Generate initial clusters which picks rows as centers if that row contains max eigen
    # value in that column
    init_clusters = U[np.argmax(U, axis=0)]

    # Run kmeans clustering with init_clusters
    kmeans = KMeans(k, num_iter)
    U = expr.from_numpy(U)
    centers, labels = kmeans.fit(U, init_clusters)

    return labels
Exemplo n.º 17
0
def pagerank_sparse(num_pages,
                    num_outlinks,
                    same_site_prob,
                    hint):
   
  return expr.shuffle(
           expr.ndarray((num_pages, num_pages), 
                        dtype=np.float32, 
                        tile_hint=hint, 
                        sparse=True),
             fn=_make_site_sparse,
             kw = { 'num_outlinks' : num_outlinks, 
                    'same_site_prob' : same_site_prob })
Exemplo n.º 18
0
    def _evaluate(self, ctx, deps):
        V, M, U = deps['V'], deps['M'], deps['U']

        strata = _compute_strata(V)
        util.log_info('Start eval')

        for i, stratum in enumerate(strata):
            util.log_info('Processing stratum: %d of %d (size = %d)', i,
                          len(strata), len(stratum))
            #for ex in stratum: print ex

            worklist = set(stratum)
            expr.shuffle(V,
                         sgd_netflix_mapper,
                         kw={
                             'V': lazify(V),
                             'M': lazify(M),
                             'U': lazify(U),
                             'worklist': worklist
                         }).evaluate()

        util.log_info('Eval done.')
Exemplo n.º 19
0
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1):
  '''
  Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model.

  Args:
    terms_docs_matrix(Expr or DistArray): the count of each term in each document.
    k_topics: the number of topics we need to find.
    alpha(float): parameter of LDA model.
    eta(float): parameter of LDA model.
    max_iter(int):the max iterations to train LDA topic model.
    max_iter_per_doc: the max iterations to train each document.
  '''
  topic_term_counts = expr.rand(k_topics, terms_docs_matrix.shape[0], 
                                tile_hint=(k_topics, terms_docs_matrix.shape[0]))

  for i in range(max_iter):
    new_topic_term_counts = expr.ndarray((k_topics, terms_docs_matrix.shape[0]), 
                                         dtype=np.float64, 
                                         reduce_fn=np.add, 
                                         tile_hint=(k_topics, terms_docs_matrix.shape[0]))
    topic_term_counts = expr.shuffle(terms_docs_matrix, _lda_mapper, target=new_topic_term_counts, 
                                     kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 
                                         'max_iter_per_doc': max_iter_per_doc, 
                                         'topic_term_counts': topic_term_counts})
    
  # calculate the doc-topic inference
  doc_topics = expr.shuffle(terms_docs_matrix, _lda_doc_topic_mapper, 
                            kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 
                                'max_iter_per_doc': max_iter_per_doc, 
                                'topic_term_counts': topic_term_counts})
  
  # normalize the topic-term distribution  
  norm_val = expr.reduce(topic_term_counts, axis=1, 
                         dtype_fn=lambda input: input.dtype, 
                         local_reduce_fn=lambda ex, data, axis:np.abs(data).sum(axis), 
                         accumulate_fn=np.add)
  topic_term_counts = topic_term_counts / norm_val.reshape((topic_term_counts.shape[0], 1))

  return doc_topics, topic_term_counts
Exemplo n.º 20
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.
  
  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans. 
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = expr.force(points)
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim, tile_hint=(k, num_dim))
  
  labels = expr.zeros((points.shape[0],), dtype=np.int, tile_hint=(points.shape[0]/len(points.tiles),))
  for iter in range(num_iter):
    new_centers = expr.ndarray((k, num_dim), reduce_fn=lambda a, b: a + b, tile_hint=(k, num_dim))
    new_counts = expr.ndarray((k, 1), dtype=np.float, reduce_fn=lambda a, b: a + b, tile_hint=(k, 1))
    expr.shuffle(points, _fuzzy_kmeans_mapper, kw={'old_centers': centers, 
                                                   'centers': new_centers, 
                                                   'counts': new_counts, 
                                                   'labels': labels, 
                                                   'm': m}).force()
    
    # If any centroids don't have any points assigned to them.
    zcount_indices = (new_counts.glom() == 0).reshape(k)
      
    if np.any(zcount_indices):
      # One or more centroids may not have any points assigned to them, which results in their
      # position being the zero-vector.  We reseed these centroids with new random values
      # and set their counts to 1 in order to get rid of dividing by zero.
      new_counts[zcount_indices, :] = 1
      new_centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim)
        
    centers = new_centers / new_counts
    
  return labels
Exemplo n.º 21
0
  def fit(self, X, centers = None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    X = expr.force(X)
    num_dim = X.shape[1]
    labels = expr.zeros((X.shape[0],1), dtype=np.int, tile_hint=X.tile_shape())
  
    if centers is None:
      centers = np.random.rand(self.n_clusters, num_dim)
    
    for i in range(self.n_iter):
      # Reset them to zero.
      new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b)
      new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b)
      
      _ = expr.shuffle(X,
                        _find_cluster_mapper,
                        kw={'d_pts' : X,
                            'old_centers' : centers,
                            'new_centers' : new_centers,
                            'new_counts' : new_counts,
                            'labels': labels
                            })
      _.force()

      new_counts = new_counts.glom()
      new_centers = new_centers.glom()
      
      # If any centroids don't have any points assigined to them.
      zcount_indices = (new_counts == 0).reshape(self.n_clusters)
      
      if np.any(zcount_indices):
        # One or more centroids may not have any points assigned to them,
        # which results in their position being the zero-vector.  We reseed these
        # centroids with new random values.
        n_points = np.count_nonzero(zcount_indices)
        # In order to get rid of dividing by zero.
        new_counts[zcount_indices] = 1
        new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

      new_centers = new_centers / new_counts
      centers = new_centers

    return centers, labels
Exemplo n.º 22
0
def canopy_cluster(points, t1=0.1, t2=0.1, cf=1):
  '''
  A simple implementation of canopy clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    t1(float): distance threshold between center point and the points within a canopy.
    t2(float): distance threshold between center point and the points within a canopy.
    cf(int): the minimum canopy size.
  '''
  new_points = expr.tile_operation(points, _canopy_mapper, kw={'t1': t1, 't2': t2, 'cf': cf}).evaluate()
  centers = find_centers(new_points.values(), t1, t2, cf)
  labels = expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0],))

  return labels
Exemplo n.º 23
0
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10):
  '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.
  
  where `A` is the "ratings" matrix which maps from a user and item to a rating score, 
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
  num_users = A.shape[0]
  num_items = A.shape[1]
 
  AT = expr.transpose(A)

  avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0)

  M = expr.rand(num_items, num_features)
  M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1)))
  
  for i in range(num_iter):
    # Recomputing U
    U = expr.shuffle(expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)), 
                     _solve_U_or_M_mapper, 
                     kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}, 
                     shape_hint=(num_users, num_features)).optimized()
    # Recomputing M
    M = expr.shuffle(expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)), 
                     _solve_U_or_M_mapper, 
                     kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}, 
                     shape_hint=(num_items, num_features)).optimized()
  return U, M
Exemplo n.º 24
0
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10):
  '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.
  
  where `A` is the "ratings" matrix which maps from a user and item to a rating score, 
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
  A = expr.force(A)
  AT = expr.shuffle(expr.ndarray((A.shape[1], A.shape[0]), dtype=A.dtype,
                                 tile_hint=(A.shape[1] / len(A.tiles), A.shape[0])),
                    _transpose_mapper, kw={'orig_array': A})
  
  num_items = A.shape[1]
  
  avg_rating = expr.sum(A, axis=0, tile_hint=(num_items / len(A.tiles),)) * 1.0 / \
               expr.count_nonzero(A, axis=0, tile_hint=(num_items / len(A.tiles),))
  
  M = expr.shuffle(expr.ndarray((num_items, num_features), 
                                tile_hint=(num_items / len(A.tiles), num_features)), 
                   _init_M_mapper, kw={'avg_rating': avg_rating})
  #util.log_warn('avg_rating:%s M:%s', avg_rating.glom(), M.glom())
  
  for i in range(num_iter):
    # Recomputing U
    U = expr.shuffle(A, _solve_U_or_M_mapper, kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback})
    # Recomputing M
    M = expr.shuffle(AT, _solve_U_or_M_mapper, kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback})
    
  return U, M
Exemplo n.º 25
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
  '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''  
  # calculate similarity for each pair of points to generate the adjacency matrix A
  A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement})
  
  num_dims = A.shape[1]
  
  # Construct the diagonal matrix D
  D = expr.sum(A, axis=1, tile_hint=(A.shape[0],))
  
  # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
  L = expr.shuffle(A, _laplacian_mapper, kw={'D': D})
  
  # Perform eigen-decomposition using Lanczos solver
  overshoot = min(k * 2, num_dims) 
  d, U = lanczos.solve(L, L, overshoot, True)
  U = U[:, 0:k]
  
  # Generate initial clusters which picks rows as centers if that row contains max eigen 
  # value in that column
  init_clusters = U[np.argmax(U, axis=0)]
  
  # Run kmeans clustering with init_clusters
  kmeans = KMeans(k, num_iter)
  U = expr.from_numpy(U)
  centers, labels = kmeans.fit(U, init_clusters)
  
  return labels
Exemplo n.º 26
0
def benchmark_pagerank(ctx, timer):
  num_pages = PAGES_PER_WORKER * ctx.num_workers
  util.log_info('Total pages: %s', num_pages)

  wts = eager(
    expr.shuffle(
      expr.ndarray(
        (num_pages, num_pages), 
        dtype=np.float32,
        tile_hint=(num_pages, PAGES_PER_WORKER / 8)),
      make_weights,
    ))

  p = eager(expr.ones((num_pages, 1), 
                      tile_hint=(PAGES_PER_WORKER / 8, 1), 
                      dtype=np.float32))

  for i in range(3):
    timer.time_op('pagerank', lambda: expr.dot(wts, p).force())
Exemplo n.º 27
0
def benchmark_pagerank(ctx, timer):
    num_pages = PAGES_PER_WORKER * ctx.num_workers
    util.log_info('Total pages: %s', num_pages)

    wts = eager(
        expr.shuffle(
            expr.ndarray((num_pages, num_pages),
                         dtype=np.float32,
                         tile_hint=(num_pages, PAGES_PER_WORKER / 8)),
            make_weights,
        ))

    p = eager(
        expr.ones((num_pages, 1),
                  tile_hint=(PAGES_PER_WORKER / 8, 1),
                  dtype=np.float32))

    for i in range(3):
        timer.time_op('pagerank', lambda: expr.dot(wts, p).force())
Exemplo n.º 28
0
def fit(data, labels, num_tiles, T=50, la=1.0):
  '''
  Train an SVM model using the disdca (2013) algorithm.
 
  Args:
    data(Expr): points to be trained.
    labels(Expr): the correct labels of the training data.
    num_tiles(int): the total tiles of the training data.
    T(int): max training iterations.
    la(float): lambda parameter of this SVM model.
  '''
  w = None
  m = data.shape[0] / num_tiles
  alpha = expr.zeros((m * num_tiles, 1), dtype=np.float64, tile_hint=(m,1)).force()
  for i in range(T):
    new_weight = expr.ndarray((data.shape[1], 1), dtype=np.float64, reduce_fn=np.add, tile_hint=(data.shape[1], 1))
    new_weight = expr.shuffle(data, _svm_mapper, target=new_weight, kw={'labels': labels, 'alpha': alpha, 'w': w, 'm': m, 'scale': num_tiles, 'lambda_n': la * data.shape[0]})
    w = new_weight / num_tiles
  return w
Exemplo n.º 29
0
def pagerank_sparse(num_pages, num_outlinks, same_site_prob):
    result = expr.ndarray((num_pages, num_pages),
                          dtype=np.float32,
                          sparse=True)
    cost = num_pages * num_pages
    return expr.shuffle(result,
                        target=result,
                        fn=_make_site_sparse,
                        kw={
                            'num_outlinks': num_outlinks,
                            'same_site_prob': same_site_prob
                        },
                        cost_hint={
                            hash(result): {
                                '11': 0,
                                '01': cost,
                                '10': cost,
                                '00': cost
                            }
                        })
Exemplo n.º 30
0
def fit(data, labels, T=50, la=1.0):
  '''
  Train an SVM model using the disdca (2013) algorithm.
 
  Args:
    data(Expr): points to be trained.
    labels(Expr): the correct labels of the training data.
    T(int): max training iterations.
    la(float): lambda parameter of this SVM model.
  '''
  w = expr.zeros((data.shape[1], 1), dtype=np.float64)
  alpha = expr.zeros((data.shape[0], 1), dtype=np.float64)
  for i in range(T):
    alpha = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
                         _svm_mapper, 
                         kw={'labels': labels, 'alpha': alpha, 'w': w, 'lambda_n': la * data.shape[0]},
                         shape_hint=alpha.shape, 
                         cost_hint={ hash(labels) : {'00': 0, '01': np.prod(labels.shape)}, hash(alpha) : {'00': 0, '01': np.prod(alpha.shape)} })
    w = expr.sum(data * alpha * 1.0 / la / data.shape[0], axis=0).reshape((data.shape[1], 1))
    w = w.optimized()
  return w
def pagerankDistributed(ctx, numPage, numIters, alpha):
  sGenerate = time.time()
  rank = eager(expr.ones((numPage, 1), tile_hint = (numPage / ctx.num_workers, 1), dtype = np.float32))
  linkMatrix = eager(
              expr.shuffle(
                expr.ndarray(
                  (numPage, numPage),
                  dtype = np.float32,
                  tile_hint = (numPage, numPage / ctx.num_workers)),
              make_weights,
              ))
  eGenerate = time.time()
  util.log_info("**pagerank** rank init finished")
  startCompute = time.time()
  for i in range(numIters):
    #rank = ((1 - alpha) * expr.dot(linkMatrix, rank,tile_hint = (numPage, numPage/10))) + belta
    rank = expr.dot(linkMatrix, rank, tile_hint = (numPage, numPage/10))
  rank.evaluate()
  endCompute = time.time()
  util.log_info("**pagerank** compute finished")
  return (eGenerate - sGenerate, endCompute - startCompute)
Exemplo n.º 32
0
def streaming_kmeans(points,
                     k=10,
                     num_iters=10,
                     num_ballkmeans_runs=2,
                     trim_factor=0.9,
                     test_probability=0.1,
                     correct_weight=False):
    '''
  clustering data points using streaming kmeans method.

  Args:
    points(DistArray): data points to be clustered.
    k(int): the final number of clusters.
    num_iters(int): the number of iterations to run in each ball kmeans run.
    num_ballkmeans_runs(int): the number of ball kmeans to run.
    trim_factor(float): the ball kmeans parameter to separate the nearest points and distant points.
    test_probability(float): the percentage of points to be chosen as test set.
    correct_weights(bool): whether to correct the weights of the centroids.
  '''
    centroids = expr.tile_operation(points, _streaming_mapper, kw={
        'k': k
    }).evaluate()

    new_centroids = []
    for tile_result in centroids.values():
        for centroids_list in tile_result:
            new_centroids.extend(centroids_list)

    centriods = ball_kmeans(new_centroids, k, num_iters, num_ballkmeans_runs,
                            trim_factor, test_probability, correct_weight)

    centers = np.zeros((k, points.shape[1]))
    for i in range(k):
        centers[i] = centriods[i].get_center()

    return expr.shuffle(points,
                        _cluster_mapper,
                        kw={'centers': centers},
                        shape_hint=(points.shape[0], ))
Exemplo n.º 33
0
  def test_pagerank(self):
    _skip_if_travis()
    OUTLINKS_PER_PAGE = 10
    PAGES_PER_WORKER = 1000000
    num_pages = PAGES_PER_WORKER * self.ctx.num_workers

    wts = expr.shuffle(
        expr.ndarray(
          (num_pages, num_pages),
          dtype=np.float32,
          tile_hint=(num_pages, PAGES_PER_WORKER / 8)),
        make_weights,
      )

    start = time.time()

    p = expr.eager(expr.ones((num_pages, 1), tile_hint=(PAGES_PER_WORKER / 8, 1),
                             dtype=np.float32))

    expr.dot(wts, p, tile_hint=(PAGES_PER_WORKER / 8, 1)).evaluate()

    cost = time.time() - start
    self._verify_cost("pagerank", cost)
Exemplo n.º 34
0
def canopy_cluster(points, t1=0.1, t2=0.1, cf=1):
    '''
  A simple implementation of canopy clustering method.
  
  Args:
    points(Expr or DistArray): the input data points matrix.
    t1(float): distance threshold between center point and the points within a canopy. 
    t2(float): distance threshold between center point and the points within a canopy.
    cf(int): the minimum canopy size.
  '''
    new_points = expr.tile_operation(points,
                                     _canopy_mapper,
                                     kw={
                                         't1': t1,
                                         't2': t2,
                                         'cf': cf
                                     }).force()
    centers = find_centers(new_points.values(), t1, t2, cf)
    labels = expr.shuffle(points,
                          _cluster_mapper,
                          kw={'centers': centers},
                          shape_hint=(points.shape[0], ))

    return labels
Exemplo n.º 35
0
def fit(data, labels, label_size, alpha=1.0):
    '''
  Train standard naive bayes model.
 
  Args:
    data(Expr): documents to be trained.
    labels(Expr): the correct labels of the training data.
    label_size(int): the number of different labels.
    alpha(float): alpha parameter of naive bayes model.
  '''
    # calc document freq
    df = expr.reduce(data,
                     axis=0,
                     dtype_fn=lambda input: input.dtype,
                     local_reduce_fn=lambda ex, data, axis:
                     (data > 0).sum(axis),
                     accumulate_fn=np.add)

    idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1

    # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency
    # by the root mean square of features frequencies in that document
    square_sum = expr.reduce(
        data,
        axis=1,
        dtype_fn=lambda input: input.dtype,
        local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis),
        accumulate_fn=np.add)

    rms = expr.sqrt(square_sum * 1.0 / data.shape[1])

    # calculate weight normalized Tf-Idf
    data = data / rms.reshape((data.shape[0], 1)) * idf.reshape(
        (1, data.shape[1]))

    # add up all the feature vectors with the same labels
    #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64)
    #for i in range(label_size):
    #  i_mask = (labels == i)
    #  weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0))
    weights_per_label_and_feature = expr.shuffle(
        expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
        _sum_instance_by_label_mapper,
        target=expr.ndarray((label_size, data.shape[1]),
                            dtype=np.float64,
                            reduce_fn=np.add),
        kw={
            'labels': labels,
            'label_size': label_size
        },
        cost_hint={hash(labels): {
                       '00': 0,
                       '01': np.prod(labels.shape)
                   }})

    # sum up all the weights for each label from the previous step
    weights_per_label = expr.sum(weights_per_label_and_feature, axis=1)

    # generate naive bayes per_label_and_feature weights
    weights_per_label_and_feature = expr.log(
        (weights_per_label_and_feature + alpha) /
        (weights_per_label.reshape((weights_per_label.shape[0], 1)) +
         alpha * weights_per_label_and_feature.shape[1]))

    return {
        'scores_per_label_and_feature':
        weights_per_label_and_feature.optimized().force(),
        'scores_per_label':
        weights_per_label.optimized().force(),
    }
Exemplo n.º 36
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels
Exemplo n.º 37
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels