Exemplo n.º 1
0
def cgit(A, x):
  '''
  CGIT Conjugate Gradient iteration
  z = cgit(A, x) generates approximate solution to A*z = x.
  
  Args:
  A(Expr): matrix to be processed.
  x(Expr): the input vector.
  '''
  z = expr.zeros(x.shape, tile_hint=(A.tile_shape()[1], 1))
  r = x
  rho = expr.sum(r * r).glom()
  #util.log_warn('rho:%s', rho)
  p = r
  
  for i in xrange(15):
    q = expr.dot(A, p, tile_hint=(A.tile_shape()[1], 1))
    alpha = rho / expr.sum(p * q).glom()
    #util.log_warn('alpha:%s', alpha)
    z = z + p * alpha
    rho0 = rho
    r = r - q * alpha
    rho = expr.sum(r * r).glom()
    beta = rho / rho0
    #util.log_warn('beta:%s', beta)
    p = r + p * beta
  
  return z
Exemplo n.º 2
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = expr.force(points)
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = expr.as_array(centers)
    points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1]))
    centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
    distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2)
    # This is used to avoid dividing zero
    distances = distances + 0.00000000001
    util.log_info('distances shape %s' % str(distances.shape))
    distances_broadcast = expr.reshape(distances, (distances.shape[0], 1,
                                                   distances.shape[1]))
    distances_broadcast2 = expr.reshape(distances, (distances.shape[0],
                                                    distances.shape[1], 1))
    prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2,
                                     2.0 / (m - 1)), axis=2)
    prob.force()
    counts = expr.sum(prob, axis=0)
    counts = expr.reshape(counts, (counts.shape[0], 1))
    labels = expr.argmax(prob, axis=1)
    centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
                       expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
                       axis=0)

    # We assume that the size of centers are relative small that can be handled
    # on the master.
    counts = counts.glom()
    centers = centers.glom()
    # If any centroids don't have any points assigned to them.
    zcount_indices = (counts == 0).reshape(k)

    if np.any(zcount_indices):
      # One or more centroids may not have any points assigned to them, which results in their
      # position being the zero-vector.  We reseed these centroids with new random values
      # and set their counts to 1 in order to get rid of dividing by zero.
      counts[zcount_indices, :] = 1
      centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices),
                                                  num_dim)

    centers = centers / counts
  return labels
Exemplo n.º 3
0
  def fit(self, X, centers=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if centers is None:
      centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim))

    for i in range(self.n_iter):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1]))
      distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
      labels = expr.argmin(distances, axis=1)
      center_idx = expr.arange((1, centers.shape[0]))
      matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
      matches = matches.astype(np.int64)
      counts = expr.sum(matches, axis=0)
      centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                              matches.shape[1], 1)),
                         axis=0)

      counts = counts.optimized().glom()
      centers = centers.optimized().glom()

      # If any centroids don't have any points assigined to them.
      zcount_indices = (counts == 0).reshape(self.n_clusters)

      if np.any(zcount_indices):
        # One or more centroids may not have any points assigned to them,
        # which results in their position being the zero-vector.  We reseed these
        # centroids with new random values.
        n_points = np.count_nonzero(zcount_indices)
        # In order to get rid of dividing by zero.
        counts[zcount_indices] = 1
        centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

      centers = centers / counts.reshape(centers.shape[0], 1)
      centers = expr.from_numpy(centers)
    return centers, labels	

    '''
Exemplo n.º 4
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
  '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
  points = points.evaluate()
  num_dim = points.shape[1]
  if centers is None:
      centers = expr.rand(k, num_dim)

  #labels = expr.zeros((points.shape[0],), dtype=np.int)

  for iter in range(num_iter):
    centers = centers.glom()
    fuzzy = expr.map2(points, 0, fn=kmeans_map2_dist_mapper,
                      fn_kw={"centers": centers, "m": m},
                      shape=(points.shape[0], centers.shape[0]))
    labels = expr.argmax(fuzzy, axis=1)
    new_centers = expr.map2((points, fuzzy), (0, 0), fn=kmeans_map2_center_mapper,
                            fn_kw={"centers": centers, "m": m},
                            shape=(centers.shape[0], centers.shape[1]), reducer=np.add)
    new_centers /= expr.sum(fuzzy ** m, axis=0)[:, expr.newaxis]
    centers = new_centers
  return labels
Exemplo n.º 5
0
 def _step():
   yp = expr.dot(x, w)
   Assert.all_eq(yp.shape, y.shape)
   
   diff = x * (yp - y)
   grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1))
   wprime = w - grad * 1e-6
   expr.force(wprime)
Exemplo n.º 6
0
  def kneighbors(self, X, n_neighbors=None):
    """Finds the K-neighbors of a point.

        Returns distance

        Parameters
        ----------
        X : array-like, last dimension same as that of fit data
            The new point.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
    """
    if n_neighbors is not None:
      self.n_neighbors = n_neighbors

    if isinstance(X, np.ndarray):
      X = expr.from_numpy(X)

    if self.algorithm in ('auto', 'brute'):
      X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
      fit_X_broadcast = expr.reshape(self.X, (1, self.X.shape[0], self.X.shape[1]))
      distances = expr.sum((X_broadcast - fit_X_broadcast) ** 2, axis=2)
      neigh_ind = expr.argsort(distances, axis=1)
      neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom()
      neigh_dist = expr.sort(distances, axis=1)
      neigh_dist = expr.sqrt(neigh_dist[:, :n_neighbors]).optimized().glom()
      return neigh_dist, neigh_ind
    else:
      results = self.X.foreach_tile(mapper_fn=_knn_mapper,
                                    kw={'X': self.X, 'Q': X,
                                        'n_neighbors': self.n_neighbors,
                                        'algorithm': self.algorithm})
      dist = None
      ind = None
      """ Get the KNN candidates for each tile of X, then find out the real KNN """
      for k, v in results.iteritems():
        if dist is None:
          dist = v[0]
          ind = v[1]
        else:
          dist = np.concatenate((dist, v[0]), axis=1)
          ind = np.concatenate((ind, v[1]), axis=1)

      mask = np.argsort(dist, axis=1)[:, :self.n_neighbors]
      new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)])
      new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)])
      return new_dist, new_ind
Exemplo n.º 7
0
def fit(data, labels, label_size, alpha=1.0):
  '''
  Train standard naive bayes model.
 
  Args:
    data(Expr): documents to be trained.
    labels(Expr): the correct labels of the training data.
    label_size(int): the number of different labels.
    alpha(float): alpha parameter of naive bayes model.
  '''
  labels = expr.force(labels)
  
  # calc document freq
  df = expr.reduce(data,
                   axis=0,
                   dtype_fn=lambda input: input.dtype,
                   local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis),
                   accumulate_fn=np.add,
                   tile_hint=(data.shape[1],))
  
  idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1
   
  # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency 
  # by the root mean square of features frequencies in that document
  square_sum = expr.reduce(data,
                           axis=1,
                           dtype_fn=lambda input: input.dtype,
                           local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis),
                           accumulate_fn=np.add,
                           tile_hint=(data.shape[0],))
  
  rms = expr.sqrt(square_sum * 1.0 / data.shape[1])
  
  # calculate weight normalized Tf-Idf
  data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1]))
  
  # add up all the feature vectors with the same labels
  sum_instance_by_label = expr.ndarray((label_size, data.shape[1]),
                                       dtype=np.float64, 
                                       reduce_fn=np.add,
                                       tile_hint=(label_size / len(labels.tiles), data.shape[1]))
  sum_instance_by_label = expr.shuffle(data,
                                       _sum_instance_by_label_mapper,
                                       target=sum_instance_by_label,
                                       kw={'labels': labels, 'label_size': label_size})

  # sum up all the weights for each label from the previous step
  weights_per_label = expr.sum(sum_instance_by_label, axis=1, tile_hint=(label_size,))
  
  # generate naive bayes per_label_and_feature weights
  weights_per_label_and_feature = expr.shuffle(sum_instance_by_label,
                                               _naive_bayes_mapper,
                                               kw={'weights_per_label': weights_per_label, 
                                                   'alpha':alpha})
  
  return {'scores_per_label_and_feature': weights_per_label_and_feature.force(),
          'scores_per_label': weights_per_label.force(),
          }
Exemplo n.º 8
0
def move(galaxy, dt):
  '''Move the bodies.
  First find forces and change velocity and then move positions.
  '''
  # `.reshape(add_tuple(a, 1))` is the spartan way of doing
  #   `ndarray[:, np.newaxis]` in numpy. While syntactically different, both
  #   add a dimension of length 1 after the other dimensions.
  #   e.g. (5, 5) becomes (5, 5, 1)

  # Calculate all distances component wise (with sign).
  dx_new = galaxy['x'].reshape(add_tuple(galaxy['x'].shape, [1]))
  dy_new = galaxy['y'].reshape(add_tuple(galaxy['y'].shape, [1]))
  dz_new = galaxy['z'].reshape(add_tuple(galaxy['z'].shape, [1]))
  dx = (galaxy['x'] - dx_new) * -1
  dy = (galaxy['y'] - dy_new) * -1
  dz = (galaxy['z'] - dz_new) * -1

  # Euclidean distances (all bodies).
  r = sqrt(dx**2 + dy**2 + dz**2)
  r = set_diagonal(r, 1.0)

  # Prevent collision.
  mask = r < 1.0
  #r = r * ~mask + 1.0 * mask
  r = spartan.map((r, mask), lambda x, m: x * ~m + 1.0 * m)

  m = galaxy['m'].reshape(add_tuple(galaxy['m'].shape, [1]))

  # Calculate the acceleration component wise.
  fx = G*m*dx / r**3
  fy = G*m*dy / r**3
  fz = G*m*dz / r**3

  # Set the force (acceleration) a body exerts on itself to zero.
  fx = set_diagonal(fx, 0.0)
  fy = set_diagonal(fy, 0.0)
  fz = set_diagonal(fz, 0.0)

  galaxy['vx'] += dt*expr.sum(fx, axis=0)
  galaxy['vy'] += dt*expr.sum(fy, axis=0)
  galaxy['vz'] += dt*expr.sum(fz, axis=0)

  galaxy['x'] += dt*galaxy['vx']
  galaxy['y'] += dt*galaxy['vy']
  galaxy['z'] += dt*galaxy['vz']
Exemplo n.º 9
0
  def train(self):
    self.x = expr.eager(self.x)
    self.y = expr.eager(self.y)

    for i in range(self.iterations):
      diff = self.update()
      grad = expr.sum(diff, axis=0, tile_hint=[self.N_DIM]).glom().reshape((self.N_DIM, 1))
      self.w = self.w - grad * self.alpha
    return self.w
Exemplo n.º 10
0
def fn2():
  a = expr.ones((N, N))
  b = expr.ones((N, N/2))
  g = expr.dot(a, b) + expr.dot(expr.sum(a, axis=1).reshape((1, N)), b)
  t1 = time.time()
  g_opt = g.optimized()
  #g_opt.force()
  t2 = time.time()
  print t2 - t1
  print g_opt
Exemplo n.º 11
0
def fit(data, labels, label_size, alpha=1.0):
  '''
  Train standard naive bayes model.
 
  Args:
    data(Expr): documents to be trained.
    labels(Expr): the correct labels of the training data.
    label_size(int): the number of different labels.
    alpha(float): alpha parameter of naive bayes model.
  '''
  # calc document freq
  df = expr.reduce(data,
                   axis=0,
                   dtype_fn=lambda input: input.dtype,
                   local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis),
                   accumulate_fn=np.add)
  
  idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1
   
  # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency 
  # by the root mean square of features frequencies in that document
  square_sum = expr.reduce(data,
                           axis=1,
                           dtype_fn=lambda input: input.dtype,
                           local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis),
                           accumulate_fn=np.add)
  
  rms = expr.sqrt(square_sum * 1.0 / data.shape[1])
  
  # calculate weight normalized Tf-Idf
  data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1]))
  
  # add up all the feature vectors with the same labels
  #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64)
  #for i in range(label_size):
  #  i_mask = (labels == i)
  #  weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0))
  weights_per_label_and_feature = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
                                               _sum_instance_by_label_mapper,
                                               target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add),
                                               kw={'labels': labels, 'label_size': label_size},
                                               cost_hint={hash(labels):{'00':0, '01':np.prod(labels.shape)}})

  # sum up all the weights for each label from the previous step
  weights_per_label = expr.sum(weights_per_label_and_feature, axis=1)
  
  # generate naive bayes per_label_and_feature weights
  weights_per_label_and_feature = expr.log((weights_per_label_and_feature + alpha) / 
                                           (weights_per_label.reshape((weights_per_label.shape[0], 1)) + 
                                            alpha * weights_per_label_and_feature.shape[1]))

  return {'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(),
          'scores_per_label': weights_per_label.optimized().force(),
          }
Exemplo n.º 12
0
  def fit(self, X, y):
    """ Transform to distarray if it's numpy array"""
    if isinstance(X, np.ndarray):
      X = expr.make_from_numpy(X)
    if isinstance(y, np.ndarray):
      y = expr.make_from_numpy(y)
    
    X, y, X_mean, y_mean, X_std = self._center_data(
        X, y, self.fit_intercept, self.normalize)
    
    N_DIM = X.shape[1]
    self._coef = np.random.randn(N_DIM, 1) 

    for i in range(self.iterations):
      yp = expr.dot(X, self._coef)
      print expr.sum((yp - y) ** 2).glom()
      diff = X * (yp - y)
      grad = expr.sum(diff, axis=0).glom().reshape((N_DIM, 1))
      self._coef = self._coef - grad * 1e-6  
    return self
Exemplo n.º 13
0
  def _get_norm_of_each_item(self, rating_table):
    """Get norm of each item vector.
    For each Item, caculate the norm the item vector.
    Parameters
    ----------
    rating_table : Spartan matrix of shape(M, N). 
                   Each column represents the rating of the item.

    Returns
    ---------
    item_norm:  Spartan matrix of shape(N,).
                item_norm[i] equals || rating_table[:,i] || 

    """
    return expr.sqrt(expr.sum(expr.multiply(rating_table, rating_table), axis=0))
Exemplo n.º 14
0
  def _get_norm_of_each_item(self, rating_table):
    """Get norm of each item vector.
    For each Item, caculate the norm the item vector.
    Parameters
    ----------
    rating_table : Spartan matrix of shape(M, N). 
                   Each column represents the rating of the item.

    Returns
    ---------
    item_norm:  Spartan matrix of shape(N,).
                item_norm[i] equals || rating_table[:,i] || 

    """
    ctx = blob_ctx.get()
    if isinstance(rating_table, array.distarray.DistArray):
      rating_table = expr.lazify(rating_table)
    res = expr.sqrt(expr.sum(expr.multiply(rating_table, rating_table), axis=0, 
                             tile_hint=(rating_table.shape[1] / ctx.num_workers, )))
    return res.force()
Exemplo n.º 15
0
def fit(data, labels, T=50, la=1.0):
  '''
  Train an SVM model using the disdca (2013) algorithm.
 
  Args:
    data(Expr): points to be trained.
    labels(Expr): the correct labels of the training data.
    T(int): max training iterations.
    la(float): lambda parameter of this SVM model.
  '''
  w = expr.zeros((data.shape[1], 1), dtype=np.float64)
  alpha = expr.zeros((data.shape[0], 1), dtype=np.float64)
  for i in range(T):
    alpha = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
                         _svm_mapper, 
                         kw={'labels': labels, 'alpha': alpha, 'w': w, 'lambda_n': la * data.shape[0]},
                         shape_hint=alpha.shape, 
                         cost_hint={ hash(labels) : {'00': 0, '01': np.prod(labels.shape)}, hash(alpha) : {'00': 0, '01': np.prod(alpha.shape)} })
    w = expr.sum(data * alpha * 1.0 / la / data.shape[0], axis=0).reshape((data.shape[1], 1))
    w = w.optimized()
  return w
Exemplo n.º 16
0
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10, M=None):
  '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.

  where `A` is the "ratings" matrix which maps from a user and item to a rating score,
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
  num_users = A.shape[0]
  num_items = A.shape[1]

  AT = expr.transpose(A)

  avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0)

  M = expr.rand(num_items, num_features)
  M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1)))

  #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0))
  #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0))
  for i in range(num_iter):
    # Recomputing U
    shape = (num_users, num_features)
    U = expr.outer((A, M), (0, None), fn=_solve_U_or_M_mapper,
                   fn_kw={'la': la, 'alpha': alpha,
                          'implicit_feedback': implicit_feedback, 'shape': shape},
                   shape=shape, dtype=np.float)
    # Recomputing M
    shape = (num_items, num_features)
    M = expr.outer((AT, U), (0, None), fn=_solve_U_or_M_mapper,
                   fn_kw={'la': la, 'alpha': alpha,
                          'implicit_feedback': implicit_feedback, 'shape': shape},
                   shape=shape, dtype=np.float)
  return U, M
Exemplo n.º 17
0
  def test_optimization_reduced(self):
    na = np.random.rand(1000, 1000)
    nb = np.random.rand(1000, 1000)
    a = expr.from_numpy(na)
    b = expr.from_numpy(nb)

    c = a - b
    d = a + c
    f = c[200:900, 200:900]
    g = d[200:900, 200:900]
    h = f - g
    i = f + h
    j = h[100:500, 100:500]
    k = i[100:500, 100:500]
    l = expr.dot(j, k)
    m = j + k
    n = k - l
    o = n - m
    q = n + o
    r = q - m
    s = expr.sum(r)

    nc = na - nb
    nd = na + nc
    nf = nc[200:900, 200:900]
    ng = nd[200:900, 200:900]
    nh = nf - ng
    ni = nf + nh
    nj = nh[100:500, 100:500]
    nk = ni[100:500, 100:500]
    nl = np.dot(nj, nk)
    nm = nj + nk
    nn = nk - nl
    no = nn - nm
    nq = nn + no
    nr = nq - nm
    ns = np.sum(nr)

    # Our sum seems to reduce precision
    Assert.all_eq(ns, s.optimized().glom(), tolerance = 1e-6)
Exemplo n.º 18
0
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10):
  '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.
  
  where `A` is the "ratings" matrix which maps from a user and item to a rating score, 
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
  A = expr.force(A)
  AT = expr.shuffle(expr.ndarray((A.shape[1], A.shape[0]), dtype=A.dtype,
                                 tile_hint=(A.shape[1] / len(A.tiles), A.shape[0])),
                    _transpose_mapper, kw={'orig_array': A})
  
  num_items = A.shape[1]
  
  avg_rating = expr.sum(A, axis=0, tile_hint=(num_items / len(A.tiles),)) * 1.0 / \
               expr.count_nonzero(A, axis=0, tile_hint=(num_items / len(A.tiles),))
  
  M = expr.shuffle(expr.ndarray((num_items, num_features), 
                                tile_hint=(num_items / len(A.tiles), num_features)), 
                   _init_M_mapper, kw={'avg_rating': avg_rating})
  #util.log_warn('avg_rating:%s M:%s', avg_rating.glom(), M.glom())
  
  for i in range(num_iter):
    # Recomputing U
    U = expr.shuffle(A, _solve_U_or_M_mapper, kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback})
    # Recomputing M
    M = expr.shuffle(AT, _solve_U_or_M_mapper, kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback})
    
  return U, M
Exemplo n.º 19
0
  def test_linear_reg(self):
    _skip_if_travis()
    N_EXAMPLES =  10 * 1000 * 1000 * self.ctx.num_workers
    N_DIM = 10
    x = expr.rand(N_EXAMPLES, N_DIM, 
                  tile_hint=(N_EXAMPLES / self.ctx.num_workers, N_DIM)).astype(np.float32)
    
    y = expr.rand(N_EXAMPLES, 1, 
                  tile_hint=(N_EXAMPLES / self.ctx.num_workers, 1)).astype(np.float32)
    
    w = np.random.rand(N_DIM, 1).astype(np.float32)
    x = expr.eager(x)
    y = expr.eager(y)
    
    start = time.time()

    for i in range(5):
      yp = expr.dot(x, w)
      diff = x * (yp - y)
      grad = expr.sum(diff, axis=0, tile_hint=[N_DIM]).glom().reshape((N_DIM, 1))
      w = w - grad * 1e-6

    cost = time.time() - start
    self._verify_cost("linear_reg", cost)
Exemplo n.º 20
0
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'):
  '''
  clustering data points using kmeans spectral clustering method.

  Args:
    points(Expr or DistArray): the data points to be clustered.
    k(int): the number of clusters we need to generate.
    num_iter(int): the max number of iterations that kmeans clustering method runs. 
    similarity_measurement(str): distance method used to measure similarity between two points.
  '''  
  # calculate similarity for each pair of points to generate the adjacency matrix A
  A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement})
  
  num_dims = A.shape[1]
  
  # Construct the diagonal matrix D
  D = expr.sum(A, axis=1, tile_hint=(A.shape[0],))
  
  # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
  L = expr.shuffle(A, _laplacian_mapper, kw={'D': D})
  
  # Perform eigen-decomposition using Lanczos solver
  overshoot = min(k * 2, num_dims) 
  d, U = lanczos.solve(L, L, overshoot, True)
  U = U[:, 0:k]
  
  # Generate initial clusters which picks rows as centers if that row contains max eigen 
  # value in that column
  init_clusters = U[np.argmax(U, axis=0)]
  
  # Run kmeans clustering with init_clusters
  kmeans = KMeans(k, num_iter)
  U = expr.from_numpy(U)
  centers, labels = kmeans.fit(U, init_clusters)
  
  return labels
Exemplo n.º 21
0
def center_data(X, y, fit_intercept, normalize=False):
  """
  Centers data to have mean zero along axis 0. This is here because
  nearly all linear models will want their data to be centered.
  """
  if fit_intercept:
    X_mean = X.mean(axis = 0)
    X_mean = expr.reshape(X_mean, (1, X_mean.shape[0]))
    X -= X_mean
    
    if normalize:
      X_std = expr.sqrt(expr.sum(X ** 2, axis=0)).force()
      X_std[X_std == 0] = 1
      X /= X_std
    else:
      X_std = expr.ones(X.shape[1])
    
    y_mean = y.mean(axis=0)
    y -= y_mean
  else:
    X_mean = expr.zeros(X.shape[1])
    X_std = expr.ones(X.shape[1])
    y_mean = 0. if y.ndim == 1 else expr.zeros(y.shape[1], dtype=X.dtype)
  return X, y, X_mean, y_mean, X_std
Exemplo n.º 22
0
    def kneighbors(self, X, n_neighbors=None):
        """Finds the K-neighbors of a point.

        Returns distance

        Parameters
        ----------
        X : array-like, last dimension same as that of fit data
            The new point.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        Returns
        -------
        dist : array
            Array representing the lengths to point, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
    """
        if n_neighbors is not None:
            self.n_neighbors = n_neighbors

        if isinstance(X, np.ndarray):
            X = expr.from_numpy(X)

        if self.algorithm in ('auto', 'brute'):
            X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
            fit_X_broadcast = expr.reshape(
                self.X, (1, self.X.shape[0], self.X.shape[1]))
            distances = expr.sum((X_broadcast - fit_X_broadcast)**2, axis=2)
            neigh_ind = expr.argsort(distances, axis=1)
            neigh_ind = neigh_ind[:, :n_neighbors].optimized().glom()
            neigh_dist = expr.sort(distances, axis=1)
            neigh_dist = expr.sqrt(
                neigh_dist[:, :n_neighbors]).optimized().glom()
            return neigh_dist, neigh_ind
        else:
            results = self.X.foreach_tile(mapper_fn=_knn_mapper,
                                          kw={
                                              'X': self.X,
                                              'Q': X,
                                              'n_neighbors': self.n_neighbors,
                                              'algorithm': self.algorithm
                                          })
            dist = None
            ind = None
            """ Get the KNN candidates for each tile of X, then find out the real KNN """
            for k, v in results.iteritems():
                if dist is None:
                    dist = v[0]
                    ind = v[1]
                else:
                    dist = np.concatenate((dist, v[0]), axis=1)
                    ind = np.concatenate((ind, v[1]), axis=1)

            mask = np.argsort(dist, axis=1)[:, :self.n_neighbors]
            new_dist = np.array([dist[i][mask[i]] for i, r in enumerate(dist)])
            new_ind = np.array([ind[i][mask[i]] for i, r in enumerate(ind)])
            return new_dist, new_ind
Exemplo n.º 23
0
def simulate(ts_all, te_all, lamb_all, num_paths):
    """Range over a number of independent products.

  :param ts_all: DistArray
    Start dates for a series of swaptions.
  :param te_all: DistArray
    End dates for a series of swaptions.
  :param lamb_all: DistArray
    Parameter values for a series of swaptions.
  :param num_paths: Int
    Number of paths used in random walk.

  :rtype: DistArray

  """
    swaptions = []
    i = 0
    for ts_a, te, lamb in zip(ts_all, te_all, lamb_all):
        for ts in ts_a:
            # start = time()
            print i
            time_structure = arange(None, 0, ts + DELTA, DELTA)
            maturity_structure = arange(None, 0, te, DELTA)

            ############# MODEL ###############
            # Variance reduction technique - Antithetic Variates.
            eps_tmp = randn(time_structure.shape[0] - 1, num_paths)
            eps = concatenate(eps_tmp, -eps_tmp, 1)

            # Forward LIBOR rates for the construction of the spot measure.
            f_kk = zeros((time_structure.shape[0], 2 * num_paths))
            f_kk = assign(f_kk, np.s_[0, :], F_0)

            # Plane kxN of simulated LIBOR rates.
            f_kn = ones((maturity_structure.shape[0], 2 * num_paths)) * F_0

            # Simulations of the plane f_kn for each time step.
            for t in xrange(1, time_structure.shape[0]):
                f_kn_new = f_kn[1:, :] * exp(
                    lamb * mu(f_kn, lamb) * DELTA - 0.5 * lamb * lamb * DELTA + lamb * eps[t - 1, :] * sqrt(DELTA)
                )
                f_kk = assign(f_kk, np.s_[t, :], f_kn_new[0])
                f_kn = f_kn_new

            ############## PRODUCT ###############
            # Value of zero coupon bonds.
            zcb = ones((int((te - ts) / DELTA) + 1, 2 * num_paths))
            f_kn_modified = 1 + DELTA * f_kn
            for j in xrange(zcb.shape[0] - 1):
                zcb = assign(zcb, np.s_[j + 1], zcb[j] / f_kn_modified[j])

            # Swaption price at maturity.
            last_row = zcb[zcb.shape[0] - 1, :].reshape((20,))
            swap_ts = maximum(1 - last_row - THETA * DELTA * expr.sum(zcb[1:], 0), 0)

            # Spot measure used for discounting.
            b_ts = ones((2 * num_paths,))
            tmp = 1 + DELTA * f_kk
            for j in xrange(int(ts / DELTA)):
                b_ts *= tmp[j].reshape((20,))

            # Swaption price at time 0.
            swaption = swap_ts / b_ts

            # Save expected value in bps and std.
            me = mean((swaption[0:num_paths] + swaption[num_paths:]) / 2) * 10000
            st = std((swaption[0:num_paths] + swaption[num_paths:]) / 2) / sqrt(num_paths) * 10000

            swaptions.append([me.optimized().force(), st.optimized().force()])
            # print time() - start
            i += 1
    return swaptions
Exemplo n.º 24
0
def fn3():
    a = expr.ones((N, N))
    b = expr.ones((N, N / 2))
    g = expr.dot(a, b) + expr.dot(expr.sum(a, axis=1).reshape((1, N)), b)
    return g
Exemplo n.º 25
0
def gen_reduce(a):
    if hasattr(a, 'shape') and len(a.shape) > 0:
        return [expr.sum(a, axis=random.randrange(len(a.shape)))]

    return [a]
Exemplo n.º 26
0
def als(A,
        la=0.065,
        alpha=40,
        implicit_feedback=False,
        num_features=20,
        num_iter=10,
        M=None):
    '''
  compute the factorization A = U M' using the alternating least-squares (ALS) method.

  where `A` is the "ratings" matrix which maps from a user and item to a rating score,
        `U` and `M` are the factor matrices, which represent user and item preferences.
  Args:
    A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score.
    la(float): the parameter of the als.
    alpha(int): confidence parameter used on implicit feedback.
    implicit_feedback(bool): whether using implicit_feedback method for als.
    num_features(int): dimension of the feature space.
    num_iter(int): max iteration to run.
  '''
    num_users = A.shape[0]
    num_items = A.shape[1]

    AT = expr.transpose(A)

    avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0)

    M = expr.rand(num_items, num_features)
    M = expr.assign(M, np.s_[:, 0], avg_rating.reshape(
        (avg_rating.shape[0], 1)))

    #A = expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0))
    #AT = expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0))
    for i in range(num_iter):
        # Recomputing U
        shape = (num_users, num_features)
        U = expr.outer(
            (A, M), (0, None),
            fn=_solve_U_or_M_mapper,
            fn_kw={
                'la': la,
                'alpha': alpha,
                'implicit_feedback': implicit_feedback,
                'shape': shape
            },
            shape=shape,
            dtype=np.float)
        # Recomputing M
        shape = (num_items, num_features)
        M = expr.outer(
            (AT, U), (0, None),
            fn=_solve_U_or_M_mapper,
            fn_kw={
                'la': la,
                'alpha': alpha,
                'implicit_feedback': implicit_feedback,
                'shape': shape
            },
            shape=shape,
            dtype=np.float)
    return U, M
Exemplo n.º 27
0
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None):
    '''
  clustering data points using fuzzy kmeans clustering method.

  Args:
    points(Expr or DistArray): the input data points matrix.
    k(int): the number of clusters.
    num_iter(int): the max iterations to run.
    m(float): the parameter of fuzzy kmeans.
    centers(Expr or DistArray): the initialized centers of each cluster.
  '''
    points = expr.force(points)
    num_dim = points.shape[1]
    if centers is None:
        centers = expr.rand(k, num_dim)

    labels = expr.zeros((points.shape[0], ), dtype=np.int)

    for iter in range(num_iter):
        centers = expr.as_array(centers)
        points_broadcast = expr.reshape(points,
                                        (points.shape[0], 1, points.shape[1]))
        centers_broadcast = expr.reshape(
            centers, (1, centers.shape[0], centers.shape[1]))
        distances = expr.sum(expr.square(points_broadcast - centers_broadcast),
                             axis=2)
        # This is used to avoid dividing zero
        distances = distances + 0.00000000001
        util.log_info('distances shape %s' % str(distances.shape))
        distances_broadcast = expr.reshape(
            distances, (distances.shape[0], 1, distances.shape[1]))
        distances_broadcast2 = expr.reshape(
            distances, (distances.shape[0], distances.shape[1], 1))
        prob = 1.0 / expr.sum(expr.power(
            distances_broadcast / distances_broadcast2, 2.0 / (m - 1)),
                              axis=2)
        prob.force()
        counts = expr.sum(prob, axis=0)
        counts = expr.reshape(counts, (counts.shape[0], 1))
        labels = expr.argmax(prob, axis=1)
        centers = expr.sum(
            expr.reshape(points, (points.shape[0], 1, points.shape[1])) *
            expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)),
            axis=0)

        # We assume that the size of centers are relative small that can be handled
        # on the master.
        counts = counts.glom()
        centers = centers.glom()
        # If any centroids don't have any points assigned to them.
        zcount_indices = (counts == 0).reshape(k)

        if np.any(zcount_indices):
            # One or more centroids may not have any points assigned to them, which results in their
            # position being the zero-vector.  We reseed these centroids with new random values
            # and set their counts to 1 in order to get rid of dividing by zero.
            counts[zcount_indices, :] = 1
            centers[zcount_indices, :] = np.random.rand(
                np.count_nonzero(zcount_indices), num_dim)

        centers = centers / counts
    return labels
Exemplo n.º 28
0
 def train(self):
   for i in range(self.iterations):
     diff = self.update()
     grad = expr.sum(diff, axis=0).optimized().glom().reshape((self.N_DIM, 1))
     self.w = self.w - grad * self.alpha
   return self.w
Exemplo n.º 29
0
def simulate(ts_all, te_all, lamb_all, num_paths):
  '''Range over a number of independent products.

  :param ts_all: DistArray
    Start dates for a series of swaptions.
  :param te_all: DistArray
    End dates for a series of swaptions.
  :param lamb_all: DistArray
    Parameter values for a series of swaptions.
  :param num_paths: Int
    Number of paths used in random walk.

  :rtype: DistArray

  '''
  swaptions = []
  i = 0
  for ts_a, te, lamb in zip(ts_all, te_all, lamb_all):
    for ts in ts_a:
      #start = time()
      print i
      time_structure = arange(None, 0, ts + DELTA, DELTA)
      maturity_structure = arange(None, 0, te, DELTA)

      ############# MODEL ###############
      # Variance reduction technique - Antithetic Variates.
      eps_tmp = randn(time_structure.shape[0] - 1, num_paths)
      eps = concatenate(eps_tmp, -eps_tmp, 1)

      # Forward LIBOR rates for the construction of the spot measure.
      f_kk = zeros((time_structure.shape[0], 2*num_paths))
      f_kk = assign(f_kk, np.s_[0, :], F_0)

      # Plane kxN of simulated LIBOR rates.
      f_kn = ones((maturity_structure.shape[0], 2*num_paths))*F_0

      # Simulations of the plane f_kn for each time step.
      for t in xrange(1, time_structure.shape[0]):
        f_kn_new = f_kn[1:, :]*exp(lamb*mu(f_kn, lamb)*DELTA-0.5*lamb*lamb *
            DELTA + lamb*eps[t - 1, :]*sqrt(DELTA))
        f_kk = assign(f_kk, np.s_[t, :], f_kn_new[0])
        f_kn = f_kn_new

      ############## PRODUCT ###############
      # Value of zero coupon bonds.
      zcb = ones((int((te-ts)/DELTA)+1, 2*num_paths))
      f_kn_modified = 1 + DELTA*f_kn
      for j in xrange(zcb.shape[0] - 1):
        zcb = assign(zcb, np.s_[j + 1], zcb[j] / f_kn_modified[j])

      # Swaption price at maturity.
      last_row = zcb[zcb.shape[0] - 1, :].reshape((20, ))
      swap_ts = maximum(1 - last_row - THETA*DELTA*expr.sum(zcb[1:], 0), 0)

      # Spot measure used for discounting.
      b_ts = ones((2*num_paths, ))
      tmp = 1 + DELTA * f_kk
      for j in xrange(int(ts/DELTA)):
        b_ts *= tmp[j].reshape((20, ))

      # Swaption price at time 0.
      swaption = swap_ts/b_ts

      # Save expected value in bps and std.
      me = mean((swaption[0:num_paths] + swaption[num_paths:])/2) * 10000
      st = std((swaption[0:num_paths] + swaption[num_paths:])/2)/sqrt(num_paths)*10000

      swaptions.append([me.optimized().force(), st.optimized().force()])
      #print time() - start
      i += 1
  return swaptions
Exemplo n.º 30
0
    def fit(self, X, centers=None, implementation='map2'):
        """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
        num_dim = X.shape[1]
        num_points = X.shape[0]

        labels = expr.zeros((num_points, 1), dtype=np.int)

        if implementation == 'map2':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.map2(X,
                                   0,
                                   fn=kmeans_map2_dist_mapper,
                                   fn_kw={"centers": centers},
                                   shape=(X.shape[0], ))

                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
            return centers, labels

        elif implementation == 'outer':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                labels = expr.outer((X, centers), (0, None),
                                    fn=kmeans_outer_dist_mapper,
                                    shape=(X.shape[0], ))
                #labels = expr.argmin(distances, axis=1)
                counts = expr.map2(labels,
                                   0,
                                   fn=kmeans_count_mapper,
                                   fn_kw={'centers_count': self.n_clusters},
                                   shape=(centers.shape[0], ))
                new_centers = expr.map2(
                    (X, labels), (0, 0),
                    fn=kmeans_center_mapper,
                    fn_kw={'centers_count': self.n_clusters},
                    shape=(centers.shape[0], centers.shape[1]))
                counts = counts.optimized().glom()
                centers = new_centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'broadcast':
            if centers is None:
                centers = expr.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                util.log_warn("k_means_ %d %d", i, time.time())
                X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
                centers_broadcast = expr.reshape(
                    centers, (1, centers.shape[0], centers.shape[1]))
                distances = expr.sum(expr.square(X_broadcast -
                                                 centers_broadcast),
                                     axis=2)
                labels = expr.argmin(distances, axis=1)
                center_idx = expr.arange((1, centers.shape[0]))
                matches = expr.reshape(labels,
                                       (labels.shape[0], 1)) == center_idx
                matches = matches.astype(np.int64)
                counts = expr.sum(matches, axis=0)
                centers = expr.sum(
                    X_broadcast *
                    expr.reshape(matches,
                                 (matches.shape[0], matches.shape[1], 1)),
                    axis=0)

                counts = counts.optimized().glom()
                centers = centers.optimized().glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    counts[zcount_indices] = 1
                    centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                centers = centers / counts.reshape(centers.shape[0], 1)
                centers = expr.from_numpy(centers)
            return centers, labels
        elif implementation == 'shuffle':
            if centers is None:
                centers = np.random.rand(self.n_clusters, num_dim)

            for i in range(self.n_iter):
                # Reset them to zero.
                new_centers = expr.ndarray((self.n_clusters, num_dim),
                                           reduce_fn=lambda a, b: a + b)
                new_counts = expr.ndarray((self.n_clusters, 1),
                                          dtype=np.int,
                                          reduce_fn=lambda a, b: a + b)

                _ = expr.shuffle(X,
                                 _find_cluster_mapper,
                                 kw={
                                     'd_pts': X,
                                     'old_centers': centers,
                                     'new_centers': new_centers,
                                     'new_counts': new_counts,
                                     'labels': labels
                                 },
                                 shape_hint=(1, ),
                                 cost_hint={
                                     hash(labels): {
                                         '00': 0,
                                         '01': np.prod(labels.shape)
                                     }
                                 })
                _.force()

                new_counts = new_counts.glom()
                new_centers = new_centers.glom()

                # If any centroids don't have any points assigined to them.
                zcount_indices = (new_counts == 0).reshape(self.n_clusters)

                if np.any(zcount_indices):
                    # One or more centroids may not have any points assigned to them,
                    # which results in their position being the zero-vector.  We reseed these
                    # centroids with new random values.
                    n_points = np.count_nonzero(zcount_indices)
                    # In order to get rid of dividing by zero.
                    new_counts[zcount_indices] = 1
                    new_centers[zcount_indices, :] = np.random.randn(
                        n_points, num_dim)

                new_centers = new_centers / new_counts
                centers = new_centers

            return centers, labels
Exemplo n.º 31
0
def gen_reduce(a):
  if hasattr(a, 'shape') and len(a.shape) > 0:
    return [expr.sum(a, axis=random.randrange(len(a.shape)))]

  return [a]
Exemplo n.º 32
0
def fit(data, labels, label_size, alpha=1.0):
    '''
  Train standard naive bayes model.
 
  Args:
    data(Expr): documents to be trained.
    labels(Expr): the correct labels of the training data.
    label_size(int): the number of different labels.
    alpha(float): alpha parameter of naive bayes model.
  '''
    # calc document freq
    df = expr.reduce(data,
                     axis=0,
                     dtype_fn=lambda input: input.dtype,
                     local_reduce_fn=lambda ex, data, axis:
                     (data > 0).sum(axis),
                     accumulate_fn=np.add)

    idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1

    # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency
    # by the root mean square of features frequencies in that document
    square_sum = expr.reduce(
        data,
        axis=1,
        dtype_fn=lambda input: input.dtype,
        local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis),
        accumulate_fn=np.add)

    rms = expr.sqrt(square_sum * 1.0 / data.shape[1])

    # calculate weight normalized Tf-Idf
    data = data / rms.reshape((data.shape[0], 1)) * idf.reshape(
        (1, data.shape[1]))

    # add up all the feature vectors with the same labels
    #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64)
    #for i in range(label_size):
    #  i_mask = (labels == i)
    #  weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0))
    weights_per_label_and_feature = expr.shuffle(
        expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)),
        _sum_instance_by_label_mapper,
        target=expr.ndarray((label_size, data.shape[1]),
                            dtype=np.float64,
                            reduce_fn=np.add),
        kw={
            'labels': labels,
            'label_size': label_size
        },
        cost_hint={hash(labels): {
                       '00': 0,
                       '01': np.prod(labels.shape)
                   }})

    # sum up all the weights for each label from the previous step
    weights_per_label = expr.sum(weights_per_label_and_feature, axis=1)

    # generate naive bayes per_label_and_feature weights
    weights_per_label_and_feature = expr.log(
        (weights_per_label_and_feature + alpha) /
        (weights_per_label.reshape((weights_per_label.shape[0], 1)) +
         alpha * weights_per_label_and_feature.shape[1]))

    return {
        'scores_per_label_and_feature':
        weights_per_label_and_feature.optimized().force(),
        'scores_per_label':
        weights_per_label.optimized().force(),
    }
Exemplo n.º 33
0
def fn3():
  a = expr.ones((N, N))
  b = expr.ones((N, N/2))
  g = expr.dot(a, b) + expr.dot(expr.sum(a, axis=1).reshape((1, N)), b)
  return g
Exemplo n.º 34
0
  def fit(self, X, centers=None, implementation='outer'):
    """Compute k-means clustering.

    Parameters
    ----------
    X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows.
    centers : numpy.ndarray. The initial centers. If None, it will be randomly generated.
    """
    num_dim = X.shape[1]
    num_points = X.shape[0]

    labels = expr.zeros((num_points, 1), dtype=np.int)

    if implementation == 'map2':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers},
                           shape=(X.shape[0], ))

        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
      return centers, labels

    elif implementation == 'outer':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper,
                            shape=(X.shape[0],))
        #labels = expr.argmin(distances, axis=1)
        counts = expr.map2(labels, 0, fn=kmeans_count_mapper,
                           fn_kw={'centers_count': self.n_clusters},
                           shape=(centers.shape[0], ))
        new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper,
                                fn_kw={'centers_count': self.n_clusters},
                                shape=(centers.shape[0], centers.shape[1]))
        counts = counts.optimized().glom()
        centers = new_centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'broadcast':
      if centers is None:
        centers = expr.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        util.log_warn("k_means_ %d %d", i, time.time())
        X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1]))
        centers_broadcast = expr.reshape(centers, (1, centers.shape[0],
                                                   centers.shape[1]))
        distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2)
        labels = expr.argmin(distances, axis=1)
        center_idx = expr.arange((1, centers.shape[0]))
        matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx
        matches = matches.astype(np.int64)
        counts = expr.sum(matches, axis=0)
        centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0],
                                                                matches.shape[1], 1)),
                           axis=0)

        counts = counts.optimized().glom()
        centers = centers.optimized().glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          counts[zcount_indices] = 1
          centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        centers = centers / counts.reshape(centers.shape[0], 1)
        centers = expr.from_numpy(centers)
      return centers, labels
    elif implementation == 'shuffle':
      if centers is None:
        centers = np.random.rand(self.n_clusters, num_dim)

      for i in range(self.n_iter):
        # Reset them to zero.
        new_centers = expr.ndarray((self.n_clusters, num_dim),
                                   reduce_fn=lambda a, b: a + b)
        new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int,
                                  reduce_fn=lambda a, b: a + b)

        _ = expr.shuffle(X,
                         _find_cluster_mapper,
                         kw={'d_pts': X,
                             'old_centers': centers,
                             'new_centers': new_centers,
                             'new_counts': new_counts,
                             'labels': labels},
                         shape_hint=(1,),
                         cost_hint={hash(labels): {'00': 0,
                                                   '01': np.prod(labels.shape)}})
        _.force()

        new_counts = new_counts.glom()
        new_centers = new_centers.glom()

        # If any centroids don't have any points assigined to them.
        zcount_indices = (new_counts == 0).reshape(self.n_clusters)

        if np.any(zcount_indices):
          # One or more centroids may not have any points assigned to them,
          # which results in their position being the zero-vector.  We reseed these
          # centroids with new random values.
          n_points = np.count_nonzero(zcount_indices)
          # In order to get rid of dividing by zero.
          new_counts[zcount_indices] = 1
          new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim)

        new_centers = new_centers / new_counts
        centers = new_centers

      return centers, labels