def train_smo_1998(self, data, labels): ''' Train an SVM model using the SMO (1998) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 self.alpha = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N/self.ctx.num_workers, N]) labels = expr.force(labels) self.E = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force() for i in xrange(N): self.E[i, 0] = self.b + expr.reduce(self.alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=kernel_results[:,i].force())).glom() - labels[i, 0] util.log_info("Starting SMO") it = 0 num_changed = 0 examine_all = True while (num_changed > 0 or examine_all) and (it < self.maxiter): util.log_info("Iteration:%d", it) num_changed = 0 if examine_all: for i in xrange(N): num_changed += self.examine_example(i, N, labels, kernel_results) else: for i in xrange(N): if self.alpha[i, 0] > 0 and self.alpha[i, 0] < self.C: num_changed += self.examine_example(i, N, labels, kernel_results) it += 1 if examine_all: examine_all = False elif num_changed == 0: examine_all = True self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i,0] = expr.reduce(self.alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force(data[:,i]))).glom() self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def random_galaxy(n): '''Generate a galaxy of random bodies.''' dtype = np.float # consistent with sp.rand, same as np.float64 galaxy = { # All bodies stand still initially. 'm': (rand(n) + dtype(10)) * dtype(m_sol / 10), 'x': (rand(n) - dtype(0.5)) * dtype(r_ly / 100), 'y': (rand(n) - dtype(0.5)) * dtype(r_ly / 100), 'z': (rand(n) - dtype(0.5)) * dtype(r_ly / 100), 'vx': zeros((n, )), 'vy': zeros((n, )), 'vz': zeros((n, )) } return galaxy
def random_galaxy(n): '''Generate a galaxy of random bodies.''' dtype = np.float # consistent with sp.rand, same as np.float64 galaxy = { # All bodies stand still initially. 'm': (rand(n) + dtype(10)) * dtype(m_sol/10), 'x': (rand(n) - dtype(0.5)) * dtype(r_ly/100), 'y': (rand(n) - dtype(0.5)) * dtype(r_ly/100), 'z': (rand(n) - dtype(0.5)) * dtype(r_ly/100), 'vx': zeros((n, )), 'vy': zeros((n, )), 'vz': zeros((n, )) } return galaxy
def precompute(self): '''Precompute the most k similar items for each item. After this funcion returns. 2 attributes will be created. Attributes ------ top_k_similar_table : Numpy array of shape (N, k). Records the most k similar scores between each items. top_k_similar_indices : Numpy array of shape (N, k). Records the indices of most k similar items for each item. ''' M = self.rating_table.shape[0] N = self.rating_table.shape[1] self.rating_table = expr.force(self.rating_table) assert self.rating_table.tile_shape()[0] == M, \ "rating table is only allowed to tile by columns!" self.similarity_table = expr.zeros(shape=(N, N), tile_hint=(self.rating_table.tile_shape()[1], N)).force() self.item_norm = self._get_norm_of_each_item(self.rating_table) self.rating_table.foreach_tile(mapper_fn=_similarity_mapper, kw={'rating_table' : self.rating_table, 'similarity_table' : self.similarity_table, 'item_norm' : self.item_norm, 'step' : self.rating_table.tile_shape()[1]}) # Release the memory for item_norm self.item_norm = None k = self.k top_k_similar_table = expr.zeros((N, k), tile_hint=(self.rating_table.tile_shape()[1], k)).force() top_k_similar_indices = expr.zeros((N, k), tile_hint=(self.rating_table.tile_shape()[1], k), dtype=np.int).force() # Find top-k similar items for each item. # Store the similarity scores into table top_k_similar table. # Store the indices of top k items into table top_k_similar_indices. self.similarity_table.foreach_tile(mapper_fn=_select_most_k_similar_mapper, kw={'similarity_table' : self.similarity_table, 'top_k_similar_table' : top_k_similar_table, 'top_k_similar_indices' : top_k_similar_indices, 'k' : k}) self.top_k_similar_table = top_k_similar_table.glom() self.top_k_similar_indices = top_k_similar_indices.glom()
def benchmark_optimization(ctx, timer): FLAGS.optimization = 0 DATA_SIZE = 5 * 1000 * 1000 current = eager(zeros((DATA_SIZE * ctx.num_workers,), dtype=np.float32, tile_hint = (DATA_SIZE,))) strike = eager(ones((DATA_SIZE * ctx.num_workers,), dtype=np.float32, tile_hint=(DATA_SIZE,))) maturity = eager(strike * 12) rate = eager(strike * 0.05) volatility = eager(strike * 0.01) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.optimization = 1 FLAGS.opt_parakeet_gen = 0 FLAGS.opt_map_fusion = 1 timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.opt_parakeet_gen = 1 timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility))
def cgit(A, x): ''' CGIT Conjugate Gradient iteration z = cgit(A, x) generates approximate solution to A*z = x. Args: A(Expr): matrix to be processed. x(Expr): the input vector. ''' z = expr.zeros(x.shape, tile_hint=(A.tile_shape()[1], 1)) r = x rho = expr.sum(r * r).glom() #util.log_warn('rho:%s', rho) p = r for i in xrange(15): q = expr.dot(A, p, tile_hint=(A.tile_shape()[1], 1)) alpha = rho / expr.sum(p * q).glom() #util.log_warn('alpha:%s', alpha) z = z + p * alpha rho0 = rho r = r - q * alpha rho = expr.sum(r * r).glom() beta = rho / rho0 #util.log_warn('beta:%s', beta) p = r + p * beta return z
def jacobi_method(A, b, _iter=100): """ Iterative algorithm for approximating the solutions of a diagonally dominant system of linear equations. Parameters ---------- A : ndarray or Expr - 2d Input matrix b : ndarray or Expr - vector RHS vector _iter : int Times of iteration needed, default to be 100 Returns ------- result : Expr - vector Approximated solution. """ util.Assert.eq(A.shape[0], b.shape[0]) x = expr.zeros((A.shape[0],)) D = expr.diag(A) R = A - expr.diagflat(D) for i in xrange(_iter): x = (b - expr.dot(R, x)) / D return x
def jacobi_method(A, b, _iter=100): """ Iterative algorithm for approximating the solutions of a diagonally dominant system of linear equations. Parameters ---------- A : ndarray or Expr - 2d Input matrix b : ndarray or Expr - vector RHS vector _iter : int Times of iteration needed, default to be 100 Returns ------- result : Expr - vector Approximated solution. """ util.Assert.eq(A.shape[0], b.shape[0]) x = expr.zeros((A.shape[0], )) D = expr.diag(A) R = A - expr.diagflat(D) for i in xrange(_iter): x = (b - expr.dot(R, x)) / D return x
def cgit(A, x): ''' CGIT Conjugate Gradient iteration z = cgit(A, x) generates approximate solution to A*z = x. Args: A(Expr): matrix to be processed. x(Expr): the input vector. ''' z = expr.zeros(x.shape) r = x rho = expr.sum(r * r).optimized().glom() #util.log_warn('rho:%s', rho) p = r for i in xrange(15): q = expr.dot(A, p) alpha = rho / expr.sum(p * q).optimized().glom() #util.log_warn('alpha:%s', alpha) z = z + p * alpha rho0 = rho r = r - q * alpha rho = expr.sum(r * r).optimized().glom() beta = rho / rho0 #util.log_warn('beta:%s', beta) p = r + p * beta return z
def precompute(self): '''Precompute the most k similar items for each item. After this funcion returns. 2 attributes will be created. Attributes ------ top_k_similar_table : Numpy array of shape (N, k). Records the most k similar scores between each items. top_k_similar_indices : Numpy array of shape (N, k). Records the indices of most k similar items for each item. ''' M = self.rating_table.shape[0] N = self.rating_table.shape[1] self.similarity_table = expr.shuffle(self.rating_table, _similarity_mapper, kw={'item_norm': self._get_norm_of_each_item(self.rating_table), 'step': util.divup(self.rating_table.shape[1], blob_ctx.get().num_workers)}, shape_hint=(N, N)) # Release the memory for item_norm top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int) # Find top-k similar items for each item. # Store the similarity scores into table top_k_similar table. # Store the indices of top k items into table top_k_similar_indices. cost = np.prod(top_k_similar_indices.shape) top_k_similar_table = expr.shuffle(self.similarity_table, _select_most_k_similar_mapper, kw = {'top_k_similar_indices': top_k_similar_indices, 'k': self.k}, shape_hint=(N, self.k), cost_hint={hash(top_k_similar_indices):{'00': 0, '01': cost, '10': cost, '11': cost}}) self.top_k_similar_table = top_k_similar_table.optimized().glom() self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0],), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape(distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape(distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power(distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum(expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels
def benchmark_optimization(ctx, timer): FLAGS.optimization = 0 DATA_SIZE = 5 * 1000 * 1000 current = eager( zeros((DATA_SIZE * ctx.num_workers, ), dtype=np.float32, tile_hint=(DATA_SIZE, ))) strike = eager( ones((DATA_SIZE * ctx.num_workers, ), dtype=np.float32, tile_hint=(DATA_SIZE, ))) maturity = eager(strike * 12) rate = eager(strike * 0.05) volatility = eager(strike * 0.01) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-none', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.optimization = 1 FLAGS.opt_parakeet_gen = 0 FLAGS.opt_map_fusion = 1 timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-fusion', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.opt_parakeet_gen = 1 timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-parakeet', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.opt_parakeet_gen = 0 FLAGS.opt_auto_tiling = 0 timer.time_op('opt-tiling = 0', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-tiling = 0', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-tiling = 0', lambda: bs_step(current, strike, maturity, rate, volatility)) FLAGS.opt_auto_tiling = 1 timer.time_op('opt-tiling', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-tiling', lambda: bs_step(current, strike, maturity, rate, volatility)) timer.time_op('opt-tiling', lambda: bs_step(current, strike, maturity, rate, volatility))
def benchmark_slice(ctx, timer): TEST_SIZE = 1000 * ctx.num_workers # force arange to evaluate first. x = expr.eager(expr.zeros((TEST_SIZE,10000))) for i in range(5): timer.time_op('slice-rows', lambda: expr.evaluate(x[200:300, :].sum())) timer.time_op('slice-cols', lambda: expr.evaluate(x[:, 200:300].sum())) timer.time_op('slice-box', lambda: expr.evaluate(x[200:300, 200:300].sum()))
def benchmark_slice(ctx, timer): TEST_SIZE = 1000 * ctx.num_workers # force arange to evaluate first. x = expr.eager(expr.zeros((TEST_SIZE, 10000))) for i in range(5): timer.time_op('slice-rows', lambda: expr.evaluate(x[200:300, :].sum())) timer.time_op('slice-cols', lambda: expr.evaluate(x[:, 200:300].sum())) timer.time_op('slice-box', lambda: expr.evaluate(x[200:300, 200:300].sum()))
def fit(data, labels, T=50, la=1.0): ''' Train an SVM model using the disdca (2013) algorithm. Args: data(Expr): points to be trained. labels(Expr): the correct labels of the training data. T(int): max training iterations. la(float): lambda parameter of this SVM model. ''' w = expr.zeros((data.shape[1], 1), dtype=np.float64) alpha = expr.zeros((data.shape[0], 1), dtype=np.float64) for i in range(T): alpha = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _svm_mapper, kw={'labels': labels, 'alpha': alpha, 'w': w, 'lambda_n': la * data.shape[0]}, shape_hint=alpha.shape, cost_hint={ hash(labels) : {'00': 0, '01': np.prod(labels.shape)}, hash(alpha) : {'00': 0, '01': np.prod(alpha.shape)} }) w = expr.sum(data * alpha * 1.0 / la / data.shape[0], axis=0).reshape((data.shape[1], 1)) w = w.optimized() return w
def precompute(self): '''Precompute the most k similar items for each item. After this funcion returns. 2 attributes will be created. Attributes ------ top_k_similar_table : Numpy array of shape (N, k). Records the most k similar scores between each items. top_k_similar_indices : Numpy array of shape (N, k). Records the indices of most k similar items for each item. ''' M = self.rating_table.shape[0] N = self.rating_table.shape[1] self.similarity_table = expr.shuffle( self.rating_table, _similarity_mapper, kw={ 'item_norm': self._get_norm_of_each_item(self.rating_table), 'step': util.divup(self.rating_table.shape[1], blob_ctx.get().num_workers) }, shape_hint=(N, N)) # Release the memory for item_norm top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int) # Find top-k similar items for each item. # Store the similarity scores into table top_k_similar table. # Store the indices of top k items into table top_k_similar_indices. cost = np.prod(top_k_similar_indices.shape) top_k_similar_table = expr.shuffle(self.similarity_table, _select_most_k_similar_mapper, kw={ 'top_k_similar_indices': top_k_similar_indices, 'k': self.k }, shape_hint=(N, self.k), cost_hint={ hash(top_k_similar_indices): { '00': 0, '01': cost, '10': cost, '11': cost } }) self.top_k_similar_table = top_k_similar_table.optimized().glom() self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
def fit(self, X, centers = None): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ X = expr.force(X) num_dim = X.shape[1] labels = expr.zeros((X.shape[0],1), dtype=np.int, tile_hint=X.tile_shape()) if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts' : X, 'old_centers' : centers, 'new_centers' : new_centers, 'new_counts' : new_counts, 'labels': labels }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(self, X, centers=None): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if centers is None: centers = expr.from_numpy(np.random.rand(self.n_clusters, num_dim)) for i in range(self.n_iter): X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels '''
def center_data(X, y, fit_intercept, normalize=False): """ Centers data to have mean zero along axis 0. This is here because nearly all linear models will want their data to be centered. """ if fit_intercept: X_mean = X.mean(axis = 0) X_mean = expr.reshape(X_mean, (1, X_mean.shape[0])) X -= X_mean if normalize: X_std = expr.sqrt(expr.sum(X ** 2, axis=0)).force() X_std[X_std == 0] = 1 X /= X_std else: X_std = expr.ones(X.shape[1]) y_mean = y.mean(axis=0) y -= y_mean else: X_mean = expr.zeros(X.shape[1]) X_std = expr.ones(X.shape[1]) y_mean = 0. if y.ndim == 1 else expr.zeros(y.shape[1], dtype=X.dtype) return X, y, X_mean, y_mean, X_std
def fit(data, labels, num_tiles, T=50, la=1.0): ''' Train an SVM model using the disdca (2013) algorithm. Args: data(Expr): points to be trained. labels(Expr): the correct labels of the training data. num_tiles(int): the total tiles of the training data. T(int): max training iterations. la(float): lambda parameter of this SVM model. ''' w = None m = data.shape[0] / num_tiles alpha = expr.zeros((m * num_tiles, 1), dtype=np.float64, tile_hint=(m,1)).force() for i in range(T): new_weight = expr.ndarray((data.shape[1], 1), dtype=np.float64, reduce_fn=np.add, tile_hint=(data.shape[1], 1)) new_weight = expr.shuffle(data, _svm_mapper, target=new_weight, kw={'labels': labels, 'alpha': alpha, 'w': w, 'm': m, 'scale': num_tiles, 'lambda_n': la * data.shape[0]}) w = new_weight / num_tiles return w
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim, tile_hint=(k, num_dim)) labels = expr.zeros((points.shape[0],), dtype=np.int, tile_hint=(points.shape[0]/len(points.tiles),)) for iter in range(num_iter): new_centers = expr.ndarray((k, num_dim), reduce_fn=lambda a, b: a + b, tile_hint=(k, num_dim)) new_counts = expr.ndarray((k, 1), dtype=np.float, reduce_fn=lambda a, b: a + b, tile_hint=(k, 1)) expr.shuffle(points, _fuzzy_kmeans_mapper, kw={'old_centers': centers, 'centers': new_centers, 'counts': new_counts, 'labels': labels, 'm': m}).force() # If any centroids don't have any points assigned to them. zcount_indices = (new_counts.glom() == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. new_counts[zcount_indices, :] = 1 new_centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = new_centers / new_counts return labels
def simulate(ts_all, te_all, lamb_all, num_paths): '''Range over a number of independent products. :param ts_all: DistArray Start dates for a series of swaptions. :param te_all: DistArray End dates for a series of swaptions. :param lamb_all: DistArray Parameter values for a series of swaptions. :param num_paths: Int Number of paths used in random walk. :rtype: DistArray ''' swaptions = [] i = 0 for ts_a, te, lamb in zip(ts_all, te_all, lamb_all): for ts in ts_a: #start = time() print i time_structure = arange(None, 0, ts + DELTA, DELTA) maturity_structure = arange(None, 0, te, DELTA) ############# MODEL ############### # Variance reduction technique - Antithetic Variates. eps_tmp = randn(time_structure.shape[0] - 1, num_paths) eps = concatenate(eps_tmp, -eps_tmp, 1) # Forward LIBOR rates for the construction of the spot measure. f_kk = zeros((time_structure.shape[0], 2*num_paths)) f_kk = assign(f_kk, np.s_[0, :], F_0) # Plane kxN of simulated LIBOR rates. f_kn = ones((maturity_structure.shape[0], 2*num_paths))*F_0 # Simulations of the plane f_kn for each time step. for t in xrange(1, time_structure.shape[0]): f_kn_new = f_kn[1:, :]*exp(lamb*mu(f_kn, lamb)*DELTA-0.5*lamb*lamb * DELTA + lamb*eps[t - 1, :]*sqrt(DELTA)) f_kk = assign(f_kk, np.s_[t, :], f_kn_new[0]) f_kn = f_kn_new ############## PRODUCT ############### # Value of zero coupon bonds. zcb = ones((int((te-ts)/DELTA)+1, 2*num_paths)) f_kn_modified = 1 + DELTA*f_kn for j in xrange(zcb.shape[0] - 1): zcb = assign(zcb, np.s_[j + 1], zcb[j] / f_kn_modified[j]) # Swaption price at maturity. last_row = zcb[zcb.shape[0] - 1, :].reshape((20, )) swap_ts = maximum(1 - last_row - THETA*DELTA*expr.sum(zcb[1:], 0), 0) # Spot measure used for discounting. b_ts = ones((2*num_paths, )) tmp = 1 + DELTA * f_kk for j in xrange(int(ts/DELTA)): b_ts *= tmp[j].reshape((20, )) # Swaption price at time 0. swaption = swap_ts/b_ts # Save expected value in bps and std. me = mean((swaption[0:num_paths] + swaption[num_paths:])/2) * 10000 st = std((swaption[0:num_paths] + swaption[num_paths:])/2)/sqrt(num_paths)*10000 swaptions.append([me.optimized().force(), st.optimized().force()]) #print time() - start i += 1 return swaptions
def test_count_zero(self): x = expr.ones((TEST_SIZE, )) Assert.eq(expr.count_zero(x).glom(), 0) x = expr.zeros((TEST_SIZE, )) Assert.eq(expr.count_zero(x).glom(), TEST_SIZE)
def train_smo_1998(self, data, labels): ''' Train an SVM model using the SMO (1998) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 self.alpha = expr.zeros((N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N / self.ctx.num_workers, N]) labels = expr.force(labels) self.E = expr.zeros((N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1]).force() for i in xrange(N): self.E[i, 0] = self.b + expr.reduce( self.alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict( label=labels, data=kernel_results[:, i].force())).glom() - labels[i, 0] util.log_info("Starting SMO") it = 0 num_changed = 0 examine_all = True while (num_changed > 0 or examine_all) and (it < self.maxiter): util.log_info("Iteration:%d", it) num_changed = 0 if examine_all: for i in xrange(N): num_changed += self.examine_example( i, N, labels, kernel_results) else: for i in xrange(N): if self.alpha[i, 0] > 0 and self.alpha[i, 0] < self.C: num_changed += self.examine_example( i, N, labels, kernel_results) it += 1 if examine_all: examine_all = False elif num_changed == 0: examine_all = True self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i, 0] = expr.reduce(self.alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force( data[:, i]))).glom() self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def train_smo_2005(self, data, labels): ''' Train an SVM model using the SMO (2005) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 alpha = expr.zeros((N,1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N/self.ctx.num_workers, N]) gradient = expr.ones((N, 1), dtype=np.float64, tile_hint=[N/self.ctx.num_workers, 1]) * -1.0 expr_labels = expr.lazify(labels) util.log_info("Starting SMO") pv1 = pv2 = -1 it = 0 while it < self.maxiter: util.log_info("Iteration:%d", it) minObj = 1e100 expr_alpha = expr.lazify(alpha) G = expr.multiply(labels, gradient) * -1.0 v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) + (expr_labels < -self.tol) * (expr_alpha > self.tol)) v1 = expr.argmax(G[v1_mask-True]).glom().item() maxG = G[v1,0].glom() print 'maxv1:', v1, 'maxG:', maxG v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) + (expr_labels < -self.tol) * (expr_alpha < self.C)) min_v2 = expr.argmin(G[v2_mask-True]).glom().item() minG = G[min_v2,0].glom() #print 'minv2:', min_v2, 'minG:', minG set_v2 = v2_mask.glom().nonzero()[0] #print 'actives:', set_v2.shape[0] v2 = -1 for v in set_v2: b = maxG - G[v,0].glom() if b > self.tol: na = (kernel_results[v1,v1] + kernel_results[v,v] - 2*kernel_results[v1,v]).glom()[0][0] if na < self.tol: na = 1e12 obj = -(b*b)/na if obj <= minObj and v1 != pv1 or v != pv2: v2 = v a = na minObj = obj if v2 == -1: break if maxG - minG < self.tol: break print 'opt v1:', v1, 'v2:', v2 pv1 = v1 pv2 = v2 y1 = labels[v1,0] y2 = labels[v2,0] oldA1 = alpha[v1,0] oldA2 = alpha[v2,0] # Calculate new alpha values, to reduce the objective function... b = y2*expr.glom(gradient[v2,0]) - y1*expr.glom(gradient[v1,0]) if y1 != y2: a += 4 * kernel_results[v1,v2].glom() newA1 = oldA1 + y1*b/a newA2 = oldA2 - y2*b/a # Correct for alpha being out of range... sum = y1*oldA1 + y2*oldA2; if newA1 < self.tol: newA1 = 0.0 elif newA1 > self.C: newA1 = self.C newA2 = y2 * (sum - y1 * newA1) if newA2 < self.tol: newA2 = 0.0; elif newA2 > self.C: newA2 = self.C newA1 = y1 * (sum - y2 * newA2) # Update the gradient... dA1 = newA1 - oldA1 dA2 = newA2 - oldA2 gradient += expr.multiply(labels, kernel_results[:,v1]) * y1 * dA1 + expr.multiply(labels, kernel_results[:,v2]) * y2 * dA2 alpha[v1,0] = newA1 alpha[v2,0] = newA2 #print 'alpha:', alpha.glom().T it += 1 #print 'gradient:', gradient.glom().T self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i,0] = expr.reduce(alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force(data[:,i]))).glom() self.b = 0.0 E = (labels - self.margins(data)).force() minB = -1e100 maxB = 1e100 actualB = 0.0 numActualB = 0 for i in xrange(N): ai = alpha[i,0] yi = labels[i,0] Ei = E[i,0] if ai < 1e-3: if yi < self.tol: maxB = min((maxB,Ei)) else: minB = max((minB,Ei)) elif ai > self.C - 1e-3: if yi < self.tol: minB = max((minB,Ei)) else: maxB = min((maxB,Ei)) else: numActualB += 1 actualB += (Ei - actualB) / float(numActualB) if numActualB > 0: self.b = actualB else: self.b = 0.5*(minB + maxB) self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def train_smo_2005(self, data, labels): ''' Train an SVM model using the SMO (2005) algorithm. Args: data(Expr): points to be trained labels(Expr): the correct labels of the training data ''' N = data.shape[0] # Number of instances D = data.shape[1] # Number of features self.b = 0.0 alpha = expr.zeros((N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1]).force() # linear kernel kernel_results = expr.dot(data, expr.transpose(data), tile_hint=[N / self.ctx.num_workers, N]) gradient = expr.ones( (N, 1), dtype=np.float64, tile_hint=[N / self.ctx.num_workers, 1 ]) * -1.0 expr_labels = expr.lazify(labels) util.log_info("Starting SMO") pv1 = pv2 = -1 it = 0 while it < self.maxiter: util.log_info("Iteration:%d", it) minObj = 1e100 expr_alpha = expr.lazify(alpha) G = expr.multiply(labels, gradient) * -1.0 v1_mask = ((expr_labels > self.tol) * (expr_alpha < self.C) + (expr_labels < -self.tol) * (expr_alpha > self.tol)) v1 = expr.argmax(G[v1_mask - True]).glom().item() maxG = G[v1, 0].glom() print 'maxv1:', v1, 'maxG:', maxG v2_mask = ((expr_labels > self.tol) * (expr_alpha > self.tol) + (expr_labels < -self.tol) * (expr_alpha < self.C)) min_v2 = expr.argmin(G[v2_mask - True]).glom().item() minG = G[min_v2, 0].glom() #print 'minv2:', min_v2, 'minG:', minG set_v2 = v2_mask.glom().nonzero()[0] #print 'actives:', set_v2.shape[0] v2 = -1 for v in set_v2: b = maxG - G[v, 0].glom() if b > self.tol: na = (kernel_results[v1, v1] + kernel_results[v, v] - 2 * kernel_results[v1, v]).glom()[0][0] if na < self.tol: na = 1e12 obj = -(b * b) / na if obj <= minObj and v1 != pv1 or v != pv2: v2 = v a = na minObj = obj if v2 == -1: break if maxG - minG < self.tol: break print 'opt v1:', v1, 'v2:', v2 pv1 = v1 pv2 = v2 y1 = labels[v1, 0] y2 = labels[v2, 0] oldA1 = alpha[v1, 0] oldA2 = alpha[v2, 0] # Calculate new alpha values, to reduce the objective function... b = y2 * expr.glom(gradient[v2, 0]) - y1 * expr.glom(gradient[v1, 0]) if y1 != y2: a += 4 * kernel_results[v1, v2].glom() newA1 = oldA1 + y1 * b / a newA2 = oldA2 - y2 * b / a # Correct for alpha being out of range... sum = y1 * oldA1 + y2 * oldA2 if newA1 < self.tol: newA1 = 0.0 elif newA1 > self.C: newA1 = self.C newA2 = y2 * (sum - y1 * newA1) if newA2 < self.tol: newA2 = 0.0 elif newA2 > self.C: newA2 = self.C newA1 = y1 * (sum - y2 * newA2) # Update the gradient... dA1 = newA1 - oldA1 dA2 = newA2 - oldA2 gradient += expr.multiply( labels, kernel_results[:, v1]) * y1 * dA1 + expr.multiply( labels, kernel_results[:, v2]) * y2 * dA2 alpha[v1, 0] = newA1 alpha[v2, 0] = newA2 #print 'alpha:', alpha.glom().T it += 1 #print 'gradient:', gradient.glom().T self.w = expr.zeros((D, 1), dtype=np.float64).force() for i in xrange(D): self.w[i, 0] = expr.reduce(alpha, axis=None, dtype_fn=lambda input: input.dtype, local_reduce_fn=margin_mapper, accumulate_fn=np.add, fn_kw=dict(label=labels, data=expr.force( data[:, i]))).glom() self.b = 0.0 E = (labels - self.margins(data)).force() minB = -1e100 maxB = 1e100 actualB = 0.0 numActualB = 0 for i in xrange(N): ai = alpha[i, 0] yi = labels[i, 0] Ei = E[i, 0] if ai < 1e-3: if yi < self.tol: maxB = min((maxB, Ei)) else: minB = max((minB, Ei)) elif ai > self.C - 1e-3: if yi < self.tol: minB = max((minB, Ei)) else: maxB = min((maxB, Ei)) else: numActualB += 1 actualB += (Ei - actualB) / float(numActualB) if numActualB > 0: self.b = actualB else: self.b = 0.5 * (minB + maxB) self.usew_ = True print 'iteration finish:', it print 'b:', self.b print 'w:', self.w.glom()
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def test_count_nonzero(self): x = expr.ones((TEST_SIZE,)) Assert.eq(expr.count_nonzero(x).glom(), TEST_SIZE) x = expr.zeros((TEST_SIZE,)) Assert.eq(expr.count_nonzero(x).glom(), 0)
def simulate(ts_all, te_all, lamb_all, num_paths): """Range over a number of independent products. :param ts_all: DistArray Start dates for a series of swaptions. :param te_all: DistArray End dates for a series of swaptions. :param lamb_all: DistArray Parameter values for a series of swaptions. :param num_paths: Int Number of paths used in random walk. :rtype: DistArray """ swaptions = [] i = 0 for ts_a, te, lamb in zip(ts_all, te_all, lamb_all): for ts in ts_a: # start = time() print i time_structure = arange(None, 0, ts + DELTA, DELTA) maturity_structure = arange(None, 0, te, DELTA) ############# MODEL ############### # Variance reduction technique - Antithetic Variates. eps_tmp = randn(time_structure.shape[0] - 1, num_paths) eps = concatenate(eps_tmp, -eps_tmp, 1) # Forward LIBOR rates for the construction of the spot measure. f_kk = zeros((time_structure.shape[0], 2 * num_paths)) f_kk = assign(f_kk, np.s_[0, :], F_0) # Plane kxN of simulated LIBOR rates. f_kn = ones((maturity_structure.shape[0], 2 * num_paths)) * F_0 # Simulations of the plane f_kn for each time step. for t in xrange(1, time_structure.shape[0]): f_kn_new = f_kn[1:, :] * exp( lamb * mu(f_kn, lamb) * DELTA - 0.5 * lamb * lamb * DELTA + lamb * eps[t - 1, :] * sqrt(DELTA) ) f_kk = assign(f_kk, np.s_[t, :], f_kn_new[0]) f_kn = f_kn_new ############## PRODUCT ############### # Value of zero coupon bonds. zcb = ones((int((te - ts) / DELTA) + 1, 2 * num_paths)) f_kn_modified = 1 + DELTA * f_kn for j in xrange(zcb.shape[0] - 1): zcb = assign(zcb, np.s_[j + 1], zcb[j] / f_kn_modified[j]) # Swaption price at maturity. last_row = zcb[zcb.shape[0] - 1, :].reshape((20,)) swap_ts = maximum(1 - last_row - THETA * DELTA * expr.sum(zcb[1:], 0), 0) # Spot measure used for discounting. b_ts = ones((2 * num_paths,)) tmp = 1 + DELTA * f_kk for j in xrange(int(ts / DELTA)): b_ts *= tmp[j].reshape((20,)) # Swaption price at time 0. swaption = swap_ts / b_ts # Save expected value in bps and std. me = mean((swaption[0:num_paths] + swaption[num_paths:]) / 2) * 10000 st = std((swaption[0:num_paths] + swaption[num_paths:]) / 2) / sqrt(num_paths) * 10000 swaptions.append([me.optimized().force(), st.optimized().force()]) # print time() - start i += 1 return swaptions
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim) labels = expr.zeros((points.shape[0], ), dtype=np.int) for iter in range(num_iter): centers = expr.as_array(centers) points_broadcast = expr.reshape(points, (points.shape[0], 1, points.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(points_broadcast - centers_broadcast), axis=2) # This is used to avoid dividing zero distances = distances + 0.00000000001 util.log_info('distances shape %s' % str(distances.shape)) distances_broadcast = expr.reshape( distances, (distances.shape[0], 1, distances.shape[1])) distances_broadcast2 = expr.reshape( distances, (distances.shape[0], distances.shape[1], 1)) prob = 1.0 / expr.sum(expr.power( distances_broadcast / distances_broadcast2, 2.0 / (m - 1)), axis=2) prob.force() counts = expr.sum(prob, axis=0) counts = expr.reshape(counts, (counts.shape[0], 1)) labels = expr.argmax(prob, axis=1) centers = expr.sum( expr.reshape(points, (points.shape[0], 1, points.shape[1])) * expr.reshape(prob, (prob.shape[0], prob.shape[1], 1)), axis=0) # We assume that the size of centers are relative small that can be handled # on the master. counts = counts.glom() centers = centers.glom() # If any centroids don't have any points assigned to them. zcount_indices = (counts == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. counts[zcount_indices, :] = 1 centers[zcount_indices, :] = np.random.rand( np.count_nonzero(zcount_indices), num_dim) centers = centers / counts return labels