def connectedConponents(ctx, dim, numIters): linkMatrix = eager( expr.shuffle( expr.ndarray( (dim, dim), dtype = np.int64, tile_hint = (dim / ctx.num_workers, dim)), make_matrix, )) power = eager( expr.shuffle( expr.ndarray( (dim, dim), dtype = np.int64, tile_hint = (dim / ctx.num_workers, dim)), make_matrix, )) eye = expr.eye(dim, tile_hint = (dim / ctx.num_workers,dim)) startCompute = time.time() result = expr.logical_or(eye, linkMatrix).optimized().glom() for i in range(numIters): power = expr.dot(power, linkMatrix).optimized().glom() result = expr.logical_or(result, power) result.optimized().glom() final = expr.logical_and(result, expr.transpose(result.optimized())).optimized().evaluate() endCompute = time.time() return endCompute - startCompute
def shortestPath(ctx, dim, numIters): dist = eager( expr.shuffle( expr.ndarray( (dim, 1), dtype = np.int64, tile_hint = (dim / ctx.num_workers, 1) ), make_dist, ) ) linkMatrix = eager( expr.shuffle( expr.ndarray( (dim, dim), dtype = np.int64, tile_hint = (dim, dim / ctx.num_workers)), make_matrix, )) startCompute = time.time() for it in range(numIters): first = expr.add(dist, linkMatrix) second = first.min(axis = 0) dist = second.reshape(dim, 1) dist.evaluate() endCompute = time.time() return endCompute - startCompute
def bfs(ctx, dim): util.log_info("start to computing......") sGenerate = time.time() current = eager( expr.shuffle( expr.ndarray( (dim, 1), dtype = np.int64, tile_hint = (dim / ctx.num_workers, 1)), make_current, )) linkMatrix = eager( expr.shuffle( expr.ndarray( (dim, dim), dtype = np.int64, tile_hint = (dim, dim / ctx.num_workers)), make_matrix, )) eGenerate = time.time() startCompute = time.time() while(True): next = expr.dot(linkMatrix, current) formerNum = expr.count_nonzero(current) laterNum = expr.count_nonzero(next) hasNew = expr.equal(formerNum, laterNum).glom() current = next if (hasNew): break current.evaluate() endCompute = time.time() return (eGenerate - sGenerate, endCompute - startCompute)
def precompute(self): '''Precompute the most k similar items for each item. After this funcion returns. 2 attributes will be created. Attributes ------ top_k_similar_table : Numpy array of shape (N, k). Records the most k similar scores between each items. top_k_similar_indices : Numpy array of shape (N, k). Records the indices of most k similar items for each item. ''' M = self.rating_table.shape[0] N = self.rating_table.shape[1] self.similarity_table = expr.shuffle(self.rating_table, _similarity_mapper, kw={'item_norm': self._get_norm_of_each_item(self.rating_table), 'step': util.divup(self.rating_table.shape[1], blob_ctx.get().num_workers)}, shape_hint=(N, N)) # Release the memory for item_norm top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int) # Find top-k similar items for each item. # Store the similarity scores into table top_k_similar table. # Store the indices of top k items into table top_k_similar_indices. cost = np.prod(top_k_similar_indices.shape) top_k_similar_table = expr.shuffle(self.similarity_table, _select_most_k_similar_mapper, kw = {'top_k_similar_indices': top_k_similar_indices, 'k': self.k}, shape_hint=(N, self.k), cost_hint={hash(top_k_similar_indices):{'00': 0, '01': cost, '10': cost, '11': cost}}) self.top_k_similar_table = top_k_similar_table.optimized().glom() self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' labels = expr.force(labels) # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add, tile_hint=(data.shape[1],)) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce(data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add, tile_hint=(data.shape[0],)) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1])) # add up all the feature vectors with the same labels sum_instance_by_label = expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add, tile_hint=(label_size / len(labels.tiles), data.shape[1])) sum_instance_by_label = expr.shuffle(data, _sum_instance_by_label_mapper, target=sum_instance_by_label, kw={'labels': labels, 'label_size': label_size}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(sum_instance_by_label, axis=1, tile_hint=(label_size,)) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.shuffle(sum_instance_by_label, _naive_bayes_mapper, kw={'weights_per_label': weights_per_label, 'alpha':alpha}) return {'scores_per_label_and_feature': weights_per_label_and_feature.force(), 'scores_per_label': weights_per_label.force(), }
def precompute(self): '''Precompute the most k similar items for each item. After this funcion returns. 2 attributes will be created. Attributes ------ top_k_similar_table : Numpy array of shape (N, k). Records the most k similar scores between each items. top_k_similar_indices : Numpy array of shape (N, k). Records the indices of most k similar items for each item. ''' M = self.rating_table.shape[0] N = self.rating_table.shape[1] self.similarity_table = expr.shuffle( self.rating_table, _similarity_mapper, kw={ 'item_norm': self._get_norm_of_each_item(self.rating_table), 'step': util.divup(self.rating_table.shape[1], blob_ctx.get().num_workers) }, shape_hint=(N, N)) # Release the memory for item_norm top_k_similar_indices = expr.zeros((N, self.k), dtype=np.int) # Find top-k similar items for each item. # Store the similarity scores into table top_k_similar table. # Store the indices of top k items into table top_k_similar_indices. cost = np.prod(top_k_similar_indices.shape) top_k_similar_table = expr.shuffle(self.similarity_table, _select_most_k_similar_mapper, kw={ 'top_k_similar_indices': top_k_similar_indices, 'k': self.k }, shape_hint=(N, self.k), cost_hint={ hash(top_k_similar_indices): { '00': 0, '01': cost, '10': cost, '11': cost } }) self.top_k_similar_table = top_k_similar_table.optimized().glom() self.top_k_similar_indices = top_k_similar_indices.optimized().glom()
def benchmark_svm(ctx, timer): print "#worker:", ctx.num_workers max_iter = 2 #N = 200000 * ctx.num_workers N = 1000 * 64 D = 64 # create data data = expr.randn(N, D, dtype=np.float64, tile_hint=(N, util.divup(D, ctx.num_workers))) labels = expr.shuffle(data, _init_label_mapper, shape_hint=(data.shape[0], 1)) t1 = datetime.now() w = fit(data, labels, T=max_iter).force() t2 = datetime.now() util.log_warn('train time per iteration:%s ms, final w:%s', millis(t1,t2)/max_iter, w.glom().T) correct = 0 for i in range(10): new_data = expr.randn(1, D, dtype=np.float64, tile_hint=[1, D]) new_label = predict(w, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if new_data[0,0] >= new_data[0,1] and new_label == 1.0 or new_data[0,0] < new_data[0,1] and new_label == -1.0: correct += 1 print 'predict precision:', correct * 1.0 / 10
def benchmark_naive_bayes(ctx, timer): print "#worker:", ctx.num_workers N = 100000 * ctx.num_workers D = 128 # create data data = expr.randint(N, D, low=0, high=D, tile_hint=(N/ctx.num_workers, D)) labels = expr.eager(expr.shuffle(data, _init_label_mapper)) #util.log_warn('data:%s, label:%s', data.glom(), labels.glom()) util.log_warn('begin train') t1 = datetime.now() model = fit(data, labels, D) t2 = datetime.now() util.log_warn('train time:%s ms', millis(t1,t2)) correct = 0 for i in range(10): new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D)) new_label = predict(model, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if np.isclose(new_data[0, new_label], np.max(new_data)): correct += 1 print 'predict precision:', correct * 1.0 / 10
def streaming_kmeans(points, k=10, num_iters=10, num_ballkmeans_runs=2, trim_factor=0.9, test_probability=0.1, correct_weight=False): ''' clustering data points using streaming kmeans method. Args: points(DistArray): data points to be clustered. k(int): the final number of clusters. num_iters(int): the number of iterations to run in each ball kmeans run. num_ballkmeans_runs(int): the number of ball kmeans to run. trim_factor(float): the ball kmeans parameter to separate the nearest points and distant points. test_probability(float): the percentage of points to be chosen as test set. correct_weights(bool): whether to correct the weights of the centroids. ''' centroids = expr.tile_operation(points, _streaming_mapper, kw={'k': k}).evaluate() new_centroids = [] for tile_result in centroids.values(): for centroids_list in tile_result: new_centroids.extend(centroids_list) centriods = ball_kmeans(new_centroids, k, num_iters, num_ballkmeans_runs, trim_factor, test_probability, correct_weight) centers = np.zeros((k, points.shape[1])) for i in range(k): centers[i] = centriods[i].get_center() return expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0],))
def benchmark_naive_bayes(ctx, timer): print "#worker:", ctx.num_workers #N = 100000 * ctx.num_workers N = 10000 * 64 D = 128 # create data data = expr.randint(N, D, low=0, high=D, tile_hint=(N, D/ctx.num_workers)) labels = expr.shuffle(expr.ndarray((data.shape[0], 1), dtype=np.int), _init_label_mapper, kw={'data': data}, shape_hint=(data.shape[0], 1), cost_hint={hash(data):{'00': 0, '10': np.prod(data.shape)}} ) #util.log_warn('data:%s, label:%s', data.glom(), labels.glom()) util.log_warn('begin train') t1 = datetime.now() model = fit(data, labels, D) t2 = datetime.now() util.log_warn('train time:%s ms', millis(t1,t2)) correct = 0 for i in range(10): new_data = expr.randint(1, D, low=0, high=D, tile_hint=(1, D)) new_label = predict(model, new_data) #print 'point %s, predict %s' % (new_data.glom(), new_label) new_data = new_data.glom() if np.isclose(new_data[0, new_label], np.max(new_data)): correct += 1 print 'predict precision:', correct * 1.0 / 10
def _evaluate(self, ctx, deps): V, M, U = deps['V'], deps['M'], deps['U'] strata = _compute_strata(V) util.log_info('Start eval') for i, stratum in enumerate(strata): util.log_info('Processing stratum: %d of %d (size = %d)', i, len(strata), len(stratum)) #for ex in stratum: print ex worklist = set(stratum) expr.shuffle(V, sgd_netflix_mapper, kw={'V' : lazify(V), 'M' : lazify(M), 'U' : lazify(U), 'worklist' : worklist }).force() util.log_info('Eval done.')
def test_slice_shuffle(self): x = expr.arange((TEST_SIZE, TEST_SIZE)) z = x[5:8, 5:8] z = expr.shuffle(z, add_one_extent) val = z.force() nx = np.arange(TEST_SIZE*TEST_SIZE).reshape(TEST_SIZE, TEST_SIZE) Assert.all_eq(val.glom(), nx[5:8, 5:8] + 1)
def saveAsTextFile(ctx, dim): matrix = eager( expr.shuffle( expr.ndarray( (dim, dim), dtype = np.int32, tile_hint = (dim, dim / ctx.num_workers)), #tile_hint = (2, 2)), make_matrix, ))
def pagerank_sparse(num_pages, num_outlinks, same_site_prob): result = expr.ndarray((num_pages, num_pages), dtype=np.float32, sparse=True) cost = num_pages * num_pages return expr.shuffle(result, target=result, fn=_make_site_sparse, kw = { 'num_outlinks' : num_outlinks, 'same_site_prob' : same_site_prob }, cost_hint={hash(result):{'11':0, '01':cost, '10':cost, '00':cost}})
def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce(data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape((1, data.shape[1])) # add up all the feature vectors with the same labels #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64) #for i in range(label_size): # i_mask = (labels == i) # weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0)) weights_per_label_and_feature = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _sum_instance_by_label_mapper, target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add), kw={'labels': labels, 'label_size': label_size}, cost_hint={hash(labels):{'00':0, '01':np.prod(labels.shape)}}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(weights_per_label_and_feature, axis=1) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.log((weights_per_label_and_feature + alpha) / (weights_per_label.reshape((weights_per_label.shape[0], 1)) + alpha * weights_per_label_and_feature.shape[1])) return {'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(), 'scores_per_label': weights_per_label.optimized().force(), }
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'): ''' clustering data points using kmeans spectral clustering method. Args: points(Expr or DistArray): the data points to be clustered. k(int): the number of clusters we need to generate. num_iter(int): the max number of iterations that kmeans clustering method runs. similarity_measurement(str): distance method used to measure similarity between two points. ''' # calculate similarity for each pair of points to generate the adjacency matrix A A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement}, shape_hint=(points.shape[0], points.shape[0])) num_dims = A.shape[1] # Construct the diagonal matrix D D = expr.sum(A, axis=1, tile_hint=(A.shape[0], )) # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5) L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}, shape_hint=A.shape) # Perform eigen-decomposition using Lanczos solver overshoot = min(k * 2, num_dims) d, U = lanczos.solve(L, L, overshoot, True) U = U[:, 0:k] # Generate initial clusters which picks rows as centers if that row contains max eigen # value in that column init_clusters = U[np.argmax(U, axis=0)] # Run kmeans clustering with init_clusters kmeans = KMeans(k, num_iter) U = expr.from_numpy(U) centers, labels = kmeans.fit(U, init_clusters) return labels
def pagerank_sparse(num_pages, num_outlinks, same_site_prob, hint): return expr.shuffle( expr.ndarray((num_pages, num_pages), dtype=np.float32, tile_hint=hint, sparse=True), fn=_make_site_sparse, kw = { 'num_outlinks' : num_outlinks, 'same_site_prob' : same_site_prob })
def _evaluate(self, ctx, deps): V, M, U = deps['V'], deps['M'], deps['U'] strata = _compute_strata(V) util.log_info('Start eval') for i, stratum in enumerate(strata): util.log_info('Processing stratum: %d of %d (size = %d)', i, len(strata), len(stratum)) #for ex in stratum: print ex worklist = set(stratum) expr.shuffle(V, sgd_netflix_mapper, kw={ 'V': lazify(V), 'M': lazify(M), 'U': lazify(U), 'worklist': worklist }).evaluate() util.log_info('Eval done.')
def learn_topics(terms_docs_matrix, k_topics, alpha=0.1, eta=0.1, max_iter=10, max_iter_per_doc=1): ''' Using Collapsed Variational Bayes method (Mahout implementation) to train LDA topic model. Args: terms_docs_matrix(Expr or DistArray): the count of each term in each document. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter(int):the max iterations to train LDA topic model. max_iter_per_doc: the max iterations to train each document. ''' topic_term_counts = expr.rand(k_topics, terms_docs_matrix.shape[0], tile_hint=(k_topics, terms_docs_matrix.shape[0])) for i in range(max_iter): new_topic_term_counts = expr.ndarray((k_topics, terms_docs_matrix.shape[0]), dtype=np.float64, reduce_fn=np.add, tile_hint=(k_topics, terms_docs_matrix.shape[0])) topic_term_counts = expr.shuffle(terms_docs_matrix, _lda_mapper, target=new_topic_term_counts, kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 'max_iter_per_doc': max_iter_per_doc, 'topic_term_counts': topic_term_counts}) # calculate the doc-topic inference doc_topics = expr.shuffle(terms_docs_matrix, _lda_doc_topic_mapper, kw={'k_topics': k_topics, 'alpha': alpha, 'eta':eta, 'max_iter_per_doc': max_iter_per_doc, 'topic_term_counts': topic_term_counts}) # normalize the topic-term distribution norm_val = expr.reduce(topic_term_counts, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis:np.abs(data).sum(axis), accumulate_fn=np.add) topic_term_counts = topic_term_counts / norm_val.reshape((topic_term_counts.shape[0], 1)) return doc_topics, topic_term_counts
def fuzzy_kmeans(points, k=10, num_iter=10, m=2.0, centers=None): ''' clustering data points using fuzzy kmeans clustering method. Args: points(Expr or DistArray): the input data points matrix. k(int): the number of clusters. num_iter(int): the max iterations to run. m(float): the parameter of fuzzy kmeans. centers(Expr or DistArray): the initialized centers of each cluster. ''' points = expr.force(points) num_dim = points.shape[1] if centers is None: centers = expr.rand(k, num_dim, tile_hint=(k, num_dim)) labels = expr.zeros((points.shape[0],), dtype=np.int, tile_hint=(points.shape[0]/len(points.tiles),)) for iter in range(num_iter): new_centers = expr.ndarray((k, num_dim), reduce_fn=lambda a, b: a + b, tile_hint=(k, num_dim)) new_counts = expr.ndarray((k, 1), dtype=np.float, reduce_fn=lambda a, b: a + b, tile_hint=(k, 1)) expr.shuffle(points, _fuzzy_kmeans_mapper, kw={'old_centers': centers, 'centers': new_centers, 'counts': new_counts, 'labels': labels, 'm': m}).force() # If any centroids don't have any points assigned to them. zcount_indices = (new_counts.glom() == 0).reshape(k) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, which results in their # position being the zero-vector. We reseed these centroids with new random values # and set their counts to 1 in order to get rid of dividing by zero. new_counts[zcount_indices, :] = 1 new_centers[zcount_indices, :] = np.random.rand(np.count_nonzero(zcount_indices), num_dim) centers = new_centers / new_counts return labels
def fit(self, X, centers = None): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ X = expr.force(X) num_dim = X.shape[1] labels = expr.zeros((X.shape[0],1), dtype=np.int, tile_hint=X.tile_shape()) if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts' : X, 'old_centers' : centers, 'new_centers' : new_centers, 'new_counts' : new_counts, 'labels': labels }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def canopy_cluster(points, t1=0.1, t2=0.1, cf=1): ''' A simple implementation of canopy clustering method. Args: points(Expr or DistArray): the input data points matrix. t1(float): distance threshold between center point and the points within a canopy. t2(float): distance threshold between center point and the points within a canopy. cf(int): the minimum canopy size. ''' new_points = expr.tile_operation(points, _canopy_mapper, kw={'t1': t1, 't2': t2, 'cf': cf}).evaluate() centers = find_centers(new_points.values(), t1, t2, cf) labels = expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0],)) return labels
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' num_users = A.shape[0] num_items = A.shape[1] AT = expr.transpose(A) avg_rating = expr.sum(A, axis=0) * 1.0 / expr.count_nonzero(A, axis=0) M = expr.rand(num_items, num_features) M = expr.assign(M, np.s_[:, 0], avg_rating.reshape((avg_rating.shape[0], 1))) for i in range(num_iter): # Recomputing U U = expr.shuffle(expr.retile(A, tile_hint=util.calc_tile_hint(A, axis=0)), _solve_U_or_M_mapper, kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}, shape_hint=(num_users, num_features)).optimized() # Recomputing M M = expr.shuffle(expr.retile(AT, tile_hint=util.calc_tile_hint(AT, axis=0)), _solve_U_or_M_mapper, kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}, shape_hint=(num_items, num_features)).optimized() return U, M
def als(A, la=0.065, alpha=40, implicit_feedback=False, num_features=20, num_iter=10): ''' compute the factorization A = U M' using the alternating least-squares (ALS) method. where `A` is the "ratings" matrix which maps from a user and item to a rating score, `U` and `M` are the factor matrices, which represent user and item preferences. Args: A(Expr or DistArray): the rating matrix which maps from a user and item to a rating score. la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. num_features(int): dimension of the feature space. num_iter(int): max iteration to run. ''' A = expr.force(A) AT = expr.shuffle(expr.ndarray((A.shape[1], A.shape[0]), dtype=A.dtype, tile_hint=(A.shape[1] / len(A.tiles), A.shape[0])), _transpose_mapper, kw={'orig_array': A}) num_items = A.shape[1] avg_rating = expr.sum(A, axis=0, tile_hint=(num_items / len(A.tiles),)) * 1.0 / \ expr.count_nonzero(A, axis=0, tile_hint=(num_items / len(A.tiles),)) M = expr.shuffle(expr.ndarray((num_items, num_features), tile_hint=(num_items / len(A.tiles), num_features)), _init_M_mapper, kw={'avg_rating': avg_rating}) #util.log_warn('avg_rating:%s M:%s', avg_rating.glom(), M.glom()) for i in range(num_iter): # Recomputing U U = expr.shuffle(A, _solve_U_or_M_mapper, kw={'U_or_M': M, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}) # Recomputing M M = expr.shuffle(AT, _solve_U_or_M_mapper, kw={'U_or_M': U, 'la': la, 'alpha': alpha, 'implicit_feedback': implicit_feedback}) return U, M
def spectral_cluster(points, k=10, num_iter=10, similarity_measurement='rbf'): ''' clustering data points using kmeans spectral clustering method. Args: points(Expr or DistArray): the data points to be clustered. k(int): the number of clusters we need to generate. num_iter(int): the max number of iterations that kmeans clustering method runs. similarity_measurement(str): distance method used to measure similarity between two points. ''' # calculate similarity for each pair of points to generate the adjacency matrix A A = expr.shuffle(points, _row_similarity_mapper, kw={'similarity_measurement': similarity_measurement}) num_dims = A.shape[1] # Construct the diagonal matrix D D = expr.sum(A, axis=1, tile_hint=(A.shape[0],)) # Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5) L = expr.shuffle(A, _laplacian_mapper, kw={'D': D}) # Perform eigen-decomposition using Lanczos solver overshoot = min(k * 2, num_dims) d, U = lanczos.solve(L, L, overshoot, True) U = U[:, 0:k] # Generate initial clusters which picks rows as centers if that row contains max eigen # value in that column init_clusters = U[np.argmax(U, axis=0)] # Run kmeans clustering with init_clusters kmeans = KMeans(k, num_iter) U = expr.from_numpy(U) centers, labels = kmeans.fit(U, init_clusters) return labels
def benchmark_pagerank(ctx, timer): num_pages = PAGES_PER_WORKER * ctx.num_workers util.log_info('Total pages: %s', num_pages) wts = eager( expr.shuffle( expr.ndarray( (num_pages, num_pages), dtype=np.float32, tile_hint=(num_pages, PAGES_PER_WORKER / 8)), make_weights, )) p = eager(expr.ones((num_pages, 1), tile_hint=(PAGES_PER_WORKER / 8, 1), dtype=np.float32)) for i in range(3): timer.time_op('pagerank', lambda: expr.dot(wts, p).force())
def benchmark_pagerank(ctx, timer): num_pages = PAGES_PER_WORKER * ctx.num_workers util.log_info('Total pages: %s', num_pages) wts = eager( expr.shuffle( expr.ndarray((num_pages, num_pages), dtype=np.float32, tile_hint=(num_pages, PAGES_PER_WORKER / 8)), make_weights, )) p = eager( expr.ones((num_pages, 1), tile_hint=(PAGES_PER_WORKER / 8, 1), dtype=np.float32)) for i in range(3): timer.time_op('pagerank', lambda: expr.dot(wts, p).force())
def fit(data, labels, num_tiles, T=50, la=1.0): ''' Train an SVM model using the disdca (2013) algorithm. Args: data(Expr): points to be trained. labels(Expr): the correct labels of the training data. num_tiles(int): the total tiles of the training data. T(int): max training iterations. la(float): lambda parameter of this SVM model. ''' w = None m = data.shape[0] / num_tiles alpha = expr.zeros((m * num_tiles, 1), dtype=np.float64, tile_hint=(m,1)).force() for i in range(T): new_weight = expr.ndarray((data.shape[1], 1), dtype=np.float64, reduce_fn=np.add, tile_hint=(data.shape[1], 1)) new_weight = expr.shuffle(data, _svm_mapper, target=new_weight, kw={'labels': labels, 'alpha': alpha, 'w': w, 'm': m, 'scale': num_tiles, 'lambda_n': la * data.shape[0]}) w = new_weight / num_tiles return w
def pagerank_sparse(num_pages, num_outlinks, same_site_prob): result = expr.ndarray((num_pages, num_pages), dtype=np.float32, sparse=True) cost = num_pages * num_pages return expr.shuffle(result, target=result, fn=_make_site_sparse, kw={ 'num_outlinks': num_outlinks, 'same_site_prob': same_site_prob }, cost_hint={ hash(result): { '11': 0, '01': cost, '10': cost, '00': cost } })
def fit(data, labels, T=50, la=1.0): ''' Train an SVM model using the disdca (2013) algorithm. Args: data(Expr): points to be trained. labels(Expr): the correct labels of the training data. T(int): max training iterations. la(float): lambda parameter of this SVM model. ''' w = expr.zeros((data.shape[1], 1), dtype=np.float64) alpha = expr.zeros((data.shape[0], 1), dtype=np.float64) for i in range(T): alpha = expr.shuffle(expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _svm_mapper, kw={'labels': labels, 'alpha': alpha, 'w': w, 'lambda_n': la * data.shape[0]}, shape_hint=alpha.shape, cost_hint={ hash(labels) : {'00': 0, '01': np.prod(labels.shape)}, hash(alpha) : {'00': 0, '01': np.prod(alpha.shape)} }) w = expr.sum(data * alpha * 1.0 / la / data.shape[0], axis=0).reshape((data.shape[1], 1)) w = w.optimized() return w
def pagerankDistributed(ctx, numPage, numIters, alpha): sGenerate = time.time() rank = eager(expr.ones((numPage, 1), tile_hint = (numPage / ctx.num_workers, 1), dtype = np.float32)) linkMatrix = eager( expr.shuffle( expr.ndarray( (numPage, numPage), dtype = np.float32, tile_hint = (numPage, numPage / ctx.num_workers)), make_weights, )) eGenerate = time.time() util.log_info("**pagerank** rank init finished") startCompute = time.time() for i in range(numIters): #rank = ((1 - alpha) * expr.dot(linkMatrix, rank,tile_hint = (numPage, numPage/10))) + belta rank = expr.dot(linkMatrix, rank, tile_hint = (numPage, numPage/10)) rank.evaluate() endCompute = time.time() util.log_info("**pagerank** compute finished") return (eGenerate - sGenerate, endCompute - startCompute)
def streaming_kmeans(points, k=10, num_iters=10, num_ballkmeans_runs=2, trim_factor=0.9, test_probability=0.1, correct_weight=False): ''' clustering data points using streaming kmeans method. Args: points(DistArray): data points to be clustered. k(int): the final number of clusters. num_iters(int): the number of iterations to run in each ball kmeans run. num_ballkmeans_runs(int): the number of ball kmeans to run. trim_factor(float): the ball kmeans parameter to separate the nearest points and distant points. test_probability(float): the percentage of points to be chosen as test set. correct_weights(bool): whether to correct the weights of the centroids. ''' centroids = expr.tile_operation(points, _streaming_mapper, kw={ 'k': k }).evaluate() new_centroids = [] for tile_result in centroids.values(): for centroids_list in tile_result: new_centroids.extend(centroids_list) centriods = ball_kmeans(new_centroids, k, num_iters, num_ballkmeans_runs, trim_factor, test_probability, correct_weight) centers = np.zeros((k, points.shape[1])) for i in range(k): centers[i] = centriods[i].get_center() return expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0], ))
def test_pagerank(self): _skip_if_travis() OUTLINKS_PER_PAGE = 10 PAGES_PER_WORKER = 1000000 num_pages = PAGES_PER_WORKER * self.ctx.num_workers wts = expr.shuffle( expr.ndarray( (num_pages, num_pages), dtype=np.float32, tile_hint=(num_pages, PAGES_PER_WORKER / 8)), make_weights, ) start = time.time() p = expr.eager(expr.ones((num_pages, 1), tile_hint=(PAGES_PER_WORKER / 8, 1), dtype=np.float32)) expr.dot(wts, p, tile_hint=(PAGES_PER_WORKER / 8, 1)).evaluate() cost = time.time() - start self._verify_cost("pagerank", cost)
def canopy_cluster(points, t1=0.1, t2=0.1, cf=1): ''' A simple implementation of canopy clustering method. Args: points(Expr or DistArray): the input data points matrix. t1(float): distance threshold between center point and the points within a canopy. t2(float): distance threshold between center point and the points within a canopy. cf(int): the minimum canopy size. ''' new_points = expr.tile_operation(points, _canopy_mapper, kw={ 't1': t1, 't2': t2, 'cf': cf }).force() centers = find_centers(new_points.values(), t1, t2, cf) labels = expr.shuffle(points, _cluster_mapper, kw={'centers': centers}, shape_hint=(points.shape[0], )) return labels
def fit(data, labels, label_size, alpha=1.0): ''' Train standard naive bayes model. Args: data(Expr): documents to be trained. labels(Expr): the correct labels of the training data. label_size(int): the number of different labels. alpha(float): alpha parameter of naive bayes model. ''' # calc document freq df = expr.reduce(data, axis=0, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: (data > 0).sum(axis), accumulate_fn=np.add) idf = expr.log(data.shape[0] * 1.0 / (df + 1)) + 1 # Normalized Frequency for a feature in a document is calculated by dividing the feature frequency # by the root mean square of features frequencies in that document square_sum = expr.reduce( data, axis=1, dtype_fn=lambda input: input.dtype, local_reduce_fn=lambda ex, data, axis: np.square(data).sum(axis), accumulate_fn=np.add) rms = expr.sqrt(square_sum * 1.0 / data.shape[1]) # calculate weight normalized Tf-Idf data = data / rms.reshape((data.shape[0], 1)) * idf.reshape( (1, data.shape[1])) # add up all the feature vectors with the same labels #weights_per_label_and_feature = expr.ndarray((label_size, data.shape[1]), dtype=np.float64) #for i in range(label_size): # i_mask = (labels == i) # weights_per_label_and_feature = expr.assign(weights_per_label_and_feature, np.s_[i, :], expr.sum(data[i_mask, :], axis=0)) weights_per_label_and_feature = expr.shuffle( expr.retile(data, tile_hint=util.calc_tile_hint(data, axis=0)), _sum_instance_by_label_mapper, target=expr.ndarray((label_size, data.shape[1]), dtype=np.float64, reduce_fn=np.add), kw={ 'labels': labels, 'label_size': label_size }, cost_hint={hash(labels): { '00': 0, '01': np.prod(labels.shape) }}) # sum up all the weights for each label from the previous step weights_per_label = expr.sum(weights_per_label_and_feature, axis=1) # generate naive bayes per_label_and_feature weights weights_per_label_and_feature = expr.log( (weights_per_label_and_feature + alpha) / (weights_per_label.reshape((weights_per_label.shape[0], 1)) + alpha * weights_per_label_and_feature.shape[1])) return { 'scores_per_label_and_feature': weights_per_label_and_feature.optimized().force(), 'scores_per_label': weights_per_label.optimized().force(), }
def fit(self, X, centers=None, implementation='outer'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0],)) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2((X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape(centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum(X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn(n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={'d_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels}, shape_hint=(1,), cost_hint={hash(labels): {'00': 0, '01': np.prod(labels.shape)}}) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn(n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels
def fit(self, X, centers=None, implementation='map2'): """Compute k-means clustering. Parameters ---------- X : spartan matrix, shape=(n_samples, n_features). It should be tiled by rows. centers : numpy.ndarray. The initial centers. If None, it will be randomly generated. """ num_dim = X.shape[1] num_points = X.shape[0] labels = expr.zeros((num_points, 1), dtype=np.int) if implementation == 'map2': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.map2(X, 0, fn=kmeans_map2_dist_mapper, fn_kw={"centers": centers}, shape=(X.shape[0], )) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) return centers, labels elif implementation == 'outer': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): labels = expr.outer((X, centers), (0, None), fn=kmeans_outer_dist_mapper, shape=(X.shape[0], )) #labels = expr.argmin(distances, axis=1) counts = expr.map2(labels, 0, fn=kmeans_count_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], )) new_centers = expr.map2( (X, labels), (0, 0), fn=kmeans_center_mapper, fn_kw={'centers_count': self.n_clusters}, shape=(centers.shape[0], centers.shape[1])) counts = counts.optimized().glom() centers = new_centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'broadcast': if centers is None: centers = expr.rand(self.n_clusters, num_dim) for i in range(self.n_iter): util.log_warn("k_means_ %d %d", i, time.time()) X_broadcast = expr.reshape(X, (X.shape[0], 1, X.shape[1])) centers_broadcast = expr.reshape( centers, (1, centers.shape[0], centers.shape[1])) distances = expr.sum(expr.square(X_broadcast - centers_broadcast), axis=2) labels = expr.argmin(distances, axis=1) center_idx = expr.arange((1, centers.shape[0])) matches = expr.reshape(labels, (labels.shape[0], 1)) == center_idx matches = matches.astype(np.int64) counts = expr.sum(matches, axis=0) centers = expr.sum( X_broadcast * expr.reshape(matches, (matches.shape[0], matches.shape[1], 1)), axis=0) counts = counts.optimized().glom() centers = centers.optimized().glom() # If any centroids don't have any points assigined to them. zcount_indices = (counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. counts[zcount_indices] = 1 centers[zcount_indices, :] = np.random.randn( n_points, num_dim) centers = centers / counts.reshape(centers.shape[0], 1) centers = expr.from_numpy(centers) return centers, labels elif implementation == 'shuffle': if centers is None: centers = np.random.rand(self.n_clusters, num_dim) for i in range(self.n_iter): # Reset them to zero. new_centers = expr.ndarray((self.n_clusters, num_dim), reduce_fn=lambda a, b: a + b) new_counts = expr.ndarray((self.n_clusters, 1), dtype=np.int, reduce_fn=lambda a, b: a + b) _ = expr.shuffle(X, _find_cluster_mapper, kw={ 'd_pts': X, 'old_centers': centers, 'new_centers': new_centers, 'new_counts': new_counts, 'labels': labels }, shape_hint=(1, ), cost_hint={ hash(labels): { '00': 0, '01': np.prod(labels.shape) } }) _.force() new_counts = new_counts.glom() new_centers = new_centers.glom() # If any centroids don't have any points assigined to them. zcount_indices = (new_counts == 0).reshape(self.n_clusters) if np.any(zcount_indices): # One or more centroids may not have any points assigned to them, # which results in their position being the zero-vector. We reseed these # centroids with new random values. n_points = np.count_nonzero(zcount_indices) # In order to get rid of dividing by zero. new_counts[zcount_indices] = 1 new_centers[zcount_indices, :] = np.random.randn( n_points, num_dim) new_centers = new_centers / new_counts centers = new_centers return centers, labels