def _svm_mapper(array, ex, labels, alpha, w, m, scale, lambda_n): ''' Local linear SVM solver. Args: array(DistArray): features of the training data. ex(Extent): Region being processed. labels(DistArray): labels of the training data. alpha(DistArray): alpha vector which is the parameter optimized by SVM. w(DistArray): weight vector of the previous iteration. m(int): number of samples to train (now we set it to the whole local data set). scale(int): number of tiles lambda_n: lambda/size(total train data) which is the parameter of this svm model. ''' X = array.fetch(ex) Y = labels.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), labels.shape)) tile_id = ex.ul[0]/(ex.lr[0]-ex.ul[0]) ex_alpha = extent.create((tile_id*m, 0), ((tile_id+1)*m, 1), alpha.shape) old_alpha = alpha.fetch(ex_alpha) old_w = np.zeros((X.shape[1],1)) if w is None else w[:] new_w, new_alpha = _svm_disdca_train(X, Y, old_alpha, old_w, m, scale, lambda_n) # update the alpha vector alpha.update(ex_alpha, new_alpha) # reduce the weight vector yield extent.create((0,0),(array.shape[1],1),(array.shape[1], 1)), new_w
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' A = expr.force(A) n = int(math.sqrt(len(A.tiles))) tile_size = A.shape[0] / n for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.region_map(A, diag_ex, _cholesky_dpotrf_mapper) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k+1)*tile_size, k*tile_size),(n*tile_size, (k+1)*tile_size), A.shape) A = expr.region_map(A, col_ex, _cholesky_dtrsm_mapper, fn_kw=dict(diag_ex=diag_ex)) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([extent.create((m*tile_size, m*tile_size), (n*tile_size, (m+1)*tile_size), A.shape) for m in range(k+1,n)]) A = expr.region_map(A, col_exs, _cholesky_dsyrk_dgemm_mapper, fn_kw=dict(k=k)) # update the right corner to 0 col_exs = list([extent.create((0, m*tile_size),(m*tile_size, (m+1)*tile_size),A.shape) for m in range(1,n)]) A = expr.region_map(A, col_exs, lambda input, array, ex: np.zeros(input.shape, input.dtype)) return A
def _solve_U_or_M_mapper(array, ex, U_or_M, la, alpha, implicit_feedback): ''' given A and U (or M), solve M (or U) such that A = U M' using alternating least-squares factorization method Args: array(DistArray): the user-item (or item-user) rating matrix. ex(Extent): region being processed. U_or_M(DistArray): the matrix U (or M). la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. ''' rating_matrix = array.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape)) U_or_M = U_or_M[:] if implicit_feedback: Y = U_or_M YT = Y.T YTY = np.dot(YT, Y) result = np.zeros((rating_matrix.shape[0], U_or_M.shape[1])) for i in range(rating_matrix.shape[0]): if implicit_feedback: result[i] = _implicit_feedback_als_solver(rating_matrix[i], la, alpha, Y, YT, YTY) else: non_zero_idx = rating_matrix[i].nonzero()[0] rating_vector = rating_matrix[i, non_zero_idx] feature_vectors = U_or_M[non_zero_idx] result[i] = _als_solver(feature_vectors, rating_vector, la) yield extent.create((ex.ul[0], 0), (ex.lr[0], U_or_M.shape[1]), (array.shape[0], U_or_M.shape[1])), result
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' A = expr.force(A) n = int(math.sqrt(len(A.tiles))) tile_size = A.shape[0] / n for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper, shape=A.shape, update_region=diag_ex) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k + 1) * tile_size, k * tile_size), (n * tile_size, (k + 1) * tile_size), A.shape) diag_tile = A.force().fetch(diag_ex) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dtrsm_mapper, fn_kw=dict(array=force(A), diag_tile=diag_tile), shape=A.shape, update_region=col_ex) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([ extent.create((m * tile_size, m * tile_size), (n * tile_size, (m + 1) * tile_size), A.shape) for m in range(k + 1, n) ]) dgemm_1 = expr.transpose(A)[(k * tile_size):((k + 1) * tile_size), :] dgemm_2 = A[:, (k * tile_size):((k + 1) * tile_size)] A = expr.map2((A, dgemm_1, dgemm_2), ((0, 1), 1, 0), fn=_cholesky_dsyrk_dgemm_mapper, fn_kw=dict(array=force(A), k=k), shape=A.shape, update_region=col_exs) # update the right corner to 0 col_exs = list([ extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size), A.shape) for m in range(1, n) ]) A = expr.map2(A, ((0, 1), ), fn=_zero_mapper, shape=A.shape, update_region=col_exs) return A
def kmeans_map2_center_mapper(ex, tile, centers=None, m=None): X = tile[0] weights = tile[1] ** m new_centers = np.dot(X.T, weights).T target_ex = extent.create((ex[0].ul[0], ), (ex[0].lr[0], ), (ex[0].array_shape[0], )) target_ex = extent.create((0, 0), (centers.shape[0], centers.shape[1]), (centers.shape[0], centers.shape[1])) yield target_ex, new_centers
def _init_label_mapper(array, ex): data = array.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape)) labels = np.zeros((data.shape[0], 1), dtype=np.int64) for i in range(data.shape[0]): if data[i,0] > data[i,1]: labels[i,0] = 1.0 else: labels[i,0] = -1.0 yield extent.create((ex.ul[0], 0), (ex.lr[0], 1), (array.shape[0], 1)), labels
def _cholesky_dsyrk_dgemm_mapper(input, array, ex, k): mk_ex = extent.create((ex.ul[1], k*input.shape[1]), (ex.lr[1], (k+1)*input.shape[1]), array.shape) A_mk = array.fetch(mk_ex) if ex.ul[0] == ex.ul[1] and ex.lr[0] == ex.lr[1]: # diag block return linalg.blas.dsyrk(-1.0, A_mk, 1.0, input, lower=1) else: # other block lk_ex = extent.create((ex.ul[0], k*input.shape[1]), (ex.lr[0], (k+1)*input.shape[1]), array.shape) A_lk = array.fetch(lk_ex) return linalg.blas.dgemm(-1.0, A_lk, A_mk.T, 1.0, input)
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' n = int(math.sqrt(FLAGS.num_workers)) tile_size = A.shape[0] / n print n, tile_size for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper, shape=A.shape, update_region=diag_ex) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k + 1) * tile_size, k * tile_size), (n * tile_size, (k + 1) * tile_size), A.shape) A = expr.map2((A, A[diag_ex.to_slice()]), ((0, 1), None), fn=_cholesky_dtrsm_mapper, shape=A.shape, update_region=col_ex) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([ extent.create((m * tile_size, m * tile_size), (n * tile_size, (m + 1) * tile_size), A.shape) for m in range(k + 1, n) ]) dgemm = A[:, (k * tile_size):((k + 1) * tile_size)] A = expr.map2((A, expr.transpose(dgemm), dgemm), ((0, 1), 1, 0), fn=_cholesky_dsyrk_dgemm_mapper, shape=A.shape, update_region=col_exs).optimized() # update the right corner to 0 col_exs = list([ extent.create((0, m * tile_size), (m * tile_size, (m + 1) * tile_size), A.shape) for m in range(1, n) ]) A = expr.map2(A, ((0, 1), ), fn=_zero_mapper, shape=A.shape, update_region=col_exs) return A
def _cluster_mapper(array, ex, centers): ''' label the cluster id for each data point. Args: array(DistArray): the input data points matrix. ex(Extent): region being processed. centers(numpy.array): the center points for each cluster. ''' points = array.fetch(ex) labels = np.zeros(points.shape[0], dtype=np.int32) for i in range(points.shape[0]): point = points[i] max = -1 max_id = -1 for j in range(centers.shape[0]): dist = np.square(centers[j] - point).sum() pdf = 1.0 / (1 + dist) if max < pdf: max = pdf max_id = j labels[i] = max_id yield extent.create((ex.ul[0], ), (ex.lr[0], ), (array.shape[0], )), labels
def _lda_doc_topic_mapper( ex_a, term_docs_matrix, ex_b, local_topic_term_counts, k_topics, alpha, eta, max_iter_per_doc ): """ Last iteration that uses Collapsed Variational Bayes method (Mahout implementation) to calculate the final document/topic inference. Args: array(DistArray): the count of each term in each document. ex(Extent): Region being processed. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter_per_doc(int): the max iterations to train each document. topic_term_counts(DistArray): the matrix to save p(topic x | term). """ # term_docs_matrix = array.fetch(extent.create((0, ex.ul[1]), (array.shape[0], ex.lr[1]), array.shape)) # local_topic_term_counts = topic_term_counts[:] local_topic_sums = np.linalg.norm(local_topic_term_counts, 1, axis=1) doc_topics = np.ones((term_docs_matrix.shape[1], k_topics), dtype=np.float64) / k_topics local_topic_term_counts = _lda_train( term_docs_matrix, local_topic_term_counts, local_topic_sums, doc_topics, k_topics, alpha, eta, max_iter_per_doc ) # yield extent.create((ex.ul[1], 0), (ex.lr[1], k_topics), (array.shape[1], k_topics)), doc_topics yield (extent.create((ex_a.ul[1], 0), (ex_a.lr[1], k_topics), (ex_a.array_shape[1], k_topics)), doc_topics)
def _row_similarity_mapper(array, ex, similarity_measurement): ''' calculate distances for each pair of points. Args: array(DistArray): the input data points matrix. ex(Extent): region being processed. similarity_measurement(str): distance method used to measure similarity between two points. ''' measurement = distance_methods[similarity_measurement] points = array.fetch(ex) result = np.zeros((points.shape[0], array.shape[0])) for other_ex in array.tiles: if ex == other_ex: other_points = points else: other_points = array.fetch(other_ex) for i in range(points.shape[0]): for j in range(other_points.shape[0]): result[i, other_ex.ul[0] + j] = measurement( points[i], other_points[j]) yield extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[0]), (array.shape[0], array.shape[0])), result
def _lda_mapper(ex_a, term_docs_matrix, ex_b, local_topic_term_counts, k_topics, alpha, eta, max_iter_per_doc): ''' Using Collapsed Variational Bayes method (Mahout implementation) to train local LDA model. Args: array(DistArray): the count of each term in each document. ex(Extent): Region being processed. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter_per_doc(int): the max iterations to train each document. topic_term_counts(DistArray): the matrix to save p(topic x | term). ''' #term_docs_matrix = array.fetch(extent.create((0, ex.ul[1]), (array.shape[0], ex.lr[1]), array.shape)) #local_topic_term_counts = topic_term_counts[:] local_topic_sums = np.linalg.norm(local_topic_term_counts, 1, axis=1) local_topic_term_counts = _lda_train(term_docs_matrix, local_topic_term_counts, local_topic_sums, None, k_topics, alpha, eta, max_iter_per_doc) #yield extent.create((0, 0), (k_topics, array.shape[0]), (k_topics, array.shape[0])), local_topic_term_counts yield (extent.create( (0, 0), (k_topics, ex_a.array_shape[0]), (k_topics, ex_a.array_shape[0])), local_topic_term_counts)
def _lda_doc_topic_mapper(ex_a, term_docs_matrix, ex_b, local_topic_term_counts, k_topics, alpha, eta, max_iter_per_doc): ''' Last iteration that uses Collapsed Variational Bayes method (Mahout implementation) to calculate the final document/topic inference. Args: array(DistArray): the count of each term in each document. ex(Extent): Region being processed. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter_per_doc(int): the max iterations to train each document. topic_term_counts(DistArray): the matrix to save p(topic x | term). ''' #term_docs_matrix = array.fetch(extent.create((0, ex.ul[1]), (array.shape[0], ex.lr[1]), array.shape)) #local_topic_term_counts = topic_term_counts[:] local_topic_sums = np.linalg.norm(local_topic_term_counts, 1, axis=1) doc_topics = np.ones( (term_docs_matrix.shape[1], k_topics), dtype=np.float64) / k_topics local_topic_term_counts = _lda_train(term_docs_matrix, local_topic_term_counts, local_topic_sums, doc_topics, k_topics, alpha, eta, max_iter_per_doc) #yield extent.create((ex.ul[1], 0), (ex.lr[1], k_topics), (array.shape[1], k_topics)), doc_topics yield (extent.create((ex_a.ul[1], 0), (ex_a.lr[1], k_topics), (ex_a.array_shape[1], k_topics)), doc_topics)
def _lda_mapper(ex_a, term_docs_matrix, ex_b, local_topic_term_counts, k_topics, alpha, eta, max_iter_per_doc): """ Using Collapsed Variational Bayes method (Mahout implementation) to train local LDA model. Args: array(DistArray): the count of each term in each document. ex(Extent): Region being processed. k_topics: the number of topics we need to find. alpha(float): parameter of LDA model. eta(float): parameter of LDA model. max_iter_per_doc(int): the max iterations to train each document. topic_term_counts(DistArray): the matrix to save p(topic x | term). """ # term_docs_matrix = array.fetch(extent.create((0, ex.ul[1]), (array.shape[0], ex.lr[1]), array.shape)) # local_topic_term_counts = topic_term_counts[:] local_topic_sums = np.linalg.norm(local_topic_term_counts, 1, axis=1) local_topic_term_counts = _lda_train( term_docs_matrix, local_topic_term_counts, local_topic_sums, None, k_topics, alpha, eta, max_iter_per_doc ) # yield extent.create((0, 0), (k_topics, array.shape[0]), (k_topics, array.shape[0])), local_topic_term_counts yield ( extent.create((0, 0), (k_topics, ex_a.array_shape[0]), (k_topics, ex_a.array_shape[0])), local_topic_term_counts, )
def kmeans_outer_dist_mapper(ex_a, tile_a, ex_b, tile_b): points = tile_a centers = tile_b target_ex = extent.create((ex_a[0].ul[0], ), (ex_a[0].lr[0], ), (ex_a[0].array_shape[0], )) yield target_ex, np.argmin(cdist(points, centers), axis=1)
def _cluster_mapper(array, ex, centers): ''' label the cluster id for each data point. Args: array(DistArray): the input data points matrix. ex(Extent): region being processed. centers(numpy.array): the center points for each cluster. ''' points = array.fetch(ex) labels = np.zeros(points.shape[0], dtype=np.int32) for i in range(points.shape[0]): point = points[i] max = -1 max_id = -1 for j in range(centers.shape[0]): dist = np.square(centers[j] - point).sum() pdf = 1.0 / (1 + dist) if max < pdf: max = pdf max_id = j labels[i] = max_id yield extent.create((ex.ul[0],), (ex.lr[0],), (array.shape[0],)), labels
def _fuzzy_kmeans_mapper(array, ex, old_centers, centers, counts, labels, m): ''' Update the new centers, new counts and labels using fuzzy kmeans method. Args: array(DistArray): the input data points matrix. ex(Extent): region being processed. old_centers(DistArray): the current centers of each cluster. centers(DistArray): the new centers to be updated. counts(DistArray): the new counts to be updated. labels(DistArray): the new labels for each point to be updated. m(float): the parameter of fuzzy kmeans. ''' points = array.fetch(ex) old_centers = old_centers[:] new_centers = np.zeros_like(old_centers) new_counts = np.zeros((old_centers.shape[0], 1)) new_labels = np.zeros(points.shape[0], dtype=np.int) for i in range(points.shape[0]): point = points[i] prob = _calc_probability(point, old_centers, m) new_labels[i] = np.argmax(prob) for i in prob.nonzero()[0]: new_counts[i] += prob[i] new_centers[i] += prob[i] * point centers.update(extent.from_shape(centers.shape), new_centers) counts.update(extent.from_shape(counts.shape), new_counts) labels.update(extent.create((ex.ul[0],), (ex.lr[0],), labels.shape), new_labels) return []
def _solve_U_or_M_mapper(ex_a, rating_matrix, ex_b, U_or_M, la, alpha, implicit_feedback, shape=None): ''' given A and U (or M), solve M (or U) such that A = U M' using alternating least-squares factorization method Args: rating_matrix: the user-item (or item-user) rating matrix. U_or_M: the matrix U (or M). la(float): the parameter of the als. alpha(int): confidence parameter used on implicit feedback. implicit_feedback(bool): whether using implicit_feedback method for als. ''' if implicit_feedback: Y = U_or_M YT = Y.T YTY = np.dot(YT, Y) result = np.zeros((rating_matrix.shape[0], U_or_M.shape[1])) if implicit_feedback: for i in range(rating_matrix.shape[0]): result[i] = _implicit_feedback_als_solver(rating_matrix[i], la, alpha, Y, YT, YTY) else: for i in range(rating_matrix.shape[0]): non_zero_idx = rating_matrix[i].nonzero()[0] rating_vector = rating_matrix[i, non_zero_idx] feature_vectors = U_or_M[non_zero_idx] result[i] = _als_solver(feature_vectors, rating_vector, la) target_ex = extent.create((ex_a.ul[0], 0), (ex_a.lr[0], U_or_M.shape[1]), shape) yield target_ex, result
def _init_label_mapper(array, ex): data = array.fetch(ex) labels = np.zeros((data.shape[0], 1), dtype=np.int64) for i in range(data.shape[0]): labels[i] = np.argmax(data[i]) yield extent.create((ex.ul[0], 0), (ex.lr[0], 1), (array.shape[0], 1)), labels
def _init_label_mapper(array, ex, data): data = data.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], data.shape[1]), data.shape)) labels = np.zeros((data.shape[0], 1), dtype=np.int64) for i in range(data.shape[0]): labels[i] = np.argmax(data[i]) yield ex, labels
def test_intersection(): a = extent.create((0, 0), (10, 10), None) b = extent.create((5, 5), (6, 6), None) Assert.eq(extent.intersection(a, b), extent.create((5, 5), (6, 6), None)) Assert.eq(extent.intersection(b, a), extent.create((5, 5), (6, 6), None)) a = extent.create((5, 5), (10, 10), None) b = extent.create((4, 6), (6, 8), None) Assert.eq(extent.intersection(a, b), extent.create((5, 6), (6, 8), None)) a = extent.create((5, 5), (5, 5), None) b = extent.create((1, 1), (2, 2), None) assert extent.intersection(a, b) == None
def _sum_instance_by_label_mapper(array, ex, labels, label_size): ''' For each label, compute the sum of the feature vectors which belong to that label. Args: array(DistArray): tf-idf normalized training data. ex(Extent): Region being processed. labels(DistArray): labels of the training data. label_size: the number of different labels. ''' X = array.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape)) Y = labels.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), labels.shape)) sum_instance_by_label = np.zeros((label_size, X.shape[1])) for i in xrange(Y.shape[0]): sum_instance_by_label[Y[i, 0]] += X[i] yield extent.create((0, 0), (label_size, X.shape[1]), (label_size, X.shape[1])), sum_instance_by_label
def test_unravel(): for i in range(100): shp = (20, 77) ul = (random.randint(0, 19), random.randint(0, 76)) lr = (random.randint(ul[0] + 1, 20), random.randint(ul[1] + 1, 77)) a = extent.create(ul, lr, shp) ravelled = a.ravelled_pos() unravelled = extent.unravelled_pos(ravelled, a.array_shape) Assert.eq(a.ul, unravelled)
def test_ravelled_pos(): a = extent.create((2, 2), (7, 7), (10, 10)) for i in range(0, 10): for j in range(0, 10): assert extent.ravelled_pos((i, j), a.array_shape) == 10 * i + j Assert.eq(a.to_global(0, axis=None), 22) Assert.eq(a.to_global(10, axis=None), 42) Assert.eq(a.to_global(11, axis=None), 43) Assert.eq(a.to_global(20, axis=None), 62)
def kmeans_map2_dist_mapper(ex, tile, centers=None, m=None): points = tile[0] target_ex = extent.create((ex[0].ul[0], 0), (ex[0].lr[0], centers.shape[0]), (ex[0].array_shape[0], centers.shape[0])) distances = cdist(points, centers) distances[distances == 0] = 0.0000000001 distances **= 1.0 / (m - 1) distances /= np.sum(distances, axis=1)[:, np.newaxis] yield target_ex, distances
def _local_read_sparse_mm(array, ex, fn, data_begin): ''' 1. Noted that Matrix Market format doesn't require (row, col) to be sorted. If the file is sorted (by either row or col), each worker will return only a part of the array. If the file is unsorted, each worker may return a very big and sparser sub-array of the original array. In the worst case, the sub-array can be as large as the original array but sparser. 2. We can't know how many lines without reading the whole file. So we simply decide the region this worker should read based on the file size. ''' data_size = os.path.getsize(fn) - data_begin array_size = np.product(array.shape) begin = extent.ravelled_pos(ex.ul, array.shape) begin = math.ceil(((begin * 1.0) / array_size) * data_size) + data_begin end = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape) end = math.floor(((end * 1.0) / array_size) * data_size) + data_begin ul = [array.shape[0], array.shape[1]] lr = [0, 0] rows = [] cols = [] data = [] with open(fn) as fp: fp.seek(begin) if begin != data_begin: fp.seek(begin - 1) a = fp.read(1) if a != '\n': line = fp.readline() pos = fp.tell() for line in fp: if pos > end + 1: # +1 in case end locates on \n break pos += len(line) (_row, _col), val = _extract_mm_coordinate(line) _row -= 1 _col -= 1 rows.append(_row) cols.append(_col) data.append(float(val)) ul[0] = _row if _row < ul[0] else ul[0] ul[1] = _col if _col < ul[1] else ul[1] lr[0] = _row if _row > lr[0] else lr[0] lr[1] = _col if _col > lr[1] else lr[1] # Adjust rows and cols based on the ul of this submatrix. for i in xrange(len(rows)): rows[i] -= ul[0] cols[i] -= ul[1] new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape) new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape) return new_ex, sparse.convert_sparse_array(new_array)
def _transpose_mapper(array, ex, orig_array): ''' Transpose ``orig_array`` into ``array``. Args: array(DistArray): destination array. ex(Extent): region being processed. orig_array(DistArray): array to be transposed. ''' orig_ex = extent.create(ex.ul[::-1], ex.lr[::-1], orig_array.shape) yield ex, orig_array.fetch(orig_ex).transpose()
def _sum_instance_by_label_mapper(array, ex, labels, label_size): ''' For each label, compute the sum of the feature vectors which belong to that label. Args: array(DistArray): tf-idf normalized training data. ex(Extent): Region being processed. labels(DistArray): labels of the training data. label_size: the number of different labels. ''' X = array.fetch( extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape)) Y = labels.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), labels.shape)) sum_instance_by_label = np.zeros((label_size, X.shape[1])) for i in xrange(Y.shape[0]): sum_instance_by_label[Y[i, 0]] += X[i] yield extent.create((0, 0), (label_size, X.shape[1]), (label_size, X.shape[1])), sum_instance_by_label
def test_intersection(): a = extent.create((0, 0), (10, 10), None) b = extent.create((5, 5), (6, 6), None) Assert.eq(extent.intersection(a, b), extent.create((5,5), (6,6), None)) Assert.eq(extent.intersection(b, a), extent.create((5,5), (6,6), None)) a = extent.create((5, 5), (10, 10), None) b = extent.create((4, 6), (6, 8), None) Assert.eq(extent.intersection(a, b), extent.create((5,6), (6, 8), None)) a = extent.create((5, 5), (5, 5), None) b = extent.create((1, 1), (2, 2), None) assert extent.intersection(a, b) == None
def test_tilesharing(ctx): print "#worker:", ctx.num_workers N_EXAMPLES = 5 * ctx.num_workers x = expr.ones((N_EXAMPLES, 1), tile_hint=(N_EXAMPLES / ctx.num_workers, 1)) y = expr.region_map(x, extent.create((0, 0), (3, 1), (N_EXAMPLES, 1)), fn=lambda data, ex, a: data+a, fn_kw={'a': 1}) npx = np.ones((N_EXAMPLES, 1)) npy = np.ones((N_EXAMPLES, 1)) npy[0:3, 0] += 1 assert np.all(np.equal(x.glom(), npx)) assert np.all(np.equal(y.glom(), npy))
def _svm_mapper(array, ex, labels, alpha, w, lambda_n): ''' Local linear SVM solver. Args: array(DistArray): features of the training data. ex(Extent): Region being processed. labels(DistArray): labels of the training data. alpha(DistArray): alpha vector which is the parameter optimized by SVM. w(DistArray): weight vector of the previous iteration. lambda_n: lambda/size(total train data) which is the parameter of this svm model. ''' X = array.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[1]), array.shape)) Y = labels.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), labels.shape)) old_alpha = alpha.fetch(extent.create((ex.ul[0], 0), (ex.lr[0], 1), alpha.shape)) old_w = w[:] new_alpha = _svm_disdca_train(X, Y, old_alpha, old_w, len(array.tiles), lambda_n) # update the alpha vector yield extent.create((ex.ul[0], 0), (ex.lr[0], 1), alpha.shape), new_alpha
def _write_mapper(ex, source=None, sregion=None, dst_slice=None): intersection = extent.intersection(ex, sregion) futures = rpc.FutureGroup() if intersection is not None: dst_lr = np.asarray(intersection.lr) - np.asarray(sregion.ul) dst_ul = np.asarray(intersection.ul) - np.asarray(sregion.ul) dst_ex = extent.create(tuple(dst_ul), tuple(dst_lr), dst_slice.shape) v = dst_slice.fetch(dst_ex) futures.append(source.update(intersection, v, wait=False)) return LocalKernelResult(result=None, futures=futures)
def _write_mapper(ex, source = None, sregion = None, dst_slice = None): intersection = extent.intersection(ex, sregion) futures = rpc.FutureGroup() if intersection != None: dst_lr = np.asarray(intersection.lr) - np.asarray(sregion.ul) dst_ul = np.asarray(intersection.ul) - np.asarray(sregion.ul) dst_ex = extent.create(tuple(dst_ul), tuple(dst_lr), dst_slice.shape) v = dst_slice.fetch(dst_ex) futures.append(source.update(intersection, v, wait=False)) return LocalKernelResult(result=None, futures=futures)
def _init_M_mapper(array, ex, avg_rating): ''' Initialize the M matrix with its first column equals to avg_rating. Args: array(DistArray): the array to be created. ex(Extent): region being processed. avg_rating(DistArray): the average rating for each item. ''' avg_rating = avg_rating.fetch(extent.create((ex.ul[0],), (ex.lr[0],), avg_rating.shape)) M = np.zeros(ex.shape) for i in avg_rating.nonzero()[0]: M[i, 0] = avg_rating[i] M[i, 1:] = np.random.rand(M.shape[1]-1) yield ex, M
def cholesky(A): ''' Cholesky matrix decomposition. Args: A(Expr): matrix to be decomposed ''' n = int(math.sqrt(FLAGS.num_workers)) tile_size = A.shape[0] / n print n, tile_size for k in range(n): # A[k,k] = DPOTRF(A[k,k]) diag_ex = get_ex(k, k, tile_size, A.shape) A = expr.map2(A, ((0, 1), ), fn=_cholesky_dpotrf_mapper, shape=A.shape, update_region=diag_ex) if k == n - 1: break # A[l,k] = DTRSM(A[k,k], A[l,k]) l -> [k+1,n) col_ex = extent.create(((k+1)*tile_size, k*tile_size), (n*tile_size, (k+1)*tile_size), A.shape) A = expr.map2((A, A[diag_ex.to_slice()]), ((0, 1), None), fn=_cholesky_dtrsm_mapper, shape=A.shape, update_region=col_ex) # A[m,m] = DSYRK(A[m,k], A[m,m]) m -> [k+1,n) # A[l,m] = DGEMM(A[l,k], A[m,k], A[l,m]) m -> [k+1,n) l -> [m+1,n) col_exs = list([extent.create((m*tile_size, m*tile_size), (n*tile_size, (m+1)*tile_size), A.shape) for m in range(k+1, n)]) dgemm = A[:, (k * tile_size):((k + 1) * tile_size)] A = expr.map2((A, expr.transpose(dgemm), dgemm), ((0, 1), 1, 0), fn=_cholesky_dsyrk_dgemm_mapper, shape=A.shape, update_region=col_exs).optimized() # update the right corner to 0 col_exs = list([extent.create((0, m*tile_size), (m*tile_size, (m+1)*tile_size), A.shape) for m in range(1, n)]) A = expr.map2(A, ((0, 1), ), fn=_zero_mapper, shape=A.shape, update_region=col_exs) return A
def _naive_bayes_mapper(array, ex, weights_per_label, alpha): ''' train local naive bayes weights. Args: array(DistArray): weights for each label and feature. ex(Extent): Region being processed. weights_per_label(DistArray): weights for each label. alpha: naive bayes parameter. ''' weights_per_label_and_feature = array.fetch(ex) weights_per_label = weights_per_label.fetch(extent.create((ex.ul[0],), (ex.lr[0],), weights_per_label.shape)) weights_per_label = weights_per_label.reshape((weights_per_label.shape[0], 1)) weights_per_label_and_feature = np.log((weights_per_label_and_feature + alpha) / (weights_per_label + alpha * weights_per_label_and_feature.shape[1])) yield ex, weights_per_label_and_feature
def _find_cluster_mapper(inputs, ex, d_pts, old_centers, new_centers, new_counts, labels): centers = old_centers pts = d_pts.fetch(ex) closest = _find_closest(pts, centers) l_counts = np.zeros((centers.shape[0], 1), dtype=np.int) l_centers = np.zeros_like(centers) for i in range(centers.shape[0]): matching = (closest == i) l_counts[i] = matching.sum() l_centers[i] = pts[matching].sum(axis=0) # update centroid positions new_centers.update(extent.from_shape(new_centers.shape), l_centers) new_counts.update(extent.from_shape(new_counts.shape), l_counts) labels.update(extent.create(ex.ul, (ex.lr[0], 1), labels.shape), closest.reshape(pts.shape[0], 1)) return []
def _make_site_sparse(tile, ex, num_outlinks=None, same_site_prob=None): if ex.shape[0] == tile.shape[0]: tile_pages = ex.shape[1] ul, lr = ex.ul[1], ex.lr[1] else: tile_pages = ex.shape[0] ul, lr = ex.ul[0], ex.lr[0] same_site = np.random.rand(num_outlinks * tile_pages) <= same_site_prob outlink = np.zeros(num_outlinks * tile_pages, dtype=np.int32) outlink[same_site] = np.random.randint(ul, lr, np.count_nonzero(same_site)) outlink[~same_site] = np.random.randint(0, tile.shape[0], np.count_nonzero(~same_site)) rows, cols, data = _build_site_coo(tile_pages, num_outlinks, outlink, ul, lr) result = scipy.sparse.coo_matrix((data, (rows, cols)), shape=(tile.shape[0], tile_pages), dtype=np.float32) result_ex = extent.create((0, ul), (tile.shape[0], lr), tile.shape) yield result_ex, result
def _row_similarity_mapper(array, ex, similarity_measurement): ''' calculate distances for each pair of points. Args: array(DistArray): the input data points matrix. ex(Extent): region being processed. similarity_measurement(str): distance method used to measure similarity between two points. ''' measurement = distance_methods[similarity_measurement] points = array.fetch(ex) result = np.zeros((points.shape[0], array.shape[0])) for other_ex in array.tiles: if ex == other_ex: other_points = points else: other_points = array.fetch(other_ex) for i in range(points.shape[0]): for j in range(other_points.shape[0]): result[i, other_ex.ul[0] + j] = measurement(points[i], other_points[j]) yield extent.create((ex.ul[0], 0), (ex.lr[0], array.shape[0]), (array.shape[0], array.shape[0])), result
def kmeans_center_mapper(extents, tiles, centers_count): points = tiles[0] labels = tiles[1] target_ex = extent.create((0, 0), (centers_count, points.shape[1]), (centers_count, points.shape[1])) #new_centers = np.ndarray((centers_count, points.shape[1])) #sorted_labels = np.sort(tiles[1]) #argsorted_labels = np.argsort(tiles[1]) #index = np.searchsorted(sorted_labels, np.arange(centers_count), side='right') #for i in xrange(centers_count): #if i == 0 or sorted_labels[index[i] - 1] != i: #continue #else: #if i == 0: #new_centers[i] = np.sum(argsorted_labels[0:index[0]], axis=0) #else: #new_centers[i] = np.sum(argsorted_labels[index[i - 1]:index[i]], axis=0) new_centers = np.zeros((centers_count, points.shape[1])) for i in xrange(centers_count): matching = (labels == i) new_centers[i] = points[matching].sum(axis=0) yield target_ex, new_centers
def _select_most_k_similar_mapper(array, ex, top_k_similar_indices, k): ''' Find the top k similar items for each item. Parameters ---------- top_k_similar_indices: Spartan array of shape (N, k) The indices of top k similar items. k : Integer ''' local_similarity_table = array.fetch(ex) local_top_k_values = np.zeros((ex.shape[0], k)) start_idx = ex.ul[0] # Find the k largest value of each row. This function is adapted from # bottlenect.argpartsort. sorted_indices = argpartsort(local_similarity_table, k, axis=1)[:, :k] for i in range(sorted_indices.shape[0]): local_top_k_values[i] = local_similarity_table[i, sorted_indices[i]] top_k_similar_indices[ex.ul[0]:ex.lr[0], :] = sorted_indices yield extent.create((ex.ul[0], 0), (ex.lr[0], k), (array.shape[0], k)), local_top_k_values
def _local_read_sparse_npy(array, ex, fn): ''' 1. Noted that coo_matrix format doesn't require row[] or col[] to be sorted. If one of row[] or col[] is sorted (by either row or col), each worker will return only a part of the array. If the file is unsorted, each worker may return a very big and sparser sub-array of the original array. In the worst case, the sub-array can be as large as the original array but sparser. 2. For numpy format, we can evenly distribute the files we need to read to workers. ''' #data_begin = {} #dtype = {} #dtype_size = {} #shape = {} #fp = {} #read_next = {} attr = { 'data_begin': {}, 'dtype': {}, 'shape': None, 'read_next': {}, 'fn': {} } types = ['row', 'col', 'data'] dtype_name = {'float64': 'd', 'float32': 'f', 'int64': 'q', 'int32': 'i'} for i in types: _fn = '%s_%s.npy' % (fn, i) attr['fn'][i] = _fn _shape, attr['dtype'][i], attr['data_begin'][i] = _parse_npy_header( _fn) if attr['shape'] is not None: assert attr['shape'] == _shape else: attr['shape'] = _shape #shape['row'], dtype['row'], data_begin['row'] = _parse_npy_header(fn + '_row.npy') #shape['col'], dtype['col'], data_begin['col'] = _parse_npy_header(fn + '_col.npy') #shape['data'], dtype['data'], data_begin['data'] = _parse_npy_header(fn + '_data.npy') item_count = np.product(array.shape) begin_item = extent.ravelled_pos(ex.ul, array.shape) begin_item = int( math.ceil(((begin_item * 1.0) / item_count) * attr['shape'][0])) end_item = extent.ravelled_pos([(i - 1) for i in ex.lr], array.shape) end_item = int(math.floor( (end_item * 1.0) / item_count * attr['shape'][0])) + 1 end_item = attr['shape'][0] if end_item > attr['shape'][0] else end_item ul = [array.shape[0], array.shape[1]] lr = [0, 0] rows = [] cols = [] data = [] with FileHelper(row=open(attr['fn']['row'], 'rb'), col=open(attr['fn']['col'], 'rb'), data=open(attr['fn']['data'], 'rb')) as fp: for k in types: _dtype = attr['dtype'][k] _dtype_size = _dtype.itemsize _fp = getattr(fp, k) _fp.seek(attr['data_begin'][k] + begin_item * _dtype_size) attr['read_next'][k] = _bulk_read(_fp, _dtype_size) attr['dtype'][k] = dtype_name[_dtype.name] for i in xrange(begin_item, end_item): _row = struct.unpack(attr['dtype']['row'], attr['read_next']['row'].next())[0] rows.append(_row) _col = struct.unpack(attr['dtype']['col'], attr['read_next']['col'].next())[0] cols.append(_col) _data = struct.unpack(attr['dtype']['data'], attr['read_next']['data'].next())[0] data.append(_data) ul[0] = _row if _row < ul[0] else ul[0] ul[1] = _col if _col < ul[1] else ul[1] lr[0] = _row if _row > lr[0] else lr[0] lr[1] = _col if _col > lr[1] else lr[1] for i in xrange(len(rows)): rows[i] -= ul[0] cols[i] -= ul[1] new_ex = extent.create(ul, [lr[0] + 1, lr[1] + 1], array.shape) new_array = sp.coo_matrix((data, (rows, cols)), new_ex.shape) return new_ex, sparse.convert_sparse_array(new_array)
def get_ex(i, j, step, array_shape): return extent.create((i * step, j * step), ((i + 1) * step, (j + 1) * step), array_shape)
def _similarity_mapper(array, ex, item_norm, step): ''' Find all pair similarities between items. Parameters ---------- item_norm : Spartan array of shape(N,) The norm values of each item. step : Integer. How many items need to be fetched for each iteration, now this equals to the columns of tiles. ''' M = array.shape[0] N = array.shape[1] local_ratings = array.fetch(ex) local_item_norm = item_norm[ex.ul[1]:ex.lr[1]] local_item_norm = local_item_norm.reshape(local_item_norm.shape[0], 1) assert local_ratings.shape[0] == M # The start index of the items this worker is responsible for. local_start_idx = ex.ul[1] # The start index of the items which will be fetched next. fetch_start_idx = 0 count = 0 while fetch_start_idx < N: util.log_info("Round : %s on %s", count, socket.gethostname()) # Maybe last tile of the rating matrix doesn't have enough items. if N - fetch_start_idx <= step: step = N - fetch_start_idx count += 1 with util.TIMER.item_fetching: # Fetch the ratings of remote items. The matrix is sparse, so this step # will not be very expensive. remote_ratings = array[:, fetch_start_idx:fetch_start_idx + step] remote_item_norm = item_norm[fetch_start_idx:fetch_start_idx + step] remote_item_norm = remote_item_norm.reshape( 1, remote_item_norm.shape[0]) with util.TIMER.calculate_similarities: ''' Calculate the all-paris similarities between local items and remote items. local_ratings is a local matrix of shape(M, N1), remote_ratings is a local matrix of shape(M, N2). We calculate the cosine similarity, which is defined as: simi(V1, V2) = dot(V1, V2) / (|| V1 || * || V2 ||) For effiency, we calculate this in the way of matrix multiplication. "local_ratings.T.dot(remote_ratings)" generates a N1 X N2 matrix S. S[i, j] equals dot(Vi, Vj). "local_item_norm.dot(remote_item_norm)" generates a N1 X N2 matrix N. N[i, j] equals (|| Vi || * || Vj ||). In final step, we divide S by N, which yields all-pairs similarity. ''' similarities = local_ratings.T.dot(remote_ratings) similarities = np.array(similarities.todense()) norms = local_item_norm.dot(remote_item_norm) similarities = similarities / norms # In case some norms are zero. similarities = np.nan_to_num(similarities) # Update this to global array. yield extent.create((local_start_idx, fetch_start_idx), (local_start_idx + similarities.shape[0], fetch_start_idx + similarities.shape[1]), (array.shape[1], array.shape[1])), similarities # Update fetch_start_idx, fetch next part of table. fetch_start_idx += step
def test_local_offset(): a = extent.create((0, 0), (5, 5), None) b = extent.create((2, 2), (3, 3), None) util.log_info('%s', extent.offset_from(a, b))
import unittest import numpy as np from spartan import expr, util from spartan.array import tile from spartan.util import Assert import spartan.array.extent as extent import test_common import scipy.sparse as sp ARRAY_SIZE = (10, 10) UPDATE_SHAPE = (8, 8) UPDATE_SUBSLICE = extent.create((0, 0), (8, 8), UPDATE_SHAPE).to_slice() class TestTile(test_common.ClusterTest): def test_create_dense(self): t = tile.from_shape(ARRAY_SIZE, dtype=np.float32, tile_type=tile.TYPE_DENSE) t._initialize() Assert.eq(t.mask.shape, ARRAY_SIZE) def test_create_sparse(self): t = tile.from_shape(ARRAY_SIZE, dtype=np.float32, tile_type=tile.TYPE_SPARSE) t._initialize() Assert.eq(t.data.shape, ARRAY_SIZE) Assert.eq(t.mask, None)
def kmeans_count_mapper(extents, tiles, centers_count): target_ex = extent.create((0, ), (centers_count, ), (centers_count, )) result = np.bincount(tiles[0].astype(np.int), minlength=centers_count) yield target_ex, result
def kmeans_map2_dist_mapper(ex, tile, centers=None): points = tile[0] target_ex = extent.create((ex[0].ul[0], ), (ex[0].lr[0], ), (ex[0].array_shape[0], )) yield target_ex, np.argmin(cdist(points, centers), axis=1)