def mult(A, B): #-------LOG logging.warn("Multiplication started") blockcount = A.blocks.getNumPartitions() logging.warn("A part count") logging.warn(blockcount) blockcount = B.blocks.getNumPartitions() logging.warn("B part count") logging.warn(blockcount) #-----LOG # If dense, just call the inbuilt function. if (isinstance(A.blocks.first()[1], DenseMatrix) or isinstance(B.blocks.first()[1], DenseMatrix)): return A.multiply(B) #sparse ? Then continue the madness N = A.numRows() p = SQUARE_BLOCK_SIZE num_blocks = N / p aleft = A.blocks.flatMap(lambda x: affectLeft(x, num_blocks)) bright = B.blocks.flatMap(lambda x: affectRight(x, num_blocks)) both = aleft.union(bright) indi = both.reduceByKey(lambda a, b: prod(a, b)) map = indi.map(lambda x: ((x[0][0], x[0][2]), x[1])) pr = map.reduceByKey(add) brd = pr.map(lambda x: ((x[0][0], x[0][ 1]), Matrices.sparse(p, p, x[1].indptr, x[1].indices, x[1].data))) C = BlockMatrix(brd, p, p, N, N) return C
def difun(x, vect): if x[0] == x[1]: sm = SparseMatrix(p, p, np.linspace(0, p, num = (p+1)), \ np.linspace(0, p-1, num = p), vect[(x[0]*p):((x[0]+1)*p)]) return (x, sm) else: h = sparse.csc_matrix((p, p)) return (x, Matrices.sparse(p, p, h.indptr, h.indices, h.data))
def loadBlockFromMatFile(filename): data = loadmat(filename, squeeze_me=True) id, G = data['block_id'], data['G'] if isinstance(G, sparse.csc_matrix): sub_matrix = Matrices.sparse(p, p, G.indptr, G.indices, G.data) else: sub_matrix = Matrices.dense(p, p, G.transpose().flatten()) return ((id[0], id[1]), sub_matrix)
def test_ml_mllib_matrix_conversion(self): # to ml # dense mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM2 = mllibDM.asML() self.assertEqual(mlDM2, mlDM1) # transposed mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt2 = mllibDMt.asML() self.assertEqual(mlDMt2, mlDMt1) # sparse mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM2 = mllibSM.asML() self.assertEqual(mlSM2, mlSM1) # transposed mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt2 = mllibSMt.asML() self.assertEqual(mlSMt2, mlSMt1) # from ml # dense mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) mllibDM2 = Matrices.fromML(mlDM) self.assertEqual(mllibDM1, mllibDM2) # transposed mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) mllibDMt2 = Matrices.fromML(mlDMt) self.assertEqual(mllibDMt1, mllibDMt2) # sparse mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mllibSM2 = Matrices.fromML(mlSM) self.assertEqual(mllibSM1, mllibSM2) # transposed mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mllibSMt2 = Matrices.fromML(mlSMt) self.assertEqual(mllibSMt1, mllibSMt2)
def difun(x, vect): if (x[0] == x[1]): sm = SparseMatrix( SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE, np.linspace(0, SQUARE_BLOCK_SIZE, num=(SQUARE_BLOCK_SIZE + 1)), np.linspace(0, SQUARE_BLOCK_SIZE - 1, num=SQUARE_BLOCK_SIZE), vect[(x[0] * SQUARE_BLOCK_SIZE):((x[0] + 1) * SQUARE_BLOCK_SIZE)]) return (x, sm) else: h = sparse.csc_matrix((SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE)) return (x, Matrices.sparse(SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE, h.indptr, h.indices, h.data))
def difun(self, x, vect): squareBlockSize = copy.deepcopy(self.squareBlockSize) if (x[0] == x[1]): sm = SparseMatrix( squareBlockSize, squareBlockSize, np.linspace(0, squareBlockSize, num=(squareBlockSize + 1)), np.linspace(0, squareBlockSize - 1, num=squareBlockSize), vect[(x[0] * squareBlockSize):((x[0] + 1) * squareBlockSize)]) return (x, sm) else: h = sparse.csc_matrix((squareBlockSize, squareBlockSize)) return (x, Matrices.sparse(squareBlockSize, squareBlockSize, h.indptr, h.indices, h.data))
def MapperLoadBlocksFromMatFile(filename): logging.warn('MapperLoadBlocksFromMatFile started %s ', filename) data = loadmat(filename) logging.warn('Loaded data') name = re.search('(\d+_\d+).mat$', filename, re.IGNORECASE).group(1) G = data[name] id = name.split('_') n = G.shape[0] logging.warn('Before sparse conversion') if (not (isinstance(G, sparse.csc_matrix))): sub_matrix = Matrices.dense(n, n, G.transpose().flatten()) else: #sub_matrix = Matrices.dense(n,n,np.array(G.todense()).transpose().flatten()) #SPARSE sub_matrix = Matrices.sparse(n, n, G.indptr, G.indices, G.data) logging.warn('MapperLoadBlocksFromMatFile Ended') return ((id[0], id[1]), sub_matrix)
def constructElectionBlock(pairDonations): I = int(pairDonations[0][0]) J = int(pairDonations[1][0]) donationsI = pairDonations[0][1] donationsJ = pairDonations[1][1] n = donationsI.shape[0] allCombinations = itertools.product(donationsI, donationsJ) allCombsEdges = [edgeDefinitionElection(p[0], p[1]) for p in allCombinations] if len(allCombsEdges) == (n*n): adj = np.reshape(allCombsEdges, (n,n)) else: adj = np.zeros((n,n)) if I==J: adj[range(n), range(n)] = 0 if GENERATE_SPARSE: G = sparse.csc_matrix(adj) subMatrixSparse = Matrices.sparse(n, n, G.indptr, G.indices, G.data) return ((I,J), subMatrixSparse) else: G = Matrices.dense(n,n, adj.transpose().flatten()) return ((I,J), G)
from __future__ import print_function #Section 7.2.1 from pyspark.mllib.linalg import Vectors, Vector dv1 = Vectors.dense(5.0,6.0,7.0,8.0) dv2 = Vectors.dense([5.0,6.0,7.0,8.0]) sv = Vectors.sparse(4, [0,1,2,3], [5.0,6.0,7.0,8.0]) dv2[2] dv1.size dv2.toArray() from pyspark.mllib.linalg import Matrices dm = Matrices.dense(2,3,[5.0,0.0,0.0,3.0,1.0,4.0]) sm = Matrices.sparse(2,3,[0,1,2,4], [0,1,0,1], [5.0,3.0,1.0,4.0]) sm.toDense() dm.toSparse() dm[1,1] #Section 7.2.2 from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow rmind = IndexedRowMatrix(rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0]))) #Section 7.4 housingLines = sc.textFile("first-edition/ch07/housing.data", 6) housingVals = housingLines.map(lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")])) #Section 7.4.1 from pyspark.mllib.linalg.distributed import RowMatrix housingMat = RowMatrix(housingVals) from pyspark.mllib.stat._statistics import Statistics
# Labelled point with a positive label and a dense feature vector. lp_pos = LabeledPoint(1.0, [5.0, 0.0, 1.0, 7.0]) # Labelled point with a negative label and a sparse feature vector. lp_neg = LabeledPoint(0.0, SparseVector(4, [0, 2, 3], [5.0, 1.0, 7.0])) # # Local Matrix # from pyspark.mllib.linalg import Matrix, Matrices # Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)) dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) # Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0)) sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) # # Code Plan # # # 1- Combine all tweets files into a single data frame # 2- Parse the Tweets - remove stopwords - extract emoticons - extract url - normalize your words (e.g., mapping them to lowercase and removing punctuation and numbers) # 3- Feature extraction # 3a- Tokenisation # 3b- TF-IDF # 3c- Hash TF-IDF # 4- Run K-Means clustering # 5- Evaluate # 5a- Identify Tweets to Cluster # 5b- Dimemsionality reduction to 2 dim with PCA
from __future__ import print_function #Section 7.2.1 from pyspark.mllib.linalg import Vectors, Vector dv1 = Vectors.dense(5.0, 6.0, 7.0, 8.0) dv2 = Vectors.dense([5.0, 6.0, 7.0, 8.0]) sv = Vectors.sparse(4, [0, 1, 2, 3], [5.0, 6.0, 7.0, 8.0]) dv2[2] dv1.size dv2.toArray() from pyspark.mllib.linalg import Matrices dm = Matrices.dense(2, 3, [5.0, 0.0, 0.0, 3.0, 1.0, 4.0]) sm = Matrices.sparse(2, 3, [0, 1, 2, 4], [0, 1, 0, 1], [5.0, 3.0, 1.0, 4.0]) sm.toDense() dm.toSparse() dm[1, 1] #Section 7.2.2 from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow rmind = IndexedRowMatrix( rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0]))) #Section 7.4 housingLines = sc.textFile("first-edition/ch07/housing.data", 6) housingVals = housingLines.map( lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")])) #Section 7.4.1 from pyspark.mllib.linalg.distributed import RowMatrix
rand_mat = scipy.sparse.rand(sp_cols, sp_cols, density=0.2, format='csc') value1 = newmat.data col_index1 = newmat.indices row_pointers1 = newmat.indptr value2 = rand_mat.data col_index2 = rand_mat.indices row_pointers2 = rand_mat.indptr start2 = timeit.default_timer() just = newmat * rand_mat stop2 = timeit.default_timer() t2 = (stop2 - start2) print t2 sparse1 = Matrices.sparse(sp_rows, sp_cols, row_pointers1, col_index1, value1) sparse2 = Matrices.sparse(sp_cols, sp_cols, row_pointers2, col_index2, value2) """sparse1 = newmat.toarray() sparse2 = rand_mat.toarray() print sparse2""" r1 = sp_rows / 2 c1 = sp_cols / 2 r2 = sp_cols / 2 c2 = sp_cols / 2 """a, b, c, d = sparse1[:r1, :c1], sparse1[r1:, :c1], sparse1[:r1, c1:], sparse1[r1:, c1:] e, f, g, h = sparse2[:r2, :c2], sparse2[r2:, :c2], sparse2[:r2, c2:], sparse2[r2:, c2:] blocks1 = sc.parallelize([((0, 0), a),((1,0), b), ((0,1),c), ((1,1),d)]) blocks2 = sc.parallelize([((0, 0), e),((0,1), f), ((0,1),g), ((1,1),h)])"""