示例#1
0
def mult(A, B):
    #-------LOG
    logging.warn("Multiplication started")
    blockcount = A.blocks.getNumPartitions()
    logging.warn("A part count")
    logging.warn(blockcount)
    blockcount = B.blocks.getNumPartitions()
    logging.warn("B part count")
    logging.warn(blockcount)
    #-----LOG

    # If dense, just call the inbuilt function.
    if (isinstance(A.blocks.first()[1], DenseMatrix)
            or isinstance(B.blocks.first()[1], DenseMatrix)):
        return A.multiply(B)
    #sparse ? Then continue the madness

    N = A.numRows()
    p = SQUARE_BLOCK_SIZE
    num_blocks = N / p

    aleft = A.blocks.flatMap(lambda x: affectLeft(x, num_blocks))
    bright = B.blocks.flatMap(lambda x: affectRight(x, num_blocks))
    both = aleft.union(bright)
    indi = both.reduceByKey(lambda a, b: prod(a, b))
    map = indi.map(lambda x: ((x[0][0], x[0][2]), x[1]))
    pr = map.reduceByKey(add)
    brd = pr.map(lambda x: ((x[0][0], x[0][
        1]), Matrices.sparse(p, p, x[1].indptr, x[1].indices, x[1].data)))
    C = BlockMatrix(brd, p, p, N, N)
    return C
示例#2
0
 def difun(x, vect):
     if x[0] == x[1]:
         sm = SparseMatrix(p, p, np.linspace(0, p, num = (p+1)), \
             np.linspace(0, p-1, num = p), vect[(x[0]*p):((x[0]+1)*p)])
         return (x, sm)
     else:
         h = sparse.csc_matrix((p, p))
         return (x, Matrices.sparse(p, p, h.indptr, h.indices, h.data))
示例#3
0
 def loadBlockFromMatFile(filename):
     data = loadmat(filename, squeeze_me=True)
     id, G = data['block_id'], data['G']
     if isinstance(G, sparse.csc_matrix):
         sub_matrix = Matrices.sparse(p, p, G.indptr, G.indices, G.data)
     else:
         sub_matrix = Matrices.dense(p, p, G.transpose().flatten())
     return ((id[0], id[1]), sub_matrix)
示例#4
0
 def test_ml_mllib_matrix_conversion(self):
     # to ml
     # dense
     mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM2 = mllibDM.asML()
     self.assertEqual(mlDM2, mlDM1)
     # transposed
     mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt2 = mllibDMt.asML()
     self.assertEqual(mlDMt2, mlDMt1)
     # sparse
     mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1],
                                       [2, 3, 4])
     mlSM2 = mllibSM.asML()
     self.assertEqual(mlSM2, mlSM1)
     # transposed
     mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4],
                                     True)
     mlSMt2 = mllibSMt.asML()
     self.assertEqual(mlSMt2, mlSMt1)
     # from ml
     # dense
     mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
     mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
     mllibDM2 = Matrices.fromML(mlDM)
     self.assertEqual(mllibDM1, mllibDM2)
     # transposed
     mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mllibDMt2 = Matrices.fromML(mlDMt)
     self.assertEqual(mllibDMt1, mllibDMt2)
     # sparse
     mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mllibSM2 = Matrices.fromML(mlSM)
     self.assertEqual(mllibSM1, mllibSM2)
     # transposed
     mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4],
                                    True)
     mllibSMt2 = Matrices.fromML(mlSMt)
     self.assertEqual(mllibSMt1, mllibSMt2)
示例#5
0
 def test_ml_mllib_matrix_conversion(self):
     # to ml
     # dense
     mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM2 = mllibDM.asML()
     self.assertEqual(mlDM2, mlDM1)
     # transposed
     mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt2 = mllibDMt.asML()
     self.assertEqual(mlDMt2, mlDMt1)
     # sparse
     mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM2 = mllibSM.asML()
     self.assertEqual(mlSM2, mlSM1)
     # transposed
     mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt2 = mllibSMt.asML()
     self.assertEqual(mlSMt2, mlSMt1)
     # from ml
     # dense
     mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
     mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
     mllibDM2 = Matrices.fromML(mlDM)
     self.assertEqual(mllibDM1, mllibDM2)
     # transposed
     mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mllibDMt2 = Matrices.fromML(mlDMt)
     self.assertEqual(mllibDMt1, mllibDMt2)
     # sparse
     mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mllibSM2 = Matrices.fromML(mlSM)
     self.assertEqual(mllibSM1, mllibSM2)
     # transposed
     mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mllibSMt2 = Matrices.fromML(mlSMt)
     self.assertEqual(mllibSMt1, mllibSMt2)
示例#6
0
def difun(x, vect):
    if (x[0] == x[1]):
        sm = SparseMatrix(
            SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE,
            np.linspace(0, SQUARE_BLOCK_SIZE, num=(SQUARE_BLOCK_SIZE + 1)),
            np.linspace(0, SQUARE_BLOCK_SIZE - 1, num=SQUARE_BLOCK_SIZE),
            vect[(x[0] * SQUARE_BLOCK_SIZE):((x[0] + 1) * SQUARE_BLOCK_SIZE)])
        return (x, sm)
    else:
        h = sparse.csc_matrix((SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE))
        return (x,
                Matrices.sparse(SQUARE_BLOCK_SIZE, SQUARE_BLOCK_SIZE, h.indptr,
                                h.indices, h.data))
示例#7
0
 def difun(self, x, vect):
     squareBlockSize = copy.deepcopy(self.squareBlockSize)
     if (x[0] == x[1]):
         sm = SparseMatrix(
             squareBlockSize, squareBlockSize,
             np.linspace(0, squareBlockSize, num=(squareBlockSize + 1)),
             np.linspace(0, squareBlockSize - 1, num=squareBlockSize),
             vect[(x[0] * squareBlockSize):((x[0] + 1) * squareBlockSize)])
         return (x, sm)
     else:
         h = sparse.csc_matrix((squareBlockSize, squareBlockSize))
         return (x,
                 Matrices.sparse(squareBlockSize, squareBlockSize, h.indptr,
                                 h.indices, h.data))
示例#8
0
def MapperLoadBlocksFromMatFile(filename):
    logging.warn('MapperLoadBlocksFromMatFile started %s ', filename)
    data = loadmat(filename)
    logging.warn('Loaded data')
    name = re.search('(\d+_\d+).mat$', filename, re.IGNORECASE).group(1)
    G = data[name]
    id = name.split('_')
    n = G.shape[0]
    logging.warn('Before sparse conversion')
    if (not (isinstance(G, sparse.csc_matrix))):
        sub_matrix = Matrices.dense(n, n, G.transpose().flatten())
    else:
        #sub_matrix = Matrices.dense(n,n,np.array(G.todense()).transpose().flatten())
        #SPARSE
        sub_matrix = Matrices.sparse(n, n, G.indptr, G.indices, G.data)
    logging.warn('MapperLoadBlocksFromMatFile Ended')
    return ((id[0], id[1]), sub_matrix)
示例#9
0
def constructElectionBlock(pairDonations):
    I = int(pairDonations[0][0])
    J = int(pairDonations[1][0])
    donationsI = pairDonations[0][1]
    donationsJ = pairDonations[1][1]

    n = donationsI.shape[0]
    allCombinations = itertools.product(donationsI, donationsJ)
    allCombsEdges = [edgeDefinitionElection(p[0], p[1]) for p in allCombinations]
    if len(allCombsEdges) == (n*n):
        adj = np.reshape(allCombsEdges, (n,n))
    else:
        adj = np.zeros((n,n))
    if I==J:
        adj[range(n), range(n)] = 0

    if GENERATE_SPARSE:
        G = sparse.csc_matrix(adj)
        subMatrixSparse = Matrices.sparse(n, n, G.indptr, G.indices, G.data)
        return ((I,J), subMatrixSparse)
    else:
        G = Matrices.dense(n,n, adj.transpose().flatten())
        return ((I,J), G)
示例#10
0
from __future__ import print_function

#Section 7.2.1
from pyspark.mllib.linalg import Vectors, Vector
dv1 = Vectors.dense(5.0,6.0,7.0,8.0)
dv2 = Vectors.dense([5.0,6.0,7.0,8.0])
sv = Vectors.sparse(4, [0,1,2,3], [5.0,6.0,7.0,8.0])
dv2[2]
dv1.size
dv2.toArray()

from pyspark.mllib.linalg import Matrices

dm = Matrices.dense(2,3,[5.0,0.0,0.0,3.0,1.0,4.0])
sm = Matrices.sparse(2,3,[0,1,2,4], [0,1,0,1], [5.0,3.0,1.0,4.0])
sm.toDense()
dm.toSparse()
dm[1,1]

#Section 7.2.2
from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow
rmind = IndexedRowMatrix(rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0])))

#Section 7.4
housingLines = sc.textFile("first-edition/ch07/housing.data", 6)
housingVals = housingLines.map(lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")]))

#Section 7.4.1
from pyspark.mllib.linalg.distributed import RowMatrix
housingMat = RowMatrix(housingVals)
from pyspark.mllib.stat._statistics import Statistics
示例#11
0
# Labelled point with a positive label and a dense feature vector.
lp_pos = LabeledPoint(1.0, [5.0, 0.0, 1.0, 7.0])

# Labelled point with a negative label and a sparse feature vector.
lp_neg = LabeledPoint(0.0, SparseVector(4, [0, 2, 3], [5.0, 1.0, 7.0]))

#
# Local Matrix
#
from pyspark.mllib.linalg import Matrix, Matrices

# Dense matrix ((1.0, 2.0, 3.0), (4.0, 5.0, 6.0))
dMatrix = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])

# Sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sMatrix = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])

#
# Code Plan
#
#
# 1- Combine all tweets files into a single data frame
# 2- Parse the Tweets - remove stopwords - extract emoticons - extract url - normalize your words (e.g., mapping them to lowercase and removing punctuation and numbers)
# 3- Feature extraction
# 		3a- Tokenisation
# 		3b- TF-IDF
# 		3c- Hash TF-IDF
# 4- Run K-Means clustering
# 5- Evaluate
# 		5a- Identify Tweets to Cluster
# 		5b- Dimemsionality reduction to 2 dim with PCA
from __future__ import print_function

#Section 7.2.1
from pyspark.mllib.linalg import Vectors, Vector
dv1 = Vectors.dense(5.0, 6.0, 7.0, 8.0)
dv2 = Vectors.dense([5.0, 6.0, 7.0, 8.0])
sv = Vectors.sparse(4, [0, 1, 2, 3], [5.0, 6.0, 7.0, 8.0])
dv2[2]
dv1.size
dv2.toArray()

from pyspark.mllib.linalg import Matrices

dm = Matrices.dense(2, 3, [5.0, 0.0, 0.0, 3.0, 1.0, 4.0])
sm = Matrices.sparse(2, 3, [0, 1, 2, 4], [0, 1, 0, 1], [5.0, 3.0, 1.0, 4.0])
sm.toDense()
dm.toSparse()
dm[1, 1]

#Section 7.2.2
from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow
rmind = IndexedRowMatrix(
    rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0])))

#Section 7.4
housingLines = sc.textFile("first-edition/ch07/housing.data", 6)
housingVals = housingLines.map(
    lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")]))

#Section 7.4.1
from pyspark.mllib.linalg.distributed import RowMatrix
示例#13
0
rand_mat = scipy.sparse.rand(sp_cols, sp_cols, density=0.2, format='csc')
value1 = newmat.data
col_index1 = newmat.indices
row_pointers1 = newmat.indptr

value2 = rand_mat.data
col_index2 = rand_mat.indices
row_pointers2 = rand_mat.indptr

start2 = timeit.default_timer()
just = newmat * rand_mat
stop2 = timeit.default_timer()
t2 = (stop2 - start2)
print t2

sparse1 = Matrices.sparse(sp_rows, sp_cols, row_pointers1, col_index1, value1)
sparse2 = Matrices.sparse(sp_cols, sp_cols, row_pointers2, col_index2, value2)
"""sparse1 = newmat.toarray()
sparse2 = rand_mat.toarray()
print sparse2"""

r1 = sp_rows / 2
c1 = sp_cols / 2
r2 = sp_cols / 2
c2 = sp_cols / 2
"""a, b, c, d = sparse1[:r1, :c1], sparse1[r1:, :c1], sparse1[:r1, c1:], sparse1[r1:, c1:]
e, f, g, h = sparse2[:r2, :c2], sparse2[r2:, :c2], sparse2[:r2, c2:], sparse2[r2:, c2:]

blocks1 = sc.parallelize([((0, 0), a),((1,0), b), ((0,1),c), ((1,1),d)])
blocks2 = sc.parallelize([((0, 0), e),((0,1), f), ((0,1),g), ((1,1),h)])"""