示例#1
0
 def test_indexed_row_matrix_from_dataframe(self):
     from pyspark.sql.utils import IllegalArgumentException
     df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
     matrix = IndexedRowMatrix(df)
     self.assertEqual(matrix.numRows(), 1)
     self.assertEqual(matrix.numCols(), 1)
     with self.assertRaises(IllegalArgumentException):
         IndexedRowMatrix(df.drop("_1"))
示例#2
0
class ConjugateGradient:
    def __init__(self, args, sc):

        self.EPSILON = 1.0e-5

        self.ctx = sc

        self.numPartitions = args.partitions

        self.numIterations = args.iterations
        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        self.inputMatrix = IndexedRowMatrix(inputMatrixData)

        self.inputVector = readVector(self.inputVectorPath, self.ctx)

        if (self.numIterations == 0):
            self.numIterations = self.inputVector.size * 2

        self.result = Vectors.zeros(self.inputVector.size)

    def solve(self):
        # print result

        stop = False

        start = time.clock()

        r = np.copy(self.inputVector)

        Ap = Vectors.zeros(self.inputMatrix.numRows())

        # p = r
        p = np.copy(r)

        # rsold = r * r

        rsold = r.dot(r)

        rsold = r.dot(r)

        alpha = 0.0

        rsnew = 0.0

        k = 0

        while (not stop):

            # Inicio -- Ap=A * p
            Ap = L2.DGEMV(1.0, self.inputMatrix, p, 0.0, Ap, self.ctx)

            # Fin -- Ap=A * p

            # alpha=rsold / (p'*Ap)
            alpha = rsold / p.dot(Ap);

            # x=x+alpha * p
            self.result = self.result + alpha*p

            # r=r-alpha * Ap
            r = r - alpha*Ap

            # rsnew = r'*r
            rsnew = r.dot(r)

            if ((math.sqrt(rsnew) <= self.EPSILON) or (k >= (self.numIterations))):
                stop = True

            # p=r+rsnew / rsold * p
            p = r + (rsnew/rsold) * p

            rsold = rsnew

            k += 1

        # FIN GRADIENTE CONJUGADO

        end = time.clock()

        print "Total time in solve system is: " + str(end - start) + " and " + str(k) + " iterations."

        printVector(self.result)

        return self.result
示例#3
0
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()
num_rows = X_rm.numRows()
# U is whitened and projected onto principal components subspace.

S = svd_o.s.toArray()
eig_vals = S**2
# change the ncomp to 3 for this tutorial
#n_comp  = np.argmax(np.cumsum(eig_vals)/eig_vals.sum() > 0.95)+1
n_comp = 3
U = svd_o.U.rows.map(lambda x:(x.index, (np.sqrt(num_rows-1)*x.vector).tolist()[0:n_comp]))
# K is our transformation matrix to obtain projection on PC's subspace
K = (U/S).T[:n_comp]

import pyspark.sql.functions as f
import pyspark.sql.types as t

df = spark.createDataFrame(U).toDF("id", "features")
示例#4
0
    def sparkComputeCost(self, input_file, x, y, theta):
        
        sc = SparkContext()

        # add the ones vector while building the RDD
        idx = 0
        x_mat = sc.textFile(input_file) \
            .map(lambda line: ('1, ' + line).split(",")[:-1]) \
            .zipWithIndex()
        
        # need a SQLContext() to generate an IndexedRowMatrix from RDD
        sqlContext = SQLContext(sc)
        
        x_mat = IndexedRowMatrix( \
            x_mat \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        x_mat.cache()

        print "Matrix rows x cols"
        print x_mat.numRows()
        print x_mat.numCols()

        vec = sc.parallelize(theta) \
            .map(lambda line: [line]) \
            .zipWithIndex()

        vec = IndexedRowMatrix( \
            vec \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        vec.cache()

        print "Vector rows x cols"
        print vec.numRows()
        print vec.numCols()

        h = x_mat.multiply(vec)
        h.cache()

        print "Hypothesis rows x cols"
        print h.numRows()
        print h.numCols()

        y_vec = sc.textFile(input_file) \
            .map(lambda line: [('1, ' + line).split(",")[-1]]) \
            .zipWithIndex()

        y_vec = IndexedRowMatrix( \
            y_vec \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        y_vec.cache()

        errors = h.subtract(y_vec).toLocalMatrix()

        print sum(errors.toArray())

        '''sparkSession = SparkSession \
            .builder \
            .appName('pyspark') \
            .getOrCreate()
        
        df = sparkSession.read.csv(input_file)
        df = df \
            .toDF(x, y) \
            .withColumn("Ones", psf.lit(1)) \
            .cache()

        df.select(x,'Ones').show()'''

        '''sc = SparkContext('local', 'pyspark')