Python IndexedRowMatrix.numRows示例

编程语言: Python

命名空间/包名称: pyspark.mllib.linalg.distributed

类/类型: IndexedRowMatrix

方法/功能: numRows

hotexamples.com的示例: 4

Python IndexedRowMatrix.numRows - 已找到4个示例。这些是从开源项目中提取的最受好评的pyspark.mllib.linalg.distributed.IndexedRowMatrix.numRows现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

IndexedRowMatrix(30)

multiply(12)

toBlockMatrix(4)

numRows(3)

columnSimilarities(2)

computeSVD(2)

numCols(2)

add(1)

cache(1)

示例#1

显示文件

文件： test_linalg.py 项目： Swidasya/spark-research

 def test_indexed_row_matrix_from_dataframe(self):
     from pyspark.sql.utils import IllegalArgumentException
     df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
     matrix = IndexedRowMatrix(df)
     self.assertEqual(matrix.numRows(), 1)
     self.assertEqual(matrix.numCols(), 1)
     with self.assertRaises(IllegalArgumentException):
         IndexedRowMatrix(df.drop("_1"))

示例#2

显示文件

class ConjugateGradient:
    def __init__(self, args, sc):

        self.EPSILON = 1.0e-5

        self.ctx = sc

        self.numPartitions = args.partitions

        self.numIterations = args.iterations
        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        self.inputMatrix = IndexedRowMatrix(inputMatrixData)

        self.inputVector = readVector(self.inputVectorPath, self.ctx)

        if (self.numIterations == 0):
            self.numIterations = self.inputVector.size * 2

        self.result = Vectors.zeros(self.inputVector.size)

    def solve(self):
        # print result

        stop = False

        start = time.clock()

        r = np.copy(self.inputVector)

        Ap = Vectors.zeros(self.inputMatrix.numRows())

        # p = r
        p = np.copy(r)

        # rsold = r * r

        rsold = r.dot(r)

        rsold = r.dot(r)

        alpha = 0.0

        rsnew = 0.0

        k = 0

        while (not stop):

            # Inicio -- Ap=A * p
            Ap = L2.DGEMV(1.0, self.inputMatrix, p, 0.0, Ap, self.ctx)

            # Fin -- Ap=A * p

            # alpha=rsold / (p'*Ap)
            alpha = rsold / p.dot(Ap);

            # x=x+alpha * p
            self.result = self.result + alpha*p

            # r=r-alpha * Ap
            r = r - alpha*Ap

            # rsnew = r'*r
            rsnew = r.dot(r)

            if ((math.sqrt(rsnew) <= self.EPSILON) or (k >= (self.numIterations))):
                stop = True

            # p=r+rsnew / rsold * p
            p = r + (rsnew/rsold) * p

            rsold = rsnew

            k += 1

        # FIN GRADIENTE CONJUGADO

        end = time.clock()

        print "Total time in solve system is: " + str(end - start) + " and " + str(k) + " iterations."

        printVector(self.result)

        return self.result

示例#3

显示文件

scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()
num_rows = X_rm.numRows()
# U is whitened and projected onto principal components subspace.

S = svd_o.s.toArray()
eig_vals = S**2
# change the ncomp to 3 for this tutorial
#n_comp  = np.argmax(np.cumsum(eig_vals)/eig_vals.sum() > 0.95)+1
n_comp = 3
U = svd_o.U.rows.map(lambda x:(x.index, (np.sqrt(num_rows-1)*x.vector).tolist()[0:n_comp]))
# K is our transformation matrix to obtain projection on PC's subspace
K = (U/S).T[:n_comp]

import pyspark.sql.functions as f
import pyspark.sql.types as t

df = spark.createDataFrame(U).toDF("id", "features")

示例#4

显示文件

文件： uni_lin_reg.py 项目： pmb311/Machine_Learning

    def sparkComputeCost(self, input_file, x, y, theta):
        
        sc = SparkContext()

        # add the ones vector while building the RDD
        idx = 0
        x_mat = sc.textFile(input_file) \
            .map(lambda line: ('1, ' + line).split(",")[:-1]) \
            .zipWithIndex()
        
        # need a SQLContext() to generate an IndexedRowMatrix from RDD
        sqlContext = SQLContext(sc)
        
        x_mat = IndexedRowMatrix( \
            x_mat \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        x_mat.cache()

        print "Matrix rows x cols"
        print x_mat.numRows()
        print x_mat.numCols()

        vec = sc.parallelize(theta) \
            .map(lambda line: [line]) \
            .zipWithIndex()

        vec = IndexedRowMatrix( \
            vec \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        vec.cache()

        print "Vector rows x cols"
        print vec.numRows()
        print vec.numCols()

        h = x_mat.multiply(vec)
        h.cache()

        print "Hypothesis rows x cols"
        print h.numRows()
        print h.numCols()

        y_vec = sc.textFile(input_file) \
            .map(lambda line: [('1, ' + line).split(",")[-1]]) \
            .zipWithIndex()

        y_vec = IndexedRowMatrix( \
            y_vec \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        y_vec.cache()

        errors = h.subtract(y_vec).toLocalMatrix()

        print sum(errors.toArray())

        '''sparkSession = SparkSession \
            .builder \
            .appName('pyspark') \
            .getOrCreate()
        
        df = sparkSession.read.csv(input_file)
        df = df \
            .toDF(x, y) \
            .withColumn("Ones", psf.lit(1)) \
            .cache()

        df.select(x,'Ones').show()'''

        '''sc = SparkContext('local', 'pyspark')