Exemplo n.º 1
0
    def __init__(self, args, sc):

        self.EPSILON = 1.0e-5

        self.ctx = sc

        self.numPartitions = args.partitions

        self.numIterations = args.iterations
        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        self.inputMatrix = IndexedRowMatrix(inputMatrixData)

        self.inputVector = readVector(self.inputVectorPath, self.ctx)

        if (self.numIterations == 0):
            self.numIterations = self.inputVector.size * 2

        self.result = Vectors.zeros(self.inputVector.size)
def compute_similarity(df):
    """
    Compute cosine
    :param df:dataframe of rating by user for movies
    :return:
    """

    # df = df.filter(df.movieId.isin([91542.0, 1.0, 5.0, 90.0, 2541.0, 1246.0, 1552.0, 4084.0, 5679.0]))

    df = df.groupBy("userId").pivot("movieId").agg(
        first(col('rating')).cast("double"))

    mat = IndexedRowMatrix(
        df.rdd.map(lambda row: IndexedRow(row[0], Vectors.dense(row[1:]))))

    cs = mat.columnSimilarities()

    path = "test"

    cs.entries.toDF().write.parquet(path)

    cs.entries.toDF().coalesce(1)\
       .write.format("com.databricks.spark.csv")\
       .option("header", "true")\
       .save("testtest.csv")
Exemplo n.º 3
0
 def test_indexed_row_matrix_from_dataframe(self):
     from pyspark.sql.utils import IllegalArgumentException
     df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))])
     matrix = IndexedRowMatrix(df)
     self.assertEqual(matrix.numRows(), 1)
     self.assertEqual(matrix.numCols(), 1)
     with self.assertRaises(IllegalArgumentException):
         IndexedRowMatrix(df.drop("_1"))
Exemplo n.º 4
0
def multiply_matrices2(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    listA = A.tolist()
    rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))])
    matA = IndexedRowMatrix(rddA).toBlockMatrix()

    listB = B.tolist()
    rddB = sc.parallelize([IndexedRow(i, listB[i]) for i in range(len(listB))])
    matB = IndexedRowMatrix(rddB).toBlockMatrix()

    matC = matA.multiply(matB).toLocalMatrix()
    return matC.toArray()
def readMovieChar(spark, f_name):
    my_data = list()
    with open(f_name, 'r') as handle:
        reader = csv.reader(handle, delimiter=",", quotechar='"')
        for row in reader:
            my_data.append(row)
    my_data.pop(0)

    matrix = np.zeros(shape=(int(my_data[-1][0]) + 1, len(movie_genre)),
                      dtype=int)
    movie_list = dict()

    for movie in my_data:
        movie_id = int(movie[0])
        movie_list[movie_id] = movie[1]

        genres = movie[2].split('|')
        for each in genres:
            col_idx = movie_genre.get(each, movie_genre['Other'])
            matrix[movie_id][col_idx] = 1

    indexedRows = spark.sparkContext.parallelize(
        [IndexedRow(i, matrix[i]) for i in range(len(matrix))])
    mat = IndexedRowMatrix(indexedRows)
    return mat, movie_list
Exemplo n.º 6
0
def _getColumns(blockMat, j, norm=1):
    """
    Returns column(s) j of the input BlockMatrix as a BlockMatrix with
    the same number of rowsPerBlock.
    """
    sc = SparkContext.getOrCreate()
    if np.isscalar(j):
        colsPerBlock = blockMat.colsPerBlock
        jBlockCol = j // colsPerBlock
        jInBlock = j % colsPerBlock
        jBlocks = blockMat.blocks.filter(lambda x: x[0][1] == jBlockCol)

        def g(block):
            colJ = block[1].toArray()[:, jInBlock] / norm
            return ((block[0][0], 0), OldMatrices.dense(len(colJ), 1, colJ))

        colJBlocks = jBlocks.map(g)
        return BlockMatrix(colJBlocks,
                           rowsPerBlock=blockMat.rowsPerBlock,
                           colsPerBlock=1,
                           numCols=1)
    else:
        j_b = sc.broadcast(j)
        blockMat_red = blockMat.toIndexedRowMatrix()
        rows_red = blockMat_red.rows.map(lambda row: (
            row.index, OldVectors.dense(row.vector.toArray()[j_b.value] / norm
                                        )))
        j_b.unpersist()
        return IndexedRowMatrix(rows_red).toBlockMatrix(
            rowsPerBlock=blockMat.rowsPerBlock,
            colsPerBlock=min(len(j), blockMat.colsPerBlock))
Exemplo n.º 7
0
def MatrixTranspose(
    mat
):  #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help.
    ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it,
    #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix
    ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order
    ## use indexed matrix could partially fix this issue by reordering but this is too wierd
    '''
	transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector
	:param mat: the input row matrix
	:return a transposed row matrix
	ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark
	'''
    if isinstance(mat, IndexedRowMatrix):
        mat = mat.toRowMatrix()
    #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency
    transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
    transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF(
    ).orderBy("index")
    # back to sparse first then convert to indexedrowmatrix
    transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow(
        row["index"],
        MLLibVectors.sparse(
            row["vector"].size,
            np.nonzero(row["vector"].values)[0], row["vector"].values[
                np.nonzero(row["vector"].values)])))
    return IndexedRowMatrix(transposed_mat)
Exemplo n.º 8
0
    def _fit(self, dataset):

        inputCol = self.getInputCol()
        outputCol = self.getOutputCol()

        ds_rdd = dataset.select(inputCol).rdd

        m = IndexedRowMatrix(ds_rdd)

        mu = self.getMu()
        l = self.getL()

        if not mu:
            mu = 1.25 * mat.computeSVD(1).s

        print('mu:', mu)

        if not l:
            n_cols = mat.numCols()
            n_rows = mat.numRows()
            l = 1.0 / np.sqrt(np.max((n_cols, n_rows)))

        print('l:', l)

        pass
Exemplo n.º 9
0
    def matrix(self):
        """
        Gets the matrix backing this LD matrix.

        :return: Matrix of Pearson correlation values.
        :rtype: `IndexedRowMatrix <https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix>`__
        """
        return IndexedRowMatrix(self._jldm.matrix())
Exemplo n.º 10
0
def indexed_matrix_from_numpy(M):
    t = tuple(map(tuple, M))
    t2 = range(len(t))
    l = list(t)
    for i in t2:
        l[i] = tuple((i, l[i]))
    idxM = IndexedRowMatrix(sc.parallelize(l))
    return idxM
Exemplo n.º 11
0
def multiply_transpose2(A: np.array) -> np.ndarray:  # A*A.T
    global counter
    print()
    print("No." + str(counter) + " matrix multiplication starts")
    start_time = time.time()
    print("matrix shape:", A.shape)
    listA = A.tolist()
    rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))])
    matA = IndexedRowMatrix(rddA).toBlockMatrix()
    matT = matA.transpose()
    matR = matA.multiply(matT)
    res = matR.toLocalMatrix().toArray()
    elapsed_time = time.time() - start_time
    print("No." + str(counter) + " matrix multiplication ends, takes time:",
          elapsed_time)
    counter = counter + 1
    return res
Exemplo n.º 12
0
    def test_row_matrix_invalid_type(self):
        rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
        invalid_type = ""
        matrix = RowMatrix(rows)
        self.assertRaises(TypeError, matrix.multiply, invalid_type)

        irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])])
        imatrix = IndexedRowMatrix(irows)
        self.assertRaises(TypeError, imatrix.multiply, invalid_type)
Exemplo n.º 13
0
def vectorDFtoIndexedMatrix(df, vecvar, idcol):
    '''
	applicable to dataframe already having assembled vectors
	'''
    df = df.rdd.map(lambda row: IndexedRow(
        row[idcol],
        MLLibVectors.sparse(row[vecvar].size, row[vecvar].indices, row[vecvar].
                            values)))
    return IndexedRowMatrix(df)
Exemplo n.º 14
0
    def matrix(self):
        """
        Gets the matrix backing this kinship matrix.

        :return: Matrix of kinship values.
        :rtype: `IndexedRowMatrix <https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix>`__
        """
        from pyspark.mllib.linalg.distributed import IndexedRowMatrix

        return IndexedRowMatrix(self._jkm.matrix())
Exemplo n.º 15
0
def df_to_indexed_row_matrix(row_number_col: str, vector_col: str,
                             df: DataFrame):
    """Convert a dataframe containing a row number and vector to a block matrix"""
    indexed_rows = (df.where(F.col(vector_col).isNotNull()).select(
        F.col(row_number_col), F.col(vector_col)).rdd.map(
            lambda row: IndexedRow(row.__getitem__(row_number_col),
                                   row.__getitem__(vector_col).toArray())))

    if indexed_rows.isEmpty():
        raise ValueError(
            "Primary RDD is empty. Cannot perform matrix multiplication")

    return IndexedRowMatrix(indexed_rows)
Exemplo n.º 16
0
def DFtoIndexedMatrix(df, quantvars, idcol):
    '''
	convert a numeric dataframe to a rowmatrix with sparse vector as basic units, won't be applicable to dataframe already having assembled vectors
	'''
    df = VectorAssembler(
        inputCols=quantvars, outputCol="features"
    ).transform(df).select(
        [idcol, "features"]
    )  #vector assembler turn it automatically to sparse matrix, so next line should be fine
    df = df.rdd.map(lambda row: IndexedRow(
        row[idcol],
        MLLibVectors.sparse(row.features.size, row.features.indices, row.
                            features.values)))
    return IndexedRowMatrix(df)
Exemplo n.º 17
0
    def test_multiply_coordinate_matrices(self, spark: SQLContext):

        a_data = [(0, MllibVectors.dense(0, 3, 4)),
                  (1, MllibVectors.dense(1, 2, 3))]

        b_data = [
            (0, MllibVectors.dense(1, 0)),
            (1, MllibVectors.dense(4, 2)),
            (2, MllibVectors.dense(1, 3)),
        ]

        matrix_a = IndexedRowMatrix(
            spark._sc.parallelize(a_data)).toCoordinateMatrix()

        matrix_b = IndexedRowMatrix(
            spark._sc.parallelize(b_data)).toCoordinateMatrix()

        product = matrix.multiply_coordinate_matrices(matrix_a, matrix_b)
        actual = product.toBlockMatrix().toLocalMatrix().toArray()

        expected = [[16.0, 18.0], [12.0, 13.0]]

        assert actual.tolist() == expected
 def _dist_matrix(self, rddv1, rddv2, sc):
     dlist1 = rddv1.collect()
     dlist2 = rddv2.collect()
     irows1 = [
         IndexedRow(i, dlist1[i][0].toArray())
         for i in range(0, len(dlist1))
     ]
     irows2 = [
         IndexedRow(i, dlist2[i][0].toArray())
         for i in range(0, len(dlist2))
     ]
     IMatrix1 = IndexedRowMatrix(sc.parallelize(irows1))
     IMatrix2 = IndexedRowMatrix(sc.parallelize(irows2))
     cart = IMatrix1.rows.cartesian(IMatrix2.rows)
     A = cart.map(lambda x: (x[0].index, x[1].index,
                             np.sqrt(
                                 np.sum(
                                     np.power(
                                         np.array(x[0].vector) - np.array(x[
                                             1].vector), 2))))).collect()
     A.sort()
     Arr = self.__dist_array(A)
     return Arr
Exemplo n.º 19
0
	def getConnectivity(self,rddv,spark):
		sc = spark.sparkContext
		radius = self.getRadius()
		dist = self.getDistance()
		dlist = rddv.collect()
		featurecol = self.getFeaturesCol()
		irows = [IndexedRow(i,dlist[i][featurecol].toArray()) for i in range(0,len(dlist))]
		imatrix = IndexedRowMatrix(sc.parallelize(irows))
		cart = imatrix.rows.cartesian(imatrix.rows)

		rows = Row("id","vector")
		usr_row = [rows(i,np.float_(x).tolist()) for i,x in enumerate(dlist)]
		verts = spark.createDataFrame(usr_row)
		A = cart.filter(lambda x : dist(x[0].vector,x[1].vector) <= radius).map(lambda x : (x[0].index, x[1].index, 1))
		edges = spark.createDataFrame(A,['src','dst','connected'])
		return GraphFrame(verts,edges)
Exemplo n.º 20
0
    def __index_row_matrix_rdd(self, scale_df):
        """

        :param scale_df:
        :return:
        """
        try:
            vector_mllib = MLUtils.convertVectorColumnsFromML(
                scale_df, 'scaled_features').drop('features')
            vector_rdd = vector_mllib.select(
                'scaled_features',
                'id').rdd.map(lambda x: IndexedRow(x[1], x[0]))
            self.__logger.info("Build Index Row Matrix RDD")
            return IndexedRowMatrix(vector_rdd)
        except TypeError as te:
            raise OpheliaMLException(
                f"An error occurred while calling __index_row_matrix_rdd() method: {te}"
            )
Exemplo n.º 21
0
    def __init__(self, args, sc):
        self.ctx = sc

        self.numPartitions = args.partitions

        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        self.alpha = args.alpha
        self.beta = args.beta

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        print "Number of rows in Matrix with type" + str(type(inputMatrixData)) + " is: " + str(inputMatrixData.count())

        # PipelinedRDD to RDD
        # newData = sc.parallelize(inputMatrixData.collect())

        inputMatrix = IndexedRowMatrix(inputMatrixData)

        inputVector = readVector(self.inputVectorPath, self.ctx)

        print "Vector size is: " + str(inputVector.size)

        result = Vectors.zeros(inputVector.size)

        # print result

        # DGEMV(alpha, A, x, beta, y, jsc):
        result = L2.DGEMV(self.alpha, inputMatrix, inputVector, self.beta, result, self.ctx)

        # writeVector(self.outputVectorPath, result)

        printVector(result)
Exemplo n.º 22
0
def dense_matrix_cross_join(
    spark: SQLContext,
    output_col: str,
    primary_row_number_col: str,
    primary_matrix: IndexedRowMatrix,
    secondary_row_number_col: str,
    secondary_matrix: DenseMatrix,
):
    """Multiply 2 dense matrices to produce a dataframe with pairwise results
    showing primary row number, secondary column number and the dot product as a score
    Note that if you are using this method to produce the cosine similarity of 2 dense
    matrices then it is expected that you have already taken the transpose of the
    secondary matrix"""
    product = primary_matrix.multiply(secondary_matrix)

    log.info(
        "finished dense matrix multiplication",
        num_cols=product.numCols(),
        num_rows=product.numRows(),
    )

    coords_matrix = product.toCoordinateMatrix()

    log.info(
        "finished converting row matrix to coordinate matrix",
        num_cols=coords_matrix.numCols(),
        num_rows=coords_matrix.numRows(),
    )

    return coord_matrix_to_dataframe(
        spark,
        primary_row_number_col,
        secondary_row_number_col,
        output_col,
        coords_matrix,
    )
Exemplo n.º 23
0
class ConjugateGradient:
    def __init__(self, args, sc):

        self.EPSILON = 1.0e-5

        self.ctx = sc

        self.numPartitions = args.partitions

        self.numIterations = args.iterations
        self.inputVectorPath = args.inputVector
        self.inputMatrixPath = args.inputMatrix
        self.outputVectorPath = args.outputVector

        # Read Matrix input data
        # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)

        if (self.numPartitions != 0):
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))\
                .repartition(self.numPartitions)
        else:
            inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\
                .map(lambda line: IndexedRow(line[0], line[1]))

        self.inputMatrix = IndexedRowMatrix(inputMatrixData)

        self.inputVector = readVector(self.inputVectorPath, self.ctx)

        if (self.numIterations == 0):
            self.numIterations = self.inputVector.size * 2

        self.result = Vectors.zeros(self.inputVector.size)

    def solve(self):
        # print result

        stop = False

        start = time.clock()

        r = np.copy(self.inputVector)

        Ap = Vectors.zeros(self.inputMatrix.numRows())

        # p = r
        p = np.copy(r)

        # rsold = r * r

        rsold = r.dot(r)

        rsold = r.dot(r)

        alpha = 0.0

        rsnew = 0.0

        k = 0

        while (not stop):

            # Inicio -- Ap=A * p
            Ap = L2.DGEMV(1.0, self.inputMatrix, p, 0.0, Ap, self.ctx)

            # Fin -- Ap=A * p

            # alpha=rsold / (p'*Ap)
            alpha = rsold / p.dot(Ap);

            # x=x+alpha * p
            self.result = self.result + alpha*p

            # r=r-alpha * Ap
            r = r - alpha*Ap

            # rsnew = r'*r
            rsnew = r.dot(r)

            if ((math.sqrt(rsnew) <= self.EPSILON) or (k >= (self.numIterations))):
                stop = True

            # p=r+rsnew / rsold * p
            p = r + (rsnew/rsold) * p

            rsold = rsnew

            k += 1

        # FIN GRADIENTE CONJUGADO

        end = time.clock()

        print "Total time in solve system is: " + str(end - start) + " and " + str(k) + " iterations."

        printVector(self.result)

        return self.result
Exemplo n.º 24
0
#     --.reduceByKey(lambda a,b: a+b)\
#     --.map(lambda x: (x[0], sorted(x[1], key=lambda x: x[0], reverse=True)))\
#     --.map(lambda x: (x[0], [p[1] for p in x[1]]))\
#     --.map(lambda x: x[1])\
#     --.zipWithIndex()
# ------------------------------------------

# do I have a 2D matrix now?
print(
    "# do I have a 2D matrix now --> FULLY PREDICTED ????????????????????????")
for item in final_stars_FINAL_READY.collect():
    print(item)
print(
    "# do I have a 2D matrix now --> FULLY PREDICTED ??????????????????????? ==> NOW WE KNOw ........."
)
iris_irm = IndexedRowMatrix(
    final_stars_FINAL_READY.map(lambda x: IndexedRow(x[1], x[0])))

# ------------------------------------------
# https://blog.paperspace.com/dimension-reduction-with-principal-component-analysis/
# do SVD:
num_of_top_sing_values = 2
SVD = iris_irm.computeSVD(num_of_top_sing_values, True)

U = SVD.U
S = SVD.s.toArray()

# compute the eigenvalues and number of components to retain
n = final_stars_FINAL_READY.count()
eigvals = S**2 / (n - 1)
eigvals = np.flipud(np.sort(eigvals))
cumsum = eigvals.cumsum()
    data = sc.wholeTextFiles(root + folders[f])
    data.cache()
    documents = data.map(lambda s: tokenize(s[1])).map(
        lambda s: remove_stopwords(s, stopwords))
    files = data.map(lambda s: s[0]).collect()
    documents.cache()
    hashingTF = HashingTF()
    featurizedData = hashingTF.transform(documents)
    idf = IDF()
    idfModel = idf.fit(featurizedData)
    featurizedData.cache()
    tfidfs = idfModel.transform(featurizedData)
    tfidfs.cache()
    final_rdd = tfidfs.zipWithIndex().map(lambda s: IndexedRow(s[1], s[0]))
    final_rdd.cache()
    sims = IndexedRowMatrix(final_rdd).toCoordinateMatrix().transpose(
    ).toIndexedRowMatrix().columnSimilarities()
    pairs = sims.entries.map(lambda m: [m.i, m.j, m.value]).collect()
    for p in range(0, len(pairs)):
        pairs.append([pairs[p][1], pairs[p][0], pairs[p][2]])
    results = []
    for p in range(0, len(files)):
        results.append([p, 0, 0.0])

    for p in range(0, len(pairs)):
        index = pairs[p][0]
        if pairs[p][2] > results[index][2]:
            results[index] = [index, pairs[p][1], pairs[p][2]]
    file_object = open("/home/user/out/" + folders[f] + ".csv", "w")
    for i in range(0, len(files)):
        file_object.write(
            str(results[i][0]) + ";" + str(results[i][1]) + ";" +
Exemplo n.º 26
0
def as_block_matrix(rdd, rowsPerBlock=65000, colsPerBlock=65000):
    return IndexedRowMatrix(
        rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0]))
    ).toBlockMatrix(rowsPerBlock, colsPerBlock)
Exemplo n.º 27
0
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(
            "Usage: spark-submit generate_similarity_matrix.py <input path to hdfs file> <hdfs output path>",
            file=sys.stderr)
        exit(-1)
    #convert and process raw input to (bookid, [features])
    def processFeatures(raw):
        features_str = raw.split()
        book_id = int(features_str[0])
        features = []
        for i in range(1, len(features_str)):
            features.append(float(features_str[i]))
        return (book_id, features)

    sc = SparkContext(appName="BookRecSystem")
    spark = SQLContext(sc)
    featureRdd = sc.textFile(sys.argv[1])
    featureRdd = featureRdd.map(processFeatures)
    labels = featureRdd.map(lambda x: x[0])  #label_rdd
    fvecs = featureRdd.map(lambda x: Vectors.dense(x[1]))  #feature_rdd
    data = labels.zip(fvecs)
    mat = IndexedRowMatrix(data).toBlockMatrix(
    )  #convert to block-matrix for pairwise cosine similarity
    dot = mat.multiply(mat.transpose()).toIndexedRowMatrix().rows.map(
        lambda x: (x.index, x.vector.toArray())).sortByKey().map(
            lambda x: str(x[0]) + ' '.join(map(str, x[1]))
        )  #pairwise_cosine_similarity to rdd
    dot.saveAsTextFile(sys.argv[2])  #save output
    sc.stop()
Exemplo n.º 28
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
import numpy as np
import os
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix

os.environ["SPARK_HOME"] = "C:\\Users\\plfoley\\spark-2.3.1-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "C:\\Users\\plfoley\\winutils"

sc = SparkContext()
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) \
    .zipWithIndex()
rows2 = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) \
    .zipWithIndex()

# need a SQLContext() to generate an IndexedRowMatrix from RDD
sqlContext = SQLContext(sc)
rows = IndexedRowMatrix( \
    rows \
    .map(lambda row: IndexedRow(row[1], row[0])) \
    ).toBlockMatrix()

rows2 = IndexedRowMatrix( \
    rows2 \
    .map(lambda row2: IndexedRow(row2[1], row2[0])) \
    ).toBlockMatrix()

mat_product = rows.multiply(rows2).toLocalMatrix()
print(mat_product)
Exemplo n.º 29
0
        .getOrCreate()

    lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])

    articles = lines.map(lambda urls: getArticletText(urls))

    hashingTF = HashingTF()
    tf = hashingTF.transform(articles)

    tf.cache()
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    
    rows = tfidf.zipWithIndex()
    
    bm = IndexedRowMatrix(rows.map(lambda row : IndexedRow(row[1], row[0]))).toBlockMatrix()

    #bm_t = bm.transpose()
    #result_mat = bm.multiply(bm_t)
    #exact = result_mat.toIndexedRowMatrix().toRowMatrix()

    exact = bm.transpose().toIndexedRowMatrix().columnSimilarities()

    print(exact.entries.collect())

    #print(exact.entries.collect()[0])

    #parsedArticles = articles.collect()

    #tfidf = TfidfVectorizer().fit_transform(parsedArticles)
    #pairwise_similarity = tfidf * tfidf.T
Exemplo n.º 30
0
    def sparkComputeCost(self, input_file, x, y, theta):
        
        sc = SparkContext()

        # add the ones vector while building the RDD
        idx = 0
        x_mat = sc.textFile(input_file) \
            .map(lambda line: ('1, ' + line).split(",")[:-1]) \
            .zipWithIndex()
        
        # need a SQLContext() to generate an IndexedRowMatrix from RDD
        sqlContext = SQLContext(sc)
        
        x_mat = IndexedRowMatrix( \
            x_mat \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        x_mat.cache()

        print "Matrix rows x cols"
        print x_mat.numRows()
        print x_mat.numCols()

        vec = sc.parallelize(theta) \
            .map(lambda line: [line]) \
            .zipWithIndex()

        vec = IndexedRowMatrix( \
            vec \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        vec.cache()

        print "Vector rows x cols"
        print vec.numRows()
        print vec.numCols()

        h = x_mat.multiply(vec)
        h.cache()

        print "Hypothesis rows x cols"
        print h.numRows()
        print h.numCols()

        y_vec = sc.textFile(input_file) \
            .map(lambda line: [('1, ' + line).split(",")[-1]]) \
            .zipWithIndex()

        y_vec = IndexedRowMatrix( \
            y_vec \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        y_vec.cache()

        errors = h.subtract(y_vec).toLocalMatrix()

        print sum(errors.toArray())

        '''sparkSession = SparkSession \
            .builder \
            .appName('pyspark') \
            .getOrCreate()
        
        df = sparkSession.read.csv(input_file)
        df = df \
            .toDF(x, y) \
            .withColumn("Ones", psf.lit(1)) \
            .cache()

        df.select(x,'Ones').show()'''

        '''sc = SparkContext('local', 'pyspark')
Exemplo n.º 31
0
# Code for PCA and whitening the dataset.

from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix
from sklearn import datasets
# create the standardizer model for standardizing the dataset

X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) )
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()
num_rows = X_rm.numRows()
# U is whitened and projected onto principal components subspace.

S = svd_o.s.toArray()
eig_vals = S**2
# change the ncomp to 3 for this tutorial
#n_comp  = np.argmax(np.cumsum(eig_vals)/eig_vals.sum() > 0.95)+1