Пример #1
0
def multiply_matrices2(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    listA = A.tolist()
    rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))])
    matA = IndexedRowMatrix(rddA).toBlockMatrix()

    listB = B.tolist()
    rddB = sc.parallelize([IndexedRow(i, listB[i]) for i in range(len(listB))])
    matB = IndexedRowMatrix(rddB).toBlockMatrix()

    matC = matA.multiply(matB).toLocalMatrix()
    return matC.toArray()
Пример #2
0
def multiply_transpose2(A: np.array) -> np.ndarray:  # A*A.T
    global counter
    print()
    print("No." + str(counter) + " matrix multiplication starts")
    start_time = time.time()
    print("matrix shape:", A.shape)
    listA = A.tolist()
    rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))])
    matA = IndexedRowMatrix(rddA).toBlockMatrix()
    matT = matA.transpose()
    matR = matA.multiply(matT)
    res = matR.toLocalMatrix().toArray()
    elapsed_time = time.time() - start_time
    print("No." + str(counter) + " matrix multiplication ends, takes time:",
          elapsed_time)
    counter = counter + 1
    return res
Пример #3
0
def dense_matrix_cross_join(
    spark: SQLContext,
    output_col: str,
    primary_row_number_col: str,
    primary_matrix: IndexedRowMatrix,
    secondary_row_number_col: str,
    secondary_matrix: DenseMatrix,
):
    """Multiply 2 dense matrices to produce a dataframe with pairwise results
    showing primary row number, secondary column number and the dot product as a score
    Note that if you are using this method to produce the cosine similarity of 2 dense
    matrices then it is expected that you have already taken the transpose of the
    secondary matrix"""
    product = primary_matrix.multiply(secondary_matrix)

    log.info(
        "finished dense matrix multiplication",
        num_cols=product.numCols(),
        num_rows=product.numRows(),
    )

    coords_matrix = product.toCoordinateMatrix()

    log.info(
        "finished converting row matrix to coordinate matrix",
        num_cols=coords_matrix.numCols(),
        num_rows=coords_matrix.numRows(),
    )

    return coord_matrix_to_dataframe(
        spark,
        primary_row_number_col,
        secondary_row_number_col,
        output_col,
        coords_matrix,
    )
Пример #4
0
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(
            "Usage: spark-submit generate_similarity_matrix.py <input path to hdfs file> <hdfs output path>",
            file=sys.stderr)
        exit(-1)
    #convert and process raw input to (bookid, [features])
    def processFeatures(raw):
        features_str = raw.split()
        book_id = int(features_str[0])
        features = []
        for i in range(1, len(features_str)):
            features.append(float(features_str[i]))
        return (book_id, features)

    sc = SparkContext(appName="BookRecSystem")
    spark = SQLContext(sc)
    featureRdd = sc.textFile(sys.argv[1])
    featureRdd = featureRdd.map(processFeatures)
    labels = featureRdd.map(lambda x: x[0])  #label_rdd
    fvecs = featureRdd.map(lambda x: Vectors.dense(x[1]))  #feature_rdd
    data = labels.zip(fvecs)
    mat = IndexedRowMatrix(data).toBlockMatrix(
    )  #convert to block-matrix for pairwise cosine similarity
    dot = mat.multiply(mat.transpose()).toIndexedRowMatrix().rows.map(
        lambda x: (x.index, x.vector.toArray())).sortByKey().map(
            lambda x: str(x[0]) + ' '.join(map(str, x[1]))
        )  #pairwise_cosine_similarity to rdd
    dot.saveAsTextFile(sys.argv[2])  #save output
    sc.stop()
Пример #5
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
import numpy as np
import os
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix

os.environ["SPARK_HOME"] = "C:\\Users\\plfoley\\spark-2.3.1-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "C:\\Users\\plfoley\\winutils"

sc = SparkContext()
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) \
    .zipWithIndex()
rows2 = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) \
    .zipWithIndex()

# need a SQLContext() to generate an IndexedRowMatrix from RDD
sqlContext = SQLContext(sc)
rows = IndexedRowMatrix( \
    rows \
    .map(lambda row: IndexedRow(row[1], row[0])) \
    ).toBlockMatrix()

rows2 = IndexedRowMatrix( \
    rows2 \
    .map(lambda row2: IndexedRow(row2[1], row2[0])) \
    ).toBlockMatrix()

mat_product = rows.multiply(rows2).toLocalMatrix()
print(mat_product)
Пример #6
0
wordsData = tokenizer.transform(dataFrame)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("title", "features").show()
#Normalizacion y transformada de la matriz
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(rescaledData)

#Proceso de similaridad hallando la norma y el producto punto
mat = IndexedRowMatrix(
    data.select("num", "norm")\
        .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix()
dot = mat.multiply(mat.transpose())
dot.toLocalMatrix().toArray()

dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\
    .select(
        psf.col("i.num").alias("i"),
        psf.col("j.num").alias("j"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .sort("i", "j")\
    .show()

tempcosine = data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\
       .select(
           psf.col("i.num").alias("i"),
           psf.col("j.num").alias("j"),
Пример #7
0
    def sparkComputeCost(self, input_file, x, y, theta):
        
        sc = SparkContext()

        # add the ones vector while building the RDD
        idx = 0
        x_mat = sc.textFile(input_file) \
            .map(lambda line: ('1, ' + line).split(",")[:-1]) \
            .zipWithIndex()
        
        # need a SQLContext() to generate an IndexedRowMatrix from RDD
        sqlContext = SQLContext(sc)
        
        x_mat = IndexedRowMatrix( \
            x_mat \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        x_mat.cache()

        print "Matrix rows x cols"
        print x_mat.numRows()
        print x_mat.numCols()

        vec = sc.parallelize(theta) \
            .map(lambda line: [line]) \
            .zipWithIndex()

        vec = IndexedRowMatrix( \
            vec \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        vec.cache()

        print "Vector rows x cols"
        print vec.numRows()
        print vec.numCols()

        h = x_mat.multiply(vec)
        h.cache()

        print "Hypothesis rows x cols"
        print h.numRows()
        print h.numCols()

        y_vec = sc.textFile(input_file) \
            .map(lambda line: [('1, ' + line).split(",")[-1]]) \
            .zipWithIndex()

        y_vec = IndexedRowMatrix( \
            y_vec \
            .map(lambda row: IndexedRow(row[1], row[0])) \
            ).toBlockMatrix()

        y_vec.cache()

        errors = h.subtract(y_vec).toLocalMatrix()

        print sum(errors.toArray())

        '''sparkSession = SparkSession \
            .builder \
            .appName('pyspark') \
            .getOrCreate()
        
        df = sparkSession.read.csv(input_file)
        df = df \
            .toDF(x, y) \
            .withColumn("Ones", psf.lit(1)) \
            .cache()

        df.select(x,'Ones').show()'''

        '''sc = SparkContext('local', 'pyspark')
Пример #8
0
import os
os.environ["SPARK_HOME"] = "C:\spark"
os.environ["HADOOP_HOME"] = "C:\winutils"

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix

if __name__ == "__main__":
    sc = SparkContext.getOrCreate()
    sqlContext = SQLContext(sc)  # IndexRowMatrix

    # Method 1:
    m = sc.parallelize([[2, 2, 2], [3, 3, 3]]).zipWithIndex()
    n = sc.parallelize([[1, 1], [4, 4], [3, 3]]).zipWithIndex()

    # Create an Index Row Matrix
    # Convert to Block Matrix
    mat1 = IndexedRowMatrix(m.map(lambda row: IndexedRow(row[1], row[0]))).toBlockMatrix()
    mat2 = IndexedRowMatrix(n.map(lambda row2: IndexedRow(row2[1], row2[0]))).toBlockMatrix()

    # Method 2:
    #mat1 = BlockMatrix(m, 2, 3)
    #mat2 = BlockMatrix(n, 3, 2)

    # Use of multiply function from pyspark.mllib.linalg
    mat_mul_output = mat1.multiply(mat2).toLocalMatrix()
    print(mat_mul_output)


data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt')
data = data.map(lambda _ : np.array(_.strip().split()).astype(float))
data = data.map(lambda _ : _/np.linalg.norm(_))
U = data.zipWithIndex().map(lambda _ : IndexedRow(_[1], _[0]))
U = IndexedRowMatrix(U)



UT = U.toCoordinateMatrix()
UT = UT.transpose()



U = U.toBlockMatrix()
UT = UT.toBlockMatrix()

S = U.multiply(UT)

S_coord = S.toCoordinateMatrix()
sim = S_coord.entries
print(sim.take(100))





debug.TIMESTAMP(2)
Пример #10
0
# Open some context to allow for toDF function to work or something ??
sql.SQLContext(sc)
data = sc.textFile(dataset)

#data = (data.map(lambda s: (list(map(lambda x: float(x), s.split()))))).zipWithIndex().map(lambda x: ((x[1], 0), DenseMatrix(1, 1000, x[0])))

# Read matrix normally to format of (rownumber, vector)
data = data.map(lambda s: (list(map(lambda x: float(x), s.split())))
                ).zipWithIndex().map(lambda x: (x[1], x[0]))

# Create a transpose for the matrix
tdata = sc.textFile(dataset).map(lambda s: list(
    map(lambda x:
        (x[0], float(x[1])), enumerate(s.split())))).zipWithIndex().flatMap(
            lambda x: map(lambda y: (y[0], (x[1], y[1])), x[0])).groupByKey()

# Map the transpose data to same format as normal matrix
tdata = tdata.map(lambda x: (x[
    0], map(lambda s: s[1], sorted(list(x[1]), key=itemgetter(0)))))

# Create BlockMatrix for the normal matrix and its transpose
mat = IndexedRowMatrix(data)
mat = mat.toBlockMatrix()
matTranspose = IndexedRowMatrix(tdata).toBlockMatrix()

# Get final result by multiplying mat * mat^T * mat
matTranspose = mat.multiply(matTranspose)
matRes = matTranspose.multiply(mat)

print('Done')
Пример #11
0
# rule (A*AT)*A = A*(AT*A).

# Calculate A transpose
AT = A.transpose()

# Make firts multiplication AT*A
ATA = AT.multiply(A)

# Print step 3 ready. With full set 9mins.
print(" ")
print("Step 3 ready")
print("ATA rows and cols:")
print("Rows:", ATA.numRows() , "Cols:", ATA.numCols())
print(" ")

# Make second multiplication A*ATA
AATA = A.multiply(ATA)

# Print step 4 ready
print(" ")
print("Step 4 ready")
print(" ")

# Convert AATA to indexRowMatrix
AATA = AATA.toIndexedRowMatrix()

# Get first row from AATA
first_row = AATA.rows.filter(lambda x: x.index == 0).first().vector.toArray()

# Print first row
print(first_row)
Пример #12
0
    def calculate_distance(self, sdf1, sdf2):
        """
        This will calculate the distance between the vector-type columns of two spark dataframes

        :param sdf1: This is to have a columns id1 (dtype int) and v1 (dtype Vector)
        :param sdf2: This is to have a columns id2 (dtype int) and v2 (dtype Vector)
        :return:
        """

        cov = RowMatrix(
            sdf1.select(["v1"]).withColumnRenamed("v1", "v").union(
                sdf2.select(["v2"]).withColumnRenamed(
                    "v2", "v")).rdd.map(lambda row: Vectors.fromML(row.asDict(
                    )["v"]))).computeCovariance().toArray()

        x, v = np.linalg.eigh(cov)

        indices = 1e-10 <= x

        # we are trying to enfore the data types to be only python types
        n = int(v.shape[0])
        m = int(indices.sum())

        v_vals = [float(val) for val in v[:, indices].reshape(-1, ).tolist()]

        v_spark = DenseMatrix(n, m, v_vals)

        x_vals = [
            float(val)
            for val in np.diag(x[indices]**-0.5).reshape(-1, ).tolist()
        ]

        x_spark = DenseMatrix(m, m, x_vals)

        # we get the index to maintain the order
        _sdf1 = sdf1.rdd.zipWithIndex()\
            .map(lambda val_key: Row(id1=val_key[0].id1, v1=val_key[0].v1, index=val_key[1])).toDF()

        _sdf1.persist()

        _sdf2 = sdf2.rdd.zipWithIndex()\
            .map(lambda val_key: Row(id2=val_key[0].id2, v2=val_key[0].v2, index=val_key[1])).toDF()

        _sdf2.persist()

        # we get our indexed row matrix
        _sdf1_mat = IndexedRowMatrix(
            _sdf1.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"],
                                                 vector=Vectors.fromML(
                                                     row.asDict()["v1"]))))

        _sdf2_mat = IndexedRowMatrix(
            _sdf2.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"],
                                                 vector=Vectors.fromML(
                                                     row.asDict()["v2"]))))

        # we apply our transformation and then set it as our new variable
        _sdf1 = _sdf1.drop("v1").join(_sdf1_mat.multiply(v_spark).multiply(x_spark).rows\
                                      .map(lambda indexed_row: Row(index=indexed_row.index,
                                                                   v1=indexed_row.vector)).toDF(), "index")

        _sdf2 = _sdf2.drop("v2").join(_sdf2_mat.multiply(v_spark).multiply(x_spark).rows\
                                      .map(lambda indexed_row: Row(index=indexed_row.index,
                                                                   v2=indexed_row.vector)).toDF(), "index")

        @F.udf(DoubleType(), VectorUDT())
        def tmp(vec):
            return float(vec[0].squared_distance(vec[1]))**0.5

        all_sdf = _sdf1.crossJoin(_sdf2)

        dist_sdf = all_sdf.select("*", tmp(F.array('v1', 'v2')).alias('diff'))

        dist_sdf.persist()

        return dist_sdf
Пример #13
0
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix
from pyspark.sql import SQLContext
from pyspark import SparkContext

sc = SparkContext()
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).zipWithIndex()

# need a SQLContext() to generate an IndexedRowMatrix from RDD
sqlContext = SQLContext(sc)
block_matrix = IndexedRowMatrix( \
    rows \
    .map(lambda row: IndexedRow(row[1], row[0])) \
    ).toBlockMatrix()

mat_product = block_matrix.multiply(block_matrix)
result = mat_product.toLocalMatrix()
print("Matrix Product \n", result)
mat_sum = block_matrix.add(block_matrix)
result = mat_sum.toLocalMatrix()
print("Matrix Sum \n", result)

mat_transpose = block_matrix.transpose()
result = mat_transpose.toLocalMatrix()
print("Matrix Transpose \n", result)