示例#1
0
文件: tests.py 项目: vidur89/spark
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
示例#2
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
示例#3
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array(
         [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]
     )
     arr = pyarray.array("d", [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
示例#4
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
示例#5
0
    def _calculateSimilarity(self, threshold=0):
        """ similar_movies: (movie_id, movie_rating_rdd)"""
        user_cnt = self.utility.numRows()
        movie_cnt = self.utility.numCols()
        rdd = self.utility.entries
        sims = dict()
        sims_set = dict()
        for i in range(1, user_cnt + 1):
            sims_set[i] = list()

        users = rdd.groupBy(lambda x: x.i).collect()

        for i in range(1, user_cnt + 1):
            for j in range(1, user_cnt + 1):
                if i == j:
                    continue

                for user in users:
                    if user[0] == i:
                        user1 = user
                    if user[0] == j:
                        user2 = user

                vt1 = SparseVector(movie_cnt,
                                   [(user.j, 1) for user in user1[1]])
                vt2 = SparseVector(movie_cnt,
                                   [(user.j, 1) for user in user2[1]])

                sim = vt1.dot(vt2) / (vt1.norm(2) * vt2.norm(2))
                sims[(i, j)] = sim

                sims_set[i].append(j)
        return (sims, sims_set)
示例#6
0
                  'incorrect number of keys in sampleOHEDictManual')


# ** Sparse vectors **
import numpy as np
from pyspark.mllib.linalg import SparseVector

aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4, [[0,0.], [1,3.], [2,0.], [3,4.]])

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4, [[0,0.], [1,0.], [2,0.], [3,1.]])

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)


# TEST Sparse Vectors
Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(aDense.dot(w) == aSparse.dot(w),
                'dot product of aDense and w should equal dot product of aSparse and w')
Test.assertTrue(bDense.dot(w) == bSparse.dot(w),
                'dot product of bDense and w should equal dot product of bSparse and w')


# ** OHE features as sparse vectors **
sampleOneOHEFeatManual = SparseVector(7,[2,3],[1.0,1.0])
示例#7
0
# COMMAND ----------

import numpy as np
from pyspark.mllib.linalg import SparseVector

# COMMAND ----------

aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4, [1, 3], [3., 4.])

bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4, [3], [1.])

w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)

# COMMAND ----------

# MAGIC %md #### **(1c) OHE features as sparse vectors **Any feature that occurs in a point should have the value 1.0.  For example, the `DenseVector` for a point with features 2 and 4 would be `[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]`.

# COMMAND ----------

# Reminder of the sample features
# sampleOne = [(0, 'mouse'), (1, 'black')]
# sampleTwo = [(0, 'cat'), (1, 'tabby'), (2, 'mouse')]
# sampleThree =  [(0, 'bear'), (1, 'black'), (2, 'salmon')]

# COMMAND ----------
示例#8
0
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.mllib.linalg import SparseVector
from pyspark.ml.linalg import DenseVector
from pyspark.sql import Row
from functools import partial
from pyspark.ml.regression import LinearRegression
import string
import nltk
import math
from nltk.stem.porter import *
from nltk.corpus import stopwords

#sc = SparkContext(appName="Example1")

a = SparseVector(10000, {4847: 3.0224, 9959: 6.4765})
print a.dot(a)

#print a.dot(array([1., 2., 3., 4.]))

b = SparseVector(
    10000, {
        15: 0.2668,
        67: 3.0431,
        69: 3.6234,
        93: 2.6627,
        315: 7.5103,
        415: 3.9424,
        419: 4.5356,
        493: 2.399,
        788: 0.2727,
        858: 0.3938,
示例#9
0
    sampleOHEDictManual[(2, 'mouse')],
    'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4',
    "incorrect value for sampleOHEDictManual[(2,'mouse')]")
Test.assertEqualsHashed(
    sampleOHEDictManual[(2, 'salmon')],
    'c1dfd96eea8cc2b62785275bca38ac261256e278',
    "incorrect value for sampleOHEDictManual[(2,'salmon')]")
Test.assertEquals(len(sampleOHEDictManual.keys()), 7,
                  'incorrect number of keys in sampleOHEDictManual')
# TEST Sparse Vectors (1b)
Test.assertTrue(isinstance(aSparse, SparseVector),
                'aSparse needs to be an instance of SparseVector')
Test.assertTrue(isinstance(bSparse, SparseVector),
                'aSparse needs to be an instance of SparseVector')
Test.assertTrue(
    aDense.dot(w) == aSparse.dot(w),
    'dot product of aDense and w should equal dot product of aSparse and w')
Test.assertTrue(
    bDense.dot(w) == bSparse.dot(w),
    'dot product of bDense and w should equal dot product of bSparse and w')
# TEST OHE Features as sparse vectors (1c)
Test.assertTrue(isinstance(sampleOneOHEFeatManual, SparseVector),
                'sampleOneOHEFeatManual needs to be a SparseVector')
Test.assertTrue(isinstance(sampleTwoOHEFeatManual, SparseVector),
                'sampleTwoOHEFeatManual needs to be a SparseVector')
Test.assertTrue(isinstance(sampleThreeOHEFeatManual, SparseVector),
                'sampleThreeOHEFeatManual needs to be a SparseVector')
Test.assertEqualsHashed(sampleOneOHEFeatManual,
                        'ecc00223d141b7bd0913d52377cee2cf5783abd6',
                        'incorrect value for sampleOneOHEFeatManual')
Test.assertEqualsHashed(sampleTwoOHEFeatManual,