def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1.0, 2.0, 3.0, 4.0])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]) self.assertEquals(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat))) self.assertEquals(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) self.assertEquals(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEquals(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1.0, 2.0, 3.0, 4.0])) lst = DenseVector([1, 2, 3, 4]) mat = array( [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]] ) arr = pyarray.array("d", [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) arr = pyarray.array('d', [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
def _calculateSimilarity(self, threshold=0): """ similar_movies: (movie_id, movie_rating_rdd)""" user_cnt = self.utility.numRows() movie_cnt = self.utility.numCols() rdd = self.utility.entries sims = dict() sims_set = dict() for i in range(1, user_cnt + 1): sims_set[i] = list() users = rdd.groupBy(lambda x: x.i).collect() for i in range(1, user_cnt + 1): for j in range(1, user_cnt + 1): if i == j: continue for user in users: if user[0] == i: user1 = user if user[0] == j: user2 = user vt1 = SparseVector(movie_cnt, [(user.j, 1) for user in user1[1]]) vt2 = SparseVector(movie_cnt, [(user.j, 1) for user in user2[1]]) sim = vt1.dot(vt2) / (vt1.norm(2) * vt2.norm(2)) sims[(i, j)] = sim sims_set[i].append(j) return (sims, sims_set)
'incorrect number of keys in sampleOHEDictManual') # ** Sparse vectors ** import numpy as np from pyspark.mllib.linalg import SparseVector aDense = np.array([0., 3., 0., 4.]) aSparse = SparseVector(4, [[0,0.], [1,3.], [2,0.], [3,4.]]) bDense = np.array([0., 0., 0., 1.]) bSparse = SparseVector(4, [[0,0.], [1,0.], [2,0.], [3,1.]]) w = np.array([0.4, 3.1, -1.4, -.5]) print aDense.dot(w) print aSparse.dot(w) print bDense.dot(w) print bSparse.dot(w) # TEST Sparse Vectors Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector') Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector') Test.assertTrue(aDense.dot(w) == aSparse.dot(w), 'dot product of aDense and w should equal dot product of aSparse and w') Test.assertTrue(bDense.dot(w) == bSparse.dot(w), 'dot product of bDense and w should equal dot product of bSparse and w') # ** OHE features as sparse vectors ** sampleOneOHEFeatManual = SparseVector(7,[2,3],[1.0,1.0])
# COMMAND ---------- import numpy as np from pyspark.mllib.linalg import SparseVector # COMMAND ---------- aDense = np.array([0., 3., 0., 4.]) aSparse = SparseVector(4, [1, 3], [3., 4.]) bDense = np.array([0., 0., 0., 1.]) bSparse = SparseVector(4, [3], [1.]) w = np.array([0.4, 3.1, -1.4, -.5]) print aDense.dot(w) print aSparse.dot(w) print bDense.dot(w) print bSparse.dot(w) # COMMAND ---------- # MAGIC %md #### **(1c) OHE features as sparse vectors **Any feature that occurs in a point should have the value 1.0. For example, the `DenseVector` for a point with features 2 and 4 would be `[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]`. # COMMAND ---------- # Reminder of the sample features # sampleOne = [(0, 'mouse'), (1, 'black')] # sampleTwo = [(0, 'cat'), (1, 'tabby'), (2, 'mouse')] # sampleThree = [(0, 'bear'), (1, 'black'), (2, 'salmon')] # COMMAND ----------
from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.mllib.linalg import SparseVector from pyspark.ml.linalg import DenseVector from pyspark.sql import Row from functools import partial from pyspark.ml.regression import LinearRegression import string import nltk import math from nltk.stem.porter import * from nltk.corpus import stopwords #sc = SparkContext(appName="Example1") a = SparseVector(10000, {4847: 3.0224, 9959: 6.4765}) print a.dot(a) #print a.dot(array([1., 2., 3., 4.])) b = SparseVector( 10000, { 15: 0.2668, 67: 3.0431, 69: 3.6234, 93: 2.6627, 315: 7.5103, 415: 3.9424, 419: 4.5356, 493: 2.399, 788: 0.2727, 858: 0.3938,
sampleOHEDictManual[(2, 'mouse')], 'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4', "incorrect value for sampleOHEDictManual[(2,'mouse')]") Test.assertEqualsHashed( sampleOHEDictManual[(2, 'salmon')], 'c1dfd96eea8cc2b62785275bca38ac261256e278', "incorrect value for sampleOHEDictManual[(2,'salmon')]") Test.assertEquals(len(sampleOHEDictManual.keys()), 7, 'incorrect number of keys in sampleOHEDictManual') # TEST Sparse Vectors (1b) Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector') Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector') Test.assertTrue( aDense.dot(w) == aSparse.dot(w), 'dot product of aDense and w should equal dot product of aSparse and w') Test.assertTrue( bDense.dot(w) == bSparse.dot(w), 'dot product of bDense and w should equal dot product of bSparse and w') # TEST OHE Features as sparse vectors (1c) Test.assertTrue(isinstance(sampleOneOHEFeatManual, SparseVector), 'sampleOneOHEFeatManual needs to be a SparseVector') Test.assertTrue(isinstance(sampleTwoOHEFeatManual, SparseVector), 'sampleTwoOHEFeatManual needs to be a SparseVector') Test.assertTrue(isinstance(sampleThreeOHEFeatManual, SparseVector), 'sampleThreeOHEFeatManual needs to be a SparseVector') Test.assertEqualsHashed(sampleOneOHEFeatManual, 'ecc00223d141b7bd0913d52377cee2cf5783abd6', 'incorrect value for sampleOneOHEFeatManual') Test.assertEqualsHashed(sampleTwoOHEFeatManual,