def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1.0, 2.0, 3.0, 4.0])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]) self.assertEquals(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat))) self.assertEquals(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) self.assertEquals(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEquals(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1.0, 2.0, 3.0, 4.0])) lst = DenseVector([1, 2, 3, 4]) mat = array( [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]] ) arr = pyarray.array("d", [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
def test_dot(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 1 lil[3, 0] = 2 dv = DenseVector(array([1., 2., 3., 4.])) self.assertEqual(10.0, dv.dot(lil))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) arr = pyarray.array('d', [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
def test_dot(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 1 lil[3, 0] = 2 dv = DenseVector(array([1., 2., 3., 4.])) self.assertEquals(10.0, dv.dot(lil))
def get_ratings(self, res_id, ratings, top): if res_id not in self.models.keys(): logger.info("Keys: " + str(self.models.keys())) logger.info("Key Type: " + str(type(self.models.keys()[0]))) logger.info("res_id: " + str(res_id)) logger.info("res_id type:" + str(type(res_id))) logger.info("res_id not known") return [] pf = self.models[res_id].productFeatures() user_pf = pf.filter(lambda x: x[0] in ratings) if len(user_pf.collect()) == 0: logger.info("No product matches") return [] user_f = user_pf.collect() tmp = DenseVector(user_f[0][1]) for i in xrange(1, len(user_f)): tmp = tmp + user_f[i][1] #user_f = user_pf.reduce(lambda x, y : DenseVector(x[1]) + DenseVector(y[1])) estimate_score = pf.map(lambda x: (x[0], tmp.dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v) #estimate_score = pf.map(lambda x: (x[0], DenseVector(user_f).dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v) estimate_pid = map(lambda x: x[0], estimate_score) return estimate_pid
def gradientSummand(weights, lp): """Calculates the gradient summand for a given weight and `LabeledPoint`. Note: `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably within this function. For example, they both implement the `dot` method. Args: weights (DenseVector): An array of model weights (betas). lp (LabeledPoint): The `LabeledPoint` for a single observation. Returns: DenseVector: An array of values the same length as `weights`. The gradient summand. """ return (DenseVector.dot(weights, lp.features) - lp.label) * lp.features
def gradientSummand(weights, lp): """Calculates the gradient summand for a given weight and `LabeledPoint`. Note: `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably within this function. For example, they both implement the `dot` method. Args: weights (DenseVector): An array of model weights (betas). lp (LabeledPoint): The `LabeledPoint` for a single observation. Returns: DenseVector: An array of values the same length as `weights`. The gradient summand. """ return (DenseVector.dot(lp.features, weights) - lp.label) * lp.features
def getLabeledPrediction(weights, observation): """Calculates predictions and returns a (label, prediction) tuple. Note: The labels should remain unchanged as we'll use this information to calculate prediction error later. Args: weights (np.ndarray): An array with one weight for each features in `trainData`. observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the features for the data point. Returns: tuple: A (label, prediction) tuple. """ return (observation.label, DenseVector.dot(observation.features, weights))
def getLabeledPrediction(weights, observation): """Calculates predictions and returns a (label, prediction) tuple. Note: The labels should remain unchanged as we'll use this information to calculate prediction error later. Args: weights (np.ndarray): An array with one weight for each features in `trainData`. observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the features for the data point. Returns: tuple: A (label, prediction) tuple. """ label = observation.label features = DenseVector(observation.features) weights = DenseVector(weights) prediction = DenseVector.dot(weights, features) result = (label, prediction) return result
zeros = np.zeros( 8) # returns an array of 8 0s [ 0. 0. 0. 0. 0. 0. 0. 0.] ones = np.ones(8) # returns an array of 8 1s [ 1. 1. 1. 1. 1. 1. 1. 1.] print 'zeros:\n{0}'.format(zeros) print '\nones:\n{0}'.format(ones) zerosThenOnes = np.hstack((zeros, ones)) #notice the "((" # hstack will return [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.] zerosAboveOnes = np.vstack((zeros, ones)) # A 2 by 8 array # vstack in the above example will return [[ 0. 0. 0. 0. 0. 0. 0. 0.] # [ 1. 1. 1. 1. 1. 1. 1. 1.]] print '\nzerosThenOnes:\n{0}'.format(zerosThenOnes) print '\nzerosAboveOnes:\n{0}'.format(zerosAboveOnes) # When using PySpark, we use DenseVector instead of numpy vector. Example below: from pyspark.mllib.linalg import DenseVector numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector([3.0, 4.0, 5.0]) # Calculate the dot product between the two vectors. denseDotProduct = DenseVector.dot( myDenseVector, numpyVector) # DenseVector.dot() does the dot product print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)
expectedError = [79.72013547, 30.27835699, 9.27842641, 9.20967856, 9.19446483] Test.assertTrue(np.allclose(exampleErrorTrain, expectedError), 'value of exampleErrorTrain is incorrect') # #### ** (3d) Train the model ** # #### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set. Note that the test set will not be used here. If we evaluated the model on the test set, we would bias our final results. # #### We've already done much of the required work: we computed the number of features in Part (1b); we created the training and validation datasets and computed their sizes in Part (1e); and, we wrote a function to compute RMSE in Part (2b). # In[44]: # TODO: Replace <FILL IN> with appropriate code numIters = 50 weightsLR0, errorTrainLR0 = linregGradientDescent(parsedTrainData, numIters); labelsAndPreds = parsedValData.map(lambda lp: (lp.label,DenseVector.dot(weightsLR0,lp.features))) rmseValLR0 = calcRMSE(labelsAndPreds) print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(rmseValBase, rmseValLR0) # In[45]: # TEST Train the model (3d) expectedOutput = [22.64535883, 20.064699, -0.05341901, 8.2931319, 5.79155768, -4.51008084, 15.23075467, 3.8465554, 9.91992022, 5.97465933, 11.36849033, 3.86452361] Test.assertTrue(np.allclose(weightsLR0, expectedOutput), 'incorrect value for weightsLR0') # #### ** Visualization 4: Training error **
# In[22]: from pyspark.mllib.linalg import DenseVector # In[25]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector([3.0, 4.0, 5.0]) # Calculate the dot product between the two vectors. denseDotProduct = myDenseVector.dot(numpyVector) print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct) # In[26]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector') Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])), 'incorrect value for myDenseVector') Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct') # ### ** Part 4: Python lambda expressions **
# In[28]: from pyspark.mllib.linalg import DenseVector # In[31]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print "\nnumpyVector:\n{0}".format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector([3.0, 4.0, 5.0]) # <FILL IN> # Calculate the dot product between the two vectors. denseDotProduct = myDenseVector.dot(DenseVector(numpyVector)) # <FILL IN> print "myDenseVector:\n{0}".format(myDenseVector) print "\ndenseDotProduct:\n{0}".format(denseDotProduct) # In[32]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), "myDenseVector is not a DenseVector") Test.assertTrue(np.allclose(myDenseVector, np.array([3.0, 4.0, 5.0])), "incorrect value for myDenseVector") Test.assertTrue(np.allclose(denseDotProduct, 0.0), "incorrect value for denseDotProduct") # ### ** Part 4: Python lambda expressions **
expectedError = [79.72013547, 30.27835699, 9.27842641, 9.20967856, 9.19446483] Test.assertTrue(np.allclose(exampleErrorTrain, expectedError), 'value of exampleErrorTrain is incorrect') # #### ** (3d) Train the model ** # #### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set. Note that the test set will not be used here. If we evaluated the model on the test set, we would bias our final results. # #### We've already done much of the required work: we computed the number of features in Part (1b); we created the training and validation datasets and computed their sizes in Part (1e); and, we wrote a function to compute RMSE in Part (2b). # In[44]: # TODO: Replace <FILL IN> with appropriate code numIters = 50 weightsLR0, errorTrainLR0 = linregGradientDescent(parsedTrainData, numIters) labelsAndPreds = parsedValData.map( lambda lp: (lp.label, DenseVector.dot(weightsLR0, lp.features))) rmseValLR0 = calcRMSE(labelsAndPreds) print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format( rmseValBase, rmseValLR0) # In[45]: # TEST Train the model (3d) expectedOutput = [ 22.64535883, 20.064699, -0.05341901, 8.2931319, 5.79155768, -4.51008084, 15.23075467, 3.8465554, 9.91992022, 5.97465933, 11.36849033, 3.86452361 ] Test.assertTrue(np.allclose(weightsLR0, expectedOutput), 'incorrect value for weightsLR0')
# In[29]: from pyspark.mllib.linalg import DenseVector # In[31]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector(np.array([3.0,4.0,5.0])) # Calculate the dot product between the two vectors. denseDotProduct = DenseVector.dot(myDenseVector, numpyVector) print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct) # In[32]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector') Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])), 'incorrect value for myDenseVector') Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct') # ### ** Part 4: Python lambda expressions **
def pyspark_auc_mu(data, y_true, y_score, A=None, W=None): """ Compute the multi-class measure AUC Mu from prediction scores and labels from Spark dataframe. Parameters ---------- data : Spark dataframe The prediction output table from pyspark.ml y_true : double type column in data, shape = [n_samples] The true class labels column from spark dataframe in the range [0, n_samples-1] y_score : vector type column in data, shape = [n_samples, n_classes] Target scores, where each element in a vector is a categorical distribution over the n_classes. A : array, shape = [n_classes, n_classes], optional The partition (or misclassification cost) matrix. If ``None`` A is the argmax partition matrix. Entry A_{i,j} is the cost of classifying an instance as class i when the true class is j. It is expected that diagonal entries in A are zero and off-diagonal entries are positive. W : array, shape = [n_classes, n_classes], optional The weight matrix for incorporating class skew into AUC Mu. If ``None``, the standard AUC Mu is calculated. If W is specified, it is expected to be a lower triangular matrix where entrix W_{i,j} is a positive float from 0 to 1 for the partial score between classes i and j. Entries not in the lower triangular portion of W must be 0 and the sum of all entries in W must be 1. Returns ------- auc_mu : float References ---------- .. [1] Kleiman, R., Page, D. ``AUC Mu: A Performance Metric for Multi-Class Machine Learning Models``, Proceedings of the 2019 International Conference on Machine Learning (ICML). [2] https://github.com/kleimanr/auc_mu """ n_classes = data.select(y_true).distinct().count() n_samples = data.select(y_score).count() # Validate input arguments if not isinstance(data, DataFrame): raise TypeError("Expected data to be DataFrame, got: %s" % type(data)) if not data.select(y_true).dtypes[0][1] == 'double': raise TypeError("Expected column y_true to be double, got: %s" % data.select(y_true).dtypes[0][1]) if not data.select(y_score).dtypes[0][1] == 'vector': raise TypeError("Expected column y_score to be vector, got: %s" % data.select(y_true).dtypes[0][1]) if not data.select(y_true).count() == n_samples: raise ValueError("Expected y_true to be shape %s, got: %s" %(str(data.select(y_score).count()), str(data.select(y_true).count()))) slen = udf(lambda s: len(s), IntegerType()) if not data.select(slen(col(y_score))).groupBy().avg().collect()[0][0] == n_classes: raise ValueError("Expected y_true values in range 0..%i, got: %s" %(n_classes-1, str(data.select(slen(data.y_score)).groupBy().avg().collect()[0][0]))) if A is None: A = np.ones((n_classes, n_classes)) - np.eye(n_classes) if not isinstance(A, np.ndarray): raise TypeError("Expected A to be np.ndarray, got: %s" % type(A)) if not A.ndim == 2: raise ValueError("Expected A to be 2 dimensional, got: %s" % A.ndim) if not A.shape == (n_classes, n_classes): raise ValueError("Expected A to be shape (%i, %i), got: %s" %(n_classes, n_classes, str(A.shape))) if not np.all(A.diagonal() == np.zeros(n_classes)): raise ValueError("Expected A to be zero on the diagonals") if not np.all(A >= 0): raise ValueError("Expected A to be non-negative") if W is None: W = np.tri(n_classes, k=-1) W /= W.sum() if not isinstance(W, np.ndarray): raise TypeError("Expected W to be np.ndarray, got: %s" % type(W)) if not W.ndim == 2: raise ValueError("Expected W to be 2 dimensional, got: %s" % W.ndim) if not W.shape == (n_classes, n_classes): raise ValueError("Expected W to be shape (%i, %i), got: %s" %(n_classes, n_classes, str(W.shape))) auc_total = 0.0 for class_i in xrange(n_classes): preds_i = data.select(y_score).where(col(y_true) == class_i) n_i = preds_i.count() for class_j in xrange(class_i): preds_j = data.select(y_score).where(col(y_true) == class_j) temp_preds = preds_i.union(preds_j) n_j = preds_j.count() n = n_i+n_j #temp_preds: concat prob vectors which class = i and j temp_preds = DenseVector(temp_preds.select("probability").rdd.map(lambda x: x[0]).collect()) #temp_labels: convert the two selected classes to a binary class vector temp_labels = np.zeros((n), dtype=int) temp_labels[n_i:n] = 1 # v: differencing by vector_{i,.} and vector_{j,.} in partition matrix v = A[class_i, :] - A[class_j, :] score = temp_preds.dot(v) df = np.column_stack([score, temp_labels]) concat_df = map(lambda x: (float(x[0]), float(x[1:])), df) auc_mu_df = spark.createDataFrame(concat_df,schema=["score", "temp_label"]) evaluator = BinaryClassificationEvaluator(labelCol="temp_label", rawPredictionCol="score", metricName='areaUnderROC') score_i_j = evaluator.evaluate(auc_mu_df) auc_total += W[class_i, class_j]*score_i_j return auc_total
from pyspark.mllib.linalg import DenseVector # In[29]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] #myDenseVector = <FILL IN> myDenseVector = DenseVector([3.0, 4.0, 5.0]) # Calculate the dot product between the two vectors. #denseDotProduct = <FILL IN> denseDotProduct = DenseVector.dot(myDenseVector, numpyVector) print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct) # In[30]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector') Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])), 'incorrect value for myDenseVector') Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct') # ### ** Part 4: Python lambda expressions **
zeros = np.zeros(8) # returns an array of 8 0s [ 0. 0. 0. 0. 0. 0. 0. 0.] ones = np.ones(8) # returns an array of 8 1s [ 1. 1. 1. 1. 1. 1. 1. 1.] print 'zeros:\n{0}'.format(zeros) print '\nones:\n{0}'.format(ones) zerosThenOnes = np.hstack((zeros,ones)) #notice the "((" # hstack will return [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.] zerosAboveOnes = np.vstack((zeros,ones)) # A 2 by 8 array # vstack in the above example will return [[ 0. 0. 0. 0. 0. 0. 0. 0.] # [ 1. 1. 1. 1. 1. 1. 1. 1.]] print '\nzerosThenOnes:\n{0}'.format(zerosThenOnes) print '\nzerosAboveOnes:\n{0}'.format(zerosAboveOnes) # When using PySpark, we use DenseVector instead of numpy vector. Example below: from pyspark.mllib.linalg import DenseVector numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector([3.0, 4.0, 5.0]) # Calculate the dot product between the two vectors. denseDotProduct = DenseVector.dot(myDenseVector, numpyVector) # DenseVector.dot() does the dot product print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)
# In[97]: from pyspark.mllib.linalg import DenseVector # In[98]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector(np.array([3.0,4.0,5.0])) # Calculate the dot product between the two vectors. denseDotProduct = DenseVector.dot(DenseVector(numpyVector),myDenseVector) print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct) # In[99]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector') Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])), 'incorrect value for myDenseVector') Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct') # ### ** Part 4: Python lambda expressions **
PySpark provides a DenseVector class which allows you to more efficiently operate and store these sparse vectors DenseVector is used to store arrays of values for use in PySpark. DenseVector actually stores values in a NumPy array and delegates calculations to that object. You can create a new DenseVector using DenseVector() and passing in a NumPy array or a Python list ''' from pyspark.mllib.linalg import DenseVector # Create a numpy array numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector myDenseVector = DenseVector(np.array([3.0, 4.0, 5.0])) # Calculate the dot product between the two vectors. # One of the vectors here is a numpy array, and the other a densevector denseDotProduct = DenseVector.dot(DenseVector(numpyVector),myDenseVector) print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct) ################# Intro to Panda ############################ ### ### ### ### ### Introduction to Panda ### ### ### ### ### ############################################################### import numpy as np import pandas as pd