예제 #1
0
파일: tests.py 프로젝트: vidur89/spark
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
예제 #2
0
파일: tests.py 프로젝트: bopopescu/SparkNew
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     self.assertEquals(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEquals(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEquals(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
예제 #3
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1.0, 2.0, 3.0, 4.0]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array(
         [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]
     )
     arr = pyarray.array("d", [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
예제 #4
0
 def test_dot(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 1
     lil[3, 0] = 2
     dv = DenseVector(array([1., 2., 3., 4.]))
     self.assertEqual(10.0, dv.dot(lil))
예제 #5
0
 def test_dot(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = DenseVector(array([1., 2., 3., 4.]))
     lst = DenseVector([1, 2, 3, 4])
     mat = array([[1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.],
                  [1., 2., 3., 4.]])
     arr = pyarray.array('d', [0, 1, 2, 3])
     self.assertEqual(10.0, sv.dot(dv))
     self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
     self.assertEqual(30.0, dv.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
     self.assertEqual(30.0, lst.dot(dv))
     self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
     self.assertEqual(7.0, sv.dot(arr))
예제 #6
0
파일: tests.py 프로젝트: bopopescu/SparkNew
 def test_dot(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 1
     lil[3, 0] = 2
     dv = DenseVector(array([1., 2., 3., 4.]))
     self.assertEquals(10.0, dv.dot(lil))
예제 #7
0
    def get_ratings(self, res_id, ratings, top):
        if res_id not in self.models.keys():
            logger.info("Keys: " + str(self.models.keys()))
            logger.info("Key Type: " + str(type(self.models.keys()[0])))
            logger.info("res_id: " + str(res_id))
            logger.info("res_id type:" + str(type(res_id)))
            logger.info("res_id not known")
            return []
        
        pf = self.models[res_id].productFeatures()
         
        user_pf = pf.filter(lambda x: x[0] in ratings)
        if len(user_pf.collect()) == 0:
            logger.info("No product matches")
            return []
        user_f = user_pf.collect()
        tmp = DenseVector(user_f[0][1])
        for i in xrange(1, len(user_f)):
            tmp = tmp + user_f[i][1]
        #user_f = user_pf.reduce(lambda x, y : DenseVector(x[1]) + DenseVector(y[1]))
        estimate_score = pf.map(lambda x: (x[0], tmp.dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v)
 
        #estimate_score = pf.map(lambda x: (x[0], DenseVector(user_f).dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v)
        estimate_pid = map(lambda x: x[0], estimate_score)
        
        return estimate_pid
예제 #8
0
def gradientSummand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

    Returns:
        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    """
    return (DenseVector.dot(weights, lp.features) - lp.label) * lp.features
예제 #9
0
def gradientSummand(weights, lp):
    """Calculates the gradient summand for a given weight and `LabeledPoint`.

    Note:
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

    Args:
        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

    Returns:
        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    """
    return (DenseVector.dot(lp.features, weights) - lp.label) * lp.features
예제 #10
0
def getLabeledPrediction(weights, observation):
    """Calculates predictions and returns a (label, prediction) tuple.

    Note:
        The labels should remain unchanged as we'll use this information to calculate prediction
        error later.

    Args:
        weights (np.ndarray): An array with one weight for each features in `trainData`.
        observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the
            features for the data point.

    Returns:
        tuple: A (label, prediction) tuple.
    """
    return (observation.label, DenseVector.dot(observation.features, weights))
예제 #11
0
def getLabeledPrediction(weights, observation):
    """Calculates predictions and returns a (label, prediction) tuple.

    Note:
        The labels should remain unchanged as we'll use this information to calculate prediction
        error later.

    Args:
        weights (np.ndarray): An array with one weight for each features in `trainData`.
        observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the
            features for the data point.

    Returns:
        tuple: A (label, prediction) tuple.
    """
    label = observation.label
    features = DenseVector(observation.features)
    weights = DenseVector(weights)
    prediction = DenseVector.dot(weights, features)
    result = (label, prediction)
    return result
예제 #12
0
zeros = np.zeros(
    8)  # returns an array of 8 0s [ 0.  0.  0.  0.  0.  0.  0.  0.]
ones = np.ones(8)  # returns an array of 8 1s [ 1.  1.  1.  1.  1.  1.  1.  1.]
print 'zeros:\n{0}'.format(zeros)
print '\nones:\n{0}'.format(ones)

zerosThenOnes = np.hstack((zeros, ones))  #notice the "(("
# hstack will return [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.]
zerosAboveOnes = np.vstack((zeros, ones))  # A 2 by 8 array
# vstack in the above example will return 	[[ 0.  0.  0.  0.  0.  0.  0.  0.]
# 											[ 1.  1.  1.  1.  1.  1.  1.  1.]]

print '\nzerosThenOnes:\n{0}'.format(zerosThenOnes)
print '\nzerosAboveOnes:\n{0}'.format(zerosAboveOnes)

# When using PySpark, we use DenseVector instead of numpy vector. Example below:

from pyspark.mllib.linalg import DenseVector

numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(
    myDenseVector, numpyVector)  # DenseVector.dot() does the dot product

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)
예제 #13
0
expectedError = [79.72013547, 30.27835699,  9.27842641,  9.20967856,  9.19446483]
Test.assertTrue(np.allclose(exampleErrorTrain, expectedError),
                'value of exampleErrorTrain is incorrect')


# #### ** (3d) Train the model **
# #### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set.  Note that the test set will not be used here.  If we evaluated the model on the test set, we would bias our final results.
# #### We've already done much of the required work: we computed the number of features in Part (1b); we created the training and validation datasets and computed their sizes in Part (1e); and, we wrote a function to compute RMSE in Part (2b).

# In[44]:

# TODO: Replace <FILL IN> with appropriate code
numIters = 50
weightsLR0, errorTrainLR0 = linregGradientDescent(parsedTrainData, numIters);

labelsAndPreds = parsedValData.map(lambda lp: (lp.label,DenseVector.dot(weightsLR0,lp.features)))
rmseValLR0 = calcRMSE(labelsAndPreds)

print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(rmseValBase,
                                                                       rmseValLR0)


# In[45]:

# TEST Train the model (3d)
expectedOutput = [22.64535883, 20.064699, -0.05341901, 8.2931319, 5.79155768, -4.51008084,
                  15.23075467, 3.8465554, 9.91992022, 5.97465933, 11.36849033, 3.86452361]
Test.assertTrue(np.allclose(weightsLR0, expectedOutput), 'incorrect value for weightsLR0')


# #### ** Visualization 4: Training error **
예제 #14
0
# In[22]:

from pyspark.mllib.linalg import DenseVector


# In[25]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])
# Calculate the dot product between the two vectors.
denseDotProduct = myDenseVector.dot(numpyVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[26]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')


# ### ** Part 4: Python lambda expressions **
# In[28]:

from pyspark.mllib.linalg import DenseVector


# In[31]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print "\nnumpyVector:\n{0}".format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])  # <FILL IN>
# Calculate the dot product between the two vectors.
denseDotProduct = myDenseVector.dot(DenseVector(numpyVector))  # <FILL IN>

print "myDenseVector:\n{0}".format(myDenseVector)
print "\ndenseDotProduct:\n{0}".format(denseDotProduct)


# In[32]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), "myDenseVector is not a DenseVector")
Test.assertTrue(np.allclose(myDenseVector, np.array([3.0, 4.0, 5.0])), "incorrect value for myDenseVector")
Test.assertTrue(np.allclose(denseDotProduct, 0.0), "incorrect value for denseDotProduct")


# ### ** Part 4: Python lambda expressions **
예제 #16
0
expectedError = [79.72013547, 30.27835699, 9.27842641, 9.20967856, 9.19446483]
Test.assertTrue(np.allclose(exampleErrorTrain, expectedError),
                'value of exampleErrorTrain is incorrect')

# #### ** (3d) Train the model **
# #### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set.  Note that the test set will not be used here.  If we evaluated the model on the test set, we would bias our final results.
# #### We've already done much of the required work: we computed the number of features in Part (1b); we created the training and validation datasets and computed their sizes in Part (1e); and, we wrote a function to compute RMSE in Part (2b).

# In[44]:

# TODO: Replace <FILL IN> with appropriate code
numIters = 50
weightsLR0, errorTrainLR0 = linregGradientDescent(parsedTrainData, numIters)

labelsAndPreds = parsedValData.map(
    lambda lp: (lp.label, DenseVector.dot(weightsLR0, lp.features)))
rmseValLR0 = calcRMSE(labelsAndPreds)

print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(
    rmseValBase, rmseValLR0)

# In[45]:

# TEST Train the model (3d)
expectedOutput = [
    22.64535883, 20.064699, -0.05341901, 8.2931319, 5.79155768, -4.51008084,
    15.23075467, 3.8465554, 9.91992022, 5.97465933, 11.36849033, 3.86452361
]
Test.assertTrue(np.allclose(weightsLR0, expectedOutput),
                'incorrect value for weightsLR0')
# In[29]:

from pyspark.mllib.linalg import DenseVector


# In[31]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector(np.array([3.0,4.0,5.0]))
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(myDenseVector, numpyVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[32]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')


# ### ** Part 4: Python lambda expressions **
예제 #18
0
def pyspark_auc_mu(data, y_true, y_score, A=None, W=None):
    """
    Compute the multi-class measure AUC Mu from prediction scores and labels from Spark dataframe.
    
    Parameters
    ----------
    data : Spark dataframe
        The prediction output table from pyspark.ml
        
    y_true : double type column in data, shape = [n_samples]
        The true class labels column from spark dataframe in the range [0, n_samples-1]
    
    y_score : vector type column in data, shape = [n_samples, n_classes]
        Target scores, where each element in a vector is a categorical distribution over the 
        n_classes.
    
    A : array, shape = [n_classes, n_classes], optional
        The partition (or misclassification cost) matrix. If ``None`` A is the
        argmax partition matrix. Entry A_{i,j} is the cost of classifying an
        instance as class i when the true class is j. It is expected that
        diagonal entries in A are zero and off-diagonal entries are positive.
    
    W : array, shape = [n_classes, n_classes], optional
        The weight matrix for incorporating class skew into AUC Mu. If ``None``,
        the standard AUC Mu is calculated. If W is specified, it is expected to 
        be a lower triangular matrix where entrix W_{i,j} is a positive float
        from 0 to 1 for the partial score between classes i and j. Entries not
        in the lower triangular portion of W must be 0 and the sum of all 
        entries in W must be 1.
    
    Returns
    -------
    auc_mu : float
    
    References
    ----------
    .. [1] Kleiman, R., Page, D. ``AUC Mu: A Performance Metric for Multi-Class
           Machine Learning Models``, Proceedings of the 2019 International
           Conference on Machine Learning (ICML).    
       [2] https://github.com/kleimanr/auc_mu
    """
    
    n_classes = data.select(y_true).distinct().count()
    n_samples = data.select(y_score).count()
    # Validate input arguments
    if not isinstance(data, DataFrame):
        raise TypeError("Expected data to be DataFrame, got: %s"
                        % type(data))     
    if not data.select(y_true).dtypes[0][1] == 'double':
        raise TypeError("Expected column y_true to be double, got: %s"
                        % data.select(y_true).dtypes[0][1])
    if not data.select(y_score).dtypes[0][1] == 'vector':
        raise TypeError("Expected column y_score to be vector, got: %s"
                        % data.select(y_true).dtypes[0][1])
    if not data.select(y_true).count() == n_samples:
        raise ValueError("Expected y_true to be shape %s, got: %s"
                        %(str(data.select(y_score).count()), str(data.select(y_true).count())))
    
    slen = udf(lambda s: len(s), IntegerType())    
    if not data.select(slen(col(y_score))).groupBy().avg().collect()[0][0] == n_classes:
        raise ValueError("Expected y_true values in range 0..%i, got: %s"
                        %(n_classes-1, str(data.select(slen(data.y_score)).groupBy().avg().collect()[0][0])))        
    if A is None:
        A = np.ones((n_classes, n_classes)) - np.eye(n_classes)
    if not isinstance(A, np.ndarray):
        raise TypeError("Expected A to be np.ndarray, got: %s" 
                        % type(A))
    if not A.ndim == 2:
        raise ValueError("Expected A to be 2 dimensional, got: %s"
                         % A.ndim)
    if not A.shape == (n_classes, n_classes):
        raise ValueError("Expected A to be shape (%i, %i), got: %s"
                         %(n_classes, n_classes, str(A.shape)))
    if not np.all(A.diagonal() == np.zeros(n_classes)):
        raise ValueError("Expected A to be zero on the diagonals")
    if not np.all(A >= 0):
        raise ValueError("Expected A to be non-negative")
    
    if W is None:
        W = np.tri(n_classes, k=-1)
        W /= W.sum()
    if not isinstance(W, np.ndarray):
        raise TypeError("Expected W to be np.ndarray, got: %s" 
                        % type(W))
    if not W.ndim == 2:
        raise ValueError("Expected W to be 2 dimensional, got: %s"
                         % W.ndim)
    if not W.shape == (n_classes, n_classes):
        raise ValueError("Expected W to be shape (%i, %i), got: %s"
                         %(n_classes, n_classes, str(W.shape)))
    
    auc_total = 0.0
    for class_i in xrange(n_classes):
        preds_i = data.select(y_score).where(col(y_true) == class_i)
        n_i = preds_i.count()
        
        for class_j in xrange(class_i):
            preds_j = data.select(y_score).where(col(y_true) == class_j)
            temp_preds = preds_i.union(preds_j)
            
            n_j = preds_j.count()
            n = n_i+n_j
            #temp_preds: concat prob vectors which class = i and j
            temp_preds = DenseVector(temp_preds.select("probability").rdd.map(lambda x: x[0]).collect())
            
            #temp_labels: convert the two selected classes to a binary class vector
            temp_labels = np.zeros((n), dtype=int)
            temp_labels[n_i:n] = 1
            
            # v: differencing by vector_{i,.} and vector_{j,.} in partition matrix
            v = A[class_i, :] - A[class_j, :]       
            score = temp_preds.dot(v)
            df = np.column_stack([score, temp_labels])
            concat_df = map(lambda x: (float(x[0]), float(x[1:])), df)
            auc_mu_df = spark.createDataFrame(concat_df,schema=["score", "temp_label"])
            
            evaluator = BinaryClassificationEvaluator(labelCol="temp_label", rawPredictionCol="score", metricName='areaUnderROC')
            score_i_j = evaluator.evaluate(auc_mu_df)
            auc_total += W[class_i, class_j]*score_i_j

            
    return auc_total
예제 #19
0
from pyspark.mllib.linalg import DenseVector

# In[29]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
#myDenseVector = <FILL IN>
myDenseVector = DenseVector([3.0, 4.0, 5.0])

# Calculate the dot product between the two vectors.
#denseDotProduct = <FILL IN>
denseDotProduct = DenseVector.dot(myDenseVector, numpyVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)

# In[30]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector),
                'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0),
                'incorrect value for denseDotProduct')

# ### ** Part 4: Python lambda expressions **
zeros = np.zeros(8) # returns an array of 8 0s [ 0.  0.  0.  0.  0.  0.  0.  0.]
ones = np.ones(8) # returns an array of 8 1s [ 1.  1.  1.  1.  1.  1.  1.  1.]
print 'zeros:\n{0}'.format(zeros)
print '\nones:\n{0}'.format(ones)

zerosThenOnes = np.hstack((zeros,ones))   #notice the "(("
# hstack will return [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.]
zerosAboveOnes = np.vstack((zeros,ones))   # A 2 by 8 array 
# vstack in the above example will return 	[[ 0.  0.  0.  0.  0.  0.  0.  0.]
# 											[ 1.  1.  1.  1.  1.  1.  1.  1.]]

print '\nzerosThenOnes:\n{0}'.format(zerosThenOnes)
print '\nzerosAboveOnes:\n{0}'.format(zerosAboveOnes)

# When using PySpark, we use DenseVector instead of numpy vector. Example below:

from pyspark.mllib.linalg import DenseVector

numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3.0, 4.0, 5.0])
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(myDenseVector, numpyVector) # DenseVector.dot() does the dot product

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)

예제 #21
0
# In[97]:

from pyspark.mllib.linalg import DenseVector


# In[98]:

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector(np.array([3.0,4.0,5.0]))
# Calculate the dot product between the two vectors.
denseDotProduct = DenseVector.dot(DenseVector(numpyVector),myDenseVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)


# In[99]:

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')


# ### ** Part 4: Python lambda expressions **
예제 #22
0
PySpark provides a DenseVector class which allows you to more efficiently operate and store these sparse vectors
DenseVector is used to store arrays of values for use in PySpark. DenseVector actually stores values in a NumPy array and delegates calculations to that object. 
You can create a new DenseVector using DenseVector() and passing in a NumPy array or a Python list
'''

from pyspark.mllib.linalg import DenseVector

# Create a numpy array 
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector 
myDenseVector = DenseVector(np.array([3.0, 4.0, 5.0]))
# Calculate the dot product between the two vectors.
# One of the vectors here is a numpy array, and the other a densevector
denseDotProduct = DenseVector.dot(DenseVector(numpyVector),myDenseVector)

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)

#################  Intro to Panda  ############################
###															###
###															###
###   				Introduction to Panda					###
###															###
###															###
###############################################################


import numpy as np
import pandas as pd