예제 #1
0
 def setUp(self):
     Test.setUp(self)
     self.recorder = FakeRecorder()
     self.uploader = FakeUploader()
     self.confirmator = FakeConfirmator()
     self.real_subprocess_call = subprocess.call
     subprocess.call = lambda *args: None
예제 #2
0
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code

# Create a new DataFrame with the features from irisDF and with labels that are zero-indexed (just subtract one).
# Also make sure your label column is still called label.
from pyspark.sql.functions import col

irisDFZeroIndex = irisDF.<FILL IN>
display(irisDFZeroIndex)

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0],
                  'incorrect value for irisDFZeroIndex')

# COMMAND ----------

# MAGIC %md
# MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`.  We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`.  To do that we'll need to create a `udf` and apply it to our dataset.  Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction).
# MAGIC  
# MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code

from pyspark.sql.functions import udf
# Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types
# VectorUDT should be the return type of the udf
# In[1]:

# TODO: Replace <FILL IN> with appropriate code
# Manually calculate your answer and represent the vector as a list of integers values.
# For example, [2, 4, 8].
x = [3, -6, 0]
y = [4, 8, 16]


# In[2]:

# TEST Scalar multiplication: vectors (1a)
# Import test library
from test_helper import Test
Test.assertEqualsHashed(x, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vector x')
Test.assertEqualsHashed(y, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vector y')


# #### ** (1b) Element-wise multiplication: vectors **
# #### In this exercise, you will calculate the element-wise multiplication of two vectors by hand and enter the result in the code cell below.  You'll later see that element-wise multiplication is the default method when two NumPy arrays are multiplied together.  Note we won't be performing element-wise multiplication in future labs, but we are introducing it here to distinguish it from other vector operators, and to because it is a common operations in NumPy, as we will discuss in Part (2b).
# #### The element-wise calculation is as follows: $$ \mathbf{x} \odot \mathbf{y} =  \begin{bmatrix} x_1 y_1 \\\  x_2 y_2 \\\ \vdots \\\ x_n y_n \end{bmatrix} $$
# #### Calculate the value of $ \mathbf{z} $: $$ \mathbf{z} = \begin{bmatrix} 1 \\\  2 \\\ 3 \end{bmatrix} \odot \begin{bmatrix} 4 \\\  5 \\\ 6 \end{bmatrix} $$

# In[3]:

# TODO: Replace <FILL IN> with appropriate code
# Manually calculate your answer and represent the vector as a list of integers values.
z = [4, 10, 18]
sampleDataRDD = sc.parallelize([sampleOne, sampleTwo, sampleThree])

sampleOHEDictManual = {}
sampleOHEDictManual[(0,'bear')] = 0
sampleOHEDictManual[(0,'cat')] = 1
sampleOHEDictManual[(0,'mouse')] = 2
sampleOHEDictManual[(1,'black')] = 3
sampleOHEDictManual[(1,'tabby')] = 4
sampleOHEDictManual[(2,'mouse')] = 5
sampleOHEDictManual[(2,'salmon')]= 6

# TEST One-hot-encoding
from test_helper import Test

Test.assertEqualsHashed(sampleOHEDictManual[(0,'bear')],
                        'b6589fc6ab0dc82cf12099d1c2d40ab994e8410c',
                        "incorrect value for sampleOHEDictManual[(0,'bear')]")
Test.assertEqualsHashed(sampleOHEDictManual[(0,'cat')],
                        '356a192b7913b04c54574d18c28d46e6395428ab',
                        "incorrect value for sampleOHEDictManual[(0,'cat')]")
Test.assertEqualsHashed(sampleOHEDictManual[(0,'mouse')],
                        'da4b9237bacccdf19c0760cab7aec4a8359010b0',
                        "incorrect value for sampleOHEDictManual[(0,'mouse')]")
Test.assertEqualsHashed(sampleOHEDictManual[(1,'black')],
                        '77de68daecd823babbb58edb1c8e14d7106e83bb',
                        "incorrect value for sampleOHEDictManual[(1,'black')]")
Test.assertEqualsHashed(sampleOHEDictManual[(1,'tabby')],
                        '1b6453892473a467d07372d45eb05abc2031647a',
                        "incorrect value for sampleOHEDictManual[(1,'tabby')]")
Test.assertEqualsHashed(sampleOHEDictManual[(2,'mouse')],
                        'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4',
예제 #5
0
plt.scatter(dataCorrelated[:,0], dataCorrelated[:,1], s=14**2, c='#d6ebf2',
            edgecolors='#8cbfd0', alpha=0.75)
pass

correlatedData = sc.parallelize(dataCorrelated)

meanCorrelated = correlatedData.mean()
correlatedDataZeroMean = correlatedData.map(lambda x:np.subtract(x,meanCorrelated))

print meanCorrelated
print correlatedData.take(1)
print correlatedDataZeroMean.take(1)


from test_helper import Test
Test.assertTrue(np.allclose(meanCorrelated, [49.95739037, 49.97180477]),
                'incorrect value for meanCorrelated')
Test.assertTrue(np.allclose(correlatedDataZeroMean.take(1)[0], [-0.28561917, 0.10351492]),
                'incorrect value for correlatedDataZeroMean')

correlatedCov = correlatedDataZeroMean.map(lambda x: np.outer(x,x)).reduce(lambda x,y:x+y)/correlatedDataZeroMean.count()
print correlatedCov

covResult = [[ 0.99558386,  0.90148989], [0.90148989, 1.08607497]]
Test.assertTrue(np.allclose(covResult, correlatedCov), 'incorrect value for correlatedCov')


def estimateCovariance(data):
    meanData = data.mean()
    zeroMeanData = data.map(lambda x:np.subtract(x,meanData))
    correlatedMatrix = zeroMeanData.map(lambda x: np.outer(x,x)).reduce(lambda x,y:x+y)/zeroMeanData.count()
    return correlatedMatrix
예제 #6
0
 def setUp(self):
     Test.setUp(self)
     self.real_time = time.time
     time.time = FakeClock([1, 3, 10]).time
# Remember to cast the value you extract from the Vector using float()
getElement = udf(lambda v, i: float(v[i]), DoubleType())

irisSeparateFeatures = (irisTwoFeatures
                        .withColumn('sepalLength', getElement('features', lit(0)))
                        .withColumn('sepalWidth', getElement('features', lit(1))))
display(irisSeparateFeatures)


# COMMAND ----------

# TEST
from test_helper import Test
firstRow = irisSeparateFeatures.select('sepalWidth', 'features').map(lambda r: (r[0], r[1])).first()
Test.assertEquals(firstRow[0], firstRow[1][1], 'incorrect definition for getElement')

# COMMAND ----------

# MAGIC %md
# MAGIC What about using `Column`'s `getItem` method?

# COMMAND ----------

from pyspark.sql.functions import col
from pyspark.sql.utils import AnalysisException

try:
    display(irisTwoFeatures.withColumn('sepalLength', col('features').getItem(0)))
except AnalysisException as e:
    print e
예제 #8
0
# TODO: Replace <FILL IN> with appropriate code
from pyspark.ml.feature import StringIndexer

stringIndexer = (<FILL IN>
                 .<FILL IN>
                 .<FILL IN>)

indexerModel = stringIndexer.<FILL IN>
irisTrainIndexed = indexerModel.<FILL IN>
display(irisTrainIndexed)

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(irisTrainIndexed.select('indexed').take(50)[-1][0], 2.0, 'incorrect values in indexed column')
Test.assertTrue(irisTrainIndexed.schema.fields[2].metadata != {}, 'indexed should have metadata')

# COMMAND ----------

# MAGIC %md
# MAGIC We've updated the metadata for the field.  Now we know that the field takes on three values and is nominal.

# COMMAND ----------

print irisTrainIndexed.schema.fields[1].metadata
print irisTrainIndexed.schema.fields[2].metadata

# COMMAND ----------

# MAGIC %md
# TODO: Replace <FILL IN> with appropriate code
from pyspark.ml.feature import StringIndexer

stringIndexer = (<FILL IN>
                 .<FILL IN>
                 .<FILL IN>)

indexerModel = stringIndexer.<FILL IN>
irisTrainIndexed = indexerModel.<FILL IN>
display(irisTrainIndexed)

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(irisTrainIndexed.select('indexed').take(50)[-1][0], 2.0, 'incorrect values in indexed column')
Test.assertTrue(irisTrainIndexed.schema.fields[2].metadata != {}, 'indexed should have metadata')

# COMMAND ----------

# MAGIC %md
# MAGIC We've updated the metadata for the field.  Now we know that the field takes on three values and is nominal.

# COMMAND ----------

print irisTrainIndexed.schema.fields[1].metadata
print irisTrainIndexed.schema.fields[2].metadata

# COMMAND ----------

# MAGIC %md
예제 #10
0
from pyspark.sql.types import DoubleType

# Remember to cast the value you extract from the Vector using float()
getElement = udf(<FILL IN>)

irisSeparateFeatures = (irisTwoFeatures
                        .withColumn('sepalLength', getElement('features', lit(0)))
                        .withColumn('sepalWidth', getElement('features', <FILL IN>)))
display(irisSeparateFeatures)

# COMMAND ----------

# TEST
from test_helper import Test
firstRow = irisSeparateFeatures.select('sepalWidth', 'features').map(lambda r: (r[0], r[1])).first()
Test.assertEquals(firstRow[0], firstRow[1][1], 'incorrect definition for getElement')

# COMMAND ----------

# MAGIC %md
# MAGIC What about using `Column`'s `getItem` method?

# COMMAND ----------

from pyspark.sql.functions import col
from pyspark.sql.utils import AnalysisException

try:
    display(irisTwoFeatures.withColumn('sepalLength', col('features').getItem(0)))
except AnalysisException as e:
    print e
예제 #11
0
def capitalize(word):
    """Capitalize lowercase `words`.

    Args:
        word (str): A lowercase string.

    Returns:
        str: A string which first letter is uppercase.
    """
    return word.capitalize()


print(capitalize('we'))

Test.assertEquals(capitalize('we'), 'We', "Capitalize")

# COMMAND ----------

# MAGIC %md Apply `capitalize` to the base RDD, using a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `capitalize()` function to each element. Then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to retrieve the values of the transformed RDD, and print them.

# COMMAND ----------

capital_RDD = words_RDD.map(capitalize)
local_result = capital_RDD.collect()
print(local_result)

Test.assertEqualsHashed(local_result,
                        'bd73c54004cc9655159aceb703bc14fe93369fb1',
                        'incorrect value for local_data')
예제 #12
0
dfcrashes.createOrReplaceTempView(temp_table)

# COMMAND ----------


def checkanzrecord():
    return dfcrashes.count()


print(checkanzrecord())

# COMMAND ----------

# TEST Ob die eingelesenen Records der erwarteten Anzahl Records entspricht
from test_helper import Test
Test.assertEquals(checkanzrecord(), 5784, 'incorrect Total Records')

# COMMAND ----------

# MAGIC %sql
# MAGIC
# MAGIC select * from crashestable
# MAGIC where date is not null
# MAGIC order by date asc, time asc
# MAGIC limit 1

# COMMAND ----------

# MAGIC %sql
# MAGIC
# MAGIC select * from crashestable
예제 #13
0
assert shakespeareCount == 122395


# ### ** Part 2: Check class testing library **

# #### ** (2a) Compare with hash **

# In[3]:

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
                        'twelve, once hashed, should equal the hashed value of 12')


# #### ** (2b) Compare lists **

# In[4]:

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]
Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')],
                  'unsortedList does not sort properly')

예제 #14
0
    sc = SparkContext(master, "WordCount")
    shakespeareRDD = (sc.textFile('./shakespeare.txt',
                                  8).map(removePunctuation))

    print '\n'.join(
        shakespeareRDD.zipWithIndex()  # to (line, lineNum)
        .map(lambda (l, num): '{0}: {1}'.format(num, l))  # to 'lineNum: line'
        .take(15))

    shakespeareWordsRDD = shakespeareRDD.flatMap(lambda s: s.split(' '))
    shakespeareWordCount = shakespeareWordsRDD.count()
    print shakespeareWordsRDD.top(5)
    print shakespeareWordCount

    Test.assertTrue(
        shakespeareWordCount == 927631 or shakespeareWordCount == 928908,
        'incorrect value for shakespeareWordCount')
    Test.assertEquals(
        shakespeareWordsRDD.top(5),
        [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'],
        'incorrect value for shakespeareWordsRDD')

    shakeWordsRDD = shakespeareWordsRDD.filter(lambda s: s != '')
    shakeWordCount = shakeWordsRDD.count()
    print shakeWordCount

    Test.assertEquals(shakeWordCount, 882996,
                      'incorrect value for shakeWordCount')

    top15WordsAndCounts = wordCount(shakeWordsRDD).takeOrdered(
        15, lambda (w, c): -c)
예제 #15
0
print hashTrainData.take(1)

averageSparsityHash = computeSparsity(hashTrainData, numBucketsCTR, nTrain)
averageSparsityOHE = computeSparsity(OHETrainData, numCtrOHEFeats, nTrain)

print 'Average OHE Sparsity: {0:.10e}'.format(averageSparsityOHE)
print 'Average Hash Sparsity: {0:.10e}'.format(averageSparsityHash)

#------------------
# Test Code
#------------------

# TEST Loading and splitting the data (3a)
Test.assertTrue(
    all([
        rawTrainData.is_cached, rawValidationData.is_cached,
        rawTestData.is_cached
    ]), 'you must cache the split data')
Test.assertEquals(nTrain, 79911, 'incorrect value for nTrain')
Test.assertEquals(nVal, 10075, 'incorrect value for nVal')
Test.assertEquals(nTest, 10014, 'incorrect value for nTest')

# TEST Extract features (3b)
Test.assertEquals(numCategories[2][1], 855,
                  'incorrect implementation of parsePoint')
Test.assertEquals(numCategories[32][1], 4,
                  'incorrect implementation of parsePoint')

# TEST Create an OHE dictionary from the dataset (3c)
Test.assertEquals(numCtrOHEFeats, 233286,
                  'incorrect number of features in ctrOHEDict')
예제 #16
0
 def setUp(self):
     Test.setUp(self)
     self.recorder = FakeRecorder()
     self.uploader = FakeUploader()
     self.confirmator = FakeConfirmator()
예제 #17
0
파일: 2.py 프로젝트: Mvrm/Spark
# One way of completing the function
def makePlural(word):
    return word + 's'

print makePlural('cat')


# In[8]:

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from test_helper import Test
# TEST Pluralize and test (1b)
Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s')


# #### ** (1c) Apply `makePlural` to the base RDD **
# #### Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD.

# In[9]:

# TODO: Replace <FILL IN> with appropriate code
wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat']
wordsRDD = sc.parallelize(wordsList, 4)
def makePlural(word):
    return word + 's'
pluralRDD = wordsRDD.map(makePlural)
print pluralRDD.collect()
예제 #18
0
def lab1_test_ex_1(dummy_plus_5):
    f_10 = dummy_plus_5.take(10)
    Test.assertEqualsHashed(
        f_10, '931f5cd168f083db2a230d92a73a83710fb4437f',
        'Incorrect RDD: [{}, ...]'.format(list_continious(f_10)),
        'Correct RDD: [{}, ...]'.format(list_continious(f_10)))
# In[7]:

def calcUserMeanRating(userRatingGroup):
    """ Calculate the average rating of a user
    """
    userID = userRatingGroup[0]
    ratingSum = 0.0
    ratingCnt = len(userRatingGroup[1])
    if ratingCnt == 0:
        return (userID, 0.0)
    for item in userRatingGroup[1]:
        ratingSum += item[1]
    return (userID, 1.0 * ratingSum / ratingCnt)

Test.assertEquals(calcUserMeanRating((123, [(1, 1), (2, 2), (3, 3)])), 
                  (123, 2.0), 'incorrect calcUserMeanRating()')


# In[8]:

def broadcastUserRatingAvg(sContext, uRRDDTrain):
    """ Broadcast the user average rating RDD
    """
    userRatingAvgList = uRRDDTrain.map(lambda x: calcUserMeanRating(x)).collect()
    userRatingAvgDict = {}
    for (user, avgscore) in userRatingAvgList:
        userRatingAvgDict[user] = avgscore
    uRatingAvgBC = sContext.broadcast(userRatingAvgDict)# broadcast
    return uRatingAvgBC

def predictUsingAvg(tup, avgDict):
예제 #20
0
def lab2_test_ex_3(response_code_counts):
    Test.assertEqualsHashed(response_code_counts,
                            'e167b0ae562c9083c5ab35d9e5430583d9a2bc60',
                            'Incorrect RDD: {}'.format(response_code_counts),
                            'Correct RDD: {}'.format(response_code_counts))
예제 #21
0
# TODO: Replace <FILL IN> with appropriate code
correlatedData = sc.parallelize(dataCorrelated)

meanCorrelated = <FILL IN>
correlatedDataZeroMean = correlatedData.<FILL IN>

print meanCorrelated
print correlatedData.take(1)
print correlatedDataZeroMean.take(1)

# COMMAND ----------

# TEST Interpreting PCA (1a)
from test_helper import Test
Test.assertTrue(np.allclose(meanCorrelated, [49.95739037, 49.97180477]),
                'incorrect value for meanCorrelated')
Test.assertTrue(np.allclose(correlatedDataZeroMean.take(1)[0], [-0.28561917, 0.10351492]),
                'incorrect value for correlatedDataZeroMean')

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC **(1b) Sample covariance matrix**
# MAGIC 
# MAGIC We are now ready to compute the sample covariance matrix. If we define \\(\scriptsize \mathbf{X} \in \mathbb{R}^{n \times d}\\) as the zero mean data matrix, then the sample covariance matrix is defined as: \\[ \mathbf{C}_{\mathbf X} = \frac{1}{n} \mathbf{X}^\top \mathbf{X} \,.\\]  To compute this matrix, compute the outer product of each data point, add together these outer products, and divide by the number of data points. The data are two dimensional, so the resulting covariance matrix should be a 2x2 matrix.
# MAGIC  
# MAGIC 
# MAGIC Note that [np.outer()](http://docs.scipy.org/doc/numpy/reference/generated/numpy.outer.html) can be used to calculate the outer product of two NumPy arrays.

# COMMAND ----------
예제 #22
0
def lab2_test_ex_4(response_code_counts):
    Test.assertEqualsHashed(response_code_counts,
                            'a23154aaf42c5addb8365a2dcd8d682210a3957b',
                            'Incorrect RDD: {}'.format(response_code_counts),
                            'Correct RDD: {}'.format(response_code_counts))
예제 #23
0
 def setUp(self):
     Test.setUp(self)
     self.real_stdin = sys.stdin
     sys.stdin = self.stdin = FakeStdin()
예제 #24
0
def lab2_test_ex_6(top_10_hosts):
    Test.assertEqualsHashed(top_10_hosts,
                            'fe387f732f0c9dc663adca8937d39d7a23278e6d',
                            'Incorrect RDD: {}'.format(top_10_hosts),
                            'Correct RDD: {}'.format(top_10_hosts))
# One way of completing the function
def makePlural(word):
    return word + 's'

print makePlural('cat')


# In[4]:

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from test_helper import Test
# TEST Pluralize and test (1b)
Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s')


# #### ** (1c) Apply `makePlural` to the base RDD **
# #### Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD.

# In[7]:

# TODO: Replace <FILL IN> with appropriate code
pluralRDD = wordsRDD.map(makePlural)
print pluralRDD.collect()


# In[ ]:

# TEST Apply makePlural to the base RDD(1c)
예제 #26
0
def lab2_test_ex_7(top_20_endpoints_404):
    Test.assertEqualsHashed(top_20_endpoints_404,
                            '768e38a3aa83ea0d5ced989c6af4e7df0968412e',
                            'Incorrect RDD: {}'.format(top_20_endpoints_404),
                            'Correct RDD: {}'.format(top_20_endpoints_404))
예제 #27
0
파일: Lab.py 프로젝트: smoltis/spark
def run_tests():
  Test.assertEquals(test_year(1945, df), [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'], 'incorrect top 5 names for 1945')
  Test.assertEquals(test_year(1970, df), [u'Jennifer', u'Lisa', u'Kimberly', u'Michelle', u'Amy'], 'incorrect top 5 names for 1970')
  Test.assertEquals(test_year(1987, df), [u'Jessica', u'Ashley', u'Amanda', u'Jennifer', u'Sarah'], 'incorrect top 5 names for 1987')
  Test.assertTrue(len(test_year(1945, df)) <= 5, 'list not limited to 5 names')
  Test.assertTrue(u'James' not in test_year(1945, df), 'male names not filtered')
  Test.assertTrue(test_year(1945, df) != [u'Linda', u'Linda', u'Linda', u'Linda', u'Mary'], 'year not filtered')
  Test.assertEqualsHashed(test_year(1880, df), "2038e2c0bb0b741797a47837c0f94dbf24123447", "incorrect top 5 names for 1880")
예제 #28
0
def lab2_test_ex_8(top_20_endpoints_404):
    Test.assertEqualsHashed(top_20_endpoints_404,
                            '8699ae1164d67e0a8260e74baf973070d559c9ec',
                            'Incorrect RDD: {}'.format(top_20_endpoints_404),
                            'Correct RDD: {}'.format(top_20_endpoints_404))
예제 #29
0
# COMMAND ----------

# MAGIC %md
# MAGIC Create a `DenseVector` with the values 1.5, 2.5, 3.0 (in that order).

# COMMAND ----------

# ANSWER
denseVec = Vectors.dense([1.5, 2.5, 3.0])

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(denseVec, DenseVector([1.5, 2.5, 3.0]), 'incorrect value for denseVec')

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `LabeledPoint` with a label equal to 10.0 and features equal to `denseVec`

# COMMAND ----------

# ANSWER
labeledP = LabeledPoint(10.0, denseVec)

# COMMAND ----------

# TEST
Test.assertEquals(str(labeledP), '(10.0,[1.5,2.5,3.0])', 'incorrect value for labeledP')
예제 #30
0
def lab2_test_ex_9(top_11_hosts_404):
    Test.assertEqualsHashed(top_11_hosts_404,
                            'b68f0611777ca20cd48360d6296e544ad39599cb',
                            'Incorrect RDD: {}'.format(top_11_hosts_404),
                            'Correct RDD: {}'.format(top_11_hosts_404))
예제 #31
0
assert shakespeareCount == 122395


# ### ** Part 2: Check class testing library **

# #### ** (2a) Compare with hash **

# In[ ]:

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, "twelve should equal 12")
Test.assertEqualsHashed(
    twelve, "7b52009b64fd0a2a49e6d8a939753077792b0554", "twelve, once hashed, should equal the hashed value of 12"
)


# #### ** (2b) Compare lists **

# In[ ]:

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, "b"), (5, "a"), (4, "c"), (3, "a")]
Test.assertEquals(sorted(unsortedList), [(3, "a"), (4, "c"), (5, "a"), (5, "b")], "unsortedList does not sort properly")

예제 #32
0
def lab2_test_ex_10(top_11_hosts_404):
    Test.assertEqualsHashed(top_11_hosts_404,
                            'a2a32f8d91d217a2a82a59276448f4162d72b2cc',
                            'Incorrect RDD: {}'.format(top_11_hosts_404),
                            'Correct RDD: {}'.format(top_11_hosts_404))
예제 #33
0
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code

# Create a new DataFrame with the features from irisDF and with labels that are zero-indexed (just subtract one).
# Also make sure your label column is still called label.
from pyspark.sql.functions import col

irisDFZeroIndex = irisDF.<FILL IN>
display(irisDFZeroIndex)

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0],
                  'incorrect value for irisDFZeroIndex')

# COMMAND ----------

# MAGIC %md
# MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`.  We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`.  To do that we'll need to create a `udf` and apply it to our dataset.  Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction).
# MAGIC  
# MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code

from pyspark.sql.functions import udf
# Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types
# VectorUDT should be the return type of the udf
예제 #34
0
def lab3_test_ex_4_2(sample_prediction):
    Test.assertEqualsHashed(
        abs(sample_prediction - 1.89839708175) < 1,
        '88b33e4e12f75ac8bf792aebde41f1a090f3a612',
        'Incorrect prediction: {}'.format(sample_prediction),
        'Prediction: {}'.format(sample_prediction))
예제 #35
0
파일: II.py 프로젝트: Inscrutive/spark
# Remember to cast the value you extract from the Vector using float()
getElement = udf(lambda v, i: float(v[i]), DoubleType())

irisSeparateFeatures = (irisTwoFeatures
                        .withColumn('sepalLength', getElement('features', lit(0)))
                        .withColumn('sepalWidth', getElement('features', lit(1))))
display(irisSeparateFeatures)


# COMMAND ----------

# TEST
from test_helper import Test
firstRow = irisSeparateFeatures.select('sepalWidth', 'features').map(lambda r: (r[0], r[1])).first()
Test.assertEquals(firstRow[0], firstRow[1][1], 'incorrect definition for getElement')

# COMMAND ----------

# MAGIC %md
# MAGIC What about using `Column`'s `getItem` method?

# COMMAND ----------

from pyspark.sql.functions import col

display(irisTwoFeatures.withColumn('sepalLength', col('features').getItem(0)))

# COMMAND ----------

# MAGIC %md
예제 #36
0
def lab3_test_ex_6_1(parsed_intervals):
    Test.assertEqualsHashed(
        parsed_intervals.map(lambda lp: lp.label).sum(),
        '0b1de9cebbe9b7a3e42cf4e76d56df9e2002dc9b',
        'Incorrect parsed_intervals: {}, ...'.format(parsed_intervals.first()),
        'Correct parsed_intervals: {}, ...'.format(parsed_intervals.first()))
예제 #37
0
파일: B.py 프로젝트: Inscrutive/spark
# MAGIC  
# MAGIC The resulting `DataFrame` should have two columns: one named `features` and another named `label`.

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import col

irisDFZeroIndex = irisDF.select('features', (col('label') - 1).alias('label'))
display(irisDFZeroIndex)

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0],
                  'incorrect value for irisDFZeroIndex')

# COMMAND ----------

# MAGIC %md
# MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`.  We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`.  To do that we'll need to create a `udf` and apply it to our dataset.  Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction).
# MAGIC  
# MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method.

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import udf
# Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types
# VectorUDT should be the return type of the udf
from pyspark.mllib.linalg import Vectors, VectorUDT
예제 #38
0
def lab3_test_ex_6_3(test_accuracy):
    Test.assertEqualsHashed(test_accuracy >= 0.5,
                            '88b33e4e12f75ac8bf792aebde41f1a090f3a612',
                            'Incorrect DecisionTree model prediction',
                            'Correct DecisionTree model prediction')
예제 #39
0
# MAGIC %md
# MAGIC First, create a `DataFrame` named sized that has a `size` column with the size of each array of words.  Here you can use `func.size`.

# COMMAND ----------

# ANSWER
sized = noStopWords.withColumn('size', func.size('words'))

sizedFirst = sized.select('size', 'words').first()
print sizedFirst[0]

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(sizedFirst[0], len(sizedFirst[1]), 'incorrect implementation for sized')

# COMMAND ----------

# MAGIC %md
# MAGIC Next, you'll need to aggregate the counts.  You can do this using `func.sum` in either a `.select` or `.agg` method call on the `DataFrame`.  Make sure to give your `Column` the alias `numberOfWords`.  There are some examples in [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.GroupedData.agg) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.DataFrame) in the APIs.

# COMMAND ----------

# ANSWER
numberOfWords = sized.agg(func.sum('size').alias('numberOfWords'))

wordCount = numberOfWords.first()[0]
print wordCount

# COMMAND ----------
예제 #40
0
def lab3_test_ex_6_4(test_accuracy):
    Test.assertEqualsHashed(test_accuracy >= 0.6,
                            '88b33e4e12f75ac8bf792aebde41f1a090f3a612',
                            'Incorrect RandomForest model prediction',
                            'Correct RandomForest model prediction')
# ** Interpreting PCA **

correlatedData = sc.parallelize(dataCorrelated)

meanCorrelated = correlatedData.mean()
correlatedDataZeroMean = correlatedData.map(lambda x: x - meanCorrelated)

print meanCorrelated
print correlatedData.take(1)
print correlatedDataZeroMean.take(1)

# TEST Interpreting PCA
from test_helper import Test

Test.assertTrue(np.allclose(meanCorrelated, [49.95739037, 49.97180477]), "incorrect value for meanCorrelated")
Test.assertTrue(
    np.allclose(correlatedDataZeroMean.take(1)[0], [-0.28561917, 0.10351492]),
    "incorrect value for correlatedDataZeroMean",
)


# **Sample covariance matrix**

correlatedCov = correlatedDataZeroMean.map(lambda x: np.outer(x, x)).mean()
print correlatedCov

# TEST Sample covariance matrix
covResult = [[0.99558386, 0.90148989], [0.90148989, 1.08607497]]
Test.assertTrue(np.allclose(covResult, correlatedCov), "incorrect value for correlatedCov")
예제 #42
0
def lab3_test_ex_7_4(test_accuracy_lr):
    Test.assertEqualsHashed(test_accuracy_lr >= 0.75,
                            '88b33e4e12f75ac8bf792aebde41f1a090f3a612',
                            'Incorrect LogisticRegression model prediction',
                            'Correct LogisticRegression model prediction')
def run_tests():
  Test.assertEquals(test_year(1945, df), [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'], 'incorrect top 5 names for 1945')
예제 #44
0
def lab3_test_ex_7_6(test_accuracy_gb):
    Test.assertEqualsHashed(test_accuracy_gb >= 0.85,
                            '88b33e4e12f75ac8bf792aebde41f1a090f3a612',
                            'Incorrect GradientBoosting model prediction',
                            'Correct GradientBoosting model prediction')
예제 #45
0
파일: V.py 프로젝트: Inscrutive/spark
# Remember to cast the value you extract from the Vector using float()
getElement = udf(lambda v, i: float(v[i]), DoubleType())

irisSeparateFeatures = (irisTwoFeatures
                        .withColumn('sepalLength', getElement('features', lit(0)))
                        .withColumn('sepalWidth', getElement('features', lit(1))))
display(irisSeparateFeatures)


# COMMAND ----------

# TEST
from test_helper import Test
firstRow = irisSeparateFeatures.select('sepalWidth', 'features').map(lambda r: (r[0], r[1])).first()
Test.assertEquals(firstRow[0], firstRow[1][1], 'incorrect definition for getElement')

# COMMAND ----------

# MAGIC %md
# MAGIC What about using `Column`'s `getItem` method?

# COMMAND ----------

from pyspark.sql.functions import col

display(irisTwoFeatures.withColumn('sepalLength', col('features').getItem(0)))

# COMMAND ----------

# MAGIC %md
예제 #46
0
def run_tests():
    Test.assertEquals(test_year(1945, df),
                      [u'Mary', u'Linda', u'Barbara', u'Patricia', u'Carol'],
                      'incorrect top 5 names for 1945')
    Test.assertEquals(test_year(1970, df),
                      [u'Jennifer', u'Lisa', u'Kimberly', u'Michelle', u'Amy'],
                      'incorrect top 5 names for 1970')
    Test.assertEquals(test_year(
        1987, df), [u'Jessica', u'Ashley', u'Amanda', u'Jennifer', u'Sarah'],
                      'incorrect top 5 names for 1987')
    Test.assertTrue(
        len(test_year(1945, df)) <= 5, 'list not limited to 5 names')
    Test.assertTrue(u'James' not in test_year(1945, df),
                    'male names not filtered')
    Test.assertTrue(
        test_year(1945, df) !=
        [u'Linda', u'Linda', u'Linda', u'Linda', u'Mary'], 'year not filtered')
    Test.assertEqualsHashed(test_year(1880, df),
                            "2038e2c0bb0b741797a47837c0f94dbf24123447",
                            "incorrect top 5 names for 1880")
예제 #47
0
 def tearDown(self):
     Test.tearDown(self)
     sys.stdin = self.real_stdin
예제 #48
0
# COMMAND ----------

# MAGIC %md
# MAGIC Create a `DenseVector` with the values 1.5, 2.5, 3.0 (in that order).

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
denseVec = <FILL IN>

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(denseVec, DenseVector([1.5, 2.5, 3.0]), 'incorrect value for denseVec')

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `LabeledPoint` with a label equal to 10.0 and features equal to `denseVec`

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
labeledP = <FILL IN>

# COMMAND ----------

# TEST
Test.assertEquals(str(labeledP), '(10.0,[1.5,2.5,3.0])', 'incorrect value for labeledP')
def makePlural(word):
    return word + "s"


print makePlural("cat")


# In[5]:

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from test_helper import Test

# TEST Pluralize and test (1b)
Test.assertEquals(makePlural("rat"), "rats", "incorrect result: makePlural does not add an s")


# #### ** (1c) Apply `makePlural` to the base RDD **
# #### Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD.

# In[6]:

# TODO: Replace <FILL IN> with appropriate code
pluralRDD = wordsRDD.map(makePlural)
print pluralRDD.collect()


# In[7]:

# TEST Apply makePlural to the base RDD(1c)
예제 #50
0
numPartitions = 2
rawData = sc.textFile(fileName, numPartitions)

# In[110]:

# TODO: Replace <FILL IN> with appropriate code
numPoints = rawData.count()
print numPoints
samplePoints = rawData.take(5)
print samplePoints

# In[111]:

# TEST Load and check the data (1a)
Test.assertEquals(numPoints, 6724, 'incorrect value for numPoints')
Test.assertEquals(len(samplePoints), 5, 'incorrect length for samplePoints')

# #### ** (1b) Using `LabeledPoint` **
# #### In MLlib, labeled training instances are stored using the [LabeledPoint](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint) object.  Write the parsePoint function that takes as input a raw data point, parses it using Python's [unicode.split](https://docs.python.org/2/library/string.html#string.split) method, and returns a `LabeledPoint`.  Use this function to parse samplePoints (from the previous question).  Then print out the features and label for the first training point, using the `LabeledPoint.features` and `LabeledPoint.label` attributes. Finally, calculate the number features for this dataset.
# #### Note that `split()` can be called directly on a `unicode` or `str` object.  For example, `u'split,me'.split(',')` returns `[u'split', u'me']`.

# In[112]:

from pyspark.mllib.regression import LabeledPoint
import numpy as np

# Here is a sample raw data point:
# '2001.0,0.884,0.610,0.600,0.474,0.247,0.357,0.344,0.33,0.600,0.425,0.60,0.419'
# In this raw data point, 2001.0 is the label, and the remaining values are features
assert shakespeareCount == 122395


# ### ** Part 2: Check class testing library **

# #### ** (2a) Compare with hash **

# In[ ]:

# TEST Compare with hash (2a)
# Check our testing library/package
# This should print '1 test passed.' on two lines
from test_helper import Test

twelve = 12
Test.assertEquals(twelve, 12, 'twelve should equal 12')
Test.assertEqualsHashed(twelve, '7b52009b64fd0a2a49e6d8a939753077792b0554',
                        'twelve, once hashed, should equal the hashed value of 12')


# #### ** (2b) Compare lists **

# In[ ]:

# TEST Compare lists (2b)
# This should print '1 test passed.'
unsortedList = [(5, 'b'), (5, 'a'), (4, 'c'), (3, 'a')]
Test.assertEquals(sorted(unsortedList), [(3, 'a'), (4, 'c'), (5, 'a'), (5, 'b')],
                  'unsortedList does not sort properly')

예제 #52
0
# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code

# Create a new DataFrame with the features from irisDF and with labels that are zero-indexed (just subtract one).
# Also make sure your label column is still called label.
from pyspark.sql.functions import col

irisDFZeroIndex = irisDF.<FILL IN>
display(irisDFZeroIndex)

# COMMAND ----------

# TEST
from test_helper import Test
Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0],
                  'incorrect value for irisDFZeroIndex')

# COMMAND ----------

# MAGIC %md
# MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`.  We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`.  To do that we'll need to create a `udf` and apply it to our dataset.  Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction).
# MAGIC  
# MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code

from pyspark.sql.functions import udf
# Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types
# VectorUDT should be the return type of the udf
def simpleTokenize(string):
    """ A simple implementation of input string tokenization
    Args:
        string (str): input string
    Returns:
        list: a list of tokens
    """
    
    return re.sub('[^a-zA-Z0-9\s_]+', ' ', string).lower().split()

print simpleTokenize(quickbrownfox) # Should give ['a', 'quick', 'brown', ... ]

# TEST Tokenize a String (1a)
Test.assertEquals(simpleTokenize(quickbrownfox),
                  ['a','quick','brown','fox','jumps','over','the','lazy','dog'],
                  'simpleTokenize should handle sample text')
Test.assertEquals(simpleTokenize(' '), [], 'simpleTokenize should handle empty string')
Test.assertEquals(simpleTokenize('!!!!123A/456_B/789C.123A'), ['123a','456_b','789c','123a'],
                  'simpleTokenize should handle puntuations and lowercase result')
Test.assertEquals(simpleTokenize('fox fox'), ['fox', 'fox'],
                  'simpleTokenize should not remove duplicates')

# TODO: Replace <FILL IN> with appropriate code
stopfile = os.path.join(baseDir, inputPath, STOPWORDS_PATH)
stopwords = set(sc.textFile(stopfile).collect())
print 'These are the stopwords: %s' % stopwords

def tokenize(string):
    """ An implementation of input string tokenization that excludes stopwords
    Args:
예제 #54
0
# #### Calculate the value of $ \mathbf{y} $: $$ \mathbf{y} = 2 \begin{bmatrix} 2 \\\ 4 \\\ 8 \end{bmatrix} $$

# In[2]:

# TODO: Replace <FILL IN> with appropriate code
# Manually calculate your answer and represent the vector as a list of integers values.
# For example, [2, 4, 8].
x = [3, -6, 0]
y = [4, 8, 16]

# In[3]:

# TEST Scalar multiplication: vectors (1a)
# Import test library
from test_helper import Test
Test.assertEqualsHashed(x, 'e460f5b87531a2b60e0f55c31b2e49914f779981',
                        'incorrect value for vector x')
Test.assertEqualsHashed(y, 'e2d37ff11427dbac7f833a5a7039c0de5a740b1e',
                        'incorrect value for vector y')

# #### ** (1b) Element-wise multiplication: vectors **
# #### In this exercise, you will calculate the element-wise multiplication of two vectors by hand and enter the result in the code cell below.  You'll later see that element-wise multiplication is the default method when two NumPy arrays are multiplied together.  Note we won't be performing element-wise multiplication in future labs, but we are introducing it here to distinguish it from other vector operators, and to because it is a common operations in NumPy, as we will discuss in Part (2b).
# #### The element-wise calculation is as follows: $$ \mathbf{x} \odot \mathbf{y} =  \begin{bmatrix} x_1 y_1 \\\  x_2 y_2 \\\ \vdots \\\ x_n y_n \end{bmatrix} $$
# #### Calculate the value of $ \mathbf{z} $: $$ \mathbf{z} = \begin{bmatrix} 1 \\\  2 \\\ 3 \end{bmatrix} \odot \begin{bmatrix} 4 \\\  5 \\\ 6 \end{bmatrix} $$

# In[4]:

# TODO: Replace <FILL IN> with appropriate code
# Manually calculate your answer and represent the vector as a list of integers values.
z = [4, 10, 18]

# In[5]:
rawData = sc.textFile(fileName, numPartitions)


# In[3]:

# TODO: Replace <FILL IN> with appropriate code
numPoints = rawData.count()
print numPoints
samplePoints = rawData.take(5)
print samplePoints


# In[4]:

# TEST Load and check the data (1a)
Test.assertEquals(numPoints, 6724, 'incorrect value for numPoints')
Test.assertEquals(len(samplePoints), 5, 'incorrect length for samplePoints')


# #### ** (1b) Using `LabeledPoint` **
# #### In MLlib, labeled training instances are stored using the [LabeledPoint](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint) object.  Write the parsePoint function that takes as input a raw data point, parses it using Python's [unicode.split](https://docs.python.org/2/library/string.html#string.split) method, and returns a `LabeledPoint`.  Use this function to parse samplePoints (from the previous question).  Then print out the features and label for the first training point, using the `LabeledPoint.features` and `LabeledPoint.label` attributes. Finally, calculate the number features for this dataset.
# #### Note that `split()` can be called directly on a `unicode` or `str` object.  For example, `u'split,me'.split(',')` returns `[u'split', u'me']`.

# In[5]:

from pyspark.mllib.regression import LabeledPoint
import numpy as np

# Here is a sample raw data point:
# '2001.0,0.884,0.610,0.600,0.474,0.247,0.357,0.344,0.33,0.600,0.425,0.60,0.419'
# In this raw data point, 2001.0 is the label, and the remaining values are features
예제 #56
0
        IDandRatingsTuple: a single tuple of (MovieID, (Rating1, Rating2, Rating3, ...))
    Returns:
        tuple: a tuple of (MovieID, (number of ratings, averageRating))
    """
    id = IDandRatingsTuple[0]
    ratings = IDandRatingsTuple[1]
    num = len(ratings)
    avg = float(sum(ratings)) / len(ratings)
    return (id, (num, avg))


# In[14]:

# TEST Number of Ratings and Average Ratings for a Movie (1a)

Test.assertEquals(getCountsAndAverages((1, (1, 2, 3, 4))), (1, (4, 2.5)),
                  'incorrect getCountsAndAverages() with integer list')
Test.assertEquals(getCountsAndAverages(
    (100, (10.0, 20.0, 30.0))), (100, (3, 20.0)),
                  'incorrect getCountsAndAverages() with float list')
Test.assertEquals(getCountsAndAverages((110, xrange(20))), (110, (20, 9.5)),
                  'incorrect getCountsAndAverages() with xrange')

# #### **(1b) Movies with Highest Average Ratings**
# #### Now that we have a way to calculate the average ratings, we will use the `getCountsAndAverages()` helper function with Spark to determine movies with highest average ratings.
# #### The steps you should perform are:
# * #### Recall that the `ratingsRDD` contains tuples of the form (UserID, MovieID, Rating). From `ratingsRDD` create an RDD with tuples of the form (MovieID, Python iterable of Ratings for that MovieID). This transformation will yield an RDD of the form: `[(1, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7c90>), (2, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e79d0>), (3, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7610>)]`. Note that you will only need to perform two Spark transformations to do this step.
# * #### Using `movieIDsWithRatingsRDD` and your `getCountsAndAverages()` helper function, compute the number of ratings and average rating for each movie to yield tuples of the form (MovieID, (number of ratings, average rating)). This transformation will yield an RDD of the form: `[(1, (993, 4.145015105740181)), (2, (332, 3.174698795180723)), (3, (299, 3.0468227424749164))]`. You can do this step with one Spark transformation
# * #### We want to see movie names, instead of movie IDs. To `moviesRDD`, apply RDD transformations that use `movieIDsWithAvgRatingsRDD` to get the movie names for `movieIDsWithAvgRatingsRDD`, yielding tuples of the form (average rating, movie name, number of ratings). This set of transformations will yield an RDD of the form: `[(1.0, u'Autopsy (Macchie Solari) (1975)', 1), (1.0, u'Better Living (1998)', 1), (1.0, u'Big Squeeze, The (1996)', 3)]`. You will need to do two Spark transformations to complete this step: first use the `moviesRDD` with `movieIDsWithAvgRatingsRDD` to create a new RDD with Movie names matched to Movie IDs, then convert that RDD into the form of (average rating, movie name, number of ratings). These transformations will yield an RDD that looks like: `[(3.6818181818181817, u'Happiest Millionaire, The (1967)', 22), (3.0468227424749164, u'Grumpier Old Men (1995)', 299), (2.882978723404255, u'Hocus Pocus (1993)', 94)]`

# In[16]:
    Args:
        IDandRatingsTuple: a single tuple of (MovieID, (Rating1, Rating2, Rating3, ...))
    Returns:
        tuple: a tuple of (MovieID, (number of ratings, averageRating))
    """
    movieID = IDandRatingsTuple[0]
    numRating = len(IDandRatingsTuple[1])
    avgRating = sum(IDandRatingsTuple[1])/float(numRating)
    return (movieID, (numRating, avgRating))


# In[14]:

# TEST Number of Ratings and Average Ratings for a Movie (1a)

Test.assertEquals(getCountsAndAverages((1, (1, 2, 3, 4))), (1, (4, 2.5)),
                            'incorrect getCountsAndAverages() with integer list')
Test.assertEquals(getCountsAndAverages((100, (10.0, 20.0, 30.0))), (100, (3, 20.0)),
                            'incorrect getCountsAndAverages() with float list')
Test.assertEquals(getCountsAndAverages((110, xrange(20))), (110, (20, 9.5)),
                            'incorrect getCountsAndAverages() with xrange')


# #### **(1b) Movies with Highest Average Ratings**
# #### Now that we have a way to calculate the average ratings, we will use the `getCountsAndAverages()` helper function with Spark to determine movies with highest average ratings.
# #### The steps you should perform are:
# * #### Recall that the `ratingsRDD` contains tuples of the form (UserID, MovieID, Rating). From `ratingsRDD` create an RDD with tuples of the form (MovieID, Python iterable of Ratings for that MovieID). This transformation will yield an RDD of the form: `[(1, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7c90>), (2, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e79d0>), (3, <pyspark.resultiterable.ResultIterable object at 0x7f16d50e7610>)]`. Note that you will only need to perform two Spark transformations to do this step.
# * #### Using `movieIDsWithRatingsRDD` and your `getCountsAndAverages()` helper function, compute the number of ratings and average rating for each movie to yield tuples of the form (MovieID, (number of ratings, average rating)). This transformation will yield an RDD of the form: `[(1, (993, 4.145015105740181)), (2, (332, 3.174698795180723)), (3, (299, 3.0468227424749164))]`. You can do this step with one Spark transformation
# * #### We want to see movie names, instead of movie IDs. To `moviesRDD`, apply RDD transformations that use `movieIDsWithAvgRatingsRDD` to get the movie names for `movieIDsWithAvgRatingsRDD`, yielding tuples of the form (average rating, movie name, number of ratings). This set of transformations will yield an RDD of the form: `[(1.0, u'Autopsy (Macchie Solari) (1975)', 1), (1.0, u'Better Living (1998)', 1), (1.0, u'Big Squeeze, The (1996)', 3)]`. You will need to do two Spark transformations to complete this step: first use the `moviesRDD` with `movieIDsWithAvgRatingsRDD` to create a new RDD with Movie names matched to Movie IDs, then convert that RDD into the form of (average rating, movie name, number of ratings). These transformations will yield an RDD that looks like: `[(3.6818181818181817, u'Happiest Millionaire, The (1967)', 22), (3.0468227424749164, u'Grumpier Old Men (1995)', 299), (2.882978723404255, u'Hocus Pocus (1993)', 94)]`

# In[25]:
예제 #58
0
#
# #### If you not familar with Python regular expression [`search` function](https://docs.python.org/2/library/re.html#regular-expression-objects), now would be a good time to check up on the [documentation](https://developers.google.com/edu/python/regular-expressions). One tip that might be useful is to use an online tester like http://pythex.org or http://www.pythonregex.com. To use it, copy and paste the regular expression string below (located between the single quotes ') and test it against one of the 'Invalid logline' above.

# In[104]:

# TODO: Replace <FILL IN> with appropriate code

# This was originally '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)'
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s?" (\d{3}) (\S+)'

parsed_logs, access_logs, failed_logs = parseLogs()

# In[105]:

# TEST Data cleaning (1c)
Test.assertEquals(failed_logs.count(), 0, 'incorrect failed_logs.count()')
Test.assertEquals(parsed_logs.count(), 1043177,
                  'incorrect parsed_logs.count()')
Test.assertEquals(access_logs.count(), parsed_logs.count(),
                  'incorrect access_logs.count()')

# ### **Part 2: Sample Analyses on the Web Server Log File**
#
# ####Now that we have an RDD containing the log file as a set of Row objects, we can perform various analyses.
#
# #### **(2a) Example: Content Size Statistics**
#
# ####Let's compute some statistics about the sizes of content being returned by the web server. In particular, we'd like to know what are the average, minimum, and maximum content sizes.
#
# ####We can compute the statistics by applying a `map` to the `access_logs` RDD. The `lambda` function we want for the map is to extract the `content_size` field from the RDD. The map produces a new RDD containing only the `content_sizes` (one element for each Row object in the `access_logs` RDD). To compute the minimum and maximum statistics, we can use [`min()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.min) and [`max()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.max) functions on the new RDD. We can compute the average statistic by using the [`reduce`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.reduce) function with a `lambda` function that sums the two inputs, which represent two elements from the new RDD that are being reduced together. The result of the `reduce()` is the total content size from the log and it is to be divided by the number of requests as determined using the [`count()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.count) function on the new RDD.
# #### If you not familar with Python regular expression [`search` function](https://docs.python.org/2/library/re.html#regular-expression-objects), now would be a good time to check up on the [documentation](https://developers.google.com/edu/python/regular-expressions). One tip that might be useful is to use an online tester like http://pythex.org or http://www.pythonregex.com. To use it, copy and paste the regular expression string below (located between the single quotes ') and test it against one of the 'Invalid logline' above.

# In[4]:

# TODO: Replace <FILL IN> with appropriate code

# This was originally '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)'
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)\s*" (\d{3}) (\S+)'

parsed_logs, access_logs, failed_logs = parseLogs()


# In[5]:

# TEST Data cleaning (1c)
Test.assertEquals(failed_logs.count(), 0, 'incorrect failed_logs.count()')
Test.assertEquals(parsed_logs.count(), 1043177 , 'incorrect parsed_logs.count()')
Test.assertEquals(access_logs.count(), parsed_logs.count(), 'incorrect access_logs.count()')


# ### **Part 2: Sample Analyses on the Web Server Log File**
#  
# ####Now that we have an RDD containing the log file as a set of Row objects, we can perform various analyses.
#  
# #### **(2a) Example: Content Size Statistics**
#  
# ####Let's compute some statistics about the sizes of content being returned by the web server. In particular, we'd like to know what are the average, minimum, and maximum content sizes.
#  
# ####We can compute the statistics by applying a `map` to the `access_logs` RDD. The `lambda` function we want for the map is to extract the `content_size` field from the RDD. The map produces a new RDD containing only the `content_sizes` (one element for each Row object in the `access_logs` RDD). To compute the minimum and maximum statistics, we can use [`min()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.min) and [`max()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.max) functions on the new RDD. We can compute the average statistic by using the [`reduce`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.reduce) function with a `lambda` function that sums the two inputs, which represent two elements from the new RDD that are being reduced together. The result of the `reduce()` is the total content size from the log and it is to be divided by the number of requests as determined using the [`count()`](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.count) function on the new RDD.

# In[6]:
예제 #60
0
# One way of completing the function
def makePlural(word):
    return word + 's'

print makePlural('cat')


# In[103]:

# Load in the testing code and check to see if your answer is correct
# If incorrect it will report back '1 test failed' for each failed test
# Make sure to rerun any cell you change before trying the test again
from test_helper import Test
# TEST Pluralize and test (1b)
Test.assertEquals(makePlural('rat'), 'rats', 'incorrect result: makePlural does not add an s')


# #### ** (1c) Apply `makePlural` to the base RDD **
# #### Now pass each item in the base RDD into a [map()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.map) transformation that applies the `makePlural()` function to each element. And then call the [collect()](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect) action to see the transformed RDD.

# In[104]:

# TODO: Replace <FILL IN> with appropriate code
pluralRDD = wordsRDD.map(makePlural)
print pluralRDD.collect()


# In[105]:

# TEST Apply makePlural to the base RDD(1c)