示例#1
0
from rdd import RDD
from pyspark import SparkContext, SparkConf

# RDD from text file
rdd = RDD('./datasets/albums.csv')

# Create key/value pairs of (album id, average critic)
critics = rdd.map(lambda line: line.split(',')).map(
    lambda x: (x[0], (float(x[7]) + float(x[8]) + float(x[9])) / 3))

# sortByKey() sorts alphabetically. sortBy() sorts by number of sales in descending order
sortedreview = critics.sortByKey().sortBy(lambda x: x[1], ascending=False)

# Get the 10 best albums based on avg critic
sc = SparkContext.getOrCreate(SparkConf())
top = sc.parallelize(c=sortedreview.take(10))

# Save as TSV file. set coalesce(1) so that we can use this file in Task 7
top.map(lambda x: '{album}\t{avg}'.format(album=x[0], avg=x[1])).coalesce(
    1).saveAsTextFile("./datasets/result_6")
示例#2
0
from rdd import RDD

rdd1 = RDD('./datasets/artists.csv')
rdd2 = RDD('./datasets/albums.csv')


def find_name(line):
    if line[2]:
        return line[2]
    return line[1]


# (artist_id, artist_name)
norwegian_artists = rdd1.map(lambda line: line.split(',')).filter(
    lambda x: x[5] == 'Norway').map(lambda y: (y[0], find_name(y)))

# (artist_id, mtv_review)
albums = rdd2.map(lambda line: line.split(',')).map(lambda x:
                                                    (x[1], float(x[8])))

# (artist_id, (artist_name, mtv_review))
norwegian_albums = norwegian_artists.join(albums)

# (artist_id, (artist_name, avg_critic))
reduced = norwegian_albums.reduceByKey(lambda x, y: (x[0], (x[1] + y[1]) / 2))

# Save as TSV file
reduced.map(lambda x: '{name}\tNorway\t{mtv}'.format(name=x[1][0], mtv=x[1][1])
            ).saveAsTextFile("./datasets/result_9")
示例#3
0
from rdd import RDD, toTSVLine

rdd = RDD("./datasets/albums.csv")

# map every line to key/value pairs for fast reduce and sorting
albums = rdd.map(lambda line: (int(line.split(',')[1]), 1))
added = albums.reduceByKey(lambda x, y: x + y)
completelysorted = added.sortByKey().sortBy(lambda x: x[1], ascending=False)

# Save to TSV file
lines = completelysorted.map(toTSVLine)
lines.saveAsTextFile('./datasets/result_4')
示例#4
0
from rdd import RDD, toTSVLine
from operator import add


# RDD from text file
rdd = RDD('./datasets/albums.csv')

# Create key/value pairs of (genre, tracks sold). List is alreay sorted by id, thus we don't need id
genres = rdd.map(lambda line:  (''.join(line.split(',')[3]), int(line.split(',')[6])))

# Aggregate all genres and sum salesnumbers
sortbysales = genres.reduceByKey(add)

# sortByKey() sorts alphabetically. sortBy() sorts by number of sales in descending order
completed = sortbysales.sortByKey().sortBy(lambda x: x[1], ascending=False)

# Save to TSV file
lines = completed.map(toTSVLine)
lines.saveAsTextFile('./datasets/result_5')