from rdd import RDD from pyspark import SparkContext, SparkConf # RDD from text file rdd = RDD('./datasets/albums.csv') # Create key/value pairs of (album id, average critic) critics = rdd.map(lambda line: line.split(',')).map( lambda x: (x[0], (float(x[7]) + float(x[8]) + float(x[9])) / 3)) # sortByKey() sorts alphabetically. sortBy() sorts by number of sales in descending order sortedreview = critics.sortByKey().sortBy(lambda x: x[1], ascending=False) # Get the 10 best albums based on avg critic sc = SparkContext.getOrCreate(SparkConf()) top = sc.parallelize(c=sortedreview.take(10)) # Save as TSV file. set coalesce(1) so that we can use this file in Task 7 top.map(lambda x: '{album}\t{avg}'.format(album=x[0], avg=x[1])).coalesce( 1).saveAsTextFile("./datasets/result_6")
from rdd import RDD rdd1 = RDD('./datasets/artists.csv') rdd2 = RDD('./datasets/albums.csv') def find_name(line): if line[2]: return line[2] return line[1] # (artist_id, artist_name) norwegian_artists = rdd1.map(lambda line: line.split(',')).filter( lambda x: x[5] == 'Norway').map(lambda y: (y[0], find_name(y))) # (artist_id, mtv_review) albums = rdd2.map(lambda line: line.split(',')).map(lambda x: (x[1], float(x[8]))) # (artist_id, (artist_name, mtv_review)) norwegian_albums = norwegian_artists.join(albums) # (artist_id, (artist_name, avg_critic)) reduced = norwegian_albums.reduceByKey(lambda x, y: (x[0], (x[1] + y[1]) / 2)) # Save as TSV file reduced.map(lambda x: '{name}\tNorway\t{mtv}'.format(name=x[1][0], mtv=x[1][1]) ).saveAsTextFile("./datasets/result_9")
from rdd import RDD, toTSVLine rdd = RDD("./datasets/albums.csv") # map every line to key/value pairs for fast reduce and sorting albums = rdd.map(lambda line: (int(line.split(',')[1]), 1)) added = albums.reduceByKey(lambda x, y: x + y) completelysorted = added.sortByKey().sortBy(lambda x: x[1], ascending=False) # Save to TSV file lines = completelysorted.map(toTSVLine) lines.saveAsTextFile('./datasets/result_4')
from rdd import RDD, toTSVLine from operator import add # RDD from text file rdd = RDD('./datasets/albums.csv') # Create key/value pairs of (genre, tracks sold). List is alreay sorted by id, thus we don't need id genres = rdd.map(lambda line: (''.join(line.split(',')[3]), int(line.split(',')[6]))) # Aggregate all genres and sum salesnumbers sortbysales = genres.reduceByKey(add) # sortByKey() sorts alphabetically. sortBy() sorts by number of sales in descending order completed = sortbysales.sortByKey().sortBy(lambda x: x[1], ascending=False) # Save to TSV file lines = completed.map(toTSVLine) lines.saveAsTextFile('./datasets/result_5')