/
goldeneye.py
45 lines (36 loc) · 1.17 KB
/
goldeneye.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from pyspark import SparkConf, SparkContext
import math
def format0(rec):
Record = rec.split("|")
return(Record)
def format1(rec):
Record = rec.split("\t")
return(Record)
con = SparkConf()
sc = SparkContext(conf = con)
movie=sc.textFile("file:///home/cloudera/imdb/Movies.item", use_unicode=True)
rating=sc.textFile("file:///home/cloudera/imdb/Movie-Ratings-Done.data")
movieFormatted= movie.map(format0)
ratingFormatted= rating.map(format1)
dataM = movieFormatted.take(movieFormatted.count())
dataR = ratingFormatted.collect()
#golden = movieFormatted.filter(findMovie)
#match = rdd.union
#out = dataM.collect()
movietitle = "GoldenEye (1995)"
movieID = movieFormatted.filter(lambda n: n[1] == movietitle).map(lambda x: x[0]).collect()
#if (dataM[2][1] == "GoldenEye (1995)"):
# print "found"
# movieF= []
ID = movieID[0]
review = ratingFormatted.filter(lambda n: n[1] == ID).filter(lambda n: n[2] == "5").count()
#print len(list2)
print "amount of 5* reviews", review
# movieF.append(dataM[2][0])
#print movieF
#tordd = sc.parallelize(movieF)
#join = movieID.intersection(ratingFormatted).collect()
#x = join.take(5)
#out = join.collect()
#print join
#how many 5* golden eye has