示例#1
0
	def getMaxValues(self,rdd):
		setMovies = SetMovies()
		regexMoviesTitle = '::([a-z]|[A-Z]|[0-9]|[(]|[)]|[ ])*::'
		regexYear = '[(][0-9][0-9][0-9][0-9][)]'
		rdd =  rdd.map(lambda line :  re.search(regexMoviesTitle,line).group())
		rdd =  rdd.map(lambda movie : (movie,movie.replace('::','').split(" ")))
		rdd =  rdd.flatMap(lambda titleAndWords : map(lambda word: (word,(1,[titleAndWords[0]])),titleAndWords[1]))
		rdd =  rdd.filter(lambda wordsAndTitle : not re.match(regexYear, wordsAndTitle[0]) and len(wordsAndTitle[0])>=3)
		rdd =  rdd.reduceByKey(lambda firstValue,secondValue :(firstValue[0]+secondValue[0],list(set(firstValue[1]+secondValue[1]))))	
		return setMovies.setWithMaxValues(rdd,lambda value :(value[1][0],(value[0],value[1][1])))
示例#2
0
class TestCalculator (unittest.TestCase):

	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.setMovies = SetMovies() 

	def tearDown(self):
	   self.sc.stop()

	def test_when_calculate_set_word_most_repeater(self):
	   entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])),
	            ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])),
	            ('Story', (1, ['::Toy Story Toy (1995)::'])),
	            ('StoryA', (3, ['::ToyA StoryA ToyA (1995)::']))]
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
	             ("StoryA",["::ToyA StoryA ToyA (1995)::"]))
	   funcReverseTuple = lambda value :((value[1][0],(value[0],value[1][1])))
	   rdd = self.sc.parallelize(entry)		              
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)

	def test_when_calculate_set_word_most_repeater_one(self):
	   entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])),
	            ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])),
	            ('Story', (1, ['::Toy Story Toy (1995)::'])),
	            ('StoryA', (1, ['::ToyA StoryA ToyA (1995)::']))]
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']))
	   funcReverseTuple = lambda value :(value[1][0],(value[0],value[1][1]))
	   rdd = self.sc.parallelize(entry)		              
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)	   

	def test_when_calculate_maximum_year(self):
	   entry = [('(1996)',2),
	            ('(1998)',2),
	            ('(1997)',1)]  
	   result = ('(1996)','(1998)')
	   rdd = self.sc.parallelize(entry)	
	   funcReverseTuple = lambda value :(value[1],value[0])
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)

	def test_when_calculate_maximum_year_with_only_one(self):
	   entry = [('(1996)',2),
	            ('(1998)',1),
	            ('(1997)',1),
	            ('(1999)',1)]  
	   result = ('(1996)')
	   rdd = self.sc.parallelize(entry)	
	   funcReverseTuple = lambda value :(value[1],value[0])
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)        
示例#3
0
	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.setMovies = SetMovies() 
示例#4
0
	 def getMaxValues(self,rdd):   
	    setMovies = SetMovies()
	    regexYearWithParenthesis = '[(][0-9][0-9][0-9][0-9][)]'	    
	    rdd =  rdd.map(lambda line :  (re.search(regexYearWithParenthesis,line).group(),1))
	    rdd =  rdd.reduceByKey(lambda firstValue,secondValue :(firstValue+secondValue))	
	    return setMovies.setWithMaxValues(rdd,lambda value :(value[1],value[0]))