def netflix_decade_avg (trainingSetDir, w = sys.stdout): """ Compute customer averages per decade that the movie was created. Print to standard out or redirect to file using "> extra/movieDecadeAvgRatings.in" trainingSetDir is the path to training_set/ from the command line w is a writer """ assert trainingSetDir movieIDYear = netflix_parse_precomputed('extra/movie_titles_no_nulls.txt', ',') # Build dict of dict of list {custID: {decade:[totalRating, numRatings]}} custIDDecade = {} for file in glob.glob(os.path.join(trainingSetDir, 'mv_*.txt')) : #print file with open(file, 'r') as f_myfile: lines = f_myfile.readlines() movieID = lines[0].strip(':\r\n') for custIDRatingDateLine in lines[1:] : decadeDict = {'1890s':[0, 0],'1900s':[0, 0],'1910s':[0, 0],'1920s':[0, 0],'1930s':[0, 0],'1940s':[0, 0],'1950s':[0, 0],'1960s':[0, 0],'1970s':[0, 0],'1980s':[0, 0],'1990s':[0, 0],'2000s':[0, 0]} #get custID and actual rating custIDRatingDateList = custIDRatingDateLine.strip().split(',') custID = custIDRatingDateList[0] rating = float(custIDRatingDateList[1]) assert 1.0 <= rating <= 5.0 # Initialize dictionary if not custID in custIDDecade: custIDDecade[custID] = decadeDict else : # custID entry already exists, so add to that dict decadeDict = custIDDecade[custID] #look up year year = movieIDYear[movieID] #determine the decade decade = netflix_decade_calc(year) #add to that decade's [totalRating, numRatings] totalRatingNumRatingList = decadeDict[decade] totalRatingNumRatingList[0] += rating #totalRating totalRatingNumRatingList[1] += 1 #numRatings decadeDict[decade] = totalRatingNumRatingList custIDDecade[custID] = decadeDict # compute averages for each decade for custID, decadeDict in sorted(custIDDecade.items()) : w.write( custID + ":\n" ) for decade, totalRatingNumRatingList in sorted(decadeDict.items()) : totalRating = totalRatingNumRatingList[0] if totalRating == 0 : #customer didn't rate any movies of that decade continue numRating = totalRatingNumRatingList[1] avgRating = totalRating / numRating w.write( decade + "=" + str(avgRating) + "\n")
def test_decade2(self): year = '2005' self.assert_(netflix_decade_calc(year) == '2000s')
def test_decade3(self): year = '1989' self.assert_(netflix_decade_calc(year) == '1980s')
def test_decade(self): year = '1890' self.assert_(netflix_decade_calc(year) == '1890s')