Пример #1
0
def netflix_decade_avg (trainingSetDir, w = sys.stdout):
    """
    Compute customer averages per decade that the movie was created.
    Print to standard out or redirect to file using "> extra/movieDecadeAvgRatings.in"
    trainingSetDir is the path to training_set/ from the command line
    w is a writer
    """
    assert trainingSetDir
    movieIDYear = netflix_parse_precomputed('extra/movie_titles_no_nulls.txt', ',')
    
    # Build dict of dict of list {custID: {decade:[totalRating, numRatings]}}
    custIDDecade = {}
    for file in glob.glob(os.path.join(trainingSetDir, 'mv_*.txt')) :
        #print file
        with open(file, 'r') as f_myfile:
            lines = f_myfile.readlines()
            movieID = lines[0].strip(':\r\n')
            for custIDRatingDateLine in lines[1:] :
                decadeDict = {'1890s':[0, 0],'1900s':[0, 0],'1910s':[0, 0],'1920s':[0, 0],'1930s':[0, 0],'1940s':[0, 0],'1950s':[0, 0],'1960s':[0, 0],'1970s':[0, 0],'1980s':[0, 0],'1990s':[0, 0],'2000s':[0, 0]}
                #get custID and actual rating
                custIDRatingDateList = custIDRatingDateLine.strip().split(',')
                custID = custIDRatingDateList[0]
                rating = float(custIDRatingDateList[1])
                assert 1.0 <= rating <= 5.0
                
                # Initialize dictionary 
                if not custID in custIDDecade:
                    custIDDecade[custID] = decadeDict
                else : # custID entry already exists, so add to that dict
                    decadeDict = custIDDecade[custID]
                
                #look up year
                year = movieIDYear[movieID]
                
                #determine the decade 
                decade = netflix_decade_calc(year)
                
                #add to that decade's [totalRating, numRatings]
                totalRatingNumRatingList = decadeDict[decade]
                totalRatingNumRatingList[0] += rating #totalRating
                totalRatingNumRatingList[1] += 1      #numRatings
                decadeDict[decade] = totalRatingNumRatingList
                custIDDecade[custID] = decadeDict
                
    # compute averages for each decade  
    for custID, decadeDict in sorted(custIDDecade.items()) :
        w.write( custID + ":\n" )
        for decade, totalRatingNumRatingList in sorted(decadeDict.items()) :
            totalRating = totalRatingNumRatingList[0]
            if totalRating == 0 : #customer didn't rate any movies of that decade
                continue
            numRating = totalRatingNumRatingList[1]
            avgRating = totalRating / numRating
            w.write( decade + "=" + str(avgRating) + "\n")
Пример #2
0
 def test_parse_precomputed2(self):
     file = 'test/precomputedTest2.txt'
     d = netflix_parse_precomputed(file)
     self.assert_(d == {'1': '1.4242', '3': '3.6908', '2': '2.4515', '5': '5.2818', '4': '4.8711'})
Пример #3
0
 def test_parse_precomputed3(self):
     file = 'test/precomputedTest3.txt'
     d = netflix_parse_precomputed(file, ",")
     self.assert_(d == {'10': '1993', '1': '2002', '3': '1903', '2': '1898', '5': '1984', '4': '1940', '7': '1934', '6': '1938', '9': '1999', '8': '1967'})
Пример #4
0
 def test_parse_precomputed(self):
     file = 'test/precomputedTest.txt'
     d = netflix_parse_precomputed(file)
     self.assert_(d == {'1': '1'})