def test_pa3(self): testdata = zip([(1024, 77), (1024, 268), (1024, 462), (1024, 393), (1024, 36955), (2048, 77), (2048, 36955), (2048, 788)], [ "1024,77,4.3848,Memento (2000)", "1024,268,2.8646,Batman (1989)", "1024,462,3.1082,Erin Brockovich (2000)", "1024,393,3.8722,Kill Bill: Vol. 2 (2004)", "1024,36955,2.3524,True Lies (1994)", "2048,77,4.8493,Memento (2000)", "2048,36955,3.9698,True Lies (1994)", "2048,788,3.8509,Mrs. Doubtfire (1993)", ]) data = DataIO(verbose=False) data.load('testdata/ratings.csv', items_file='testdata/movie-titles.csv') model = UserModel(verbose=False, normalize=True) model.build(data) for ((u, i), s) in testdata: self.assertTrue( '%s' % s == '%d,%d,%.4f,%s' % (u, i, user_based_knn(model, 30, [data.new_user_idx(u)], [data.new_item_idx(i)], cosine, promote_users=True, normalize='centered'), data.title(i)))
def test_pa3(self): testdata = zip([(1024,77),(1024,268),(1024,462),(1024,393),(1024,36955),(2048,77),(2048,36955),(2048,788)], [ "1024,77,4.3848,Memento (2000)", "1024,268,2.8646,Batman (1989)", "1024,462,3.1082,Erin Brockovich (2000)", "1024,393,3.8722,Kill Bill: Vol. 2 (2004)", "1024,36955,2.3524,True Lies (1994)", "2048,77,4.8493,Memento (2000)", "2048,36955,3.9698,True Lies (1994)", "2048,788,3.8509,Mrs. Doubtfire (1993)", ]) data = DataIO(verbose = False) data.load('testdata/ratings.csv', items_file = 'testdata/movie-titles.csv') model = UserModel(verbose = False, normalize = True) model.build(data) for ((u,i),s) in testdata: self.assertTrue('%s' % s == '%d,%d,%.4f,%s' % (u,i,user_based_knn(model, 30, [data.new_user_idx(u)],[data.new_item_idx(i)], cosine, promote_users = True, normalize = 'centered'), data.title(i)))
from score import user_based_knn, pearson from dataset import DataIO from model import UserModel from suggest import top_ns ratings_file = "ratings.csv" given_users = [3867, 860] NN = 5 n = 3 part_1_file = "part_1.csv" part_2_file = "part_2.csv" # part 1 data = DataIO() data.load(ratings_file) model = UserModel(normalize=False) model.build(data) given_users = data.translate_users(given_users) given_items = range(data.num_items()) R = user_based_knn(model, NN, given_users, given_items, pearson, promote_users=False) recs = top_ns(R, n, keep_order=True) file = open(part_1_file, "w") file.write("\n".join(["%d %.3f" % (data.old_item_idx(i), s) for u in recs for (i, s) in u])) file.close() # part 2
# make python find our new modules import sys sys.path.append("../../../recsys") from score import user_based_knn, cosine from dataset import DataIO from model import UserModel ratings_file = '../data/ratings.csv' items_file = '../data/movie-titles.csv' NN = 30 answer_file = 'part_1.csv' # part 1 data = DataIO() data.load(ratings_file, items_file = items_file) model = UserModel(normalize = True) model.build(data) inputs = [(4169,161), (4169,36955), (4169,453), (4169,857), (4169,238), (5399,1891), (5399,14), (5399,187), (5399,602), (5399,629), (3613,329),
class DatasetTest(unittest.TestCase): def setUp(self): self.ratings_file = 'testdata/ratings.csv' self.item_tags_file = 'testdata/movie-tags.csv' self.ds = DataIO(False) def test_ratings(self): self.ds.load(self.ratings_file) self.__ratings_norm_test() self.__ratings_test() self.__printer_test() def test_item_tags(self): self.ds.load(self.ratings_file, self.item_tags_file) self.__ratings_norm_test() self.__ratings_test() self.__tags_test() self.__tags_norm_test() def __printer_test(self): expected_users = '1: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)\n51: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)\n100: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)' expected_items = '11: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)\n603: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)\n36955: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)' recs = [ zip(range(10), range(10)[::-1]), ] * 3 ids = [0, 50, 99] self.assertTrue( self.ds.print_recs(recs, given_items=ids) == expected_items) self.assertTrue( self.ds.print_recs(recs, given_users=ids) == expected_users) def __ratings_test(self): # lines count self.assertTrue( len(self.ds.ratings) == self.__wccount(self.ratings_file)) # values head_ratings = [(1, 809, 4.0), (1, 601, 5.0), (1, 238, 5.0), (1, 664, 4.5), (1, 3049, 3.0)] self.assertTrue(self.ds.ratings[0:5] == [(self.ds.new_user_idx(u), self.ds.new_item_idx(i), r) for (u, i, r) in head_ratings]) tail_ratings = [(5573, 114, 2.5), (5573, 22, 4.5), (5573, 11, 3.0), (5573, 557, 4.0), (5573, 98, 3.5)] self.assertTrue(self.ds.ratings[-5:] == [(self.ds.new_user_idx(u), self.ds.new_item_idx(i), r) for (u, i, r) in tail_ratings]) def __ratings_norm_test(self): (user_col, item_col) = zip(*self.ds.ratings)[:2] self.assertTrue(len(set(user_col)) == self.ds.num_users()) self.assertTrue(len(set(item_col)) == self.ds.num_items()) self.assertTrue( range(self.ds.num_users()) == [ self.ds.new_user_idx(self.ds.old_user_idx(i)) for i in range(self.ds.num_users()) ]) self.assertTrue( range(self.ds.num_items()) == [ self.ds.new_item_idx(self.ds.old_item_idx(i)) for i in range(self.ds.num_items()) ]) def __tags_test(self): # read tags file and check that all (item,tag) combinations appear in the dataset # get item-tag combinations from the original file file = open(self.item_tags_file, 'rbU') csv_reader = csv.reader(file, delimiter=',') item_tag_set_orig = set([(self.ds.new_item_idx(int(i)), self.ds.tag_idx(t)) for (i, t) in csv_reader]) file.close() # item-tag combinations in the dataset item_tag_set = set(zip(*zip(*self.ds.item_tags)[:2])) self.assertTrue( len(item_tag_set_orig.symmetric_difference(item_tag_set)) == 0) # tag values tag_values = [(114, 'afternoon section', 1), (114, 'capitalism', 4), (114, "YOUNG WOMEN'S FAVORATE", 1), (10020, '18th century', 2), (581, 'wolves', 1)] self.assertTrue( all([ self.ds.item_tags.index( (self.ds.new_item_idx(i), self.ds.tag_idx(t), c)) for (i, t, c) in tag_values ])) # tag count tag_count_expected = dict([(114, 1), (680, 1), (581, 1)]) # take list of unique (item,tag) pairs, replace tag with 1s and group-sum by the first argument item_tagcount = dict( self.__sum_group_by_first( zip( zip(*self.ds.item_tags)[0], [ 1, ] * len(self.ds.item_tags)))) self.assertTrue([ item_tagcount[self.ds.new_item_idx(i)] == tag_count_expected[i] for i in [114, 680, 581] ]) def __tags_norm_test(self): # collect all users and items (item_col, tag_col) = zip(*self.ds.item_tags)[:2] # check that there are as many new indexes as different users and items self.assertTrue(len(set(item_col)) == self.ds.num_items()) # actually, this may not hold, but let's keep for now self.assertTrue(len(set(tag_col)) == self.ds.num_tags()) # for all tags, check that new(old(new) = new self.assertTrue( range(self.ds.num_tags()) == [ self.ds.tag_idx(self.ds.tags(i)) for i in range(self.ds.num_tags()) ]) # takes a list of pairs # group by the first element and do summ aggregate of the second # credits http://stackoverflow.com/questions/11058001/python-group-by-and-sum-a-list-of-tuples def __sum_group_by_first(self, list_of_pairs): return [(x, sum([z[1] for z in y])) for ( x, y) in groupby(sorted(list_of_pairs, key=operator.itemgetter(0)), key=operator.itemgetter(0))] #credits https://gist.github.com/zed/0ac760859e614cd03652 def __wccount(self, filename): out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0] return int(out.strip().partition(b' ')[0])
def setUp(self): self.ratings_file = 'testdata/ratings.csv' self.item_tags_file = 'testdata/movie-tags.csv' self.ds = DataIO(False)
class DatasetTest(unittest.TestCase): def setUp(self): self.ratings_file = 'testdata/ratings.csv' self.item_tags_file = 'testdata/movie-tags.csv' self.ds = DataIO(False) def test_ratings(self): self.ds.load(self.ratings_file) self.__ratings_norm_test() self.__ratings_test() self.__printer_test() def test_item_tags(self): self.ds.load(self.ratings_file, self.item_tags_file) self.__ratings_norm_test() self.__ratings_test() self.__tags_test() self.__tags_norm_test() def __printer_test(self): expected_users = '1: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)\n51: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)\n100: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)' expected_items = '11: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)\n603: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)\n36955: (11,9.00), (12,8.00), (13,7.00), (14,6.00), (22,5.00), (24,4.00), (38,3.00), (63,2.00), (77,1.00), (85,0.00)' recs = [zip(range(10), range(10)[::-1]),]*3 ids = [0,50,99] self.assertTrue(self.ds.print_recs(recs, given_items = ids) == expected_items) self.assertTrue(self.ds.print_recs(recs, given_users = ids) == expected_users) def __ratings_test(self): # lines count self.assertTrue(len(self.ds.ratings) == self.__wccount(self.ratings_file)) # values head_ratings = [(1,809,4.0),(1,601,5.0),(1,238,5.0),(1,664,4.5),(1,3049,3.0)] self.assertTrue(self.ds.ratings[0:5] == [(self.ds.new_user_idx(u),self.ds.new_item_idx(i),r) for (u,i,r) in head_ratings]) tail_ratings = [(5573,114,2.5),(5573,22,4.5),(5573,11,3.0),(5573,557,4.0),(5573,98,3.5)] self.assertTrue(self.ds.ratings[-5:] == [(self.ds.new_user_idx(u),self.ds.new_item_idx(i),r) for (u,i,r) in tail_ratings]) def __ratings_norm_test(self): (user_col, item_col) = zip(*self.ds.ratings)[:2] self.assertTrue(len(set(user_col)) == self.ds.num_users()) self.assertTrue(len(set(item_col)) == self.ds.num_items()) self.assertTrue(range(self.ds.num_users()) == [self.ds.new_user_idx(self.ds.old_user_idx(i)) for i in range(self.ds.num_users())]) self.assertTrue(range(self.ds.num_items()) == [self.ds.new_item_idx(self.ds.old_item_idx(i)) for i in range(self.ds.num_items())]) def __tags_test(self): # read tags file and check that all (item,tag) combinations appear in the dataset # get item-tag combinations from the original file file = open(self.item_tags_file, 'rbU') csv_reader = csv.reader(file, delimiter=',') item_tag_set_orig = set([(self.ds.new_item_idx(int(i)), self.ds.tag_idx(t)) for (i,t) in csv_reader]) file.close() # item-tag combinations in the dataset item_tag_set = set(zip(*zip(*self.ds.item_tags)[:2])) self.assertTrue(len(item_tag_set_orig.symmetric_difference(item_tag_set)) == 0) # tag values tag_values = [(114,'afternoon section',1), (114,'capitalism',4), (114,"YOUNG WOMEN'S FAVORATE",1), (10020,'18th century',2), (581,'wolves',1)] self.assertTrue(all([self.ds.item_tags.index((self.ds.new_item_idx(i), self.ds.tag_idx(t), c )) for (i,t,c) in tag_values])) # tag count tag_count_expected = dict([(114,1), (680,1), (581,1)]) # take list of unique (item,tag) pairs, replace tag with 1s and group-sum by the first argument item_tagcount = dict(self.__sum_group_by_first( zip(zip(*self.ds.item_tags)[0], [1,]*len(self.ds.item_tags)) )) self.assertTrue([item_tagcount[self.ds.new_item_idx(i)] == tag_count_expected[i] for i in [114,680,581]]) def __tags_norm_test(self): # collect all users and items (item_col, tag_col) = zip(*self.ds.item_tags)[:2] # check that there are as many new indexes as different users and items self.assertTrue(len(set(item_col)) == self.ds.num_items()) # actually, this may not hold, but let's keep for now self.assertTrue(len(set(tag_col)) == self.ds.num_tags()) # for all tags, check that new(old(new) = new self.assertTrue(range(self.ds.num_tags()) == [self.ds.tag_idx(self.ds.tags(i)) for i in range(self.ds.num_tags())]) # takes a list of pairs # group by the first element and do summ aggregate of the second # credits http://stackoverflow.com/questions/11058001/python-group-by-and-sum-a-list-of-tuples def __sum_group_by_first(self, list_of_pairs): return [(x,sum([z[1] for z in y])) for (x,y) in groupby(sorted(list_of_pairs, key = operator.itemgetter(0)), key = operator.itemgetter(0))] #credits https://gist.github.com/zed/0ac760859e614cd03652 def __wccount(self, filename): out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE, stderr=subprocess.STDOUT ).communicate()[0] return int(out.strip().partition(b' ')[0])
# make python find our new modules import sys sys.path.append("../../../recsys") from score import user_based_knn, cosine from dataset import DataIO from model import UserModel ratings_file = '../data/ratings.csv' items_file = '../data/movie-titles.csv' NN = 30 answer_file = 'part_1.csv' # part 1 data = DataIO() data.load(ratings_file, items_file=items_file) model = UserModel(normalize=True) model.build(data) inputs = [(4169, 161), (4169, 36955), (4169, 453), (4169, 857), (4169, 238), (5399, 1891), (5399, 14), (5399, 187), (5399, 602), (5399, 629), (3613, 329), (3613, 604), (3613, 134), (3613, 1637), (3613, 278), (1873, 786), (1873, 2502), (1873, 550), (1873, 1894), (1873, 1422), (4914, 268), (4914, 36658), (4914, 786), (4914, 161), (4914, 854)] file = open(answer_file, 'w') file.write('\n'.join([ '%d,%d,%.4f,%s' % (u, i, user_based_knn(model,
from score import user_based_knn, pearson from dataset import DataIO from model import UserModel from suggest import top_ns ratings_file = 'ratings.csv' given_users = [3867, 860] NN = 5 n = 3 part_1_file = 'part_1.csv' part_2_file = 'part_2.csv' # part 1 data = DataIO() data.load(ratings_file) model = UserModel(normalize=False) model.build(data) given_users = data.translate_users(given_users) given_items = range(data.num_items()) R = user_based_knn(model, NN, given_users, given_items, pearson, promote_users=False) recs = top_ns(R, n, keep_order=True)
def setUp(self): self.data = DataIO(verbose = False) self.data.load('testdata/ratings-ma4.csv') self.model = UserModel(normalize = False, verbose = False) self.model.build(self.data)
class WA4Test(unittest.TestCase): def setUp(self): self.data = DataIO(verbose = False) self.data.load('testdata/ratings-ma4.csv') self.model = UserModel(normalize = False, verbose = False) self.model.build(self.data) def test_pearson(self): # test correlation S = pearson(self.model.R(), self.model.R()).todense() # 1. check we don't have numbers more than 1 # user string comparison to avoid float nuances self.assertTrue('%.2f' % S.max() == '1.00'); # 2. check there are only '1' on the diagonal self.assertTrue(sum([S[i,i] for i in range(S.shape[0])]) == S.shape[0]) # 3. check a couple of correlation coefficients corr_test = [(1648, 5136, 0.40298), (918, 2824, -0.31706)] for (u1,u2,c) in corr_test: # check what's in the full matrix u1 = self.data.new_user_idx(u1) u2 = self.data.new_user_idx(u2) # check precomputed self.assertTrue('%.5f' % S[u1,u2] == '%.5f' % c) # compute here self.assertTrue('%.5f' % pearson(self.model.R()[u1,:], self.model.R()[u2,:]).todense() == '%.5f' % c) def test_5nn(self): u = 3712 nns = [(2824,0.46291), (3867,0.400275), (5062,0.247693), (442,0.22713), (3853,0.19366)] S = pearson(self.model.R(), self.model.R()) leave_top_n(S,6) top_neighbours = [(self.data.old_user_idx(i),S[i,self.data.new_user_idx(u)]) for i in S[:,self.data.new_user_idx(u)].nonzero()[0]] top_neighbours.sort(key = lambda a: a[1], reverse = True) # skip the first element (corr = 1) self.assertTrue(','.join(['%d,%.6f' % a for a in top_neighbours[1:]]) == ','.join(['%d,%.6f' % a for a in nns])) # consider moving this test to test_recsys.py def test_unnormalized(self): u = 3712 expected = [(641,5.000), (603,4.856), (105,4.739)] R = user_based_knn(self.model, 5, [self.data.new_user_idx(u)], range(self.data.num_items()), pearson, promote_users = False) recs = top_ns([R],3, keep_order = True) self.assertTrue(','.join(['%d,%.3f' % (self.data.old_item_idx(a),b) for (a,b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected])) # consider moving this test to test_recsys.py def test_normalized(self): u = 3712 expected = [(641,5.900), (603,5.546), (105,5.501)] R = user_based_knn(self.model, 5, [self.data.new_user_idx(u)], range(self.data.num_items()), pearson, promote_users = False, normalize = 'normalize') recs = top_ns([R],3, keep_order = True) self.assertTrue(','.join(['%d,%.3f' % (self.data.old_item_idx(a),b) for (a,b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected]))
def setUp(self): self.data = DataIO(verbose=False) self.data.load('testdata/ratings-ma4.csv') self.model = UserModel(normalize=False, verbose=False) self.model.build(self.data)
class WA4Test(unittest.TestCase): def setUp(self): self.data = DataIO(verbose=False) self.data.load('testdata/ratings-ma4.csv') self.model = UserModel(normalize=False, verbose=False) self.model.build(self.data) def test_pearson(self): # test correlation S = pearson(self.model.R(), self.model.R()).todense() # 1. check we don't have numbers more than 1 # user string comparison to avoid float nuances self.assertTrue('%.2f' % S.max() == '1.00') # 2. check there are only '1' on the diagonal self.assertTrue( sum([S[i, i] for i in range(S.shape[0])]) == S.shape[0]) # 3. check a couple of correlation coefficients corr_test = [(1648, 5136, 0.40298), (918, 2824, -0.31706)] for (u1, u2, c) in corr_test: # check what's in the full matrix u1 = self.data.new_user_idx(u1) u2 = self.data.new_user_idx(u2) # check precomputed self.assertTrue('%.5f' % S[u1, u2] == '%.5f' % c) # compute here self.assertTrue( '%.5f' % pearson(self.model.R()[u1, :], self.model.R()[u2, :]).todense() == '%.5f' % c) def test_5nn(self): u = 3712 nns = [(2824, 0.46291), (3867, 0.400275), (5062, 0.247693), (442, 0.22713), (3853, 0.19366)] S = pearson(self.model.R(), self.model.R()) leave_top_n(S, 6) top_neighbours = [ (self.data.old_user_idx(i), S[i, self.data.new_user_idx(u)]) for i in S[:, self.data.new_user_idx(u)].nonzero()[0] ] top_neighbours.sort(key=lambda a: a[1], reverse=True) # skip the first element (corr = 1) self.assertTrue(','.join(['%d,%.6f' % a for a in top_neighbours[1:]]) == ','.join(['%d,%.6f' % a for a in nns])) # consider moving this test to test_recsys.py def test_unnormalized(self): u = 3712 expected = [(641, 5.000), (603, 4.856), (105, 4.739)] R = user_based_knn(self.model, 5, [self.data.new_user_idx(u)], range(self.data.num_items()), pearson, promote_users=False) recs = top_ns([R], 3, keep_order=True) self.assertTrue(','.join( ['%d,%.3f' % (self.data.old_item_idx(a), b) for ( a, b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected])) # consider moving this test to test_recsys.py def test_normalized(self): u = 3712 expected = [(641, 5.900), (603, 5.546), (105, 5.501)] R = user_based_knn(self.model, 5, [self.data.new_user_idx(u)], range(self.data.num_items()), pearson, promote_users=False, normalize='normalize') recs = top_ns([R], 3, keep_order=True) self.assertTrue(','.join( ['%d,%.3f' % (self.data.old_item_idx(a), b) for ( a, b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected]))