def testPredictionMethods(train_filename, eval_item_filename, user_means_filename): ''' compare predictions generated by the different approaches computes pairwise list overlap and average recall for each method ''' logging.info('testing predictions with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename)) mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) mrec_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE) mrec_recommender.fit(mrec_train_data) warp_recommender = WARPMFRecommender(d=50, gamma=0.01, C=100.0) warp_recommender.fit(mrec_train_data.X) train_data = trainData.TrainData(train_filename, user_means_filename) _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE) recalls = {} overlaps = {} top_recs = {} user_counter = 0.0 methods = ['mrec', 'warp', 'mf', 'ub_classic', 'ib_classic', 'ub_damping', 'ib_damping', 'ub_non', 'ib_non'] with open(eval_item_filename,'rb') as eval_file: for line in eval_file: data = line.split('\t') user_id = data[0] ground_truth_items = data[1].split(',') random_unrated_items = data[2].rstrip('\n').split(',') evaluation_item_ids = ground_truth_items + random_unrated_items # for each prediction method, compute topN recommendations once per user predictions1 = mrec_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True) top_recs['mrec'] = topNLists.getTopNList(predictions1, evaluation_item_ids=evaluation_item_ids) predictions2 = warp_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True) top_recs['warp'] = topNLists.getTopNList(predictions2, evaluation_item_ids=evaluation_item_ids) predictions3 = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids) top_recs['mf'] = topNLists.getTopNList(predictions3) predictions4 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'classic') top_recs['ub_classic'] = topNLists.getTopNList(predictions4) predictions5 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'classic') top_recs['ib_classic'] = topNLists.getTopNList(predictions5) predictions6 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'self_damping') top_recs['ub_damping'] = topNLists.getTopNList(predictions6) predictions7 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'self_damping') top_recs['ib_damping'] = topNLists.getTopNList(predictions7) predictions8 = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized') top_recs['ub_non'] = topNLists.getTopNList(predictions8) predictions9 = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized') top_recs['ib_non'] = topNLists.getTopNList(predictions9) # then, use the computed topN lists to update recall and overlap values for method1 in methods: if method1 in recalls: recalls[method1] += topNLists.getRecall(ground_truth_items, top_recs[method1]) else: recalls[method1] = topNLists.getRecall(ground_truth_items, top_recs[method1]) for method2 in methods: dict_key = method1 + '_' + method2 if dict_key in overlaps: overlaps[dict_key] += topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2]) else: overlaps[dict_key] = topNLists.computeRecommendationListOverlap(top_recs[method1], top_recs[method2]) user_counter += 1.0 logging.info('Tested user {0}. Current recalls: {1}. Current overlaps: {2}'.\ format(user_id, [(k, v/user_counter) for k,v in recalls.items()], [(k, v/user_counter) for k,v in overlaps.items()])) return recalls, overlaps
class KNNRecommender(object): """ A wrapper for the mrec class CosineKNNRecommender. """ def __init__(self, k): self.k = k self.folds = 3 self.numAucSamples = 100 self.numProcesses = multiprocessing.cpu_count() self.chunkSize = 1 def learnModel(self, X): self.X = X self.learner = CosineKNNRecommender(self.k) self.learner.fit(X) def predict(self, maxItems): orderedItems = self.learner.batch_recommend_items(self.X, maxItems, return_scores=False) orderedItems = numpy.array(orderedItems) return orderedItems def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) precisions = numpy.zeros((self.ks.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): learner = self.copy() learner.k = k paramList.append((trainX, testX, testOmegaList, learner)) #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize) import itertools resultsIterator = itertools.imap(computePrecision, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempPrecision = resultsIterator.next() precisions[i, icv] = tempPrecision #pool.terminate() meanPrecisions = numpy.mean(precisions, 1) stdPrecisions = numpy.std(precisions, 1) logging.debug(meanPrecisions) k = self.ks[numpy.argmax(meanPrecisions)] logging.debug("Model parameters: k=" + str(k)) self.k = k return meanPrecisions, stdPrecisions def copy(self): learner = KNNRecommender(self.k) learner.ks = self.ks learner.folds = self.folds learner.numAucSamples = self.numAucSamples return learner def __str__(self): outputStr = "KnnRecommender: k=" + str(self.k) outputStr += " numAucSamples=" + str(self.numAucSamples) return outputStr
raise ValueError('Wrong reranking method entered. Choose between: bs | div_c | div_r | sur_c | sur_r | sur_r_n | nov') result_object.file = result_file # create the training data and required recommendation models train_data = trainData.TrainData(train_filename, user_means_filename) Q = None library_recommender = None if options.algorithm == 'mf': _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE) elif options.algorithm == 'mrec': mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) library_recommender = CosineKNNRecommender(config.NEIGHBOURHOOD_SIZE) library_recommender.fit(mrec_train_data) elif options.algorithm == 'warp': mrec_train_data = load_fast_sparse_matrix('tsv', train_filename) library_recommender = WARPMFRecommender(d=config.FACTOR_MODEL_SIZE, gamma=0.01, C=options.cvalue) library_recommender.fit(mrec_train_data.X) elif options.algorithm in ['ub', 'ib']: pass else: raise ValueError('Wrong rec. algorithm entered. Choose between ub, ib, mf, mrec, and warp') # run the beyondAccuracy for all users in the .eval file logging.info('running beyondAccuracy with {0}...'.format(eval_item_filename)) evaluation_cases = 0 with open(eval_item_filename,'rb') as eval_file: