def test_top_n(self): rec_sys = NeighborhoodBasedRecs() recs = rec_sys.recommend_items_by_ratings(10, [{'movie_id': AVENGERS, 'rating': 7}, {'movie_id': ALIEN, 'rating': 10}, {'movie_id': CAPTAIN_AMERICA, 'rating': 6}]) self.assertIsNotNone(recs)
class FeatureWeightedLinearStacking(base_recommender): def __init__(self): self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() self.wcb1 = Decimal(0.65221204) self.wcb2 = Decimal(-0.14638855) self.wcf1 = Decimal(-0.0062952) self.wcf2 = Decimal(0.09139193) def fun1(self): return Decimal(1.0) def fun2(self, user_id): count = Rating.objects.filter(user_id=user_id).count() if count > 3.0: return Decimal(1.0) return Decimal(0.0) def recommend_items(self, user_id, num=6): cb_recs = self.cb.recommend_items(user_id, num * 5) cf_recs = self.cf.recommend_items(user_id, num * 5) combined_recs = dict() for rec in cb_recs: movie_id = rec[0] pred = rec[1]['prediction'] combined_recs[movie_id] = {'cb': pred} for rec in cf_recs: movie_id = rec[0] pred = rec[1]['prediction'] if movie_id in combined_recs.keys(): combined_recs[movie_id]['cf'] = pred else: combined_recs[movie_id] = {'cf': pred} fwls_preds = dict() for key, recs in combined_recs.items(): if 'cb' not in recs.keys(): recs['cb'] = self.cb.predict_score(user_id, key) if 'cf' not in recs.keys(): recs['cf'] = self.cf.predict_score(user_id, key) pred = self.prediction(recs['cb'], recs['cf'], user_id) fwls_preds[key] = {'prediction': pred} sorted_items = sorted(fwls_preds.items(), key=lambda item: -float(item[1]['prediction']))[:num] return sorted_items def predict_score(self, user_id, item_id): p_cb = self.cb.predict_score(user_id, item_id) p_cf = self.cf.predict_score(user_id, item_id) self.prediction(p_cb, p_cf, user_id) def prediction(self, p_cb, p_cf, user_id): p = (self.wcb1 * self.fun1() * p_cb + self.wcb2 * self.fun2(user_id) * p_cb + self.wcf1 * self.fun1() * p_cf + self.wcf2 * self.fun2(user_id) * p_cf) return p
def __init__(self): self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() self.wcb1 = Decimal(0.65221204) self.wcb2 = Decimal(-0.14638855) self.wcf1 = Decimal(-0.0062952) self.wcf2 = Decimal(0.09139193)
def __init__(self, data_size=1000): self.logger = logging.getLogger('FWLS') self.train_data = None self.test_data = None self.rating_count = None self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() self.fwls = FeatureWeightedLinearStacking() self.data_size = data_size
def test_predicting_score(self): # predict users 10 rating for DR_STRANGELOVE rec_sys = NeighborhoodBasedRecs() score = rec_sys.predict_score_by_ratings(DR_STRANGELOVE, { AVENGERS: 10, ALIEN: 10, CAPTAIN_AMERICA: 7 }) self.assertTrue(abs(8 - score) < 1)
class CFCoverage(object): def __init__(self): self.all_users = Rating.objects.all().values('user_id').distinct() self.cf = NeighborhoodBasedRecs() self.items_in_rec = defaultdict(int) self.users_with_recs = [] def calculate_coverage(self): print('calculating coverage for all users ({} in total)'.format( len(self.all_users))) for user in self.all_users: user_id = str(user['user_id']) recset = self.cf.recommend_items(user_id) if recset: self.users_with_recs.append(user) for rec in recset: self.items_in_rec[rec[0]] += 1 print('found recs for {}'.format(user_id)) print('writing cf coverage to file.') json.dump(self.items_in_rec, open('cf_coverage.json', 'w')) no_movies = Movie.objects.all().count() no_movies_in_rec = len(self.items_in_rec.items()) print("{} {} {}".format(no_movies, no_movies_in_rec, float(no_movies / no_movies_in_rec))) return no_movies_in_rec / no_movies
def test_split_data(self): ratings = pd.DataFrame( [ [1, STAR_WARS, 9, '2013-10-12 23:21:27+00:00'], [1, WONDER_WOMAN, 10, '2014-10-12 23:22:27+00:00'], [1, AVENGERS, 10, '2015-11-12 23:20:27+00:00'], [1, WOLVERINE, 8, '2015-08-12 23:20:27+00:00'], [1, PIRATES_OF, 10, '2015-10-12 22:20:27+00:00'], [1, HARRY, 10, '2015-10-12 23:21:27+00:00'], [1, CAPTAIN_AMERICA, 10, '2014-10-12 23:20:27+00:00'], [1, ALIEN, 6, '2015-10-12 23:22:27+00:00'], [1, JACQUES, 6, '2015-10-12 11:20:27+00:00'], [2, STAR_WARS, 10, '2013-10-12 23:20:27+00:00'], [2, WONDER_WOMAN, 10, '2014-10-12 23:20:27+00:00'], [2, AVENGERS, 9, '2016-10-12 23:20:27+00:00'], [2, PIRATES_OF, 6, '2010-10-12 23:20:27+00:00'], [2, CAPTAIN_AMERICA, 10, '2005-10-12 23:20:27+00:00'], [2, DR_STRANGELOVE, 10, '2015-01-12 23:20:27+00:00'], [3, STAR_WARS, 9, '2013-10-12 20:20:27+00:00'], [3, AVENGERS, 10, '2015-10-12 10:20:27+00:00'], [3, PIRATES_OF, 9, '2013-03-12 23:20:27+00:00'], [3, HARRY, 8, '2016-10-13 23:20:27+00:00'], [3, DR_STRANGELOVE, 10, '2016-09-12 23:20:27+00:00'], ], columns=['user_id', 'movie_id', 'rating', 'rating_timestamp']) er = EvaluationRunner(5, ItemSimilarityMatrixBuilder(1, min_sim=0.0), NeighborhoodBasedRecs()) test, train = er.split_data(2, ratings, [1, 2], [3]) self.assertTrue(test is not None) self.assertTrue(test.shape[0], 4) self.assertEqual(train.shape[0], 16)
def evaluate_cf_recommender(): min_number_of_ratings = 5 min_overlap = 5 min_sim = 0.1 k = 10 min_rank = 5 timestr = time.strftime("%Y%m%d-%H%M%S") file_name = '{}-cf.csv'.format(timestr) with open(file_name, 'a', 1) as logfile: logfile.write( "ar, map, mae, min_overlap, min_sim, K, min_num_of_ratings, min_rank\n" ) for k in np.arange(0, 20, 2): min_rank = min_number_of_ratings / 2 recommender = NeighborhoodBasedRecs() er = EvaluationRunner( 0, ItemSimilarityMatrixBuilder(min_overlap, min_sim=min_sim), recommender, k) result = er.calculate(min_number_of_ratings, min_rank, number_test_users=-1) map = result['map'] mae = result['mae'] ar = result['ar'] logfile.write("{}, {}, {}, {}, {}, {}, {}, {}\n".format( ar, map, mae, min_overlap, min_sim, k, min_number_of_ratings, min_rank)) logfile.flush()
class FWLSCalculator(object): def __init__(self, data_size=1000): self.logger = logging.getLogger('FWLS') self.train_data = None self.test_data = None self.rating_count = None self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() self.fwls = FeatureWeightedLinearStacking() self.data_size = data_size def get_real_training_data(self): columns = ['user_id', 'movie_id', 'rating', 'type'] ratings_data = Rating.objects.all().values(*columns)[:self.data_size] df = pd.DataFrame.from_records(ratings_data, columns=columns) self.train_data, self.test_data = train_test_split(df, test_size=0.2) self.logger.debug("training data loaded {}".format(len(ratings_data))) def calculate_predictions_for_training_data(self): self.logger.debug("[BEGIN] getting predictions") self.train_data['cb'] = self.train_data.apply( lambda data: self.cb.predict_score(data['user_id'], data['movie_id' ]), axis=1) self.train_data['cf'] = self.train_data.apply( lambda data: self.cf.predict_score(data['user_id'], data['movie_id' ]), axis=1) self.logger.debug("[END] getting predictions") return None def calculate_feature_functions_for_training_data(self): self.logger.debug("[BEGIN] calculating functions") self.train_data['cb1'] = self.train_data.apply( lambda data: data['cb'] * self.fwls.fun1(), axis=1) self.train_data['cb2'] = self.train_data.apply( lambda data: data['cb'] * self.fwls.fun2(data['user_id']), axis=1) self.train_data['cf1'] = self.train_data.apply( lambda data: data['cf'] * self.fwls.fun1(), axis=1) self.train_data['cf2'] = self.train_data.apply( lambda data: data['cf'] * self.fwls.fun2(data['user_id']), axis=1) self.logger.debug("[END] calculating functions") return None def train(self): #model = sm.ols(formula="rating ~ cb1+cb2+cf1+cf2", data=self.train_data[['rating', 'cb1','cb2','cf1','cf2']]) #results = model.fit() #self.logger.info(results.summary()) #self.logger.info(results.params) regr = linear_model.LinearRegression() regr.fit(self.train_data[['cb1', 'cb2', 'cf1', 'cf2']], self.train_data['rating']) self.logger.info(regr.coef_) return regr.coef_
def recs_cf(request, user_id): min_sim = request.GET.get('min_sim', 0.1) sorted_items = NeighborhoodBasedRecs( min_sim=min_sim).recommend_items(user_id) data = {'user_id': user_id, 'data': sorted_items} return JsonResponse(data, safe=False)
def __init__(self, save_path, data_size=1000): self.save_path = save_path self.logger = logging.getLogger('FWLS') self.train_data = None self.test_data = None self.rating_count = None self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() self.fwls = FeatureWeightedLinearStacking() self.data_size = data_size
def main(): parser = argparse.ArgumentParser() parser.add_argument("rec", type=int, help="the recommender to use") args = parser.parse_args() rec = None if args.rec == "neighborhood_based_recommender": rec = NeighborhoodBasedRecs() RecommenderCoverage(rec)
def recs_cf(request, user_id, num=6): min_sim = request.GET.get('min_sim', 0.1) sorted_items = NeighborhoodBasedRecs(min_sim=min_sim).recommend_items(user_id, num) print("cf sorted_items is: {}".format({sorted_items})) data = { 'user_id': user_id, 'data': sorted_items } return JsonResponse(data, safe=False)
def item_news_feed(request, user_id, page_number): min_sim = request.GET.get('min_sim', 0.1) sorted_items = NeighborhoodBasedRecs( min_sim=min_sim).recommend_items(user_id) if sorted_items.count() < 10: response = {'NumberPages': 0, 'Data': None} return JsonResponse(response, safe=False) number_item_per_page = 5 paginator = Paginator(sorted_items, number_item_per_page) number_pages = paginator.num_pages page_items = paginator.page(page_number) page_items = page_items.object_list data = [] for item in page_items: id = item[0] rating = calculate_rating(id) value = {'Id': id, 'Rating': rating} data.append(value) response = {'NumberPages': number_pages, 'Data': data} return JsonResponse(response, safe=False)
class FWLSCalculator(object): def __init__(self): self.train = None self.test = None self.rating_count = None self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() def get_real_training_data(self): columns = ['user_id', 'movie_id', 'rating', 'type'] ratings_data = Rating.objects.all().values(*columns) df = pd.DataFrame.from_records(ratings_data, columns=columns) self.train, self.test = train_test_split(df, test_size=0.2) def get_training_data(self): print('load data') data = np.array([['1', '2', 3.6], ['1', '3', 5.0], ['1', '4', 5.0], ['2', '2', 3.0]]) self.train = pd.DataFrame(data, columns=['user_id', 'movie_id', 'rating']) self.rating_count = self.train.groupby('user_id').count().reset_index() return self.train def calculate_predictions_for_training_data(self): self.train['cb'] = self.train.apply(lambda data: self.cb.predict_score( data['user_id'], data['movie_id']), axis=1) self.train['cf'] = self.train.apply(lambda data: self.cf.predict_score( data['user_id'], data['movie_id']), axis=1) return None def calculate_feature_functions_for_training_data(self): self.train['cb1'] = self.train.apply( lambda data: data.cb * self.func1()) self.train['cb2'] = self.train.apply( lambda data: data.cb * self.func2(data['user_id']), axis=1) self.train['cf1'] = self.train.apply( lambda data: data.cf * self.func1()) self.train['cf2'] = self.train.apply( lambda data: data.cf * self.func2(data['user_id']), axis=1) return None def train(self): result = sm.ols(formula="rating ~ cb1+cb2+cf1+cf2", data=fwls.train).fit() print(result)
def news_feed(request, user_id, longitude, latitude, page_number): min_sim = request.GET.get('min_sim', 0.1) sorted_items = NeighborhoodBasedRecs( min_sim=min_sim).recommend_items(user_id) print('DEBUG sorted items len {}'.format(len(sorted_items))) number_item_per_page = 5 paginator = Paginator(sorted_items, number_item_per_page) number_pages = paginator.num_pages page_items = paginator.page(page_number) page_items = page_items.object_list data = [] tz = pytz.timezone('Asia/Saigon') current_time = datetime.datetime.now(tz).time() week_day = datetime.datetime.today().weekday() + 1 for item in page_items: food_id = item[0] food = Food.objects.using('sql_db').filter(id=food_id).first() if food == None: continue menu = food.menuid restaurant = menu.restaurantid user = Userdetail.objects.using('sql_db').filter( userid=restaurant.userid).first() if user == None: continue rating = calculate_rating(food_id) distance = calculate_distance(restaurant.long, restaurant.lat, float(longitude), float(latitude)) status = (current_time >= restaurant.opentime and current_time <= restaurant.closetime and week_day >= restaurant.openfromday and week_day <= restaurant.opentoday) \ and True or False value = { 'Id': food.id, 'Name': food.name, 'Address': "status", 'FoodImage': food.defaultimage, 'Rating': rating, 'Status': "status", 'Distance': "distance" } data.append(value) response = {'NumberPages': number_pages, 'Data': data} return JsonResponse(response, safe=False)
def evaluate_cf_recommender(): min_number_of_ratings = 20 min_overlap = 5 min_sim = 0.1 K = 20 min_rank = 5 timestr = time.strftime("%Y%m%d-%H%M%S") file_name = '{}-min_overlap_item_similarity.csv'.format(timestr) with open(file_name, 'a', 1) as logfile: logfile.write( "rak, pak, mae, min_overlap, min_sim, K, min_num_of_ratings, min_rank, user_coverage, " "movie_coverage\n") builder = ItemSimilarityMatrixBuilder(min_overlap, min_sim=min_sim) for min_overlap in np.arange(0, 20, 2): min_rank = min_number_of_ratings / 2 recommender = NeighborhoodBasedRecs() er = EvaluationRunner(0, builder, recommender, K) # Run the baseline recommender: # er = EvaluationRunner(3, None, PopularityBasedRecs(), K) result = er.calculate(min_number_of_ratings, min_rank, number_test_users=-1) user_coverage, movie_coverage = RecommenderCoverage( recommender).calculate_coverage() pak = result['pak'] mae = result['mae'] rak = result['rak'] logfile.write("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n".format( rak, pak, mae, min_overlap, min_sim, K, min_number_of_ratings, min_rank, user_coverage, movie_coverage)) logfile.flush()
if __name__ == '__main__': min_number_of_ratings = 30 min_overlap = 25 min_sim = 0 K = 25 #redo min_rank = 5 timestr = time.strftime("%Y%m%d-%H%M%S") file_name = '{}-min_number_of_ratings_training.csv'.format(timestr) with open(file_name, 'a', 1) as logfile: logfile.write( "pak, mae, min_overlap, min_sim, K, min_num_of_ratings, min_rank\n" ) for min_number_of_ratings in np.arange(5, 30, 10): min_rank = min_number_of_ratings / 2 min_overlap = min_number_of_ratings - min_rank er = EvaluationRunner( 3, ItemSimilarityMatrixBuilder(min_overlap, min_sim=min_sim), NeighborhoodBasedRecs(), K) result = er.calculate(min_number_of_ratings, min_rank, number_test_users=1000) pak = result['pak'] mae = result['mae'] logfile.write("{}, {}, {}, {}, {}, {}, {} \n".format( pak, mae, min_overlap, min_sim, K, min_number_of_ratings, min_rank, datetime.now()))
from analytics_rating as rating1 where rank < 3""" columns = ['user_id', 'movie_id', 'rating', 'type'] rating_data = data_helper.get_data_frame(sql, columns) print('found {} ratings'.format(rating_data.count())) return rating_data if __name__ == '__main__': TEST = False if TEST: er = EvaluationRunner(5, ItemSimilarityMatrixBuilder(2), NeighborhoodBasedRecs()) ratings = pd.DataFrame( [ [1, '11', 5, '2013-10-12 23:20:27+00:00'], [1, '12', 3, '2014-10-12 23:20:27+00:00'], [1, '14', 2, '2015-10-12 23:20:27+00:00'], [2, '11', 4, '2013-10-12 23:20:27+00:00'], [2, '12', 3, '2014-10-12 23:20:27+00:00'], [2, '13', 4, '2015-10-12 23:20:27+00:00'], [3, '11', 5, '2013-10-12 23:20:27+00:00'], [3, '12', 2, '2014-10-12 23:20:27+00:00'], [3, '13', 5, '2015-10-12 23:20:27+00:00'], [3, '14', 2, '2016-10-12 23:20:27+00:00'], [4, '11', 3, '2013-10-12 23:20:27+00:00'], [4, '12', 5, '2014-10-12 23:20:27+00:00'], [4, '13', 3, '2015-10-12 23:20:27+00:00'],
help="run evaluation on rank rec", action="store_true") args = parser.parse_args() print(args.fwls) k = 10 cov = None if args.fwls: logger.debug("evaluating coverage of fwls") cov = RecommenderCoverage(FeatureWeightedLinearStacking) cov.calculate_coverage(K=k, recName='fwls{}'.format(k)) if args.cf: logger.debug("evaluating coverage of cf") cov = RecommenderCoverage(NeighborhoodBasedRecs()) cov.calculate_coverage(K=k, recName='cf{}'.format(k)) if args.cb: logger.debug("evaluating coverage of cb") cov = RecommenderCoverage(ContentBasedRecs()) cov.calculate_coverage(K=k, recName='cb{}'.format(k)) if args.ltr: logger.debug("evaluating coverage of ltr") cov = RecommenderCoverage(BPRRecs()) cov.calculate_coverage(K=k, recName='bpr{}'.format(k)) if args.funk: logger.debug("evaluating coverage of funk") cov = RecommenderCoverage(FunkSVDRecs())
def __init__(self): self.train = None self.test = None self.rating_count = None self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs()
def __init__(self): self.all_users = Rating.objects.all().values('user_id').distinct() self.cf = NeighborhoodBasedRecs() self.items_in_rec = defaultdict(int) self.users_with_recs = []
def test_predicting_score(self): # predict users 10 rating for DR_STRANGELOVE rec_sys = NeighborhoodBasedRecs() score = rec_sys.predict_score_by_ratings(DR_STRANGELOVE, {AVENGERS: 10, ALIEN: 10, CAPTAIN_AMERICA: 7}) self.assertTrue(abs(8 - score) < 1)
len(self.all_users))) for user in self.all_users: user_id = str(user['user_id']) recset = self.cf.recommend_items(user_id) if recset: self.users_with_recs.append(user) for rec in recset: self.items_in_rec[rec[0]] += 1 print('found recs for {}'.format(user_id)) print('writing cf coverage to file.') json.dump(self.items_in_rec, open('cf_coverage.json', 'w')) no_movies = Movie.objects.all().count() no_movies_in_rec = len(self.items_in_rec.items()) print("{} {} {}".format(no_movies, no_movies_in_rec, float(no_movies / no_movies_in_rec))) return no_movies_in_rec / no_movies if __name__ == '__main__': # print("Calculating coverage...") # CFCoverage().calculate_coverage() print("Calculating Precision at K") pak = PrecissionAtK(5, NeighborhoodBasedRecs(), ItemSimilarityMatrixBuilder()) pak.calculate_old()
class FeatureWeightedLinearStacking(base_recommender): def __init__(self): self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() self.wcb1 = Decimal(0.65221204) self.wcb2 = Decimal(-0.14638855) self.wcf1 = Decimal(-0.0062952) self.wcf2 = Decimal(0.09139193) self.intercept = Decimal(0) def fun1(self): return Decimal(1.0) def fun2(self, user_id): count = Rating.objects.filter(user_id=user_id).count() if count > 3.0: return Decimal(1.0) return Decimal(0.0) def set_save_path(self, save_path): with open(save_path + 'fwls_parameters.data', 'rb') as ub_file: parameters = pickle.load(ub_file) self.wcb1 = Decimal(parameters['cb1']) self.wcb2 = Decimal(parameters['cb2']) self.wcf1 = Decimal(parameters['cb1']) self.wcf2 = Decimal(parameters['cf2']) self.intercept = Decimal(parameters['intercept']) def recommend_items_by_ratings(self, user_id, active_user_items, num=6): cb_recs = self.cb.recommend_items_by_ratings(user_id, active_user_items, num * 5) cf_recs = self.cf.recommend_items_by_ratings(user_id, active_user_items, num * 5) return self.merge_predictions(user_id, cb_recs, cf_recs, num) def recommend_items(self, user_id, num=6): cb_recs = self.cb.recommend_items(user_id, num * 5) cf_recs = self.cf.recommend_items(user_id, num * 5) return self.merge_predictions(user_id, cb_recs, cf_recs, num) def merge_predictions(self, user_id, cb_recs, cf_recs, num): combined_recs = dict() for rec in cb_recs: movie_id = rec[0] pred = rec[1]['prediction'] combined_recs[movie_id] = {'cb': pred} for rec in cf_recs: movie_id = rec[0] pred = rec[1]['prediction'] if movie_id in combined_recs.keys(): combined_recs[movie_id]['cf'] = pred else: combined_recs[movie_id] = {'cf': pred} fwls_preds = dict() for key, recs in combined_recs.items(): if 'cb' not in recs.keys(): recs['cb'] = self.cb.predict_score(user_id, key) if 'cf' not in recs.keys(): recs['cf'] = self.cf.predict_score(user_id, key) pred = self.prediction(recs['cb'], recs['cf'], user_id) fwls_preds[key] = {'prediction': pred} sorted_items = sorted( fwls_preds.items(), key=lambda item: -float(item[1]['prediction']))[:num] return sorted_items def predict_score(self, user_id, item_id): p_cb = self.cb.predict_score(user_id, item_id) p_cf = self.cf.predict_score(user_id, item_id) self.prediction(p_cb, p_cf, user_id) def prediction(self, p_cb, p_cf, user_id): p = (self.wcb1 * self.fun1() * p_cb + self.wcb2 * self.fun2(user_id) * p_cb + self.wcf1 * self.fun1() * p_cf + self.wcf2 * self.fun2(user_id) * p_cf) return p + self.intercept
def test(self): er = EvaluationRunner(5, ItemSimilarityMatrixBuilder(1, min_sim=0.0), NeighborhoodBasedRecs()) ratings = pd.DataFrame( [ [1, STAR_WARS, 9, '2013-10-12 23:21:27+00:00'], [1, WONDER_WOMAN, 10, '2014-10-12 23:22:27+00:00'], [1, AVENGERS, 10, '2015-11-12 23:20:27+00:00'], [1, WOLVERINE, 8, '2015-08-12 23:20:27+00:00'], [1, PIRATES_OF, 10, '2015-10-12 22:20:27+00:00'], [1, HARRY, 10, '2015-10-12 23:21:27+00:00'], [1, CAPTAIN_AMERICA, 10, '2014-10-12 23:20:27+00:00'], [1, ALIEN, 6, '2015-10-12 23:22:27+00:00'], [1, JACQUES, 6, '2015-10-12 11:20:27+00:00'], [2, STAR_WARS, 10, '2013-10-12 23:20:27+00:00'], [2, WONDER_WOMAN, 10, '2014-10-12 23:20:27+00:00'], [2, AVENGERS, 9, '2016-10-12 23:20:27+00:00'], [2, PIRATES_OF, 6, '2010-10-12 23:20:27+00:00'], [2, CAPTAIN_AMERICA, 10, '2005-10-12 23:20:27+00:00'], [2, DR_STRANGELOVE, 10, '2015-01-12 23:20:27+00:00'], [3, STAR_WARS, 9, '2013-10-12 20:20:27+00:00'], [3, AVENGERS, 10, '2015-10-12 10:20:27+00:00'], [3, PIRATES_OF, 9, '2013-03-12 23:20:27+00:00'], [3, HARRY, 8, '2016-10-13 23:20:27+00:00'], [3, DR_STRANGELOVE, 10, '2016-09-12 23:20:27+00:00'], [4, STAR_WARS, 8, '2013-10-12 23:20:27+00:00'], [4, WONDER_WOMAN, 8, '2014-10-12 23:20:27+00:00'], [4, AVENGERS, 9, '2015-10-12 23:20:27+00:00'], [4, PIRATES_OF, 5, '2013-10-12 23:20:27+00:00'], [4, HARRY, 6, '2014-10-12 23:20:27+00:00'], [4, ALIEN, 8, '2015-10-12 23:20:27+00:00'], [4, DR_STRANGELOVE, 9, '2015-10-12 23:20:27+00:00'], [5, STAR_WARS, 6, '2013-10-12 23:20:27+00:00'], [5, AVENGERS, 6, '2014-10-12 23:20:27+00:00'], [5, WOLVERINE, 8, '2015-10-12 23:20:27+00:00'], [5, PIRATES_OF, 2, '2016-10-12 23:20:27+00:00'], [5, HARRY, 10, '2016-10-12 23:20:27+00:00'], [5, CAPTAIN_AMERICA, 6, '2016-10-12 23:20:27+00:00'], [5, ALIEN, 4, '2016-10-12 23:20:27+00:00'], [5, DR_STRANGELOVE, 8, '2016-10-12 23:20:27+00:00'], [5, JACQUES, 10, '2016-10-12 23:20:27+00:00'], [6, STAR_WARS, 8, '2013-10-12 23:20:27+00:00'], [6, WONDER_WOMAN, 8, '2014-10-12 23:20:27+00:00'], [6, AVENGERS, 8, '2014-10-12 23:20:27+00:00'], [6, WOLVERINE, 8, '2015-10-12 23:20:27+00:00'], [6, PIRATES_OF, 6, '2016-10-12 23:20:27+00:00'], [6, HARRY, 10, '2016-10-12 23:20:27+00:00'], [6, JACQUES, 8, '2016-10-12 23:20:27+00:00'], [7, AVENGERS, 10, '2014-10-12 23:20:27+00:00'], [7, PIRATES_OF, 3, '2016-10-12 23:20:27+00:00'], [7, HARRY, 1, '2016-10-12 23:20:27+00:00'], [7, ALIEN, 8, '2016-10-12 23:20:27+00:00'], [7, DR_STRANGELOVE, 10, '2016-10-12 23:20:27+00:00'], [8, STAR_WARS, 9, '2013-10-12 23:20:27+00:00'], [8, WONDER_WOMAN, 7, '2014-10-12 23:20:27+00:00'], [8, AVENGERS, 7, '2014-10-12 23:20:27+00:00'], [8, WOLVERINE, 7, '2015-10-12 23:20:27+00:00'], [8, PIRATES_OF, 8, '2016-10-12 23:20:27+00:00'], [8, HARRY, 8, '2016-10-12 23:20:27+00:00'], [8, ALIEN, 8, '2016-10-12 23:20:27+00:00'], [8, DR_STRANGELOVE, 8, '2016-10-12 23:20:27+00:00'], [8, JACQUES, 10, '2016-10-12 23:20:27+00:00'], [9, WONDER_WOMAN, 7, '2014-10-12 23:20:27+00:00'], [9, AVENGERS, 8, '2014-10-12 23:20:27+00:00'], [9, WOLVERINE, 8, '2015-10-12 23:20:27+00:00'], [9, PIRATES_OF, 7, '2016-10-12 23:20:27+00:00'], [9, HARRY, 8, '2016-10-12 23:20:27+00:00'], [9, CAPTAIN_AMERICA, 10, '2016-10-12 23:20:27+00:00'], [9, DR_STRANGELOVE, 10, '2016-10-12 23:20:27+00:00'], [9, JACQUES, 7, '2016-10-12 23:20:27+00:00'], [10, AVENGERS, 7, '2014-10-12 23:20:27+00:00'], [10, HARRY, 10, '2016-10-12 23:20:27+00:00'], [10, CAPTAIN_AMERICA, 6, '2016-10-12 23:20:27+00:00'], [10, DR_STRANGELOVE, 8, '2016-10-12 23:20:27+00:00'], ], columns=['user_id', 'movie_id', 'rating', 'rating_timestamp']) result = er.calculate_using_ratings(ratings, min_number_of_ratings=4, min_rank=5) #figure out what to do with result ;) self.assertLess(result['mae'], decimal.Decimal(1.7)) self.assertLess(result['pak'], decimal.Decimal(0.7)) self.assertLess(result['rak'], decimal.Decimal(0.7)) print(result)
class FWLSCalculator(object): def __init__(self, save_path, data_size=1000): self.save_path = save_path self.logger = logging.getLogger('FWLS') self.train_data = None self.test_data = None self.rating_count = None self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() self.fwls = FeatureWeightedLinearStacking() self.data_size = data_size def get_real_training_data(self): columns = ['user_id', 'movie_id', 'rating', 'type'] ratings_data = Rating.objects.all().values(*columns)[:self.data_size] df = pd.DataFrame.from_records(ratings_data, columns=columns) self.train_data, self.test_data = train_test_split(df, test_size=0.2) self.logger.debug("training data loaded {}".format(len(ratings_data))) def calculate_predictions_for_training_data(self): self.logger.debug("[BEGIN] getting predictions") self.train_data['cb'] = self.train_data.apply(lambda data: self.cb.predict_score(data['user_id'], data['movie_id']), axis=1) self.train_data['cf'] = self.train_data.apply(lambda data: self.cf.predict_score(data['user_id'], data['movie_id']), axis=1) self.logger.debug("[END] getting predictions") return None def calculate_feature_functions_for_training_data(self): self.logger.debug("[BEGIN] calculating functions") self.train_data['cb1'] = self.train_data.apply(lambda data: data['cb'] * self.fwls.fun1(), axis=1) self.train_data['cb2'] = self.train_data.apply(lambda data: data['cb'] * self.fwls.fun2(data['user_id']), axis=1) self.train_data['cf1'] = self.train_data.apply(lambda data: data['cf'] * self.fwls.fun1(), axis=1) self.train_data['cf2'] = self.train_data.apply(lambda data: data['cf'] * self.fwls.fun2(data['user_id']), axis=1) self.logger.debug("[END] calculating functions") return None def build(self, train_data=None, params=None): if params: self.save_path = params['save_path'] self.data_size = params['data_sample'] if train_data is not None: self.train_data = train_data if self.data_size > 0: self.train_data = self.train_data.sample(self.data_size) self.logger.debug("training sample of size {}".format(self.train_data.shape[0])) else: self.get_real_training_data() self.calculate_predictions_for_training_data() self.calculate_feature_functions_for_training_data() return self.train() def train(self, ratings=None, train_feature_recs=False): if train_feature_recs: ItemSimilarityMatrixBuilder().build(ratings) LdaModel.build() regr = linear_model.LinearRegression(fit_intercept=True, n_jobs=-1, normalize=True) regr.fit(self.train_data[['cb1', 'cb2', 'cf1', 'cf2']], self.train_data['rating']) self.logger.info(regr.coef_) result = {'cb1': regr.coef_[0], 'cb2': regr.coef_[1], 'cf1': regr.coef_[2], 'cf2': regr.coef_[3], 'intercept': regr.intercept_} self.logger.debug(result) self.logger.debug(self.train_data.iloc[100]) ensure_dir(self.save_path) with open(self.save_path + 'fwls_parameters.data', 'wb') as ub_file: pickle.dump(result, ub_file) return result
class FWLSCalculator(object): def __init__(self, save_path, data_size = 1000): self.save_path = save_path self.logger = logging.getLogger('FWLS') self.train_data = None self.test_data = None self.rating_count = None self.cb = ContentBasedRecs() self.cf = NeighborhoodBasedRecs() self.fwls = FeatureWeightedLinearStacking() self.data_size = data_size def get_real_training_data(self): columns = ['user_id', 'movie_id', 'rating', 'type'] ratings_data = Rating.objects.all().values(*columns)[:self.data_size] df = pd.DataFrame.from_records(ratings_data, columns=columns) self.train_data, self.test_data = train_test_split(df, test_size=0.2) self.logger.debug("training data loaded {}".format(len(ratings_data))) def calculate_predictions_for_training_data(self): self.logger.debug("[BEGIN] getting predictions") self.train_data['cb'] = self.train_data.apply(lambda data: self.cb.predict_score(data['user_id'], data['movie_id']), axis=1) self.train_data['cf'] = self.train_data.apply(lambda data: self.cf.predict_score(data['user_id'], data['movie_id']), axis=1) self.logger.debug("[END] getting predictions") return None def calculate_feature_functions_for_training_data(self): self.logger.debug("[BEGIN] calculating functions") self.train_data['cb1'] = self.train_data.apply(lambda data: data['cb'] * self.fwls.fun1(), axis=1) self.train_data['cb2'] = self.train_data.apply(lambda data: data['cb'] * self.fwls.fun2(data['user_id']), axis = 1) self.train_data['cf1'] = self.train_data.apply(lambda data: data['cf'] * self.fwls.fun1(), axis=1) self.train_data['cf2'] = self.train_data.apply(lambda data: data['cf'] * self.fwls.fun2(data['user_id']), axis = 1) self.logger.debug("[END] calculating functions") return None def build(self, train_data = None, params = None): if params: self.save_path = params['save_path'] if train_data is None: self.get_real_training_data() self.train_data = train_data self.calculate_predictions_for_training_data() self.calculate_feature_functions_for_training_data() return self.train() def train(self, ratings = None, train_feature_recs= False): if train_feature_recs: ItemSimilarityMatrixBuilder().build(ratings) LdaModel.build() regr = linear_model.LinearRegression() regr.fit(self.train_data[['cb1','cb2','cf1','cf2']], self.train_data['rating']) self.logger.info(regr.coef_) result = {'cb1': regr.coef_[0], 'cb2': regr.coef_[1], 'cf1': regr.coef_[2], 'cf2': regr.coef_[3] } ensure_dir(self.save_path) with open(self.save_path + 'fwls_parameters.data', 'wb') as ub_file: pickle.dump(result, ub_file) return result
def test(self): er = EvaluationRunner(5, ItemSimilarityMatrixBuilder(1, min_sim=0.0), NeighborhoodBasedRecs()) STAR_WARS = 'star wars' WONDER_WOMAN = 'wonder woman' AVENGERS = 'avengers' WOLVERINE = 'logan' PIRATES_OF = 'pirates of the caribbien' HARRY = 'harry potter I' CAPTAIN_AMERICA = 'captain america' ALIEN = 'alien' DR_STRANGELOVE = 'doctor strangelove' JACQUES = 'jacques' ratings = pd.DataFrame( [[1, STAR_WARS, 9, '2013-10-12 23:21:27+00:00'], [1, WONDER_WOMAN, 10, '2014-10-12 23:22:27+00:00'], [1, AVENGERS, 10, '2015-11-12 23:20:27+00:00'], [1, WOLVERINE, 8, '2015-08-12 23:20:27+00:00'], [1, PIRATES_OF, 10, '2015-10-12 22:20:27+00:00'], [1, HARRY, 10, '2015-10-12 23:21:27+00:00'], [1, CAPTAIN_AMERICA, 10, '2014-10-12 23:20:27+00:00'], [1, ALIEN, 6, '2015-10-12 23:22:27+00:00'], [1, JACQUES, 6, '2015-10-12 11:20:27+00:00'], [2, STAR_WARS, 10, '2013-10-12 23:20:27+00:00'], [2, WONDER_WOMAN, 10, '2014-10-12 23:20:27+00:00'], [2, AVENGERS, 9, '2016-10-12 23:20:27+00:00'], [2, PIRATES_OF, 6, '2010-10-12 23:20:27+00:00'], [2, CAPTAIN_AMERICA, 10, '2005-10-12 23:20:27+00:00'], [2, DR_STRANGELOVE, 10, '2015-01-12 23:20:27+00:00'], [3, STAR_WARS, 9, '2013-10-12 20:20:27+00:00'], [3, AVENGERS, 10, '2015-10-12 10:20:27+00:00'], [3, PIRATES_OF, 9, '2013-03-12 23:20:27+00:00'], [3, HARRY, 8, '2016-10-13 23:20:27+00:00'], [3, DR_STRANGELOVE, 10, '2016-09-12 23:20:27+00:00'], [4, STAR_WARS, 8, '2013-10-12 23:20:27+00:00'], [4, WONDER_WOMAN, 8, '2014-10-12 23:20:27+00:00'], [4, AVENGERS, 9, '2015-10-12 23:20:27+00:00'], [4, PIRATES_OF, 5, '2013-10-12 23:20:27+00:00'], [4, HARRY, 6, '2014-10-12 23:20:27+00:00'], [4, ALIEN, 8, '2015-10-12 23:20:27+00:00'], [4, DR_STRANGELOVE, 9, '2015-10-12 23:20:27+00:00'], [5, STAR_WARS, 6, '2013-10-12 23:20:27+00:00'], [5, AVENGERS, 6, '2014-10-12 23:20:27+00:00'], [5, WOLVERINE, 8, '2015-10-12 23:20:27+00:00'], [5, PIRATES_OF, 2, '2016-10-12 23:20:27+00:00'], [5, HARRY, 10, '2016-10-12 23:20:27+00:00'], [5, CAPTAIN_AMERICA, 6, '2016-10-12 23:20:27+00:00'], [5, ALIEN, 4, '2016-10-12 23:20:27+00:00'], [5, DR_STRANGELOVE, 8, '2016-10-12 23:20:27+00:00'], [5, JACQUES, 10, '2016-10-12 23:20:27+00:00'], [6, STAR_WARS, 8, '2013-10-12 23:20:27+00:00'], [6, WONDER_WOMAN, 8, '2014-10-12 23:20:27+00:00'], [6, AVENGERS, 8, '2014-10-12 23:20:27+00:00'], [6, WOLVERINE, 8, '2015-10-12 23:20:27+00:00'], [6, PIRATES_OF, 6, '2016-10-12 23:20:27+00:00'], [6, HARRY, 10, '2016-10-12 23:20:27+00:00'], [6, JACQUES, 8, '2016-10-12 23:20:27+00:00'], [7, AVENGERS, 10, '2014-10-12 23:20:27+00:00'], [7, PIRATES_OF, 3, '2016-10-12 23:20:27+00:00'], [7, HARRY, 1, '2016-10-12 23:20:27+00:00'], [7, ALIEN, 8, '2016-10-12 23:20:27+00:00'], [7, DR_STRANGELOVE, 10, '2016-10-12 23:20:27+00:00'], [8, STAR_WARS, 9, '2013-10-12 23:20:27+00:00'], [8, WONDER_WOMAN, 7, '2014-10-12 23:20:27+00:00'], [8, AVENGERS, 7, '2014-10-12 23:20:27+00:00'], [8, WOLVERINE, 7, '2015-10-12 23:20:27+00:00'], [8, PIRATES_OF, 8, '2016-10-12 23:20:27+00:00'], [8, HARRY, 8, '2016-10-12 23:20:27+00:00'], [8, ALIEN, 8, '2016-10-12 23:20:27+00:00'], [8, DR_STRANGELOVE, 8, '2016-10-12 23:20:27+00:00'], [8, JACQUES, 10, '2016-10-12 23:20:27+00:00'], [9, WONDER_WOMAN, 7, '2014-10-12 23:20:27+00:00'], [9, AVENGERS, 8, '2014-10-12 23:20:27+00:00'], [9, WOLVERINE, 8, '2015-10-12 23:20:27+00:00'], [9, PIRATES_OF, 7, '2016-10-12 23:20:27+00:00'], [9, HARRY, 8, '2016-10-12 23:20:27+00:00'], [9, CAPTAIN_AMERICA, 10, '2016-10-12 23:20:27+00:00'], [9, DR_STRANGELOVE, 10, '2016-10-12 23:20:27+00:00'], [9, JACQUES, 7, '2016-10-12 23:20:27+00:00'], [10, AVENGERS, 7, '2014-10-12 23:20:27+00:00'], [10, HARRY, 10, '2016-10-12 23:20:27+00:00'], [10, CAPTAIN_AMERICA, 6, '2016-10-12 23:20:27+00:00'], [10, DR_STRANGELOVE, 8, '2016-10-12 23:20:27+00:00'], ], columns=['user_id', 'movie_id', 'rating', 'rating_timestamp']) result = er.calculate_using_ratings(ratings, min_number_of_ratings=4, min_rank=5) #figure out what to do with result ;) print(result)