class BPRRecommender(BaseRecommender): def __init__(self): super().__init__() self.URM = None self.item_factors = None self.user_factors = None self.model = BayesianPersonalizedRanking(factors=1000, num_threads=8, verify_negative_samples=True) def fit(self, URM): self.URM = URM URM_transpose = self.URM.T self.model.fit(URM_transpose) self.user_factors = self.model.user_factors self.item_factors = self.model.item_factors def get_expected_ratings(self, user_id): scores = np.dot(self.user_factors[user_id], self.item_factors.T) return np.squeeze(scores) def recommend(self, user_id, at=10): expected_ratings = self.get_expected_ratings(user_id) recommended_items = np.flip(np.argsort(expected_ratings), 0) unseen_items_mask = np.in1d(recommended_items, self.URM[user_id].indices, assume_unique=True, invert=True) recommended_items = recommended_items[unseen_items_mask] return recommended_items[:at]
def train(self): b_time = time.time() self.item_idx, self.item_idx_reverse = {}, {} if self.reload: with open(self.config['item_vec_file'], 'r') as in_f: num_items, dim = in_f.readline().strip().split() print(f'Num of items : {num_items}, dim : {dim}') self.t = AnnoyIndex(int(dim), 'angular') for idx, line in tqdm(enumerate(in_f)): tmp = line.split() item = tmp[0] self.item_idx[item] = idx self.item_idx_reverse[idx] = item self.t = AnnoyIndex(int(dim), 'angular') self.t.load(f'{file_name}.ann') else: Y = [] with open(self.config['train_file'], 'r') as in_f: for idx, line in tqdm(enumerate(in_f)): items_list = line.strip().split() Y.append([self.__get_id(item) for item in items_list]) # construct the sparse matrix indptr = np.fromiter(chain((0,), map(len, Y)), int, len(Y) + 1).cumsum() indices = np.fromiter(chain.from_iterable(Y), int, indptr[-1]) data = np.ones_like(indices) user_item_table_csr = csr_matrix((data, indices, indptr)) item_user_table_csr = user_item_table_csr.T.tocsr() print('Matrix size : ', item_user_table_csr.shape) print("Train finished ... : ", time.time() - b_time) # Train MF model_name = "bpr" self.model = BayesianPersonalizedRanking(num_threads=20) print("training model %s", model_name) start = time.time() self.model.fit(item_user_table_csr) print("trained model '%s' in %s", model_name, time.time() - start) print("calculating top movies") items_count, dim = self.model.item_factors.shape # Build Ann self.t = AnnoyIndex(int(dim), 'angular') with open(config['item_vec_file']) as out_f: print(f"{items_count} {dim}", file=out_f) for idx, vec in tqdm(enumerate(self.model.item_factors)): self.t.add_item(idx, vec) print(f"{self.item_idx_reverse[idx]} {' '.join(vec.astype(str))}", file=out_f) print("Read file finished ...") file_name = self.config['index_file_file'] self.t.build(30) # 10 trees self.t.save(f'{file_name}.ann') print(f"Train finished ...{time.time() - b_time}")
def _train_bpr(hyperparameters, train): h = hyperparameters model = BayesianPersonalizedRanking(factors=h['factors'], iterations=h['n_iter'], num_threads=nproc) model.fit(train) # test_eval = {'p@k': precision_at_k(model, train.T.tocsr(), factorization.T.tocsr(), K=10)} # val_eval = {'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10)} return model
def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4): """Обучает BPR""" model = BayesianPersonalizedRanking(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) model.fit(csr_matrix(user_item_matrix).T.tocsr()) return model
def get_aucs_vs_factors(): factors = [8, 16, 32, 64, 128] params_list = [{"factors": factor} for factor in factors] aucs = [] for params in params_list: model = BayesianPersonalizedRanking(**params) model.fit(comments) aucs.append( auc(test_set[:20000], model.user_factors, model.item_factors, subreddits, users)) return aucs
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def __init__(self, *args, **kwargs): """ Construct an ALS recommender. The arguments are passed as-is to :py:class:`implicit.als.BayesianPersonalizedRanking`. """ from implicit.bpr import BayesianPersonalizedRanking super().__init__(BayesianPersonalizedRanking(*args, **kwargs))
def train_implicit_bpr( train_df: pd.DataFrame, params: Dict[str, Union[str, float, int]], shape: [int, int]) -> Tuple[BayesianPersonalizedRanking, sp.csr.csr_matrix]: train_matrix = create_implicit_train_matrix(train_df, shape) model_params = dict() args = inspect.getfullargspec(BayesianPersonalizedRanking.__init__)[0] for param, param_val in params.items(): if param in args: model_params[param] = param_val model = BayesianPersonalizedRanking(**model_params) model.fit(train_matrix, show_progress=False) return model, train_matrix
def evaluate_bpr_model(hyperparameters, train, test, validation): h = hyperparameters model = BayesianPersonalizedRanking(factors=h['factors'], iterations=h['n_iter'], num_threads=nproc) model.fit(train) test_eval = { 'p@k': precision_at_k(model, train.T.tocsr(), test.T.tocsr(), K=10) } val_eval = { 'p@k': precision_at_k(model, train.T.tocsr(), validation.T.tocsr(), K=10) } return test_eval, val_eval
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if m.indptr[movieid] == m.indptr[movieid + 1]: continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def initialize_components(self): self.bpr_mf = BPR_matrix_factorization(factors=200, regularization=0.00000, learning_rate=0.01, iterations=65) self.ials_cg_mf = IALS_CG(iterations=15, calculate_training_loss=True, factors=500, use_cg=True, regularization=1e-3)
def fit(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=0): """Обучает ALS""" # model = AlternatingLeastSquares(factors=n_factors, model = BayesianPersonalizedRanking( factors=n_factors, regularization=regularization, iterations=iterations, # calculate_training_loss=True, num_threads=num_threads, ) model.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=False) print('LOG = 005') return model
def BPR_train(self, inputs, rating_df): model = BayesianPersonalizedRanking(factors=60) model.fit(inputs) # user_embeddings = model.user_factors movie_embeddings = model.item_factors # id를 영화 이름으로 변경 id_title_dict = {k: v for k, v in self.movie_df['title'].items()} title = [ id_title_dict[movie_id] for movie_id in rating_df['movie_id'].cat.categories ] # movie embedding movie_embedding_df = pd.DataFrame(movie_embeddings, index=title) # user_names = [user_id for user_id in rating_df['user_id'].cat.categories] # user_embedding_df = pd.DataFrame(user_embeddings, index=user_names) return movie_embedding_df # , user_embedding_df
def bpr(self, database, **kwargs): from implicit.bpr import BayesianPersonalizedRanking opts = self.get_option('implicit', 'bpr', **kwargs) model = BayesianPersonalizedRanking( **opts ) ratings = self.get_database(database, **kwargs) if kwargs.get('return_instance_before_train'): return (model, ratings) elapsed, mem_info = self.run(model.fit, ratings) model = None return elapsed, mem_info
def initialize_components(self): self.train = self.rescale_wrt_insertion_order(self.train) self.item_cosineCF_recommender = Cosine_Similarity(self.train, topK=200, shrink=15, normalize=True, mode='cosine') self.user_cosineCF_recommender = Cosine_Similarity(self.train.T, topK=200, shrink=15, normalize=True, mode='cosine') self.svd_recommender = PureSVDRecommender(self.train) self.cbf_bpr_recommender = SLIM_BPR_Cython(self.icm.T, positive_threshold=0) self.cbf_recommender = Cosine_Similarity(self.icm.T, topK=50, shrink=10, normalize=True, mode='cosine') self.item_rp3b_recommender = RP3betaRecommender(self.train) self.user_rp3b_recommender = RP3betaRecommender(self.train.T) self.bpr_mf = BPR_matrix_factorization(factors=800, regularization=0.01, learning_rate=0.01, iterations=300) self.ials_cg_mf = IALS_CG(iterations=15, calculate_training_loss=True, factors=500, use_cg=True, regularization=1e-3) self.lightfm = LightFM_Recommender(self.train, self.icm, no_components=200)
def BPR(A: sp.coo_matrix, factors: int, lr: float, regularization: float, iterations: float): ''' Run BayesianPersonalizedRanking - BPR: Bayesian Personalized Ranking from ImplicitFeedback :param A: userxitem matrix :param factors: embedding size :param lr: learning rate :param regularization: regularizazion parameter :param iterations: how many training updates ''' bpr = BayesianPersonalizedRanking(factors=factors, learning_rate=lr, regularization=regularization, use_gpu=True, iterations=iterations, verify_negative_samples=True, num_threads=10) bpr.fit(A.T) # Last one is the bias term. However user_bias is 1 (not used) so a simple dot product works. item_factors = bpr.item_factors user_factors = bpr.user_factors return user_factors.dot(item_factors.T)
def __init__(self): super().__init__() self.URM = None self.item_factors = None self.user_factors = None self.model = BayesianPersonalizedRanking(factors=1000, num_threads=8, verify_negative_samples=True)
class MF_kNN(Model): def __init__(self, config): self.requirement = ['test_file', 'lastN', 'topN', 'train_file', 'index_file_file'] self.config = config miss = set() for item in self.requirement: if item not in self.config: miss.add(item) if len(miss) > 0: raise Exception(f"Miss the key : {miss}") Model.__init__(self, self.config['test_file'], self.config['lastN'], self.config['topN'] ) self.reload = self.config.get('reload', False) def __get_id(self, item): if item in self.item_idx: _id = self.item_idx[item] else: _id = len(self.item_idx) self.item_idx[item] = _id self.item_idx_reverse[_id] = item return _id def train(self): b_time = time.time() self.item_idx, self.item_idx_reverse = {}, {} if self.reload: with open(self.config['item_vec_file'], 'r') as in_f: num_items, dim = in_f.readline().strip().split() print(f'Num of items : {num_items}, dim : {dim}') self.t = AnnoyIndex(int(dim), 'angular') for idx, line in tqdm(enumerate(in_f)): tmp = line.split() item = tmp[0] self.item_idx[item] = idx self.item_idx_reverse[idx] = item self.t = AnnoyIndex(int(dim), 'angular') self.t.load(f'{file_name}.ann') else: Y = [] with open(self.config['train_file'], 'r') as in_f: for idx, line in tqdm(enumerate(in_f)): items_list = line.strip().split() Y.append([self.__get_id(item) for item in items_list]) # construct the sparse matrix indptr = np.fromiter(chain((0,), map(len, Y)), int, len(Y) + 1).cumsum() indices = np.fromiter(chain.from_iterable(Y), int, indptr[-1]) data = np.ones_like(indices) user_item_table_csr = csr_matrix((data, indices, indptr)) item_user_table_csr = user_item_table_csr.T.tocsr() print('Matrix size : ', item_user_table_csr.shape) print("Train finished ... : ", time.time() - b_time) # Train MF model_name = "bpr" self.model = BayesianPersonalizedRanking(num_threads=20) print("training model %s", model_name) start = time.time() self.model.fit(item_user_table_csr) print("trained model '%s' in %s", model_name, time.time() - start) print("calculating top movies") items_count, dim = self.model.item_factors.shape # Build Ann self.t = AnnoyIndex(int(dim), 'angular') with open(config['item_vec_file']) as out_f: print(f"{items_count} {dim}", file=out_f) for idx, vec in tqdm(enumerate(self.model.item_factors)): self.t.add_item(idx, vec) print(f"{self.item_idx_reverse[idx]} {' '.join(vec.astype(str))}", file=out_f) print("Read file finished ...") file_name = self.config['index_file_file'] self.t.build(30) # 10 trees self.t.save(f'{file_name}.ann') print(f"Train finished ...{time.time() - b_time}") def predict(self, last_n_events, topN): b_time = time.time() item_similar = list() candidate_items = set() last_n_items = [self.item_idx[e.split(':', 1)[1]] for e in last_n_events[::-1] if e.split(':', 1)[1] in self.item_idx] if len(last_n_items) == 0: return [] for item_idx in last_n_items: similar_res = self.__item_topK_similar(item_idx, topN) item_similar.append(similar_res) candidate_items.update(set(similar_res.keys())) candidate_list = list(candidate_items) score_matric = np.zeros((len(last_n_items), len(candidate_list))) for i, item_id in enumerate(last_n_items): score_matric[i] = self.__item_item_arr_norm_score(item_id, candidate_list, item_similar[i]) rank_weight = np.array([1 / np.log2(rank + 2) for rank in range(len(last_n_items))]) final_score = rank_weight.dot(score_matric).tolist() # print(last_n_items, list(zip(candidate_list, final_score))) final_items = sorted(zip(candidate_list, final_score), key=lambda x:x[1], reverse=True) return [item for item, score in final_items[:topN]] def __item_topK_similar(self, given_idx, topK): item_idx_arr, score_arr = self.t.get_nns_by_item(given_idx, topK, include_distances=True) res = {} for idx, score in zip(item_idx_arr, score_arr): try: item_raw = self.item_idx_reverse[idx] if item_raw not in res: # return to cosine score res[item_raw] = 1 - score**2/2 except: pass return res def __item_item_arr_norm_score(self, item, candidate_item_arr, similar_items): res = np.zeros(len(candidate_item_arr)) for _item in similar_items: _score = similar_items[_item] if _item in candidate_item_arr: res[candidate_item_arr.index(_item)] = float(_score) return res / np.linalg.norm(res)
def calculate_similar_movies(input_filename, output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() # titles, ratings = get_movielens(variant) user_item_df = read_user_item_data(input_filename) print(user_item_df) unique_user, unique_item, user_item_df = get_user_item_sparse_data_presto( user_item_df) #user_item_df = user_item_df.sort_values(by=['user_index','item_index']) user_item_ratings = scipy.sparse.csr_matrix( (user_item_df['score'], (user_item_df['item_index'], user_item_df['user_index']))) print(user_item_ratings) ''' # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) ''' log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares( factors=128, regularization=0.01, use_native=True, iterations=20, calculate_training_loss=True) # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") # ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(user_item_ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") k=10 iterations = 10000 similar_df_gen = similar_to_csv(model, k, unique_item, iterations) with tqdm.tqdm(total=len(unique_item) // iterations + 1) as progress: for similar_df_slice in similar_df_gen: similar_df_slice.to_csv(args.outputfile, mode='a', header=False, index=False) print("finsih a batch") progress.update(1) '''
def _get_model(self): return BayesianPersonalizedRanking(factors=3, regularization=0, use_gpu=False)
class HHimmlerEnsemble: def __init__(self, urm_train, urm_test, icm, parameters=None): if parameters is None: parameters = { "USER_CF": 0.8, "USER_BPR": 0.7, "ITEM_CF": 1, "ITEM_BPR": 0.8, "CBF": 0.3, "IALS": 1.0, "CBF_BPR": 1 } self.ensemble_weights = parameters self.train = urm_train.tocsr() self.test = urm_test.tocsr() self.icm = icm.tocsr() self.initialize_components() def initialize_components(self): self.bpr_mf = BPR_matrix_factorization(factors=200, regularization=0.00000, learning_rate=0.01, iterations=65) self.ials_cg_mf = IALS_CG(iterations=15, calculate_training_loss=True, factors=500, use_cg=True, regularization=1e-3) def fit(self): self.bpr_mf.fit(self.train.T.tocoo()) self.ials_cg_mf.fit(40 * self.train.T) self.bpr_mf_latent_x = self.bpr_mf.user_factors.copy() self.bpr_mf_latent_y = self.bpr_mf.item_factors.copy() self.ials_cg_mf_latent_x = self.ials_cg_mf.user_factors.copy() self.ials_cg_mf_latent_y = self.ials_cg_mf.item_factors.copy() def recommend(self, user_id, combiner, at=10): bpr_mf_r = np.dot(self.bpr_mf_latent_x[user_id], self.bpr_mf_latent_y.T).ravel() ials_cg_mf_r = np.dot(self.ials_cg_mf_latent_x[user_id], self.ials_cg_mf_latent_y.T).ravel() scores = [ # [bpr_mf_r, self.ensemble_weights["BPR_MF"], "BPR_MF"], [ials_cg_mf_r, 1, "IALS_CG"] ] for r in scores: self.filter_seen(user_id, r[0]) return combiner.combine(scores, at) def filter_seen(self, user_id, scores): start_pos = int(self.train.indptr[user_id]) end_pos = int(self.train.indptr[user_id + 1]) user_profile = self.train.indices[start_pos:end_pos] scores[user_profile] = -1000000 #-np.inf return scores def recommend_batch(self, user_list, combiner, at=10): res = np.array([]) n = 0 for i in user_list: bpr = self.bpr_mf.recommend(user_items=self.train, userid=i, N=at, recalculate_user=False) ials = self.ials_cg_mf.recommend(userid=i, user_items=self.train, N=10) list = [x[0] for x in ials] recList = np.array(list) tuple = np.concatenate(([i], recList)) if (res.size == 0): res = tuple else: res = np.vstack([res, tuple]) return res def get_component_data(self): print('cyka')
class BMussoliniEnsemble: def __init__(self, urm_train, urm_test, icm, parameters=None): if parameters is None: parameters = { "USER_CF" : 7, "SVD" : 26, "ITEM_CF" : 0, "ITEM_BPR" : 16, "CBF" : 7, "IALS" : 26, "CBF_BPR" : 64, "BPR_MF": 6, "ITEM_RP3B": 16, "USER_RP3B": 0, "FM": 10 } self.ensemble_weights = parameters self.train = urm_train.tocsr() self.test = urm_test.tocsr() self.icm = icm.tocsr() self.sequential_playlists = None self.sequential_playlists = load_sequential.load_train_sequential() self.initialize_components() def initialize_components(self): self.train = self.rescale_wrt_insertion_order(self.train) self.item_cosineCF_recommender = Cosine_Similarity(self.train, topK=200, shrink=15, normalize=True, mode='cosine') self.user_cosineCF_recommender = Cosine_Similarity(self.train.T, topK=200, shrink=15, normalize=True, mode='cosine') self.svd_recommender = PureSVDRecommender(self.train) self.cbf_bpr_recommender = SLIM_BPR_Cython(self.icm.T, positive_threshold=0) self.cbf_recommender = Cosine_Similarity(self.icm.T, topK=50, shrink=10, normalize=True, mode='cosine') self.item_rp3b_recommender = RP3betaRecommender(self.train) self.user_rp3b_recommender = RP3betaRecommender(self.train.T) self.bpr_mf = BPR_matrix_factorization(factors=800, regularization=0.01, learning_rate=0.01, iterations=300) self.ials_cg_mf = IALS_CG(iterations=15, calculate_training_loss=True, factors=500, use_cg=True, regularization=1e-3) self.lightfm = LightFM_Recommender(self.train, self.icm, no_components=200) def fit(self): self.svd_latent_x, self.svd_latent_y = self.svd_recommender.fit(num_factors=500) self.min_svd = np.dot(self.svd_latent_x, self.svd_latent_y).min() self.cbf_bpr_w = self.cbf_bpr_recommender.fit(epochs=10, topK=200, batch_size=20, sgd_mode='adagrad', learning_rate=1e-2) self.item_cosineCF_w = self.item_cosineCF_recommender.compute_similarity() self.user_cosineCF_w = self.user_cosineCF_recommender.compute_similarity() self.cbf_w = self.cbf_recommender.compute_similarity() self.item_rp3b_w = self.item_rp3b_recommender.fit() self.user_rp3b_w = self.user_rp3b_recommender.fit() self.ials_cg_mf.fit(40*self.train.T) self.ials_latent_x = self.ials_cg_mf.user_factors.copy() self.ials_latent_y = self.ials_cg_mf.item_factors.copy() self.min_ials = np.dot(self.ials_latent_x, self.ials_latent_y.T).min() self.bpr_mf.fit(self.train.T.tocoo()) self.bpr_mf_latent_x = self.bpr_mf.user_factors.copy() self.bpr_mf_latent_y = self.bpr_mf.item_factors.copy() self.lightfm.fit(100) def recommend(self, user_id, combiner, at=10): user_profile = self.train[user_id, :] svd_r = self.svd_latent_x[user_id, :].dot(self.svd_latent_y) item_cosineCF_r = user_profile.dot(self.item_cosineCF_w).toarray().ravel() user_cosineCF_r = self.user_cosineCF_w[user_id].dot(self.train).toarray().ravel() cbf_r = user_profile.dot(self.cbf_w).toarray().ravel() cbf_bpr_r = user_profile.dot(self.cbf_bpr_w).toarray().ravel() ials_r = np.dot(self.ials_latent_x[user_id], self.ials_latent_y.T + self.min_ials).ravel() bpr_mf_r = np.dot(self.bpr_mf_latent_x[user_id], self.bpr_mf_latent_y.T).ravel() item_rp3b_r = user_profile.dot(self.item_rp3b_w).toarray().ravel() user_rp3b_r = self.user_rp3b_w[user_id].dot(self.train).toarray().ravel() lightfm_r = self.lightfm.scores(user_id) scores = [ # [item_bpr_r, self.ensemble_weights["ITEM_BPR"], "ITEM_BPR" ], # [user_bpr_r, self.ensemble_weights["USER_BPR"], "USER_BPR" ], [svd_r, self.ensemble_weights["SVD"], "SVD"], [item_cosineCF_r, self.ensemble_weights["ITEM_CF"], "ITEM_CF" ], [user_cosineCF_r, self.ensemble_weights["USER_CF"], "USER_CF" ], [ials_r, self.ensemble_weights["IALS"], "IALS" ], [cbf_r, self.ensemble_weights["CBF"], "CBF" ], [cbf_bpr_r, self.ensemble_weights["CBF_BPR"], "CBF_BPR"], [bpr_mf_r, self.ensemble_weights["BPR_MF"], "BPR_MF"], [item_rp3b_r, self.ensemble_weights["ITEM_RP3B"], "ITEM_RP3B"], [user_rp3b_r, self.ensemble_weights["USER_RP3B"], "USER_RP3B"], [lightfm_r, self.ensemble_weights["FM"], "FM"] ] for r in scores: self.filter_seen(user_id, r[0]) R = combiner.combine(scores, at) return R def rescale_wrt_insertion_order(self, R): R = R.copy() R = R.tolil() R = R*0.8 for i in self.sequential_playlists: pl = i["id"] k = 1 for j in i["songs"]: factor = 1/(k**POPULARITY_SCALING_EXP) R[pl, j] = factor*(R[pl,j] + 0.2) k += 1 R = R.tocsr() return R def filter_seen(self, user_id, scores): start_pos = int(self.train.indptr[user_id]) end_pos = int(self.train.indptr[user_id + 1]) user_profile = self.train.indices[start_pos:end_pos] scores[user_profile] = -1000000 #-np.inf return scores def recommend_batch(self, user_list, combiner, at=10): res = np.array([]) n=0 for i in user_list: recList = self.recommend(i, combiner, at).T tuple = np.concatenate(([i], recList)) if (res.size == 0): res = tuple else: res = np.vstack([res, tuple]) return res def get_component_data(self): item_cf_rating = self.ensemble_weights["ITEM_CF"]*self.train.dot(self.item_cosineCF_w) item_cf = { "min" : item_cf_rating.min(), "max" : item_cf_rating.max(), "mean" : item_cf_rating.mean(), } del item_cf_rating user_cf_rating = self.ensemble_weights["USER_CF"]*self.user_cosineCF_w.dot(self.train) user_cf = { "min": user_cf_rating.min(), "max": user_cf_rating.max(), "mean": user_cf_rating.mean(), } del user_cf_rating ials_rating = self.ensemble_weights["IALS"]*(np.dot(self.ials_latent_x, self.ials_latent_y.T)+self.min_ials) ials = { "min": ials_rating.min(), "max": ials_rating.max(), "mean": np.mean(ials_rating), } del ials_rating cbf_rating = self.ensemble_weights["CBF"]*self.train.dot(self.cbf_w) cbf = { "min": cbf_rating.min(), "max": cbf_rating.max(), "mean": cbf_rating.mean(), } del cbf_rating cbf_bpr_rating = self.ensemble_weights["CBF_BPR"]*self.train.dot(self.cbf_bpr_w) cbf_bpr = { "min": cbf_bpr_rating.min(), "max": cbf_bpr_rating.max(), "mean": cbf_bpr_rating.mean(), } del cbf_bpr_rating svd_ratings = self.ensemble_weights["SVD"] * (np.dot(self.svd_latent_x, self.svd_latent_y) + self.min_svd) svd = { "min": svd_ratings.min(), "max": svd_ratings.max(), "mean": svd_ratings.mean(), } del svd_ratings return { "ITEM_CF" : item_cf, "USER_CF": user_cf , "SVD" : svd , "IALS" : ials, "CBF" : cbf, "CBF_BPR" : cbf_bpr }
while (time.time() - start) / 60 / 60 < RUN_LIMIT_HOURS: print(str(timedelta(seconds=time.time() - start)), ' -- config #', len(performance_list) + 1, ' >> training starting...') aux_time = time.time() # hyperparameters factors = 25 * np.random.randint(1, 31) # 25, 50, 75, ... , 750 learning_rate = (10**(-np.random.randint(2, 5))) * np.random.randint(1, 10) regularization = (10**(-np.random.randint(2, 5))) * np.random.randint( 1, 10) iterations = 25 * np.random.randint(1, 31) # 25, 50, 75, ... , 750 alg = BayesianPersonalizedRanking(num_threads=NUM_THREADS, factors=factors, learning_rate=learning_rate, regularization=regularization, iterations=iterations) alg.fit(data_to_fit) perf_ndcg_at_100 = [] rec_list = [] print(' >> took ', str(timedelta(seconds=time.time() - aux_time))) print(str(timedelta(seconds=time.time() - start)), ' -- config #', len(performance_list) + 1, ' >> evaluation starting...') aux_time = time.time() with Pool(NUM_THREADS) as p: perf_ndcg_at_100 = p.map( paralelize_ndcg,
# Read matrix item/playtime df_matrix = read_user_item_playtime(DATA_FILEPATH) # Create index for items index2item = pd.Series(list(df_matrix.columns.values), dtype="category").cat.categories # Create normalized hours matrix df_scaled_matrix = normalize_hours_matrix(df_matrix) # compress matrix csr_df_matrix = csr_matrix(df_scaled_matrix) np.random.seed() # Train user_item_train, user_item_test = train_test_split(csr_df_matrix, train_percentage=train_percent) bpr = BayesianPersonalizedRanking(iterations=train_interactions) bpr.fit(user_item_train.T.tocsr()) print(user_item_train[user_id]) interacted_ids = user_item_train[user_id].nonzero()[1] index2item = index2item.astype('int32') interacted_items = [item_mapping[index2item[index]] for index in interacted_ids if index2item[index] in item_mapping.keys()] # it returns the recommended index and their corresponding score reco = bpr.recommend(user_id, user_item_train, N=topn) print(reco) # map the index to Item reco_items = [item_mapping[index2item[index]] for index, _ in reco if index2item[index] in item_mapping.keys()]
def test_fit_almost_empty_matrix(self): raw = [[0, 0, 0], [0, 1, 0], [0, 0, 0]] return BayesianPersonalizedRanking(use_gpu=False).fit( csr_matrix(raw), show_progress=False)
#%% [markdown] # ### Bayesian Personalized Ranking #%% from implicit.bpr import BayesianPersonalizedRanking params = {"factors": 63} #%% import logging import tqdm import time import codecs #%% model = BayesianPersonalizedRanking(**params) #%% model_name = 'bpr' output_filename = 'subreddits_recs_bpr' #%% model.fit(comments) #%% def bpr_related_subreddits(subreddit): found = np.where(subreddits == subreddit) if len(found[0]) == 0: raise ValueError("Subreddit doesn't exist in the dataset.") _id = found[0][0]
def _get_model(self): return BayesianPersonalizedRanking(factors=3, regularization=0, use_gpu=True, random_state=42)
def test_implicit_bpr(Rtr, Rts, k=20): from implicit.bpr import BayesianPersonalizedRanking bpr = BayesianPersonalizedRanking(k) bpr.fit(Rtr.T) return bpr