def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "lmf": model = LogisticMatrixFactorization() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
def calculate_similar_event(path, output_filename): model = AlternatingLeastSquares() a, b = read_event_data(path) event, users = hfd5_from_dataframe(a, b, output_filename) users.eliminate_zeros() users.data = np.ones(len(users.data)) log.info("Start fitting") model.fit(users) user_count = np.ediff1d(users.indptr) to_generate = sorted(np.arange(len(event)), key=lambda x: -user_count[x]) with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf-8") as o: for eventid in to_generate: if users.indptr[eventid] != users.indptr[eventid + 1]: name = event[eventid] for other, score in model.similar_items( eventid, int(len(event) * 2 / 3) ): o.write(f"{name},{event[other]},{score}\n") progress.update(1)
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant='20m'): # read in the input data file start = time.time() titles, ratings = get_movielens(variant) # remove things < min_rating, and convert to implicit dataset # by considering ratings as a binary preference only ratings.data[ratings.data < min_rating] = 0 ratings.eliminate_zeros() ratings.data = np.ones(len(ratings.data)) log.info("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. log.debug("weighting matrix by bm25_weight") ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr() elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model log.debug("training model %s", model_name) start = time.time() model.fit(ratings) log.debug("trained model '%s' in %s", model_name, time.time() - start) log.debug("calculating top movies") user_count = np.ediff1d(ratings.indptr) to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x]) log.debug("calculating similar movies") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if ratings.indptr[movieid] != ratings.indptr[movieid + 1]: title = titles[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (title, titles[other], score)) progress.update(1)
class AlsRecommender(OwnRecommender): """Модель, обученная ALS Input ----- ds: RecommenderDataset подготовленный RecommenderDataset обьект """ def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4): """Обучает ALS""" self.model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) self.model.fit(self.ds.csr_matrix) return self def _similarItems(self, userId, N=5): """Рекомендуем товары, похожие на топ-N купленных юзером товаров""" if not self.ds.userExist(userId): return self.ds.extend([], N) def _get_similar_item(item_id): """Находит товар, похожий на item_id""" recs = self.model.similar_items(self.ds.itemid_to_id[item_id], N=2) if len(recs) > 1: top_rec = recs[1][0] return self.ds.id_to_itemid[top_rec] return item_id res = [_get_similar_item(item) for item in self.ds.userTop(userId, N)] return self.extend(res, N) def _similarUsers(self, userId, N=5): """Рекомендуем топ-N товаров, среди купленных похожими юзерами""" if not self.ds.userExist(userId): return self.ds.extend([], N) res = [] similar_users = [rec[0] for rec in self.model.similar_users(self.ds.userid_to_id[userId], N=N+1)] similar_users = similar_users[1:] for user in similar_users: res.extend(self.ds.userTop(userId, 1)) return self.extend(res, N) def items_embedings(self): emb = pd.DataFrame(data=self.model.item_factors).add_prefix('itm') emb['item_id'] = self.ds.itemids return emb def users_embedings(self): emb = pd.DataFrame(data=self.model.user_factors).add_prefix('usr') emb['user_id'] = self.ds.userids return emb
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with codecs.open(output_filename, "w", "utf8") as o: for movieid in to_generate: # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has # no ratings > 4 meaning we've filtered out all data for it. if m.indptr[movieid] == m.indptr[movieid + 1]: continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def main(params): """Main function.""" # check for mandatory params if 'reference_repo' not in params: return {'error': 'Mandatory param reference_repo not present'} reference_repo = params['reference_repo'] LOGGER.info('reference_repo %s' % reference_repo) # get data LOGGER.info('read GBQ data') _GC_SVC_ACCOUNT['private_key_id'] = params['GC_SVC_PRIVATE_KEY_ID'] _GC_SVC_ACCOUNT['private_key'] = params['GC_SVC_PRIVATE_KEY'] data = pd.io.gbq.read_gbq(_QUERY, dialect="standard", project_id=_GC_SVC_ACCOUNT['project_id'], private_key=json.dumps(_GC_SVC_ACCOUNT)) # map each repo and user to a unique numeric value data['user'] = data['user'].astype("category") data['repo'] = data['repo'].astype("category") # dictionaries to translate names to ids and vice-versa repos = dict(enumerate(data['repo'].cat.categories)) repo_ids = {r: i for i, r in repos.items()} if reference_repo not in repo_ids: return {"message": "No result. Reference repo not in training set."} # create a sparse matrix of all the users/repos stars = coo_matrix( (np.ones(data.shape[0]), (data['repo'].cat.codes.copy(), data['user'].cat.codes.copy()))) # train model LOGGER.info('training model') model = AlternatingLeastSquares( factors=50, regularization=0.01, dtype=np.float64, # pylint: disable=no-member iterations=50) confidence = 40 model.fit(confidence * stars) similar_ids = model.similar_items(repo_ids[reference_repo]) LOGGER.info('found %d similar repos' % len(similar_ids)) similar_repos = [] for idx in range(1, len(similar_ids)): similar_repos.append(repos[similar_ids[idx][0]]) return { 'reference_repo': reference_repo, 'similar_repos': similar_repos, 'error': '' }
def collab_filter(song_id, user_song_df, num_songs=5): ''' song_id = spotify id for individual song user_song_df= dataframe with users, songs, playcounts etc for the time being i am not going to enable filtering by key/tempo as not enough songs but in future will do ''' song_num = user_song_df[user_song_df.spotify_id == song_id].song_nums.values[0] print(song_num) print(type(song_num)) #orig_key = song_list[song_list.spotify_id==song_id].key.values[0] #orig_tempo= song_list[song_list.spotify_id==song_id].tempo.values[0] #check if you want songs of same key #if same_key=='yes': #if yes then filter out other keys # print(f'key:{orig_key}') # song_list = song_list[song_list.key ==orig_key] #can also enter number to specify what key you want # elif type(same_key) !=str: # song_list = song_list[song_list.key==same_key] # check if you want similar tempo # if similar_tempo=='yes': # print(f'tempo:{orig_tempo}') #if yes can also specify how similar you want it # lower= int(orig_tempo)-margin # higher=int(orig_tempo)+margin # song_list=song_list[song_list.tempo.between(lower,higher)] #elif type(similar_tempo) !=str: #can also specify a specific tempo that you want # song_list = song_list[song_list.tempo.between(int(similar_tempo)-margin,int(similar_tempo)+margin)] # refined_ids=song_list.spotify_id #this will be updated user_song_refined = user_song_df #[user_song_df.spotify_id.isin( # refined_ids)].copy() plays = user_song_refined['size'] user_nums = user_song_refined.user_nums song_nums = user_song_refined.song_nums B = coo_matrix((plays, (song_nums, user_nums))).tocsr() model = AlternatingLeastSquares(factors=30) model.fit(B) songs_inds = model.similar_items(song_num, N=num_songs) songs_inds = [tup[0] for tup in songs_inds] return user_song_df[user_song_df.song_nums.isin(songs_inds)]
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): """ :param input_path: 训练数据集的路径 :param output_filename: 输出的文件名称 :param model_name: 采用的模型 :param min_rating: 过滤所需的阈值大小 :return: """ logging.debug("reading data from %s", input_path) start = time.time() rating_data, movies_data, m = read_data(input_path, min_rating=min_rating) logging.debug("reading data in %s", time.time() - start) if model_name == "als": model = AlternatingLeastSquares() logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender() else: raise NotImplementedError("TODU: model %s" % model_name) m = m.tocsr() logging.debug("Training model :%s" % model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = rating_data.groupby("movieId").size() movie_lookup = dict((i, m) for i,m in zip(movies_data['movieId'], movies_data['title'])) to_generate = sorted(list(movies_data['movieId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for movieid in to_generate: if(m.indptr[movieid] == m.indptr[movieid + 1]): continue movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def calculate_similar_businesses(input_filename, output_filename, model_name="als", factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar businesses. This might take a while") # read in the input data file logging.debug("reading data from %s", input_filename) start = time.time() df, ratings = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") ratings = bm25_weight(ratings, K1=100, B=0.8) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(ratings) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # write out similar businesses by popularity logging.debug("calculating top businesses") user_count = df.groupby('business').size() businesses = dict(enumerate(df['business'].cat.categories)) to_generate = sorted(list(businesses), key=lambda x: -user_count[x]) # write out as a TSV of businessid, otherbusinessid, score with open(output_filename, "w") as o: for businessid in to_generate: business = businesses[businessid] for other, score in model.similar_items(businessid, 11): o.write("%s\t%s\t%s\n" % (business, businesses[other], score))
def calculate_similar_beers(input_path, output_filename, model_name="cosine"): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, beers, m = read_data(input_path) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "bpr": model = BayesianPersonalizedRanking() elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model m = m.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top beers") user_count = ratings.groupby('beerId').size() beer_lookup = dict((i, m) for i, m in zip(beers['beerId'], beers['name'])) to_generate = sorted(list(beers['beerId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for beerId in to_generate: if m.indptr[beerId] == m.indptr[beerId + 1]: continue beer = beer_lookup[beerId] for other, score in model.similar_items(beerId, 11): o.write("%s,%s,%s\n" % (beer, beer_lookup[other], score))
def calculate_similar_movies(input_path, output_filename, model_name="als", min_rating=4.0): # read in the input data file logging.debug("reading data from %s", input_path) start = time.time() ratings, movies, m = read_data(input_path, min_rating=min_rating) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": model = AlternatingLeastSquares() # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") m = bm25_weight(m, B=0.9) * 5 elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(B=0.2) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(m) logging.debug("trained model '%s' in %s", model_name, time.time() - start) logging.debug("calculating top movies") user_count = ratings.groupby('movieId').size() movie_lookup = dict( (i, m) for i, m in zip(movies['movieId'], movies['title'])) to_generate = sorted(list(movies['movieId']), key=lambda x: -user_count.get(x, 0)) with open(output_filename, "w") as o: for movieid in to_generate: movie = movie_lookup[movieid] for other, score in model.similar_items(movieid, 11): o.write("%s\t%s\t%s\n" % (movie, movie_lookup[other], score))
def implicit(args): row_dict, col_dict = {}, {} rows, cols, data = [], [], [] for feedback in iter_implicit_feedbacks( os.path.join(args.in_dir, 'ua.base')): i = row_dict.setdefault(feedback.item_id, len(row_dict)) j = col_dict.setdefault(feedback.user_id, len(col_dict)) rows.append(i) cols.append(j) data.append(1) item_user_data = csr_matrix((data, (rows, cols)), shape=(len(row_dict), len(col_dict))) model = AlternatingLeastSquares(factors=8) model.fit(item_user_data) # Evaluation user_items = item_user_data.T.tocsr() user_items_test = collections.defaultdict(set) for feedback in iter_implicit_feedbacks( os.path.join(args.in_dir, 'ua.test')): try: i = row_dict[feedback.item_id] j = col_dict[feedback.user_id] except KeyError as e: continue user_items_test[j].add(i) topk = 10 precision = 0 for user_index, item_indices in user_items_test.items(): recommendations = model.recommend(user_index, user_items, topk, True) precision += sum(1 if item_index in item_indices else 0 for item_index, _ in recommendations) / topk precision = precision / len(user_items_test) print('precision:', precision) item_id = 1 item_index = row_dict[item_id] index2id = {value: key for key, value in row_dict.items()} for _item_index, score in model.similar_items(item_index, 10): _item_id = index2id[_item_index] print(_item_id)
data_sparse = sparse.csr_matrix((plays, (cols, rows)), shape=(len(artists), len(users))) model = AlternatingLeastSquares(factors=50) model.fit(data_sparse) userid = 0 user_items = data_sparse.T.tocsr() recommendations = model.recommend(userid, user_items) print(recommendations) for r in recommendations: print(artist_id_name[str(r[0])]) itemid = 107209 related = model.similar_items(itemid) print(related) for a in related: print(artist_id_name[str(a[0])]) artist_id_name['234786']
class MainRecommender: """Рекоммендации, которые можно получить из ALS Input ----- user_item_matrix: pd.DataFrame Матрица взаимодействий user-item """ def __init__(self, data, weighting=True): # Топ покупок каждого юзера self.top_purchases = data.groupby( ['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[ self.top_purchases['item_id'] != 999999] # Топ покупок по всему датасету self.overall_top_purchases = data.groupby( 'item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[ self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist( ) self.user_item_matrix = self._prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, \ self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T @staticmethod def _prepare_matrix(data): """Готовит user-item матрицу""" user_item_matrix = pd.pivot_table( data, index='user_id', columns='item_id', values='quantity', # Можно пробовать другие варианты aggfunc='count', fill_value=0) user_item_matrix = user_item_matrix.astype( float) # необходимый тип матрицы для implicit return user_item_matrix @staticmethod def _prepare_dicts(user_item_matrix): """Подготавливает вспомогательные словари""" userids = user_item_matrix.index.values itemids = user_item_matrix.columns.values matrix_userids = np.arange(len(userids)) matrix_itemids = np.arange(len(itemids)) id_to_itemid = dict(zip(matrix_itemids, itemids)) id_to_userid = dict(zip(matrix_userids, userids)) itemid_to_id = dict(zip(itemids, matrix_itemids)) userid_to_id = dict(zip(userids, matrix_userids)) return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id def fit_own_recommender(self, user_item_matrix): """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером""" self.own_recommender = ItemItemRecommender(K=1, num_threads=4) self.own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr()) return self.own_recommender def fit_als(self, user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4, show_progress=True, use_gpu=True): """Обучает ALS""" self.model_als = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, use_gpu=use_gpu, num_threads=num_threads, random_state=0) self.model_als.fit(csr_matrix(user_item_matrix).T.tocsr(), show_progress=show_progress) return self.model_als def _update_dict(self, user_id): """Если появился новыю user / item, то нужно обновить словари""" if user_id not in self.userid_to_id.keys(): max_id = max(list(self.userid_to_id.values())) max_id += 1 self.userid_to_id.update({user_id: max_id}) self.id_to_userid.update({max_id: user_id}) def _get_similar_item(self, item_id): """Находит товар, похожий на item_id""" recs = self.model_als.similar_items( self.itemid_to_id[item_id], N=2) # Товар похож на себя -> рекомендуем 2 товара top_rec = recs[1][0] # И берем второй (не товар из аргумента метода) return self.id_to_itemid[top_rec] def _extend_with_top_popular(self, recommendations, N=5): """Если кол-во рекоммендаций < N, то дополняем их топ-популярными""" if len(recommendations) < N: recommendations.extend(self.overall_top_purchases[:N]) recommendations = recommendations[:N] return recommendations def _get_recommendations(self, user, model_als, N=5): """Рекомендации через стардартные библиотеки implicit""" self._update_dict(user_id=user) recs = model_als.recommend(userid=self.userid_to_id[user], user_items=csr_matrix( self.user_item_matrix).tocsr(), N=N, filter_already_liked_items=False, filter_items=[self.itemid_to_id[999999]], recalculate_user=True) res = [self.id_to_itemid[rec[0]] for rec in recs] res = self._extend_with_top_popular(res, N=N) assert len(res) == N, 'Количество рекомендаций != {}'.format(N) return res def get_als_recommendations(self, user, N=5): """Рекомендации через стардартные библиотеки implicit""" self._update_dict(user_id=user) return self._get_recommendations(user, model_als=self.model_als, N=N) def get_own_recommendations(self, user, N=5): """Рекомендуем товары среди тех, которые юзер уже купил""" self._update_dict(user_id=user) return self._get_recommendations(user, model_als=self.own_recommender, N=N) def get_similar_items_recommendation(self, user, N=5): """Рекомендуем товары, похожие на топ-N купленных юзером товаров""" top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N) res = top_users_purchases['item_id'].apply( lambda x: self._get_similar_item(x)).tolist() res = self._extend_with_top_popular(res, N=N) assert len(res) == N, 'Количество рекомендаций != {}'.format(N) return res def get_similar_users_recommendation(self, user, N=5): """Рекомендуем топ-N товаров, среди купленных похожими юзерами""" res = [] # Находим топ-N похожих пользователей similar_users = self.model_als.similar_users(self.userid_to_id[user], N=N + 1) similar_users = [rec[0] for rec in similar_users] similar_users = similar_users[1:] # удалим юзера из запроса for user in similar_users: userid = self.id_to_userid[ user] #own recommender works with user_ids res.extend(self.get_own_recommendations(userid, N=1)) res = self._extend_with_top_popular(res, N=N) assert len(res) == N, 'Количество рекомендаций != {}'.format(N) return res
class MainRecommender: """Рекоммендации, которые можно получить из ALS Input ----- user_item_matrix: pd.DataFrame Матрица взаимодействий user-item """ def __init__(self, data, data_product, weighting=True): # Топ покупок каждого юзера self.top_purchases = data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[self.top_purchases['item_id'] != 999999] # Топ покупок по всему датасету self.overall_top_purchases = data.groupby('item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[self.overall_top_purchases['item_id'] != 999999] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist() self.user_item_matrix = self.prepare_matrix(data) # pd.DataFrame self.id_to_itemid, self.id_to_userid,self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix) # Словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ self.item_id_to_ctm = self.prepare_ctm(data_product) # Own recommender обучается до взвешивания матрицы self.own_recommender = self.fit_own_recommender(self.user_item_matrix) if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = self.fit(self) @staticmethod def prepare_matrix(data): user_item_matrix = pd.pivot_table(data, index='user_id', columns='item_id', values='quantity', # Можно пробоват ьдругие варианты aggfunc='count', fill_value=0 ) user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit return user_item_matrix @staticmethod def prepare_ctm(data_product): """Создаем словарь {item_id: 0/1}. 0/1 - факт принадлежности товара к СТМ""" ctm_items = data_product[['item_id', 'brand']] ctm_items['feature'] = data_product['brand'].isin(['Private']) ctm_items = ctm_items.replace(to_replace=[False, True], value=[0, 1]) return dict(zip(ctm_items['item_id'], ctm_items['feature'])) @staticmethod def prepare_dicts(user_item_matrix): """Подготавливает вспомогательные словари""" userids = user_item_matrix.index.values itemids = user_item_matrix.columns.values matrix_userids = np.arange(len(userids)) matrix_itemids = np.arange(len(itemids)) id_to_itemid = dict(zip(matrix_itemids, itemids)) id_to_userid = dict(zip(matrix_userids, userids)) itemid_to_id = dict(zip(itemids, matrix_itemids)) userid_to_id = dict(zip(userids, matrix_userids)) return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id @staticmethod def fit_own_recommender(user_item_matrix): """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером""" own_recommender = ItemItemRecommender(K=1, num_threads=4) own_recommender.fit(csr_matrix(user_item_matrix).T.tocsr()) return own_recommender @staticmethod def fit(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4): """Обучает ALS""" self.model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) self.model.fit(csr_matrix(self.user_item_matrix).T.tocsr()) return self.model def get_similar_items_recommendation(self, user, filter_ctm=True, N=5): """Рекомендуем товары, похожие на топ-N купленных юзером товаров""" # your_code # Практически полностью реализовали на прошлом вебинаре # Не забывайте, что нужно учесть параметр filter_ctm res = [] recommends = self.model.similar_items(self.itemid_to_id[user], N) for item in recommends: res.append(item[0]) return res def get_similar_users_recommendation(self, user, N=5): """Рекомендуем топ-N товаров, среди купленных похожими юзерами""" res = [] recommends = self.model.similar_users(self.userid_to_id[user], N) for item in recommends: res.append(item[0]) return res
def calculate_similar_artists(input_filename, output_filename, model_name="als", factors=50, regularization=0.01, iterations=15, exact=False, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar artists. This might take a while") # read in the input data file logging.debug("reading data from %s", input_filename) start = time.time() df, plays = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based off the input params if model_name == "als": if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype, iterations=iterations) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, dtype=dtype, iterations=iterations) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(K1=100, B=0.5) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # write out similar artists by popularity logging.debug("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) # write out as a TSV of artistid, otherartistid, score with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in model.similar_items(artistid, 11): o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
class ImplicitALS: def __init__(self, df, config, orig_df): df = self._calc_confidence_preference(df, config.alpha) self.config = config self.orig_df = orig_df def check_index_uniformity(index): return index.min() == 0 and \ index.max() == len(index) - 1 def index_info(index): return 'index with min %d max %d count %d items' % ( index.min(), index.max(), len(index)) assert check_index_uniformity( df.user_id.drop_duplicates()), index_info( df.user_id.drop_duplicates()) assert check_index_uniformity( df.item_id.drop_duplicates()), index_info( df.item_id.drop_duplicates()) users = df.user_id.to_list() items = df.item_id.to_list() rate = df.rate.to_list() shape = (len(set(items)), len(set(users))) self.iu_mat = csr_matrix((rate, (items, users)), shape=shape) self.ui_mat = self.iu_mat.transpose() self.model = ALS(factors=config.factors, calculate_training_loss=True, iterations=config.iterations, regularization=config.regularization) self.max_uix = max(users) def _calc_confidence_preference(self, df, alpha): # convert to confidence and preference # use split_rate as a threshold for bad and good classes. # to enlarge negative effect, use quadratic transform for rate split_rate = 6 eps = 1e-4 get_p = lambda v: 1 if v > split_rate else 0 get_logp = lambda v: log(1 + get_p(v / eps)) df['rate'] = 1 + alpha * df.rate.apply(get_logp) return df def _delete_bookmarks(self, recs, seen_items): # Since filter_already_liked doesnt work, filter by hand for i, rec in enumerate(recs): if rec[0] in seen_items: recs[i] = None recs = list(filter(lambda r: r is not None, recs)) return recs def fit(self): self.model.fit(self.iu_mat) def recommend_user(self, user, k, return_scores=False): user_items = self.orig_df[self.orig_df.user_id == user].item_id.tolist() # filter liked until len(recs) != given k base_k = k k = int(min(1.5 * k, k + 0.1 * len(user_items))) recs = self.model.recommend(user, self.ui_mat, N=k) recs = self._delete_bookmarks(recs, user_items) while len(recs) < base_k: k *= 2 recs = self.model.recommend(user, self.ui_mat, N=k) recs = self._delete_bookmarks(recs, user_items) recs = recs[:base_k] # return with or without scores if not return_scores: return [rec[0] for rec in recs] else: return recs def similar_items(self, item, k, return_scores=False): # Returns items that are similar to item with given id recs = self.model.similar_items(item, k + 1) # avoid recommending same item recs = self._delete_bookmarks(recs, [item]) recs = recs[:k] # return with or without scores if not return_scores: return [rec[0] for rec in recs] else: return recs def similar_items_for_user(self, item, user, k, return_scores=False): # Returns items that are similar to item with given id and havent # been seen by user with given id user_items = self.orig_df[self.orig_df.user_id == user].item_id.tolist() user_items += [item] # avoid recommending same item # filter liked until len(recs) != given k base_k = k k = int(min(1.5 * k, k + 0.1 * len(user_items))) recs = self.model.similar_items(item, k) recs = self._delete_bookmarks(recs, user_items) while len(recs) < base_k: k *= 2 recs = self.model.recommend(item, k) recs = self._delete_bookmarks(recs, user_items) recs = recs[:base_k] # return with or without scores if not return_scores: return [rec[0] for rec in recs] else: return recs def _add_empty_user(self): # Enlarges ui_mat and als model user_factors for 1 extra user # Upd wrapper data self.max_uix += 1 old_shape = self.ui_mat.shape self.ui_mat.resize((old_shape[0] + 1, old_shape[1])) # Upd inner model data k = self.model.factors # set random weights for new user self.model.user_factors = np.vstack( (self.model.user_factors, np.random.randn(k))) def update_user_data(self, user, user_views): # Updates model's data about user and recalculates it assert isinstance(user, int) assert isinstance(user_views, pd.DataFrame) assert len(user_views) > 0 assert len(user_views.user_id.drop_duplicates()) == 1 user_views = user_views[user_views.item_id != -1] user_views = user_views.drop_duplicates( subset='item_id user_id'.split(), keep='last') user_views = self._calc_confidence_preference(user_views, self.config.alpha) iixs = user_views.item_id.tolist() rates = user_views.rate.tolist() # Create new user rates csr matrix rowscols = ([0 for _ in iixs], iixs) size = (1, self.ui_mat.shape[1]) # Upd wrapper data assert user <= self.max_uix self.ui_mat[user] = csr_matrix((rates, rowscols), shape=size) # Upd inner model data k = self.model.factors # set random weights for new user self.model.user_factors = np.vstack( (self.model.user_factors, np.random.randn(k))) # recalculate new_user_factors = self.model.recalculate_user(user, self.ui_mat) self.model.user_factors[user] = new_user_factors def add_user(self, user, user_views=None): # Adds user to recommender model. Updates model's matrixes, allows making # predictions for new user assert isinstance(user, int) self._add_empty_user() if user_views is None: return assert isinstance(user_views, pd.DataFrame) assert len(user_views) > 0 assert len(user_views.user_id.drop_duplicates()) == 1 self.update_user_data(user, user_views)
class MainRecommender: own_recomender_defult_param = {'filter_already_liked_items':False, 'filter_items':False, "recalculate_user":True} model_als_defult_param ={'factors':50, 'regularization':15, 'iterations':15, 'num_threads':-1,'calculate_training_loss':False} def __init__(self, data,data_test=None,split_info=None): """ data - dataframe c данными data_test - даные для валидации, если нет и есть split_info то создаем split_info кортеж с инфрмацией как создать data_test (размер, поле деления) рассматривается только в слуяае отсутвя data_test """ self.top = 5000 self.data_validation={} self.data_validation['status'] = False self.user_item_matrix = {'status':False,'matrix':None,'params':None} self.own_recommender_is_fit= {'status':False,'params':None} self.als_recommender_is_fit= {'status':False,'params':None} self.data = data.copy() self.full_data_train = data.copy() #Оставим полный объем данный , если нужно будет предсказывать по полному объему данных self.data_train = data.copy() if data_test is not None: self.data_test = data_test.copy() else: self.data_test = None if split_info: self.data_train,self.data_test = self.train_test_split(test_size_num = split_info[0],split_column =split_info[1]) if self.data_test is not None: self.data_validation['data'] = self.get_validation_data() self.data_validation['status'] = True def prefiltr_1(self,my_data): df = my_data.copy() """Оставим только top самых популярных товаров остальные переименуем в 999999""" popularity = my_data.groupby('item_id')['quantity'].count().reset_index() popularity.rename(columns={'quantity': 'n_sold'}, inplace=True) top_5000 = popularity.sort_values('n_sold', ascending=False).head(self.top).item_id.tolist() df.loc[~df['item_id'].isin(top_5000), 'item_id'] = 999999 return df def prefiltr_2(self,data_train,n=5000): """Оставим только n самых популярных товаров, транзакции с остальными товрами удалим""" df = data_train.copy() popularity = df.groupby('item_id')['quantity'].count().reset_index() popularity.rename(columns={'quantity': 'n_sold'}, inplace=True) top_n = popularity.sort_values('n_sold', ascending=False).head(n).item_id.tolist() df = df.loc[df['item_id'].isin(top_n)] return df def prefiltr_3(self,data_train,n=5000): """транзакции с самыми не популярными n товрами удалим""" df = data_train.copy() not_popularity = df.groupby('item_id')['quantity'].count().reset_index() not_popularity.rename(columns={'quantity': 'n_sold'}, inplace=True) not_top_n = not_popularity.sort_values('n_sold').head(n).item_id.tolist() df = df.loc[~df['item_id'].isin(not_top_n)] return df def prefiltr_4(self,data_train,weeks = 50): """Удалим транзакции с товарами, которые не покупали более n недель""" df = data_train.copy() old_item = df.groupby('item_id')['week_no'].max().reset_index() old_item = old_item.loc[old_item['week_no']>weeks,'item_id'].tolist() df = df.loc[df['item_id'].isin(old_item)] return df def train_test_split(self,test_size_num,split_column): data_train = self.data[self.data[split_column] < self.data[split_column].max() - test_size_num] data_test = self.data[self.data[split_column] >= self.data[split_column].max() - test_size_num] return data_train, data_test def get_validation_data(self): result = self.data_test.groupby('user_id')['item_id'].unique().reset_index() users_train = self.data_train.user_id.unique() result = result[result.user_id.isin(users_train)] result['train'] = result['user_id'].map(self.data_train.groupby('user_id')['item_id'].unique()) result['full_train'] = result['user_id'].map(self.full_data_train.groupby('user_id')['item_id'].unique()) result.rename(columns={'item_id':'test'},inplace=True) result.reset_index(inplace=True,drop=True) return result def prepare_matrix(self,agg_column,full=None,filtr=None): my_data = self.data_train.copy() if full: my_data = self.full_data_train.copy() if filtr: for i in filtr: prefiltr = 'self.prefiltr_'+str(i)+'(my_data)' my_data = eval(prefiltr) user_item_matrix = pd.pivot_table(my_data, index='user_id', columns='item_id', values=agg_column[0], aggfunc=agg_column[1], fill_value=0 ) user_item_matrix = user_item_matrix.astype(float) self.prepare_dicts(user_item_matrix) self.current_working_data = my_data.copy() return user_item_matrix def prepare_dicts(self,user_item_matrix): """Подготавливает вспомогательные словари""" userids = user_item_matrix.index.values itemids = user_item_matrix.columns.values matrix_userids = np.arange(len(userids)) matrix_itemids = np.arange(len(itemids)) self.id_to_itemid = dict(zip(matrix_itemids, itemids)) self.id_to_userid = dict(zip(matrix_userids, userids)) self.itemid_to_id = dict(zip(itemids, matrix_itemids)) self.userid_to_id = dict(zip(userids, matrix_userids)) return self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id def make_data(self,agg_column,filtr=None,full =False,top = 5000): self.top = top self.full = full uim = self.prepare_matrix(agg_column=agg_column,full=full,filtr=filtr) uim_w = uim.copy() self.user_item_matrix['uim_matrix_w'] = csr_matrix(uim_w).tocsr() uim[uim>0]=1 self.user_item_matrix['uim_matrix'] = csr_matrix(uim).tocsr() self.user_item_matrix['ium_matrix_w_tfidf'] = tfidf_weight(csr_matrix(uim_w.T).tocsr()) self.user_item_matrix['ium_matrix_tfidf'] = tfidf_weight(csr_matrix(uim.T).tocsr()) self.user_item_matrix['ium_matrix_w_bm25'] = bm25_weight(csr_matrix(uim_w.T).tocsr()) self.user_item_matrix['ium_matrix_bm25'] = bm25_weight(csr_matrix(uim.T).tocsr()) self.user_item_matrix['status'] = True self.user_item_matrix['params'] = {'agg_column':agg_column,'filtr':filtr,'full':full} return self.user_item_matrix def precision_at_k(x, k=5): if len(x['predict']) == 0: return 0 bought_list = np.array(x['test']) recommended_list = np.array(x['predict'])[:k] flags = np.isin(bought_list, recommended_list) precision = flags.sum() / len(recommended_list) return precision def fit_own_recommender(self,weighting=False): """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером""" assert self.user_item_matrix['status'], 'необходимо сначала выполнить метод make_data(self,agg_column,filtr=None,weighting=None,full =False)' ium = self.user_item_matrix['uim_matrix'].T if weighting: assert (weighting == 'tf_idf' or weighting == 'bm25'), 'необходимо указать weighting: tf_idf или bm25 или None' if weighting == 'tf_idf': ium = self.user_item_matrix['ium_matrix_tfidf'] else: ium = self.user_item_matrix['ium_matrix_bm25'] self.own_recommender = ItemItemRecommender(K=1, num_threads=-1) self.own_recommender.fit(ium) self.own_recommender_is_fit['status'] =True self.own_recommender_is_fit['params'] ={'model':'ItemItemRecommender(K=1, num_threads=-1)','weighting':weighting} self.own_recommender_is_fit['ium']=ium return self.own_recommender def predict_own_recommender(self,users,N=5,params=own_recomender_defult_param): param = params.copy() assert self.own_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_own_recommender()' assert type(users) == list, 'users - должен быть списком' uim = self.user_item_matrix['uim_matrix'] param['user_items'] = uim param['N'] = N answer = pd.DataFrame() answer['user_id']=users if param['filter_items']: param['filter_items']=[self.itemid_to_id[i] for i in params['filter_items']] rec=[] for user in users: param['userid'] = self.userid_to_id[user] rec.append( [self.id_to_itemid[i[0]] for i in self.own_recommender.recommend(**param)]) answer['result'] = rec return answer def validation_own_recommender(self,metric=precision_at_k,N=5,params=own_recomender_defult_param): assert self.data_validation['status'], 'тестовые данные не созданы' assert self.own_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_own_recommender()' df = self.data_validation['data'] users = df['user_id'].to_list() predict = self.predict_own_recommender(users = users,N=N,params=params) df['predict'] = predict['result'] return df.apply(metric,axis=1).mean() def fit_als(self, params = model_als_defult_param,weighting=False): """Обучает ALS""" assert self.user_item_matrix['status'], 'необходимо сначала выполнить метод make_data(self,agg_column,filtr=None,weighting=None,full =False)' ium = self.user_item_matrix['uim_matrix_w'].T if weighting: assert (weighting == 'tf_idf' or weighting == 'bm25'), 'необходимо указать weighting: tf_idf или bm25 или None' if weighting == 'tf_idf': ium = self.user_item_matrix['ium_matrix_w_tfidf'] else: ium = self.user_item_matrix['ium_matrix_w_bm25'] self.model_als = AlternatingLeastSquares(**params) self.model_als.fit(ium) self.als_recommender_is_fit['status'] = True self.als_recommender_is_fit['params'] = {'model':params,'weighting':weighting} self.als_recommender_is_fit['ium'] = ium return self.model_als def predict_als(self,users,N=5,params=own_recomender_defult_param): param = params.copy() assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()' assert type(users) == list, 'users - должен быть списком' uim = self.user_item_matrix['uim_matrix_w'] param['user_items'] = uim param['N'] = N answer = pd.DataFrame() answer['user_id']=users if param['filter_items']: param['filter_items']=[self.itemid_to_id[i] for i in params['filter_items']] rec=[] for user in users: param['userid'] = self.userid_to_id[user] rec.append( [self.id_to_itemid[i[0]] for i in self.model_als.recommend(**param)]) answer['result'] = rec return answer def validation_als_recommender(self,metric=precision_at_k,N=5,params=own_recomender_defult_param): assert self.data_validation['status'], 'тестовые данные не созданы' assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()' df = self.data_validation['data'].copy() users = df['user_id'].to_list() predict = self.predict_als(users = users,N=N,params=params) df['predict'] = predict['result'] return df.apply(metric,axis=1).mean() def get_recs(self,user,popularity,not_my=0): result = [] for item in popularity[popularity['user_id']==user]['item_id'].to_list(): recs_ = self.model_als.similar_items(self.itemid_to_id[item], N=3) recs = [self.id_to_itemid[i[0]] for i in recs_] if 999999 in recs: recs.remove(999999) result.append(recs[not_my]) return result def get_similar_items_recommendation(self, users,not_my=0, N=5): """Рекомендуем товары, похожие на топ-N купленных юзером товаров not_my =1 если хотим предсказать поекупку собственных товаров (вроде own_recomender), 0 - обратно""" assert self.als_recommender_is_fit['status'],'Модель als не обучена, используйте fit_als()' assert type(users)==list,'параметр users должен быть list' assert not_my in [0,1],'параметр not_my должен быть равен 0 или 1' my_data = self.current_working_data.copy() my_data = my_data[my_data['user_id'].isin(users)] popularity = my_data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index() popularity.sort_values('quantity', ascending=False, inplace=True) popularity = popularity[popularity['item_id'] != 999999] popularity =popularity.groupby('user_id').head(N) popularity.sort_values(['user_id','quantity'], ascending=False, inplace=True) result = pd.DataFrame() result['user_id'] = users result['similar_recommendation'] = result['user_id'].apply(\ lambda x: self.get_recs(user = x,popularity = popularity,not_my=not_my)) return result def validation_similar_items_recommendation(self,metric=precision_at_k,N=5,not_my=0): assert self.data_validation['status'], 'тестовые данные не созданы' assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()' assert not_my in [0,1],'параметр not_my должен быть равен 0 или 1' df = self.data_validation['data'].copy() users = df['user_id'].to_list() predict = self.get_similar_items_recommendation(users = users,N=N,not_my=not_my) df['predict'] = predict['similar_recommendation'] return df.apply(metric,axis=1).mean() def get_user(self,user): users = self.model_als.similar_users(self.userid_to_id[user], N=2) return self.id_to_userid[users[1][0]] def get_similar_users_recommendation(self, users, N=5,params=own_recomender_defult_param): """Рекомендуем топ-N товаров, среди купленных похожими юзерами""" assert self.als_recommender_is_fit['status'],'Модель als не обучена, используйте fit_als()' assert type(users)==list,'параметр users должен быть list' result = pd.DataFrame() result['user_id'] = users result['simular_user_id'] = result['user_id'].apply(self.get_user) result['similar_recommendation'] = self.predict_als(result['simular_user_id'].to_list(),N=5,params=params)['result'] return result def validation_similar_users_recommendation(self,metric=precision_at_k,N=5): assert self.data_validation['status'], 'тестовые данные не созданы' assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()' df = self.data_validation['data'].copy() users = df['user_id'].to_list() predict = self.get_similar_users_recommendation(users = users,N=N) df['predict'] = predict['similar_recommendation'] return df.apply(metric,axis=1).mean()
class MainRecommender: """Рекоммендации, которые можно получить из ALS Input ----- user_item_matrix: pd.DataFrame Матрица взаимодействий user-item """ def __init__(self, data, user_features, item_features, items_to_filter=[999999], weighting=True): self.items_to_filter = items_to_filter # Топ покупок каждого юзера self.top_purchases = data.groupby( ['user_id', 'item_id'])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[ self.top_purchases['item_id'] != 999999] # Топ покупок по всему датасету self.overall_top_purchases = data.groupby( 'item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[ ~self.overall_top_purchases['item_id'].isin( self.items_to_filter)] # ~self.top_purchases отрицание self.overall_top_purchases = self.overall_top_purchases.item_id.tolist( ) self.user_item_matrix, self.sparse_user_item = self._prepare_matrix( data) # pd.DataFrame self.id_to_itemid, self.id_to_userid, \ self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix) # LightFM не будет взвешен при такой конструкции self.user_feat_lightfm_fixed, self.item_feat_lightfm_fixed = self._prepare_user_item_feat_lightfm( self.user_item_matrix, user_features, item_features) self.user_item_matrix_lightfm = self.user_item_matrix.copy() if weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T @staticmethod def _prepare_matrix(data): """Готовит user-item матрицу""" user_item_matrix = pd.pivot_table( data, index='user_id', columns='item_id', values='quantity', # Можно пробовать другие варианты aggfunc='count', fill_value=0) user_item_matrix = user_item_matrix.astype( float) # необходимый тип матрицы для implicit # переведем в формат sparse matrix (для LightFM) sparse_user_item = csr_matrix(user_item_matrix).tocsr() return user_item_matrix, sparse_user_item @staticmethod def _prepare_dicts(user_item_matrix): """Подготавливает вспомогательные словари""" userids = user_item_matrix.index.values itemids = user_item_matrix.columns.values matrix_userids = np.arange(len(userids)) matrix_itemids = np.arange(len(itemids)) id_to_itemid = dict(zip(matrix_itemids, itemids)) id_to_userid = dict(zip(matrix_userids, userids)) itemid_to_id = dict(zip(itemids, matrix_itemids)) userid_to_id = dict(zip(userids, matrix_userids)) return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id @staticmethod def _prepare_user_item_feat_lightfm(user_item_matrix, user_features, item_features): """Готовит под нужный формат фичи для LightFM""" user_feat = pd.DataFrame(user_item_matrix.index) user_feat = user_feat.merge( user_features, on='user_id', how='left').drop(columns=['homeowner_desc']) user_feat.set_index('user_id', inplace=True) item_feat = pd.DataFrame(user_item_matrix.columns) item_feat = item_feat.merge( item_features, on='item_id', how='left').drop( columns=['sub_commodity_desc', 'curr_size_of_product']) item_feat.set_index('item_id', inplace=True) user_feat_lightfm_fixed = pd.get_dummies( user_feat, columns=user_feat.columns.tolist()) item_feat_lightfm_fixed = pd.get_dummies( item_feat, columns=item_feat.columns.tolist()) return user_feat_lightfm_fixed, item_feat_lightfm_fixed def fit_own_recommender(self): """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером""" self.own_recommender = ItemItemRecommender(K=1, num_threads=4) self.own_recommender.fit(csr_matrix(self.user_item_matrix).T.tocsr()) return self.own_recommender def fit_als(self, n_factors=20, regularization=0.001, iterations=15, num_threads=4, show_progress=True, use_gpu=True, random_state=42): """Обучает ALS""" self.model_als = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, use_gpu=use_gpu, num_threads=num_threads, random_state=random_state) self.model_als.fit(csr_matrix(self.user_item_matrix).T.tocsr(), show_progress=show_progress) return self.model_als def fit_lightfm(self, no_components=16, loss='warp', learning_rate=0.05, item_alpha=0.2, user_alpha=0.05, random_state=42, epochs=15): """Обучает LightFM""" self.model_lightfm = LightFM( no_components=no_components, loss=loss, # или 'warp' - ниже в уроке описана разница learning_rate=learning_rate, item_alpha=item_alpha, user_alpha=user_alpha, random_state=random_state) self.model_lightfm.fit( (self.sparse_user_item > 0) * 1, # user-item matrix из 0 и 1 sample_weight=coo_matrix( self.user_item_matrix_lightfm), # матрица весов С user_features=csr_matrix( self.user_feat_lightfm_fixed.values).tocsr(), item_features=csr_matrix( self.item_feat_lightfm_fixed.values).tocsr(), epochs=epochs, num_threads=8) return self.model_lightfm def precision_at_k_lightfm(self, model_lightfm, sparse_user_item, k=5): """Precision встроенный в LightFM""" self.precision_res = precision_at_k( model_lightfm, sparse_user_item, user_features=csr_matrix( self.user_feat_lightfm_fixed.values).tocsr(), item_features=csr_matrix( self.item_feat_lightfm_fixed.values).tocsr(), k=k) return self.precision_res def recall_at_k_lightfm(self, model_lightfm, sparse_user_item, k=5): """Recall встроенный в LightFM""" self.recall_res = recall_at_k( model_lightfm, sparse_user_item, user_features=csr_matrix( self.user_feat_lightfm_fixed.values).tocsr(), item_features=csr_matrix( self.item_feat_lightfm_fixed.values).tocsr(), k=k) return self.recall_res def _update_dict(self, user_id): """Если появился новый user / item, то нужно обновить словари""" if user_id not in self.userid_to_id.keys(): max_id = max(list(self.userid_to_id.values())) max_id += 1 self.userid_to_id.update({user_id: max_id}) self.id_to_userid.update({max_id: user_id}) def _get_similar_item(self, item_id): """Находит товар, похожий на item_id""" recs = self.model_als.similar_items( self.itemid_to_id[item_id], N=2) # Товар похож на себя -> рекомендуем 2 товара top_rec = recs[1][0] # И берем второй (не товар из аргумента метода) return self.id_to_itemid[top_rec] def _extend_with_top_popular(self, recommendations, N=5): """Если кол-во рекоммендаций < N, то дополняем их топ-популярными""" if len(recommendations) < N: recommendations.extend(self.overall_top_purchases[:N]) recommendations = recommendations[:N] return recommendations def _get_recommendations(self, user, model, N=5): """Рекомендации через стардартные библиотеки implicit""" self._update_dict(user_id=user) recs = model.recommend( userid=self.userid_to_id[user], user_items=csr_matrix(self.user_item_matrix).tocsr(), N=N, filter_already_liked_items=False, filter_items=[ self.itemid_to_id[item] for item in self.items_to_filter if item in self.itemid_to_id.keys() ], # [self.itemid_to_id[item] for item in items_to_filter if item in self.itemid_to_id.keys()] recalculate_user=True) res = [self.id_to_itemid[rec[0]] for rec in recs] res = self._extend_with_top_popular(res, N=N) assert len(res) == N, 'Количество рекомендаций != {}'.format(N) return res def get_als_recommendations(self, user, N=5): """Рекомендации через стардартные библиотеки implicit""" self._update_dict(user_id=user) return self._get_recommendations(user, model=self.model_als, N=N) def get_lightfm_recommendations(self, user, N=5): """Рекомендации для библиотеки LightFM""" self._update_dict(user_id=user) test_item_ids = np.arange(len(self.itemid_to_id)) scores = self.model_lightfm.predict( user_ids=int( self.userid_to_id[user] ), # На <class 'numpy.int64'> ругается, поэтому в int перевожу item_ids=test_item_ids, user_features=csr_matrix( self.user_feat_lightfm_fixed.values).tocsr(), item_features=csr_matrix( self.item_feat_lightfm_fixed.values).tocsr(), num_threads=8) top_items = np.argsort(-scores) res = [ self.id_to_itemid[item] for item in top_items ][: N] # Конвертируем id обратно, делаем срез на нужное число, т.к. предсказания были для всех item res = self._extend_with_top_popular( res, N=N ) # Теоретически для этой модели не нужно, т.к. предсказания сразу для всех делает? assert len(res) == N, 'Количество рекомендаций != {}'.format(N) return res def get_own_recommendations(self, user, N=5): """Рекомендуем товары среди тех, которые юзер уже купил""" self._update_dict(user_id=user) return self._get_recommendations(user, model=self.own_recommender, N=N) def get_similar_items_recommendation(self, user, N=5): """Рекомендуем товары, похожие на топ-N купленных юзером товаров. Не фильтрует item_id!""" top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(N) res = top_users_purchases['item_id'].apply( lambda x: self._get_similar_item(x)).tolist() res = self._extend_with_top_popular(res, N=N) assert len(res) == N, 'Количество рекомендаций != {}'.format(N) return res def get_similar_users_recommendation(self, user, N=5): """Рекомендуем топ-N товаров, среди купленных похожими юзерами. Не фильтрует item_id!""" res = [] # Находим топ-N похожих пользователей similar_users = self.model_als.similar_users(self.userid_to_id[user], N=N + 1) similar_users = [rec[0] for rec in similar_users] similar_users = similar_users[1:] # удалим юзера из запроса for user in similar_users: userid = self.id_to_userid[ user] # own recommender works with user_ids res.extend(self.get_own_recommendations(userid, N=1)) res = self._extend_with_top_popular(res, N=N) assert len(res) == N, 'Количество рекомендаций != {}'.format(N) return res
class Recommender: def __init__(self, factors=50): self.model = AlternatingLeastSquares(factors=factors, regularization=0.01, dtype=np.float64, iterations=50) def train(self, data): userids = data.userid.astype("category") itemids = data.itemid.astype("category") matrix = coo_matrix((data.confidence.astype('float64'), (itemids.cat.codes.copy(), userids.cat.codes.copy()))) self.model.fit(matrix) self.t_matrix = matrix.T.tocsr() self.userid_to_code = dict([(category, code) for code, category in enumerate(userids.cat.categories)]) self.itemid_to_code = dict([(category, code) for code, category in enumerate(itemids.cat.categories)]) self.usercode_to_id = dict([(code, category) for code, category in enumerate(userids.cat.categories)]) self.itemcode_to_id = dict([(code, category) for code, category in enumerate(itemids.cat.categories)]) def similar_items(self, itemid, N=10): item_code = self.itemid_to_code[itemid] similar_codes = self.model.similar_items(item_code, N) similar_ids = [(self.itemcode_to_id[code], s) for code, s in similar_codes] return pd.DataFrame(similar_ids, columns=["itemid", "similarity"]) def recommendations(self, userid, N=10): user_code = self.userid_to_code[userid] user_item_codes = self.model.recommend(user_code, self.t_matrix, N) user_item_ids = [(self.itemcode_to_id[code], c) for code, c in user_item_codes] return pd.DataFrame(user_item_ids, columns=["itemid", "confidence"]) def explain(self, userid, itemid): user_code = self.userid_to_code[userid] item_code = self.itemid_to_code[itemid] return self.model.explain(user_code, self.t_matrix, item_code) def confidence(self, userid, itemid): item_code = self.itemid_to_code[itemid] user_code = self.userid_to_code[userid] item_factor = self.model.item_factors[item_code] user_factor = self.model.user_factors[user_code] return item_factor.dot(user_factor) def user_factors(self): factors = pd.DataFrame(self.model.user_factors).add_prefix("f") ids = factors.index.map(lambda code: self.usercode_to_id[code]) factors.insert(0, "userid", ids) return factors def item_factors(self): factors = pd.DataFrame(self.model.item_factors).add_prefix("f") ids = factors.index.map(lambda code: self.itemcode_to_id[code]) factors.insert(0, "itemid", ids) return factors def items_recommendations(self, itemids, N=10): user_code = 0 item_codes = [self.itemid_to_code[id] for id in itemids] data = [1 for _ in item_codes] rows = [0 for _ in item_codes] shape = (1, self.model.item_factors.shape[0]) user_items = coo_matrix( (data, (rows, item_codes)), shape=shape).tocsr() user_item_codes = self.model.recommend( user_code, user_items, N, recalculate_user=True) user_item_ids = [(self.itemcode_to_id[code], c) for code, c in user_item_codes] return pd.DataFrame(user_item_ids, columns=["itemid", "confidence"])
class AlsEstimator(TransformerMixin, BaseEstimator): def __init__(self, recommendations='als', n_rec=5, n_rec_pre=100, n_new=2, n_exp=1, price_lte=7, filter_item_id=-99, filter=True, filter_post=True, postfilter_func=None, factors=50, regularization=0.01, iterations=10, matrix_values='quantity', matrix_aggfunc='count', weighting=True, use_native=True, use_gpu=False): self.n_rec = n_rec self.n_rec_pre = n_rec_pre self.n_new = n_new self.n_exp = n_exp self.price_lte = price_lte self.filter_item_id = filter_item_id self.filter = filter self.filter_post = filter_post self.postfilter_func = postfilter_func self.factors = factors self.regularization = regularization self.iterations = iterations self.matrix_values = matrix_values self.matrix_aggfunc = matrix_aggfunc self.recommendations = recommendations self.weighting = True self.use_native = use_native self.use_gpu = use_gpu def _reset(self): if hasattr(self, 'item_info'): del self.item_info if hasattr(self, 'user_history'): del self.user_history if hasattr(self, 'top_purchases'): del self.top_purchases if hasattr(self, 'overall_top_purchases'): del self.overall_top_purchases if hasattr(self, 'user_item_matrix'): del self.user_item_matrix if hasattr(self, 'id_to_itemid'): del self.id_to_itemid if hasattr(self, 'id_to_userid'): del self.id_to_userid if hasattr(self, 'itemid_to_id'): del self.itemid_to_id if hasattr(self, 'userid_to_id'): del self.userid_to_id if hasattr(self, '_fit'): del self._fit @staticmethod def _prepare_matrix(data: pd.DataFrame, values: str, aggfunc: str): """Готовит user-item матрицу""" user_item_matrix = pd.pivot_table(data, index='user_id', columns='item_id', values=values, aggfunc=aggfunc, fill_value=0) user_item_matrix = user_item_matrix.astype(float) return user_item_matrix @staticmethod def _prepare_dicts(user_item_matrix): """Подготавливает вспомогательные словари""" userids = user_item_matrix.index.values itemids = user_item_matrix.columns.values matrix_userids = np.arange(len(userids)) matrix_itemids = np.arange(len(itemids)) id_to_itemid = dict(zip(matrix_itemids, itemids)) id_to_userid = dict(zip(matrix_userids, userids)) itemid_to_id = dict(zip(itemids, matrix_itemids)) userid_to_id = dict(zip(userids, matrix_userids)) return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id def fit(self, X, y=None): self._reset() self.item_info = X.groupby('item_id').agg({ 'price': 'max', 'SUB_COMMODITY_DESC': 'first' }) self.user_history = pd.DataFrame( X.groupby('user_id').item_id.unique().rename('history')) self.top_purchases = X.groupby(['user_id', 'item_id' ])['quantity'].count().reset_index() self.top_purchases.sort_values('quantity', ascending=False, inplace=True) self.top_purchases = self.top_purchases[ self.top_purchases['item_id'] != self.filter_item_id] # Топ покупок по всему датасету self.overall_top_purchases = X.groupby( 'item_id')['quantity'].count().reset_index() self.overall_top_purchases.sort_values('quantity', ascending=False, inplace=True) self.overall_top_purchases = self.overall_top_purchases[ self.overall_top_purchases['item_id'] != self.filter_item_id] self.overall_top_purchases = self.overall_top_purchases.item_id.tolist( ) self.user_item_matrix = self._prepare_matrix(X, self.matrix_values, self.matrix_aggfunc) self.id_to_itemid, self.id_to_userid, \ self.itemid_to_id, self.userid_to_id = self._prepare_dicts(self.user_item_matrix) if self.weighting: self.user_item_matrix = bm25_weight(self.user_item_matrix.T).T self.model = AlternatingLeastSquares( factors=self.factors, regularization=self.regularization, iterations=self.iterations, dtype=np.float32, use_native=self.use_native, use_gpu=self.use_gpu, ) self.model.fit(csr_matrix(self.user_item_matrix).T.tocsr()) self.model_own_recommender = ItemItemRecommender(K=1) self.model_own_recommender.fit( csr_matrix(self.user_item_matrix).T.tocsr()) self._fit = True def transform(self, X): if self._fit: X = X['user_id'].drop_duplicates() X.index = X.values return X def _update_dict(self, user_id): """Если появился новыю user / item, то нужно обновить словари""" if user_id not in self.userid_to_id.keys(): max_id = max(list(self.userid_to_id.values())) max_id += 1 self.userid_to_id.update({user_id: max_id}) self.id_to_userid.update({max_id: user_id}) def _get_similar_item(self, item_id): """Находит товар, похожий на item_id""" recs = self.model.similar_items( self.itemid_to_id[item_id], N=2) # Товар похож на себя -> рекомендуем 2 товара top_rec = recs[1][0] # И берем второй (не товар из аргумента метода) return self.id_to_itemid[top_rec] def _extend_with_top_popular(self, recommendations): """Если кол-во рекоммендаций < N, то дополняем их топ-популярными""" if self.filter_post: n_rec = self.n_rec_pre else: n_rec = self.n_rec if len(recommendations) < n_rec: recommendations.extend(self.overall_top_purchases[:n_rec]) recommendations = recommendations[:n_rec] return recommendations def _get_recommendations(self, user, model, n_rec): """Рекомендации через стардартные библиотеки implicit""" self._update_dict(user_id=user) try: res = [ self.id_to_itemid[rec[0]] for rec in model.recommend( userid=self.userid_to_id[user], user_items=csr_matrix(self.user_item_matrix).tocsr(), N=n_rec, filter_already_liked_items=False, filter_items=[self.itemid_to_id[self.filter_item_id]], recalculate_user=True) ] except: res = list() finally: res = self._extend_with_top_popular(res) assert len(res) == n_rec, 'Количество рекомендаций != {}'.format( n_rec) return res def get_als_recommendations(self, user): """Рекомендации через стардартные библиотеки implicit""" if self.filter_post: n_rec = self.n_rec_pre else: n_rec = self.n_rec self._update_dict(user_id=user) return self._get_recommendations(user, model=self.model, n_rec) def get_own_recommendations(self, user): """Рекомендуем товары среди тех, которые юзер уже купил""" self._update_dict(user_id=user) return self._get_recommendations(user, model=self.model_own_recommender) def get_similar_items_recommendations(self, user): """Рекомендуем товары, похожие на топ-N купленных юзером товаров""" if self.filter_post: n_rec = self.n_rec_pre else: n_rec = self.n_rec top_users_purchases = self.top_purchases[self.top_purchases['user_id'] == user].head(n_rec) res = top_users_purchases['item_id'].apply( lambda x: self._get_similar_item(x)).tolist() res = self._extend_with_top_popular(res) assert len(res) == n_rec, 'Количество рекомендаций != {}'.format(n_rec) return res def get_similar_users_recommendations(self, user): """Рекомендуем топ-N товаров, среди купленных похожими юзерами""" if self.filter_post: n_rec = self.n_rec_pre else: n_rec = self.n_rec res = [] # Находим топ-N похожих пользователей similar_users = self.model.similar_users(self.userid_to_id[user], N=n_rec + 1) similar_users = [rec[0] for rec in similar_users] similar_users = similar_users[1:] # удалим юзера из запроса for user in similar_users: user_rec = self._get_recommendations( user, model=self.model_own_recommender, n_rec=1) res.extend(user_rec) res = self._extend_with_top_popular(res) assert len(res) == n_rec, 'Количество рекомендаций != {}'.format(n_rec) return res def predict(self, X): X = self.transform(X) recommender = getattr(self, f'get_{self.recommendations}_recommendations') rec = X.swifter.progress_bar(False).apply( lambda item: recommender(user=item)) if self.postfilter_func is not None and self.filter_post: rec = self.postfilter_func( rec, item_info=self.item_info, user_history=self.user_history, n_rec=self.n_rec, n_new=self.n_new, n_exp=self.n_exp, price_lte=self.price_lte, ) assert (rec.swifter.progress_bar(False).apply(len) == self.n_rec).all( ), f'The number of recommendations is not equal {self.n_rec}.' return rec
class Recommender: def __init__(self, **args): self.TRAINING_THREADS = int( args.get("training_threads", os.cpu_count())) self.ALS_FACTORS = args.get("als_factors", 128) self.ALS_REGULARIZATION = args.get("als_regularization", 1e-2) self.ALS_ITERATIONS = args.get("als_iterations", 15) self.MIN_POST_FAVS = args.get("min_post_favs", 5) self.MIN_USER_FAVS = args.get("min_user_favs", 50) self.MAX_FAVS = args.get("max_favs", 1e12) self.FAVS_PATH = args.get("favs_path", "data/favs.csv") self.MODEL_PATH = args.get("model_path", "data/recommender.pickle") self.DATABASE_URL = args.get("database_url", "postgresql://localhost/danbooru2") @staticmethod def create(**args): env = {name.lower(): value for name, value in os.environ.items()} args = {**env, **args} recommender = Recommender(**args) recommender.dump_favorites() recommender.load_favorites() recommender.train() recommender.save(recommender.MODEL_PATH) return recommender @staticmethod def load(model_path): with open(model_path, "rb") as file: return pickle.load(file) def dump_favorites(self): query = f""" SELECT post_id, user_id FROM favorites WHERE post_id IN (SELECT id FROM posts WHERE fav_count > {self.MIN_POST_FAVS}) AND user_id IN (SELECT id FROM users WHERE favorite_count > {self.MIN_USER_FAVS}) ORDER BY post_id DESC LIMIT {self.MAX_FAVS} """ self.shell( f"psql --no-psqlrc -c '\copy ({query}) TO STDOUT WITH (FORMAT CSV)' {self.DATABASE_URL} > {self.FAVS_PATH}" ) def load_favorites(self): favs_df = pd.read_csv(self.FAVS_PATH, dtype=np.int32, names=["post_id", "user_id"]) favs_df = favs_df.astype("category") self.favorites = csr_matrix( (np.ones(favs_df.shape[0]), (favs_df["post_id"].cat.codes.copy(), favs_df["user_id"].cat.codes.copy())), dtype=np.int32) self.users_to_id = { k: v for v, k in enumerate(favs_df["user_id"].cat.categories) } self.posts_to_id = { k: v for v, k in enumerate(favs_df["post_id"].cat.categories) } self.ids_to_post = {k: v for v, k in self.posts_to_id.items()} self.empty = csr_matrix(self.favorites.shape) def train(self): self.model = AlternatingLeastSquares( calculate_training_loss=True, dtype=np.float32, num_threads=self.TRAINING_THREADS, factors=self.ALS_FACTORS, regularization=self.ALS_REGULARIZATION, iterations=self.ALS_ITERATIONS) start = time.monotonic() self.model.fit(self.favorites) end = time.monotonic() dur = int(end - start) self.favorites = None self.trained_at = datetime.utcnow().isoformat() self.training_time = "{:02d}:{:02d}:{:02d}".format( dur // 3600, (dur % 3600 // 60), dur % 60) def recommend_for_user(self, user_id, limit=50): if not user_id in self.users_to_id: return [] uid = self.users_to_id[user_id] recommendations = self.model.recommend(uid, self.empty, N=limit) recommendations = [(self.ids_to_post[id], float(score)) for id, score in recommendations] return recommendations def recommend_for_post(self, post_id, limit=50): if not post_id in self.posts_to_id: return [] pid = self.posts_to_id[post_id] recommendations = self.model.similar_items(pid, N=limit) recommendations = [(self.ids_to_post[id], float(score)) for id, score in recommendations] return recommendations def metrics(self): return { "user_count": len(self.users_to_id), "post_count": len(self.posts_to_id), "factors": self.model.factors, "model_size": 4 * self.model.factors * (len(self.users_to_id) + len(self.posts_to_id)), "trained_at": self.trained_at, "training_time": self.training_time, } def save(self, model_path): with open(model_path, "wb") as file: pickle.dump(self, file) def shell(self, cmd): subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, shell=True, check=True)