def test_explain(self): counts = csr_matrix( [ [1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 4, 1, 0, 7, 0], [1, 1, 0, 0, 0, 0], [9, 0, 4, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 2, 0, 1, 1], ], dtype=np.float64, ) user_items = counts * 2 item_users = user_items.T model = AlternatingLeastSquares( factors=4, regularization=20, use_native=False, use_cg=False, use_gpu=False, iterations=100, random_state=23, ) model.fit(user_items, show_progress=False) userid = 0 # Assert recommendation is the the same if we recompute user vectors recs = model.recommend(userid, item_users, N=10) recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True) for (item1, score1), (item2, score2) in zip(recs, recalculated_recs): self.assertEqual(item1, item2) self.assertAlmostEqual(score1, score2, 4) # Assert explanation makes sense top_rec, score = recalculated_recs[0] score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec) scores = [s for _, s in contributions] items = [i for i, _ in contributions] self.assertAlmostEqual(score, score_explained, 4) self.assertAlmostEqual(score, sum(scores), 4) self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order") self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user") # Assert explanation with precomputed user weights is correct top_score_explained, top_contributions, W = model.explain( userid, item_users, itemid=top_rec, user_weights=W, N=2 ) top_scores = [s for _, s in top_contributions] top_items = [i for i, _ in top_contributions] self.assertEqual(2, len(top_contributions)) self.assertAlmostEqual(score, top_score_explained, 4) self.assertEqual(scores[:2], top_scores) self.assertEqual(items[:2], top_items)
class ALS(Model): def __init__(self): """ Model inicialization """ self.model = AlternatingLeastSquares() self.trainset = None def fit(self, X, y): #Create Coo-Matrix with X and y data = coo_matrix((y, (X[:, 0], X[:, 1]))) self.trainset = data data.transpose() #rows:[n_items] ; columns:[n_users] self.model.fit(data) def recommend(self, user_id, N=1): n_recomendation = self.model.recommend( user_id, self.trainset.tocsr(), N=N) #array of tuples (item_id,rating) #convert array of [tuples] in array of [item_id] result = np.zeros(N, dtype=int) pos = 0 for recomendation_tuple in n_recomendation: result[pos] = recomendation_tuple[0] pos = pos + 1 return result def get_params(self, deep=True): return dict()
class ALSRecommender(BaseRecommender): """ implement alternating least squares algorithm implementation based on implicit library """ def fit(self, train_df, col_user=cfg.USER_COL, col_item=cfg.ITEM_COL, col_rating=cfg.DEFAULT_RATING_COL, factors=100, confidence=5, regularization=0.1): """ Trains implicit ALS recommender on train data :param train_df: pandas DataFrame with train data :param col_user: str column name for user :param col_item: str column name for item :param col_rating: str column name for ratings :param factors: int number of factors to use in ALS model :param confidence: int as described in implicit documentation :param regularization: float higher values mean stronger regularization :return: None """ BaseRecommender.fit(self, train_df, col_user, col_item, col_rating) self.train_df[self.col_rating] = train_df[self.col_rating] * confidence self.uii_matrix = self.get_uii_matrix() self.als = AlternatingLeastSquares(factors=factors, use_gpu=False, regularization=regularization) self.als.fit(self.uii_matrix.T) def predict(self, test_df, k=cfg.DEFAULT_K): """ recommend k items for each user in test_df :param test_df: pandas DataFrame with test_users and truth recommendations :param k: int number of items to recommend :return: pandas DataFrame with k recommendations for each user in test_df """ test_users_indices = [ self.users.index(user) for user in test_df[self.col_user].values if user in self.users ] prediction_records = [] for item in test_users_indices: doc = { self.col_user: self.users[item], self.col_item: [ self.items[it[0]] for it in self.als.recommend( item, self.uii_matrix, k, filter_already_liked_items=False) ] } prediction_records.append(doc) prediction = pd.DataFrame.from_records(prediction_records) return prediction
def test_explain(self): counts = csr_matrix([[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 4, 1, 0, 7, 0], [1, 1, 0, 0, 0, 0], [9, 0, 4, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 2, 0, 1, 1]], dtype=np.float64) user_items = counts * 2 item_users = user_items.T model = AlternatingLeastSquares(factors=4, regularization=20, use_native=False, use_cg=False, iterations=100) np.random.seed(23) model.fit(user_items, show_progress=False) userid = 0 # Assert recommendation is the the same if we recompute user vectors recs = model.recommend(userid, item_users, N=10) recalculated_recs = model.recommend(userid, item_users, N=10, recalculate_user=True) for (item1, score1), (item2, score2) in zip(recs, recalculated_recs): self.assertEqual(item1, item2) self.assertAlmostEqual(score1, score2, 4) # Assert explanation makes sense top_rec, score = recalculated_recs[0] score_explained, contributions, W = model.explain(userid, item_users, itemid=top_rec) scores = [s for _, s in contributions] items = [i for i, _ in contributions] self.assertAlmostEqual(score, score_explained, 4) self.assertAlmostEqual(score, sum(scores), 4) self.assertEqual(scores, sorted(scores, reverse=True), "Scores not in order") self.assertEqual([0, 2, 3, 4], sorted(items), "Items not seen by user") # Assert explanation with precomputed user weights is correct top_score_explained, top_contributions, W = model.explain( userid, item_users, itemid=top_rec, user_weights=W, N=2) top_scores = [s for _, s in top_contributions] top_items = [i for i, _ in top_contributions] self.assertEqual(2, len(top_contributions)) self.assertAlmostEqual(score, top_score_explained, 4) self.assertEqual(scores[:2], top_scores) self.assertEqual(items[:2], top_items)
def _add_als_recs(self, n_factors=20, regularization=0.001, iterations=20, num_threads=0): als_model = AlternatingLeastSquares(factors=n_factors, regularization=regularization, iterations=iterations, num_threads=num_threads) als_model.fit(csr_matrix(self.user_item_matrix).T.tocsr()) self.als_model = als_model als_recs = lambda i: [ self.id_to_itemid[rec[0]] for rec in als_model.recommend( userid=int(i), user_items=csr_matrix(self.user_item_matrix).tocsr(), N=self.first_model_rec_limit, filter_items=[self.itemid_to_id[999999]], recalculate_user=True, filter_already_liked_items=False) ] self.df_users['als_recommender'] = None self.df_users.loc[~self.df_users['id'].isnull(), 'als_recommender'] = self.df_users.loc[ ~self.df_users['id'].isnull(), 'id'].map(als_recs) self.df_users['als_recommender'] = self.df_users[ 'als_recommender'].map(lambda val: val if type(val) == type([]) else []) # adding embedings to df_users and df_items as features als_user_factors = pd.DataFrame( self.als_model.user_factors, columns=[ f'als_user_factor_{i}' for i in range(self.als_model.user_factors.shape[1]) ]) als_user_factors['id'] = als_user_factors.index self.df_users = pd.merge(left=self.df_users, right=als_user_factors, on='id', how='left') als_item_factors = pd.DataFrame( self.als_model.item_factors, columns=[ f'als_item_factor_{i}' for i in range(self.als_model.item_factors.shape[1]) ]) als_item_factors['id'] = als_item_factors.index self.df_items = pd.merge(left=self.df_items, right=als_item_factors, on='id', how='left')
class WRMF(Recsys): def __init__(self, k, reg=1e-4, n_iters=15): """""" super().__init__() self.k = k self.reg = reg self.n_iters = n_iters self.als = AlternatingLeastSquares( k, regularization=reg, iterations=n_iters ) def _recommend(self, user, user_item, n, gt=None): return np.array([ itemid for itemid, score in self.als.recommend(user, user_item, n) ])
def implicit(args): row_dict, col_dict = {}, {} rows, cols, data = [], [], [] for feedback in iter_implicit_feedbacks( os.path.join(args.in_dir, 'ua.base')): i = row_dict.setdefault(feedback.item_id, len(row_dict)) j = col_dict.setdefault(feedback.user_id, len(col_dict)) rows.append(i) cols.append(j) data.append(1) item_user_data = csr_matrix((data, (rows, cols)), shape=(len(row_dict), len(col_dict))) model = AlternatingLeastSquares(factors=8) model.fit(item_user_data) # Evaluation user_items = item_user_data.T.tocsr() user_items_test = collections.defaultdict(set) for feedback in iter_implicit_feedbacks( os.path.join(args.in_dir, 'ua.test')): try: i = row_dict[feedback.item_id] j = col_dict[feedback.user_id] except KeyError as e: continue user_items_test[j].add(i) topk = 10 precision = 0 for user_index, item_indices in user_items_test.items(): recommendations = model.recommend(user_index, user_items, topk, True) precision += sum(1 if item_index in item_indices else 0 for item_index, _ in recommendations) / topk precision = precision / len(user_items_test) print('precision:', precision) item_id = 1 item_index = row_dict[item_id] index2id = {value: key for key, value in row_dict.items()} for _item_index, score in model.similar_items(item_index, 10): _item_id = index2id[_item_index] print(_item_id)
def mixed_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20): print("MF for song / CF for tag...") res = [] # song songs_A = spr.vstack([test_songs_A, train_songs_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) als_model.fit(songs_A.T * 100) # tag train_tags_A_T = train_tags_A.T.tocsr() # shape) n_tags * n_train ply tag_val = test_tags_A.dot(train_tags_A_T) cand_tag_matrix = tag_val.dot(train_tags_A) del tag_val for r, pid in tqdm(enumerate(range(test_songs_A.shape[0]), 0)): # song if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop, filter_already_liked_items=False) else: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] # tag tag_row = cand_tag_matrix.getrow(r).toarray().reshape(-1, ) cand_tag_idx = tag_row.argsort()[-tag_ntop - 5:][::-1] tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"] if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1: rec_tag_idx = remove_seen(tags_already, cand_tag_idx)[:tag_ntop] else: tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"] rec_tag_idx = remove_seen(tags_already, cand_tag_idx)[:tag_ntop] rec_tag_score = [tag_row.data[i] for i in cand_tag_idx] res.append({ "id": self.plylst_nid_id[self.n_train + pid], "songs": rec_song_idx, "tags": [self.tag_tid_id[i] for i in rec_tag_idx], "songs_score": rec_song_score, "tags_score": rec_tag_score }) return res
class ALSpkNN(): ''' k = # of neighbours for KNN knn_frac = % of KNN recommendations max_overlap = maximum % overlap between user and their MUSIC neighbours min_songs = only use users with > min_songs in our KNN code mode = one of ['popular', 'weighted_random', 'random'] ''' def __init__(self, user_df, song_df, k=100, knn_frac=0.5, max_overlap=0.2, cf_weighting_alpha=1, min_songs=5, mode='popular'): self.user_df = user_df self.song_df = song_df self.cf_weighting_alpha = cf_weighting_alpha self.knn_frac = knn_frac self.k = k self.max_overlap = max_overlap self.min_songs = min_songs self.mode = mode user_df_subset = user_df.loc[user_df['num_songs'] > (min_songs - 1)] self.kdtree = KDTree(user_df_subset['MUSIC'].tolist()) #build the collaborative filtering model with params hardcoded als_params = { 'factors': 16, 'dtype': np.float32, 'iterations': 2, 'calculate_training_loss': True } self.cf_model = AlternatingLeastSquares(**als_params) def fit(self, train_csr): #don't want to modify original incase it gets put into other models weighted_train_csr = weight_cf_matrix(train_csr, self.cf_weighting_alpha) self.cf_model.fit(weighted_train_csr) def calculate_overlap(self, list_1, list_2): overlap = len(set(list_1) & set(list_2)) total = len(set(list_1)) + len(set(list_2)) return float(overlap) / total def get_overlap_list(self, user_sparse_index, closest_user_song_sparse_indices): overlap_list = [] songs = self.user_df.loc[user_sparse_index]['song_sparse_indices'] for i in range(len(closest_user_song_sparse_indices)): overlap_list.append( self.calculate_overlap(songs, closest_user_song_sparse_indices[i])) return overlap_list # Returns list of song_sparse_indices def get_knn_top_m_song_sparse_indices(self, user_sparse_index, m, songs_from_cf): user_MUSIC = self.user_df.loc[user_sparse_index]['MUSIC'] distances, indices = self.kdtree.query(user_MUSIC, self.k, p=1) closest_user_song_sparse_indices = self.user_df.loc[indices][ 'song_sparse_indices'].values # calculate overlap for all songlists and delete those without enough overlap insufficient_overlap_indices = [] overlap_list = self.get_overlap_list(user_sparse_index, closest_user_song_sparse_indices) for i in range(len(closest_user_song_sparse_indices)): if overlap_list[i] > self.max_overlap: insufficient_overlap_indices.append(i) #Users with only one or two songs in their listening history will almost # always exceed the overlap condition. This if statement checks if we # are clearing too many users. 5 was chosen as an arbitrary threshold if len(insufficient_overlap_indices) + 5 < len( closest_user_song_sparse_indices): closest_user_song_sparse_indices = np.delete( closest_user_song_sparse_indices, insufficient_overlap_indices) else: #Backup incase closest neighbours are all too similar to the user #Choose random MUSIC users since similarity of MUSIC scores has #became meaningless. random_sparse_user_indices = random.sample( list(self.user_df.index), m) closest_user_song_sparse_indices = self.user_df.loc[ random_sparse_user_indices]['song_sparse_indices'].values print( "Choosing random users since not enough users have small enough overlap" ) user_songs = self.user_df.loc[user_sparse_index]['song_sparse_indices'] # closest_user_song_sparse_indices_flat -> list of song_ids closest_user_song_sparse_indices_flat = itertools.chain.from_iterable( closest_user_song_sparse_indices) filtered_songs = [] for song in closest_user_song_sparse_indices_flat: if song not in (user_songs + songs_from_cf): filtered_songs.append(song) # song_count_tuples -> format [(song_sparse_index, count)] song_count_tuples = Counter(filtered_songs).most_common() if len(song_count_tuples) < m: print('len(song_count_tuples) < m') top_songs = [song_tuple[0] for song_tuple in song_count_tuples] if self.mode == 'popular': m_songs = top_songs[:m] elif self.mode in ['weighted_random', 'random']: top_song_probs = None if self.mode == 'weighted_random': top_song_counts = [ song_tuple[1] for song_tuple in song_count_tuples ] top_song_probs = top_song_counts / np.sum(top_song_counts) m_song_count_tuples_indices = np.random.choice( len(song_count_tuples), p=top_song_probs, size=m, replace=False) m_song_count_tuples = [ song_count_tuples[idx] for idx in m_song_count_tuples_indices ] # Although randomly sampled, the songs should still be sorted by popularity to maximize MAP@K m_song_count_tuples.sort(key=lambda song_tuple: song_tuple[1], reverse=True) m_songs = [song_tuple[0] for song_tuple in m_song_count_tuples] return m_songs # Returns [song_sparse_index] def recommend(self, user_sparse_index, train_plays_transpose, N): # m -> number of songs from KNN recs m = int(np.round(self.knn_frac * N)) # n -> number of songs from CF recs n = N - m n_songs = [] if n > 0: n_song_tuples = self.cf_model.recommend( userid=user_sparse_index, user_items=train_plays_transpose, N=n) n_songs = [song_tuple[0] for song_tuple in n_song_tuples] m_songs = [] if m > 0: m_songs = self.get_knn_top_m_song_sparse_indices( user_sparse_index=user_sparse_index, m=m, songs_from_cf=n_songs) return n_songs + m_songs
def mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20): print(f'MF... iters:{iteration}') # 0711 기준 최고 하이퍼파라미터) * 100, song - 256, tag - 32, reg = 0.1, epoch 20 > song 56.4%, tag 61.3% val_song_res = [] val_tag_res = [] test_song_res = [] test_tag_res = [] songs_A = spr.vstack([test_songs_A, train_songs_A]) tags_A = spr.vstack([test_tags_A, train_tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(songs_A.T * 100) als_model_tag = ALS(factors=32, regularization=0.08, use_gpu=True, iterations=iteration) als_model_tag.fit(tags_A.T * 100) for id in tqdm(range(self.n_test_song)): # 18636 / 태그 -> 11605 행 # song cand_song = als_model.recommend(id, test_songs_A, N=song_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] if (id < self.n_val_song): # 순서 - train, val, test val_song_res.append({ "id": self.plylst_nid_id[self.plylst_test_song.index[id]], "songs": rec_song_idx, "songs_score": rec_song_score }) else: test_song_res.append({ "id": self.plylst_nid_id[self.plylst_test_song.index[id]], "songs": rec_song_idx, "songs_score": rec_song_score }) # tag try: cand_tag = als_model_tag.recommend( id, test_tags_A, N=tag_ntop, filter_already_liked_items=True) rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag] rec_tag_score = [x[1] for x in cand_tag] if (id < self.n_val_song): val_tag_res.append({ "id": self.plylst_nid_id[self.plylst_test_tag.index[id]], "tags": rec_tag_idx, "tags_score": rec_tag_score }) else: test_tag_res.append({ "id": self.plylst_nid_id[self.plylst_test_tag.index[id]], "tags": rec_tag_idx, "tags_score": rec_tag_score }) except IndexError: pass print("DONE") return val_song_res, val_tag_res, test_song_res, test_tag_res
def calculate_recommendations(train_filename, test_filename, output_filename, dir, model_name="als", factors=80, regularization=0.8, iterations=10, exact=False, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar items. This might take a while") # read in the input data file logging.debug("reading data from %s", dir + train_filename) start = time.time() df, cnts = read_data(dir + train_filename) logging.debug("read data file in %s", time.time() - start) # generate a recommender model based on the input params if model_name == "als": if exact: model = AlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, iterations=iterations, dtype=dtype) else: model = AnnoyAlternatingLeastSquares(factors=factors, regularization=regularization, use_native=use_native, use_cg=cg, iterations=iterations, dtype=dtype) # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") cnts = bm25_weight(cnts, K1=100, B=0.8) elif model_name == "tfidf": model = TFIDFRecommender() elif model_name == "cosine": model = CosineRecommender() elif model_name == "bm25": model = BM25Recommender(K1=100, B=0.5) else: raise NotImplementedError("TODO: model %s" % model_name) # train the model logging.debug("training model %s", model_name) start = time.time() model.fit(cnts) logging.debug("trained model '%s' in %s", model_name, time.time() - start) # test_data = pandas.read_csv(test_filename, sep="\t", usecols=[0, 1, 2], names=['user', 'item', 'cnt']) test_data = test_data.groupby(["user", "item"], as_index=False).sum() users_test = set(test_data['user']) users_train = set(df['user']) # position is important for recommendation list and actual list dict_actual = {} for user in users_test: if user not in users_train: continue matched_df = test_data.loc[test_data["user"] == user] matched_df.sort(["cnt"], ascending=[False], inplace=True) dict_actual[user] = list(matched_df["item"]) user_items = cnts.T.tocsr() # print(user_items) # recommend items for a user dict_recommended = {} # for computing MAP and MP for user in users_test: if user not in users_train: continue # print(user) recommendations = model.recommend(user, user_items) df = pandas.DataFrame(recommendations, columns=["item", "score"]) # print(recommendations) # print(df["item"]) dict_recommended[user] = list(df["item"]) ndcg = NDCG(dict_actual, dict_recommended) err = ERR(dict_actual, dict_recommended) map = MAP(dict_actual, dict_recommended) mp = MP(dict_actual, dict_recommended) with open("%siALS_result_%s.txt" % (dir, train_filename), "w") as o: o.write("NDCG\tERR\tMAP\tMP\n") o.write("%s\t%s\t%s\t%s\n" % (ndcg, err, map, mp)) return (ndcg, err, map, mp)
class Recommender: def __init__(self, factors=50): self.model = AlternatingLeastSquares(factors=factors, regularization=0.01, dtype=np.float64, iterations=50) def train(self, data): userids = data.userid.astype("category") itemids = data.itemid.astype("category") matrix = coo_matrix((data.confidence.astype('float64'), (itemids.cat.codes.copy(), userids.cat.codes.copy()))) self.model.fit(matrix) self.t_matrix = matrix.T.tocsr() self.userid_to_code = dict([(category, code) for code, category in enumerate(userids.cat.categories)]) self.itemid_to_code = dict([(category, code) for code, category in enumerate(itemids.cat.categories)]) self.usercode_to_id = dict([(code, category) for code, category in enumerate(userids.cat.categories)]) self.itemcode_to_id = dict([(code, category) for code, category in enumerate(itemids.cat.categories)]) def similar_items(self, itemid, N=10): item_code = self.itemid_to_code[itemid] similar_codes = self.model.similar_items(item_code, N) similar_ids = [(self.itemcode_to_id[code], s) for code, s in similar_codes] return pd.DataFrame(similar_ids, columns=["itemid", "similarity"]) def recommendations(self, userid, N=10): user_code = self.userid_to_code[userid] user_item_codes = self.model.recommend(user_code, self.t_matrix, N) user_item_ids = [(self.itemcode_to_id[code], c) for code, c in user_item_codes] return pd.DataFrame(user_item_ids, columns=["itemid", "confidence"]) def explain(self, userid, itemid): user_code = self.userid_to_code[userid] item_code = self.itemid_to_code[itemid] return self.model.explain(user_code, self.t_matrix, item_code) def confidence(self, userid, itemid): item_code = self.itemid_to_code[itemid] user_code = self.userid_to_code[userid] item_factor = self.model.item_factors[item_code] user_factor = self.model.user_factors[user_code] return item_factor.dot(user_factor) def user_factors(self): factors = pd.DataFrame(self.model.user_factors).add_prefix("f") ids = factors.index.map(lambda code: self.usercode_to_id[code]) factors.insert(0, "userid", ids) return factors def item_factors(self): factors = pd.DataFrame(self.model.item_factors).add_prefix("f") ids = factors.index.map(lambda code: self.itemcode_to_id[code]) factors.insert(0, "itemid", ids) return factors def items_recommendations(self, itemids, N=10): user_code = 0 item_codes = [self.itemid_to_code[id] for id in itemids] data = [1 for _ in item_codes] rows = [0 for _ in item_codes] shape = (1, self.model.item_factors.shape[0]) user_items = coo_matrix( (data, (rows, item_codes)), shape=shape).tocsr() user_item_codes = self.model.recommend( user_code, user_items, N, recalculate_user=True) user_item_ids = [(self.itemcode_to_id[code], c) for code, c in user_item_codes] return pd.DataFrame(user_item_ids, columns=["itemid", "confidence"])
class Wrmf: def __init__(self, params={"c": None}, nunique_feature=None): self.params = params.copy() self.c = params["c"] del params["c"] self.model = ALS(**params) self.song_model = ALS(**params) self.tag_model = ALS(**params) self.song_rec_csr = None self.tag_rec_csr = None self.nunique_feature = nunique_feature def fit(self, X): self.model.fit(self.c * X.T) self.song_model.user_factors = self.model.user_factors self.song_model.item_factors = self.model.item_factors[:self. nunique_feature[ "songs"]] self.tag_model.user_factors = self.model.user_factors self.tag_model.item_factors = self.model.item_factors[ -self.nunique_feature["tags"]:] self.song_rec_csr = X[:, :self.nunique_feature["songs"]] self.tag_rec_csr = X[:, -self.nunique_feature["tags"]:] #save_model 로 따로 두는 것이 좋음 return self def predict(self, idx, num_songs, num_tags): song_rec_df = pd.DataFrame() tag_rec_df = pd.DataFrame() for u in idx: song_rec = self.song_model.recommend(u, self.song_rec_csr, N=num_songs) song_ids = [id_ for id_, _ in song_rec] song_scores = [score for _, score in song_rec] song_plylst_ids = np.repeat(u, num_songs) song_recommended = pd.DataFrame({ "plylst_id": song_plylst_ids, "song_id": song_ids, "score": song_scores }) song_rec_df = pd.concat([song_rec_df, song_recommended]) tag_rec = self.tag_model.recommend(u, self.tag_rec_csr, N=num_tags) tag_ids = [id_ for id_, _ in tag_rec] tag_scores = [score for _, score in tag_rec] tag_plylst_ids = np.repeat(u, num_tags) tag_recommended = pd.DataFrame({ "plylst_id": tag_plylst_ids, "tag": tag_ids, "score": tag_scores }) tag_rec_df = pd.concat([tag_rec_df, tag_recommended]) return song_rec_df, tag_rec_df def save_model(self, save_file): with open(stage1_config.SAVE_FOLDER + save_file + ".pkl", "wb") as f: pkl.dump(self, f) with open(stage1_config.SAVE_FOLDER + save_file + "_config.txt", "a") as f: f.write(str(self.params)) def load_model(self, save_file): with open(stage1_config.SAVE_FOLDER + save_file + ".pkl", "rb") as f: self = pkl.load(f) return self
class HHimmlerEnsemble: def __init__(self, urm_train, urm_test, icm, parameters=None): if parameters is None: parameters = { "USER_CF": 0.8, "USER_BPR": 0.7, "ITEM_CF": 1, "ITEM_BPR": 0.8, "CBF": 0.3, "IALS": 1.0, "CBF_BPR": 1 } self.ensemble_weights = parameters self.train = urm_train.tocsr() self.test = urm_test.tocsr() self.icm = icm.tocsr() self.initialize_components() def initialize_components(self): self.bpr_mf = BPR_matrix_factorization(factors=200, regularization=0.00000, learning_rate=0.01, iterations=65) self.ials_cg_mf = IALS_CG(iterations=15, calculate_training_loss=True, factors=500, use_cg=True, regularization=1e-3) def fit(self): self.bpr_mf.fit(self.train.T.tocoo()) self.ials_cg_mf.fit(40 * self.train.T) self.bpr_mf_latent_x = self.bpr_mf.user_factors.copy() self.bpr_mf_latent_y = self.bpr_mf.item_factors.copy() self.ials_cg_mf_latent_x = self.ials_cg_mf.user_factors.copy() self.ials_cg_mf_latent_y = self.ials_cg_mf.item_factors.copy() def recommend(self, user_id, combiner, at=10): bpr_mf_r = np.dot(self.bpr_mf_latent_x[user_id], self.bpr_mf_latent_y.T).ravel() ials_cg_mf_r = np.dot(self.ials_cg_mf_latent_x[user_id], self.ials_cg_mf_latent_y.T).ravel() scores = [ # [bpr_mf_r, self.ensemble_weights["BPR_MF"], "BPR_MF"], [ials_cg_mf_r, 1, "IALS_CG"] ] for r in scores: self.filter_seen(user_id, r[0]) return combiner.combine(scores, at) def filter_seen(self, user_id, scores): start_pos = int(self.train.indptr[user_id]) end_pos = int(self.train.indptr[user_id + 1]) user_profile = self.train.indices[start_pos:end_pos] scores[user_profile] = -1000000 #-np.inf return scores def recommend_batch(self, user_list, combiner, at=10): res = np.array([]) n = 0 for i in user_list: bpr = self.bpr_mf.recommend(user_items=self.train, userid=i, N=at, recalculate_user=False) ials = self.ials_cg_mf.recommend(userid=i, user_items=self.train, N=10) list = [x[0] for x in ials] recList = np.array(list) tuple = np.concatenate(([i], recList)) if (res.size == 0): res = tuple else: res = np.vstack([res, tuple]) return res def get_component_data(self): print('cyka')
print(plays[:5]) print(len(users)) print(len(artists)) rows = data.user_id.astype(int) cols = data.artist_id.astype(int) data_sparse = sparse.csr_matrix((plays, (cols, rows)), shape=(len(artists), len(users))) model = AlternatingLeastSquares(factors=50) model.fit(data_sparse) userid = 0 user_items = data_sparse.T.tocsr() recommendations = model.recommend(userid, user_items) print(recommendations) for r in recommendations: print(artist_id_name[str(r[0])]) itemid = 107209 related = model.similar_items(itemid) print(related) for a in related: print(artist_id_name[str(a[0])]) artist_id_name['234786']
class MainRecommender: own_recomender_defult_param = {'filter_already_liked_items':False, 'filter_items':False, "recalculate_user":True} model_als_defult_param ={'factors':50, 'regularization':15, 'iterations':15, 'num_threads':-1,'calculate_training_loss':False} def __init__(self, data,data_test=None,split_info=None): """ data - dataframe c данными data_test - даные для валидации, если нет и есть split_info то создаем split_info кортеж с инфрмацией как создать data_test (размер, поле деления) рассматривается только в слуяае отсутвя data_test """ self.top = 5000 self.data_validation={} self.data_validation['status'] = False self.user_item_matrix = {'status':False,'matrix':None,'params':None} self.own_recommender_is_fit= {'status':False,'params':None} self.als_recommender_is_fit= {'status':False,'params':None} self.data = data.copy() self.full_data_train = data.copy() #Оставим полный объем данный , если нужно будет предсказывать по полному объему данных self.data_train = data.copy() if data_test is not None: self.data_test = data_test.copy() else: self.data_test = None if split_info: self.data_train,self.data_test = self.train_test_split(test_size_num = split_info[0],split_column =split_info[1]) if self.data_test is not None: self.data_validation['data'] = self.get_validation_data() self.data_validation['status'] = True def prefiltr_1(self,my_data): df = my_data.copy() """Оставим только top самых популярных товаров остальные переименуем в 999999""" popularity = my_data.groupby('item_id')['quantity'].count().reset_index() popularity.rename(columns={'quantity': 'n_sold'}, inplace=True) top_5000 = popularity.sort_values('n_sold', ascending=False).head(self.top).item_id.tolist() df.loc[~df['item_id'].isin(top_5000), 'item_id'] = 999999 return df def prefiltr_2(self,data_train,n=5000): """Оставим только n самых популярных товаров, транзакции с остальными товрами удалим""" df = data_train.copy() popularity = df.groupby('item_id')['quantity'].count().reset_index() popularity.rename(columns={'quantity': 'n_sold'}, inplace=True) top_n = popularity.sort_values('n_sold', ascending=False).head(n).item_id.tolist() df = df.loc[df['item_id'].isin(top_n)] return df def prefiltr_3(self,data_train,n=5000): """транзакции с самыми не популярными n товрами удалим""" df = data_train.copy() not_popularity = df.groupby('item_id')['quantity'].count().reset_index() not_popularity.rename(columns={'quantity': 'n_sold'}, inplace=True) not_top_n = not_popularity.sort_values('n_sold').head(n).item_id.tolist() df = df.loc[~df['item_id'].isin(not_top_n)] return df def prefiltr_4(self,data_train,weeks = 50): """Удалим транзакции с товарами, которые не покупали более n недель""" df = data_train.copy() old_item = df.groupby('item_id')['week_no'].max().reset_index() old_item = old_item.loc[old_item['week_no']>weeks,'item_id'].tolist() df = df.loc[df['item_id'].isin(old_item)] return df def train_test_split(self,test_size_num,split_column): data_train = self.data[self.data[split_column] < self.data[split_column].max() - test_size_num] data_test = self.data[self.data[split_column] >= self.data[split_column].max() - test_size_num] return data_train, data_test def get_validation_data(self): result = self.data_test.groupby('user_id')['item_id'].unique().reset_index() users_train = self.data_train.user_id.unique() result = result[result.user_id.isin(users_train)] result['train'] = result['user_id'].map(self.data_train.groupby('user_id')['item_id'].unique()) result['full_train'] = result['user_id'].map(self.full_data_train.groupby('user_id')['item_id'].unique()) result.rename(columns={'item_id':'test'},inplace=True) result.reset_index(inplace=True,drop=True) return result def prepare_matrix(self,agg_column,full=None,filtr=None): my_data = self.data_train.copy() if full: my_data = self.full_data_train.copy() if filtr: for i in filtr: prefiltr = 'self.prefiltr_'+str(i)+'(my_data)' my_data = eval(prefiltr) user_item_matrix = pd.pivot_table(my_data, index='user_id', columns='item_id', values=agg_column[0], aggfunc=agg_column[1], fill_value=0 ) user_item_matrix = user_item_matrix.astype(float) self.prepare_dicts(user_item_matrix) self.current_working_data = my_data.copy() return user_item_matrix def prepare_dicts(self,user_item_matrix): """Подготавливает вспомогательные словари""" userids = user_item_matrix.index.values itemids = user_item_matrix.columns.values matrix_userids = np.arange(len(userids)) matrix_itemids = np.arange(len(itemids)) self.id_to_itemid = dict(zip(matrix_itemids, itemids)) self.id_to_userid = dict(zip(matrix_userids, userids)) self.itemid_to_id = dict(zip(itemids, matrix_itemids)) self.userid_to_id = dict(zip(userids, matrix_userids)) return self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id def make_data(self,agg_column,filtr=None,full =False,top = 5000): self.top = top self.full = full uim = self.prepare_matrix(agg_column=agg_column,full=full,filtr=filtr) uim_w = uim.copy() self.user_item_matrix['uim_matrix_w'] = csr_matrix(uim_w).tocsr() uim[uim>0]=1 self.user_item_matrix['uim_matrix'] = csr_matrix(uim).tocsr() self.user_item_matrix['ium_matrix_w_tfidf'] = tfidf_weight(csr_matrix(uim_w.T).tocsr()) self.user_item_matrix['ium_matrix_tfidf'] = tfidf_weight(csr_matrix(uim.T).tocsr()) self.user_item_matrix['ium_matrix_w_bm25'] = bm25_weight(csr_matrix(uim_w.T).tocsr()) self.user_item_matrix['ium_matrix_bm25'] = bm25_weight(csr_matrix(uim.T).tocsr()) self.user_item_matrix['status'] = True self.user_item_matrix['params'] = {'agg_column':agg_column,'filtr':filtr,'full':full} return self.user_item_matrix def precision_at_k(x, k=5): if len(x['predict']) == 0: return 0 bought_list = np.array(x['test']) recommended_list = np.array(x['predict'])[:k] flags = np.isin(bought_list, recommended_list) precision = flags.sum() / len(recommended_list) return precision def fit_own_recommender(self,weighting=False): """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером""" assert self.user_item_matrix['status'], 'необходимо сначала выполнить метод make_data(self,agg_column,filtr=None,weighting=None,full =False)' ium = self.user_item_matrix['uim_matrix'].T if weighting: assert (weighting == 'tf_idf' or weighting == 'bm25'), 'необходимо указать weighting: tf_idf или bm25 или None' if weighting == 'tf_idf': ium = self.user_item_matrix['ium_matrix_tfidf'] else: ium = self.user_item_matrix['ium_matrix_bm25'] self.own_recommender = ItemItemRecommender(K=1, num_threads=-1) self.own_recommender.fit(ium) self.own_recommender_is_fit['status'] =True self.own_recommender_is_fit['params'] ={'model':'ItemItemRecommender(K=1, num_threads=-1)','weighting':weighting} self.own_recommender_is_fit['ium']=ium return self.own_recommender def predict_own_recommender(self,users,N=5,params=own_recomender_defult_param): param = params.copy() assert self.own_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_own_recommender()' assert type(users) == list, 'users - должен быть списком' uim = self.user_item_matrix['uim_matrix'] param['user_items'] = uim param['N'] = N answer = pd.DataFrame() answer['user_id']=users if param['filter_items']: param['filter_items']=[self.itemid_to_id[i] for i in params['filter_items']] rec=[] for user in users: param['userid'] = self.userid_to_id[user] rec.append( [self.id_to_itemid[i[0]] for i in self.own_recommender.recommend(**param)]) answer['result'] = rec return answer def validation_own_recommender(self,metric=precision_at_k,N=5,params=own_recomender_defult_param): assert self.data_validation['status'], 'тестовые данные не созданы' assert self.own_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_own_recommender()' df = self.data_validation['data'] users = df['user_id'].to_list() predict = self.predict_own_recommender(users = users,N=N,params=params) df['predict'] = predict['result'] return df.apply(metric,axis=1).mean() def fit_als(self, params = model_als_defult_param,weighting=False): """Обучает ALS""" assert self.user_item_matrix['status'], 'необходимо сначала выполнить метод make_data(self,agg_column,filtr=None,weighting=None,full =False)' ium = self.user_item_matrix['uim_matrix_w'].T if weighting: assert (weighting == 'tf_idf' or weighting == 'bm25'), 'необходимо указать weighting: tf_idf или bm25 или None' if weighting == 'tf_idf': ium = self.user_item_matrix['ium_matrix_w_tfidf'] else: ium = self.user_item_matrix['ium_matrix_w_bm25'] self.model_als = AlternatingLeastSquares(**params) self.model_als.fit(ium) self.als_recommender_is_fit['status'] = True self.als_recommender_is_fit['params'] = {'model':params,'weighting':weighting} self.als_recommender_is_fit['ium'] = ium return self.model_als def predict_als(self,users,N=5,params=own_recomender_defult_param): param = params.copy() assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()' assert type(users) == list, 'users - должен быть списком' uim = self.user_item_matrix['uim_matrix_w'] param['user_items'] = uim param['N'] = N answer = pd.DataFrame() answer['user_id']=users if param['filter_items']: param['filter_items']=[self.itemid_to_id[i] for i in params['filter_items']] rec=[] for user in users: param['userid'] = self.userid_to_id[user] rec.append( [self.id_to_itemid[i[0]] for i in self.model_als.recommend(**param)]) answer['result'] = rec return answer def validation_als_recommender(self,metric=precision_at_k,N=5,params=own_recomender_defult_param): assert self.data_validation['status'], 'тестовые данные не созданы' assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()' df = self.data_validation['data'].copy() users = df['user_id'].to_list() predict = self.predict_als(users = users,N=N,params=params) df['predict'] = predict['result'] return df.apply(metric,axis=1).mean() def get_recs(self,user,popularity,not_my=0): result = [] for item in popularity[popularity['user_id']==user]['item_id'].to_list(): recs_ = self.model_als.similar_items(self.itemid_to_id[item], N=3) recs = [self.id_to_itemid[i[0]] for i in recs_] if 999999 in recs: recs.remove(999999) result.append(recs[not_my]) return result def get_similar_items_recommendation(self, users,not_my=0, N=5): """Рекомендуем товары, похожие на топ-N купленных юзером товаров not_my =1 если хотим предсказать поекупку собственных товаров (вроде own_recomender), 0 - обратно""" assert self.als_recommender_is_fit['status'],'Модель als не обучена, используйте fit_als()' assert type(users)==list,'параметр users должен быть list' assert not_my in [0,1],'параметр not_my должен быть равен 0 или 1' my_data = self.current_working_data.copy() my_data = my_data[my_data['user_id'].isin(users)] popularity = my_data.groupby(['user_id', 'item_id'])['quantity'].count().reset_index() popularity.sort_values('quantity', ascending=False, inplace=True) popularity = popularity[popularity['item_id'] != 999999] popularity =popularity.groupby('user_id').head(N) popularity.sort_values(['user_id','quantity'], ascending=False, inplace=True) result = pd.DataFrame() result['user_id'] = users result['similar_recommendation'] = result['user_id'].apply(\ lambda x: self.get_recs(user = x,popularity = popularity,not_my=not_my)) return result def validation_similar_items_recommendation(self,metric=precision_at_k,N=5,not_my=0): assert self.data_validation['status'], 'тестовые данные не созданы' assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()' assert not_my in [0,1],'параметр not_my должен быть равен 0 или 1' df = self.data_validation['data'].copy() users = df['user_id'].to_list() predict = self.get_similar_items_recommendation(users = users,N=N,not_my=not_my) df['predict'] = predict['similar_recommendation'] return df.apply(metric,axis=1).mean() def get_user(self,user): users = self.model_als.similar_users(self.userid_to_id[user], N=2) return self.id_to_userid[users[1][0]] def get_similar_users_recommendation(self, users, N=5,params=own_recomender_defult_param): """Рекомендуем топ-N товаров, среди купленных похожими юзерами""" assert self.als_recommender_is_fit['status'],'Модель als не обучена, используйте fit_als()' assert type(users)==list,'параметр users должен быть list' result = pd.DataFrame() result['user_id'] = users result['simular_user_id'] = result['user_id'].apply(self.get_user) result['similar_recommendation'] = self.predict_als(result['simular_user_id'].to_list(),N=5,params=params)['result'] return result def validation_similar_users_recommendation(self,metric=precision_at_k,N=5): assert self.data_validation['status'], 'тестовые данные не созданы' assert self.als_recommender_is_fit['status'], 'необходимо сначала выполнить метод fit_als()' df = self.data_validation['data'].copy() users = df['user_id'].to_list() predict = self.get_similar_users_recommendation(users = users,N=N) df['predict'] = predict['similar_recommendation'] return df.apply(metric,axis=1).mean()
class Implicit(object): def __init__(self): self.model = None self.mapped_trainset = None self.mapping_dict = None self.inv_mapping_dict = None self.max_index_of_item = None self.max_index_of_user = None self.item_users = None self.user_items = None self.k = 10 self.param = None self.default_param = { 'factors': 100, 'regularization': 0.01, 'iterations': 15, 'use_native': True, 'use_cg': True, 'use_gpu': False, 'calculate_training_loss': False, 'num_threads': 0 } self.mean_user_factors = None self.mean_item_factors = None self.baseline_recommend_items = None self.baseline_recommend_scores = None def fit_trainset(self, raw_train_dataset): trainset = copy.deepcopy(raw_train_dataset) #trainset = trainset.drop_duplicates(subset=['user','item']) self.mapping_dict, self.inv_mapping_dict = fit_coder( trainset, 'user', 'item', 'rating') self.mapped_trainset = code(copy.deepcopy(trainset), 'user', 'item', 'rating', self.mapping_dict) self.max_index_of_item = len(self.mapped_trainset.item.unique()) self.max_index_of_user = len(self.mapped_trainset.user.unique()) row = self.mapped_trainset.item.values col = self.mapped_trainset.user.values data = self.mapped_trainset.rating.values self.item_users = csr_matrix( (data, (row, col)), shape=(self.max_index_of_item, self.max_index_of_user)) self.user_items = self.item_users.T.tocsr() self.user_items = bm25_weight(self.user_items, B=0.7).tocsr() * 5 self.item_users = self.user_items.T.tocsr() # #Experiment -------------- # add_one = self.item_users.toarray() + 1 # self.item_users = csr_matrix(add_one) # # ------------------------- self.user_items = self.item_users.T.tocsr() def add_fit_trainset(self, new_raw_train_dataset): if self.mapped_trainset is None: self.fit_trainset(new_raw_train_dataset) else: new_trainset = copy.deepcopy(new_raw_train_dataset) new_train = code(copy.deepcopy(new_trainset), 'user', 'item', 'rating', self.mapping_dict) ind_item = new_train[new_train.item.isnull()].index ind_user = new_train[new_train.user.isnull()].index unknown_items = new_trainset.loc[ind_item, 'item'].unique() unknown_users = new_trainset.loc[ind_user, 'user'].unique() len_new_items = len(unknown_items) len_new_users = len(unknown_users) new_item_dic = { key: value for key, value in zip( unknown_items, range(self.max_index_of_item, self.max_index_of_item + len_new_items)) } new_user_dic = { key: value for key, value in zip( unknown_users, range(self.max_index_of_user, self.max_index_of_user + len_new_users)) } inv_new_item_dic = { value: key for key, value in zip( unknown_items, range(self.max_index_of_item, self.max_index_of_item + len_new_items)) } inv_new_user_dic = { value: key for key, value in zip( unknown_users, range(self.max_index_of_user, self.max_index_of_user + len_new_users)) } self.max_index_of_item += len_new_items self.max_index_of_user += len_new_users self.mapping_dict['item'].update(new_item_dic) self.mapping_dict['user'].update(new_user_dic) self.inv_mapping_dict['item'].update(inv_new_item_dic) self.inv_mapping_dict['user'].update(inv_new_user_dic) new_mapped_trainset = code(copy.deepcopy(new_raw_train_dataset), 'user', 'item', 'rating', self.mapping_dict) #self.mapped_trainset = self.mapped_trainset.append(new_trainset, ignore_index=True) self.mapped_trainset = pd.concat( [self.mapped_trainset, new_mapped_trainset], ignore_index=True) self.mapped_trainset = self.mapped_trainset.drop_duplicates( subset=['user', 'item']) row = self.mapped_trainset.item.values col = self.mapped_trainset.user.values data = self.mapped_trainset.rating.values self.item_users = csr_matrix( (data, (row, col)), shape=(self.max_index_of_item, self.max_index_of_user)) self.user_items = self.item_users.T.tocsr() if self.model: factors_for_new_unknown_users = [list(self.mean_user_factors) ] * len_new_users if len(factors_for_new_unknown_users) > 0: self.model.user_factors = np.concatenate([ self.model.user_factors, factors_for_new_unknown_users ]) factors_for_new_unknown_items = [list(self.mean_item_factors) ] * len_new_items if len(factors_for_new_unknown_items) > 0: self.model.item_factors = np.concatenate([ self.model.item_factors, factors_for_new_unknown_items ]) print('factors extended') def set_k(self, k): self.k = int(k) def fit_model(self, dic_param={}, fit_new_model=True): if self.item_users is None: print('Firstly fit trainset') else: if fit_new_model == False: #check if available previous model if not self.model: fit_new_model = True if fit_new_model: d = copy.deepcopy(self.default_param) d.update(dic_param) self.param = d else: d = copy.deepcopy(self.param) d.update(dic_param) if d['factors'] != self.param['factors']: print('different amount of facors! Previous: ' + str(self.param['factors']) + '; Now: ' + str(d['factors']) + '; Fit new model') fit_new_model = True if fit_new_model: self.model = AlternatingLeastSquares( factors=d['factors'], regularization=d['regularization'], iterations=d['iterations'], use_native=d['use_native'], use_cg=d['use_cg'], use_gpu=d['use_gpu'], calculate_training_loss=d['calculate_training_loss'], num_threads=d['num_threads']) #dic_param else: previous_user_factors = self.model.user_factors previous_item_factors = self.model.item_factors self.model = AlternatingLeastSquares( factors=d['factors'], regularization=d['regularization'], iterations=d['iterations'], use_native=d['use_native'], use_cg=d['use_cg'], use_gpu=d['use_gpu'], calculate_training_loss=d['calculate_training_loss'], num_threads=d['num_threads']) #dic_param self.model.user_factors = previous_user_factors self.model.item_factors = previous_item_factors self.model.fit(self.item_users) self.mean_user_factors = self.model.user_factors.mean(axis=0) self.mean_item_factors = self.model.item_factors.mean(axis=0) scores = np.dot(self.model.item_factors, self.mean_user_factors) items = list(range(self.max_index_of_item)) result = list(zip(scores, items)) result.sort(reverse=True) recommend = np.array(result) self.baseline_recommend_items = [ self.inv_mapping_dict['item'][int(item)] for _, item in recommend ] self.baseline_recommend_scores = [score for score, _ in recommend] def get_user_factors(self): real_user_factors = {} for i in range(self.max_index_of_user): real_user_factors[self.inv_mapping_dict['user'][i]] = list( self.model.user_factors[i]) return real_user_factors def get_item_factors(self): real_item_factors = {} for i in range(self.max_index_of_item): real_item_factors[self.inv_mapping_dict['item'][i]] = list( self.model.item_factors[i]) return real_item_factors def recommend_for_user(self, user_true_name, filter_already_liked_items=True, return_scores=False, recalculate_user=False): if self.mapping_dict is None: print('Firstly fit_trainset') return None if self.model is None: print('Firstly fit_model') return None if user_true_name in self.mapping_dict['user'].keys(): user = self.mapping_dict['user'][user_true_name] rec = self.model.recommend( user, self.user_items, self.k, filter_already_liked_items=filter_already_liked_items, recalculate_user=recalculate_user) items = [self.inv_mapping_dict['item'][item] for item, _ in rec] scores = [score for _, score in rec] if return_scores: return items, scores else: return items else: items = self.baseline_recommend_items[:self.k] if return_scores: scores = self.baseline_recommend_scores[:self.k] return items, scores else: return items def recommend(self, users_list, filter_already_liked_items=True, return_scores=False, recalculate_user=False): if self.mapping_dict is None: print('Firstly fit_trainset') return None if self.model is None: print('Firstly fit_model') return None result_user_items = {} result_user_scores = {} for user_true_name in tqdm(users_list): if return_scores: items, scores = self.recommend_for_user( user_true_name, filter_already_liked_items, return_scores, recalculate_user) result_user_items[user_true_name] = items result_user_scores[user_true_name] = scores else: items = self.recommend_for_user(user_true_name, filter_already_liked_items, return_scores, recalculate_user) result_user_items[user_true_name] = items if return_scores: return result_user_items, result_user_scores else: return result_user_items def recommend_df(self, users_list, filter_already_liked_items=True, return_scores=False, column_names=['user', 'item', 'rating'], recalculate_user=False): if self.mapping_dict is None: print('Firstly fit_trainset') return None if self.model is None: print('Firstly fit_model') return None result = [] for user_true_name in tqdm(users_list): user_column = [user_true_name] * int(self.k) if return_scores: items, scores = self.recommend_for_user( user_true_name, filter_already_liked_items, return_scores, recalculate_user) res = list(zip(user_column, items, scores)) else: items = self.recommend_for_user(user_true_name, filter_already_liked_items, return_scores, recalculate_user) res = list(zip(user_column, items)) result.extend(res) if return_scores: return pd.DataFrame(result, columns=column_names[:3]) else: return pd.DataFrame(result, columns=column_names[:2]) # # def rank_for_user(self, user): # if self.max_index_of_user is None: # print('Firstly fit_testset') # return None # # list_items = self.testset[self.testset.user == user].item # items_to_rank = list_items[list_items < self.max_index_of_item].values # items_to_end = list_items[list_items >= self.max_index_of_item].values # # res = [] # # if user >= self.max_index_of_user: # list_to_sort = [] # for item in items_to_rank: # list_to_sort.append((round(self.item_value_counts[item]*0.001,3),item)) # # for item in items_to_end: # list_to_sort.append((0,item)) # # list_to_sort.sort(reverse=True) # res = [(t[1], t[0]) for t in list_to_sort] # else: # res = self.model.rank_items(user, self.user_items,selected_items=items_to_rank) # for item in items_to_end: # res.append((item, 0)) # return res # # # # def rank(self): # if self.max_index_of_user is None: # print('Firstly fit_testset') # return None # # result = pd.DataFrame(columns=['item','rating','user']) # # users = list(self.testset.user.unique()) # for i in tqdm(range(len(users))): # user = users[i] # res = self.rank_for_user(user) # df = pd.DataFrame(res, columns=['item','rating']) # df['user'] = [user]*len(df) # # result = pd.concat([result, df]) # # result = result[['user','item','rating']] # output = code(copy.deepcopy(result), 'user','item','rating',self.inv_mapping_dict) # output.index = range(len(output)) # # return output def dump_model(self, filename='dumped_file'): """ Saving the model for further using. :param filename: str - path and name of file to save. :return: """ if (self.model is None) | (self.mapped_trainset is None): print('Unable to dump model') print('Please firstly fit train dataset and train model') else: dump_obj = { 'model': self.model, 'mapped_trainset': self.mapped_trainset, 'mapping_dict': self.mapping_dict, 'inv_mapping_dict': self.inv_mapping_dict, 'max_index_of_item': self.max_index_of_item, 'max_index_of_user': self.max_index_of_user, 'item_users': self.item_users, 'user_items': self.user_items, 'k': self.k, 'mean_user_factors': self.mean_user_factors, 'mean_item_factors': self.mean_item_factors, 'baseline_recommend_items': self.baseline_recommend_items, 'baseline_recommend_scores': self.baseline_recommend_scores, 'param': self.param } pickle.dump(dump_obj, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) print('Model has succesfuly been dumped!') def load_model(self, filename='dumped_file'): """ Function to load ready to use, pre trained model from file. :param filename: str - path to the file with model :return: nothing """ dump_obj = pickle.load(open(filename, 'rb')) self.model = dump_obj['model'] self.mapped_trainset = dump_obj['mapped_trainset'] self.mapping_dict = dump_obj['mapping_dict'] self.inv_mapping_dict = dump_obj['inv_mapping_dict'] self.max_index_of_item = dump_obj['max_index_of_item'] self.max_index_of_user = dump_obj['max_index_of_user'] self.item_users = dump_obj['item_users'] self.user_items = dump_obj['user_items'] self.k = dump_obj['k'] self.mean_user_factors = dump_obj['mean_user_factors'] self.mean_item_factors = dump_obj['mean_item_factors'] self.baseline_recommend_items = dump_obj['baseline_recommend_items'] self.baseline_recommend_scores = dump_obj['baseline_recommend_scores'] self.param = dump_obj['param']
class ImplicitALS: def __init__(self, df, config, orig_df): df = self._calc_confidence_preference(df, config.alpha) self.config = config self.orig_df = orig_df def check_index_uniformity(index): return index.min() == 0 and \ index.max() == len(index) - 1 def index_info(index): return 'index with min %d max %d count %d items' % ( index.min(), index.max(), len(index)) assert check_index_uniformity( df.user_id.drop_duplicates()), index_info( df.user_id.drop_duplicates()) assert check_index_uniformity( df.item_id.drop_duplicates()), index_info( df.item_id.drop_duplicates()) users = df.user_id.to_list() items = df.item_id.to_list() rate = df.rate.to_list() shape = (len(set(items)), len(set(users))) self.iu_mat = csr_matrix((rate, (items, users)), shape=shape) self.ui_mat = self.iu_mat.transpose() self.model = ALS(factors=config.factors, calculate_training_loss=True, iterations=config.iterations, regularization=config.regularization) self.max_uix = max(users) def _calc_confidence_preference(self, df, alpha): # convert to confidence and preference # use split_rate as a threshold for bad and good classes. # to enlarge negative effect, use quadratic transform for rate split_rate = 6 eps = 1e-4 get_p = lambda v: 1 if v > split_rate else 0 get_logp = lambda v: log(1 + get_p(v / eps)) df['rate'] = 1 + alpha * df.rate.apply(get_logp) return df def _delete_bookmarks(self, recs, seen_items): # Since filter_already_liked doesnt work, filter by hand for i, rec in enumerate(recs): if rec[0] in seen_items: recs[i] = None recs = list(filter(lambda r: r is not None, recs)) return recs def fit(self): self.model.fit(self.iu_mat) def recommend_user(self, user, k, return_scores=False): user_items = self.orig_df[self.orig_df.user_id == user].item_id.tolist() # filter liked until len(recs) != given k base_k = k k = int(min(1.5 * k, k + 0.1 * len(user_items))) recs = self.model.recommend(user, self.ui_mat, N=k) recs = self._delete_bookmarks(recs, user_items) while len(recs) < base_k: k *= 2 recs = self.model.recommend(user, self.ui_mat, N=k) recs = self._delete_bookmarks(recs, user_items) recs = recs[:base_k] # return with or without scores if not return_scores: return [rec[0] for rec in recs] else: return recs def similar_items(self, item, k, return_scores=False): # Returns items that are similar to item with given id recs = self.model.similar_items(item, k + 1) # avoid recommending same item recs = self._delete_bookmarks(recs, [item]) recs = recs[:k] # return with or without scores if not return_scores: return [rec[0] for rec in recs] else: return recs def similar_items_for_user(self, item, user, k, return_scores=False): # Returns items that are similar to item with given id and havent # been seen by user with given id user_items = self.orig_df[self.orig_df.user_id == user].item_id.tolist() user_items += [item] # avoid recommending same item # filter liked until len(recs) != given k base_k = k k = int(min(1.5 * k, k + 0.1 * len(user_items))) recs = self.model.similar_items(item, k) recs = self._delete_bookmarks(recs, user_items) while len(recs) < base_k: k *= 2 recs = self.model.recommend(item, k) recs = self._delete_bookmarks(recs, user_items) recs = recs[:base_k] # return with or without scores if not return_scores: return [rec[0] for rec in recs] else: return recs def _add_empty_user(self): # Enlarges ui_mat and als model user_factors for 1 extra user # Upd wrapper data self.max_uix += 1 old_shape = self.ui_mat.shape self.ui_mat.resize((old_shape[0] + 1, old_shape[1])) # Upd inner model data k = self.model.factors # set random weights for new user self.model.user_factors = np.vstack( (self.model.user_factors, np.random.randn(k))) def update_user_data(self, user, user_views): # Updates model's data about user and recalculates it assert isinstance(user, int) assert isinstance(user_views, pd.DataFrame) assert len(user_views) > 0 assert len(user_views.user_id.drop_duplicates()) == 1 user_views = user_views[user_views.item_id != -1] user_views = user_views.drop_duplicates( subset='item_id user_id'.split(), keep='last') user_views = self._calc_confidence_preference(user_views, self.config.alpha) iixs = user_views.item_id.tolist() rates = user_views.rate.tolist() # Create new user rates csr matrix rowscols = ([0 for _ in iixs], iixs) size = (1, self.ui_mat.shape[1]) # Upd wrapper data assert user <= self.max_uix self.ui_mat[user] = csr_matrix((rates, rowscols), shape=size) # Upd inner model data k = self.model.factors # set random weights for new user self.model.user_factors = np.vstack( (self.model.user_factors, np.random.randn(k))) # recalculate new_user_factors = self.model.recalculate_user(user, self.ui_mat) self.model.user_factors[user] = new_user_factors def add_user(self, user, user_views=None): # Adds user to recommender model. Updates model's matrixes, allows making # predictions for new user assert isinstance(user, int) self._add_empty_user() if user_views is None: return assert isinstance(user_views, pd.DataFrame) assert len(user_views) > 0 assert len(user_views.user_id.drop_duplicates()) == 1 self.update_user_data(user, user_views)
class Recommender: def __init__(self, **args): self.TRAINING_THREADS = int( args.get("training_threads", os.cpu_count())) self.ALS_FACTORS = args.get("als_factors", 128) self.ALS_REGULARIZATION = args.get("als_regularization", 1e-2) self.ALS_ITERATIONS = args.get("als_iterations", 15) self.MIN_POST_FAVS = args.get("min_post_favs", 5) self.MIN_USER_FAVS = args.get("min_user_favs", 50) self.MAX_FAVS = args.get("max_favs", 1e12) self.FAVS_PATH = args.get("favs_path", "data/favs.csv") self.MODEL_PATH = args.get("model_path", "data/recommender.pickle") self.DATABASE_URL = args.get("database_url", "postgresql://localhost/danbooru2") @staticmethod def create(**args): env = {name.lower(): value for name, value in os.environ.items()} args = {**env, **args} recommender = Recommender(**args) recommender.dump_favorites() recommender.load_favorites() recommender.train() recommender.save(recommender.MODEL_PATH) return recommender @staticmethod def load(model_path): with open(model_path, "rb") as file: return pickle.load(file) def dump_favorites(self): query = f""" SELECT post_id, user_id FROM favorites WHERE post_id IN (SELECT id FROM posts WHERE fav_count > {self.MIN_POST_FAVS}) AND user_id IN (SELECT id FROM users WHERE favorite_count > {self.MIN_USER_FAVS}) ORDER BY post_id DESC LIMIT {self.MAX_FAVS} """ self.shell( f"psql --no-psqlrc -c '\copy ({query}) TO STDOUT WITH (FORMAT CSV)' {self.DATABASE_URL} > {self.FAVS_PATH}" ) def load_favorites(self): favs_df = pd.read_csv(self.FAVS_PATH, dtype=np.int32, names=["post_id", "user_id"]) favs_df = favs_df.astype("category") self.favorites = csr_matrix( (np.ones(favs_df.shape[0]), (favs_df["post_id"].cat.codes.copy(), favs_df["user_id"].cat.codes.copy())), dtype=np.int32) self.users_to_id = { k: v for v, k in enumerate(favs_df["user_id"].cat.categories) } self.posts_to_id = { k: v for v, k in enumerate(favs_df["post_id"].cat.categories) } self.ids_to_post = {k: v for v, k in self.posts_to_id.items()} self.empty = csr_matrix(self.favorites.shape) def train(self): self.model = AlternatingLeastSquares( calculate_training_loss=True, dtype=np.float32, num_threads=self.TRAINING_THREADS, factors=self.ALS_FACTORS, regularization=self.ALS_REGULARIZATION, iterations=self.ALS_ITERATIONS) start = time.monotonic() self.model.fit(self.favorites) end = time.monotonic() dur = int(end - start) self.favorites = None self.trained_at = datetime.utcnow().isoformat() self.training_time = "{:02d}:{:02d}:{:02d}".format( dur // 3600, (dur % 3600 // 60), dur % 60) def recommend_for_user(self, user_id, limit=50): if not user_id in self.users_to_id: return [] uid = self.users_to_id[user_id] recommendations = self.model.recommend(uid, self.empty, N=limit) recommendations = [(self.ids_to_post[id], float(score)) for id, score in recommendations] return recommendations def recommend_for_post(self, post_id, limit=50): if not post_id in self.posts_to_id: return [] pid = self.posts_to_id[post_id] recommendations = self.model.similar_items(pid, N=limit) recommendations = [(self.ids_to_post[id], float(score)) for id, score in recommendations] return recommendations def metrics(self): return { "user_count": len(self.users_to_id), "post_count": len(self.posts_to_id), "factors": self.model.factors, "model_size": 4 * self.model.factors * (len(self.users_to_id) + len(self.posts_to_id)), "trained_at": self.trained_at, "training_time": self.training_time, } def save(self, model_path): with open(model_path, "wb") as file: pickle.dump(self, file) def shell(self, cmd): subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, shell=True, check=True)
def mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20): print(f'MF... iters:{iteration}') # 0711 기준 최고 하이퍼파라미터) * 100, song - 256, tag - 32, reg = 0.1, epoch 20 > song 56.4%, tag 61.3% res = [] songs_A = spr.vstack([test_songs_A, train_songs_A]) tags_A = spr.vstack([test_tags_A, train_tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(songs_A.T * 100) als_model_tag = ALS(factors=32, regularization=0.08, use_gpu=True, iterations=iteration) als_model_tag.fit(tags_A.T * 100) #rec_song = als_model.recommend_all(train_songs_A,N=500) #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score) for pid in tqdm(range(test_songs_A.shape[0])): if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop + 50, filter_already_liked_items=False) else: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop, filter_already_liked_items=True) if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1: cand_tag = als_model_tag.recommend( pid, test_tags_A, N=tag_ntop + 5, filter_already_liked_items=True) #tags_already = self.orig_test[self.orig_test['id']== self.plylst_nid_id[self.n_train + pid]]['tags'] #cand_tag = remove_seen(tags_already,cand_tag)[:tag_ntop] else: cand_tag = als_model_tag.recommend( pid, test_tags_A, N=tag_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag] rec_tag_score = [x[1] for x in cand_tag] res.append({ "id": self.plylst_nid_id[self.n_train + pid], "songs": rec_song_idx, "tags": rec_tag_idx, "songs_score": rec_song_score, "tags_score": rec_tag_score }) print("DONE") return res
class CollaborativeFiltering(object): """ """ def __init__(self, factors=100, regularization=0.01, iterations=15, calculate_training_loss=True, num_threads=0, random_state=42, **kwargs): """ """ ## Initialize self._random_state = random_state self.model = AlternatingLeastSquares( factors=factors, regularization=regularization, iterations=iterations, calculate_training_loss=calculate_training_loss, num_threads=num_threads, **kwargs) def __repr__(self): """ """ desc = f"CollaborativeFiltering(factors={self.model.factors}, regularization={self.model.regularization})" return desc def fit(self, X, rows=None, columns=None): """ Args: X (csr sparse matrix): Rows are Items, Columns are Users rows (list): Identifiers for the rows columns (list): Identifiers for the columns """ ## Fix Random Seed np.random.seed(self._random_state) ## Indices if rows is None: rows = list(range(X.shape[0])) if columns is None: columns = list(range(X.shape[1])) self._item_map = dict((row, r) for r, row in enumerate(rows)) self._items = rows self._user_map = dict((col, c) for c, col in enumerate(columns)) self._users = columns ## Fit self.model.fit(X, show_progress=self.model.calculate_training_loss) return self def get_similar_item(self, item, k_top=10): """ Find similar items to a given item using cosine similarity Args: item (any): One of the rows in the training data k_top (int): Number of top similar items to return """ if item not in self._item_map: raise KeyError("Item does not exist") ## Compute Cosine Similarity item_f = self.model.item_factors[self._item_map[item]] item_factors = self.model.item_factors sim = item_factors.dot(item_f) / ( self.model.item_norms * self.model.item_norms[self._item_map[item]]) ## Select Top-k best = np.argpartition(sim, -k_top)[-k_top:] sim = sorted(zip(best, sim[best]), key=lambda x: -x[1]) ## Replace Indices with Names sim_items = list(map(lambda i: [self._items[i[0]], i[1]], sim)) sim_items = pd.DataFrame(sim_items, columns=["item", "similarity"]) return sim_items def recommend(self, user_history, filter_liked=False, filter_items=[], k_top=10): """ Args: user_history (dict or list of raw items): k_top (int): Number of items to recommend """ ## User History user_history = Counter(user_history) ## Compute User Factor user_vector = np.zeros(self.model.item_factors.shape[0]) for item, count in user_history.items(): if item not in self._item_map: continue user_vector[self._item_map[item]] = count ## Compute Score scores = self.model.recommend(userid=0, user_items=csr_matrix(user_vector), N=k_top, filter_already_liked_items=filter_liked, filter_items=list( map(lambda f: self._item_map[f], filter_items)), recalculate_user=True) ## Replace Indices with Names rec_items = list(map(lambda i: [self._items[i[0]], i[1]], scores)) rec_items = pd.DataFrame(rec_items, columns=["item", "score"]) return rec_items def dump(self, model_file, compress=3): """ """ _ = joblib.dump(self, model_file, compress=compress)