def train_and_predict(train_filepath, test_filepath): train_df = pd.read_json(train_filepath) test_df = pd.read_json(test_filepath) tr_songs = train_df.songs.tolist() te_songs = test_df.songs.tolist() tr_tags = train_df.tags.tolist() te_tags = test_df.tags.tolist() vocab = Vocabulary(pd.concat([train_df, test_df], ignore_index=True)) train_data = encode_features(train_df, vocab) test_data = encode_features(test_df, vocab) # Shuffle train data train_data = shuffle(train_data) # list of lists -> CSR def lil_to_csr(indices, shape): data = [] row_ind = [] col_ind = [] for row_idx, row in enumerate(indices): for col_idx in row: data.append(1) row_ind.append(row_idx) col_ind.append(col_idx) return csr_matrix((data, (row_ind, col_ind)), shape=shape) train_csr = lil_to_csr(train_data, (len(train_data), vocab.size)) test_csr = lil_to_csr(test_data, (len(test_data), vocab.size)) r = scipy.sparse.vstack([test_csr, train_csr]) r = csr_matrix(r) factors = 512 alpha = 500.0 als_model = ALS(factors=factors, regularization=0.1) als_model.fit(r.T * alpha) song_model = ALS(factors=factors) tag_model = ALS(factors=factors) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:vocab.num_songs] tag_model.item_factors = als_model.item_factors[vocab.num_songs:] song_rec_csr = test_csr[:, :vocab.num_songs] tag_rec_csr = test_csr[:, vocab.num_songs:] song_rec = song_model.recommend_all(song_rec_csr, N=100) tag_rec = tag_model.recommend_all(tag_rec_csr, N=10) tag_rec += vocab.num_songs return [{ "id": test_playlist_id, "songs": list(map(vocab.id_to_song, song_rec[test_row_idx])), "tags": list(map(vocab.id_to_tag, tag_rec[test_row_idx])), } for test_row_idx, test_playlist_id in enumerate(tqdm(test_df.id))]
def __init__(self, params={"c": None}, nunique_feature=None): self.params = params.copy() self.c = params["c"] del params["c"] self.model = ALS(**params) self.song_model = ALS(**params) self.tag_model = ALS(**params) self.song_rec_csr = None self.tag_rec_csr = None self.nunique_feature = nunique_feature
def __init__(self, df, config, orig_df): df = self._calc_confidence_preference(df, config.alpha) self.config = config self.orig_df = orig_df def check_index_uniformity(index): return index.min() == 0 and \ index.max() == len(index) - 1 def index_info(index): return 'index with min %d max %d count %d items' % ( index.min(), index.max(), len(index)) assert check_index_uniformity( df.user_id.drop_duplicates()), index_info( df.user_id.drop_duplicates()) assert check_index_uniformity( df.item_id.drop_duplicates()), index_info( df.item_id.drop_duplicates()) users = df.user_id.to_list() items = df.item_id.to_list() rate = df.rate.to_list() shape = (len(set(items)), len(set(users))) self.iu_mat = csr_matrix((rate, (items, users)), shape=shape) self.ui_mat = self.iu_mat.transpose() self.model = ALS(factors=config.factors, calculate_training_loss=True, iterations=config.iterations, regularization=config.regularization) self.max_uix = max(users)
def __init__(self, params): params = params.copy() self.c = params["c"] del params["c"] self.model = ALS(**params) self.plylst_feature_mapping = {} self.song_feature_mapping = {} self.n_plylst = None self.n_song = None self.n_tag = None self.n_train = None self.n_test = None
def _split_collective_model(self, csr_matrix): splited_models = [] splited_csr_matrix = [] prev_idx = 0 for num_entity in self.num_entities: model = ALS(factors=self.num_factors) model.user_factors = self.model.user_factors model.item_factors = self.model.item_factors[prev_idx:prev_idx+num_entity] splited_models.append(model) entity_csr_matrix = csr_matrix[:, prev_idx:prev_idx+num_entity] splited_csr_matrix.append(entity_csr_matrix) prev_idx = prev_idx + num_entity return splited_models, splited_csr_matrix
def mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20): print(f'MF... iters:{iteration}') # 0711 기준 최고 하이퍼파라미터) * 100, song - 256, tag - 32, reg = 0.1, epoch 20 > song 56.4%, tag 61.3% res = [] songs_A = spr.vstack([test_songs_A, train_songs_A]) tags_A = spr.vstack([test_tags_A, train_tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(songs_A.T * 100) als_model_tag = ALS(factors=32, regularization=0.08, use_gpu=True, iterations=iteration) als_model_tag.fit(tags_A.T * 100) #rec_song = als_model.recommend_all(train_songs_A,N=500) #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score) for pid in tqdm(range(test_songs_A.shape[0])): if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop + 50, filter_already_liked_items=False) else: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop, filter_already_liked_items=True) if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1: cand_tag = als_model_tag.recommend( pid, test_tags_A, N=tag_ntop + 5, filter_already_liked_items=True) #tags_already = self.orig_test[self.orig_test['id']== self.plylst_nid_id[self.n_train + pid]]['tags'] #cand_tag = remove_seen(tags_already,cand_tag)[:tag_ntop] else: cand_tag = als_model_tag.recommend( pid, test_tags_A, N=tag_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag] rec_tag_score = [x[1] for x in cand_tag] res.append({ "id": self.plylst_nid_id[self.n_train + pid], "songs": rec_song_idx, "tags": rec_tag_idx, "songs_score": rec_song_score, "tags_score": rec_tag_score }) print("DONE") return res
def mixed_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20): print("MF for song / CF for tag...") res = [] # song songs_A = spr.vstack([test_songs_A, train_songs_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) als_model.fit(songs_A.T * 100) # tag train_tags_A_T = train_tags_A.T.tocsr() # shape) n_tags * n_train ply tag_val = test_tags_A.dot(train_tags_A_T) cand_tag_matrix = tag_val.dot(train_tags_A) del tag_val for r, pid in tqdm(enumerate(range(test_songs_A.shape[0]), 0)): # song if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop, filter_already_liked_items=False) else: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] # tag tag_row = cand_tag_matrix.getrow(r).toarray().reshape(-1, ) cand_tag_idx = tag_row.argsort()[-tag_ntop - 5:][::-1] tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"] if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1: rec_tag_idx = remove_seen(tags_already, cand_tag_idx)[:tag_ntop] else: tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"] rec_tag_idx = remove_seen(tags_already, cand_tag_idx)[:tag_ntop] rec_tag_score = [tag_row.data[i] for i in cand_tag_idx] res.append({ "id": self.plylst_nid_id[self.n_train + pid], "songs": rec_song_idx, "tags": [self.tag_tid_id[i] for i in rec_tag_idx], "songs_score": rec_song_score, "tags_score": rec_tag_score }) return res
def multi_mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop, tag_ntop, iteration): print(f'Multi_MF... iters:{iteration}') #res = [] songs_A = spr.vstack([test_songs_A, train_songs_A]) tags_A = spr.vstack([test_tags_A, train_tags_A]) A = spr.hstack([songs_A, tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(A.T * 15) song_model = ALS(use_gpu=False) tag_model = ALS(use_gpu=False) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:self.n_songs] tag_model.item_factors = als_model.item_factors[self.n_songs:] # for test song_rec_csr = songs_A[:self.n_test, :] tag_rec_csr = tags_A[:self.n_test, :] cand_song = song_model.recommend_all(song_rec_csr, N=500) cand_tag = tag_model.recommend_all(tag_rec_csr, N=50) res = [{ "id": self.plylst_nid_id[self.n_train + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] #rec_song = als_model.recommend_all(train_songs_A,N=500) #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score) ''' for id in tqdm(range(self.n_test)): # song cand_song = song_model.recommend(id,song_rec_csr, N=song_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] # tag cand_tag = tag_model.recommend(id,tag_rec_csr, N=tag_ntop, filter_already_liked_items=True) rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag] rec_tag_score = [x[1] for x in cand_tag] res.append({ "id": self.plylst_nid_id[self.n_train + id], "songs": rec_song_idx, "tags": rec_tag_idx, "songs_score":rec_song_score, "tags_score":rec_tag_score }) ''' print("DONE") return res
def mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20): print(f'MF... iters:{iteration}') # 0711 기준 최고 하이퍼파라미터) * 100, song - 256, tag - 32, reg = 0.1, epoch 20 > song 56.4%, tag 61.3% val_song_res = [] val_tag_res = [] test_song_res = [] test_tag_res = [] songs_A = spr.vstack([test_songs_A, train_songs_A]) tags_A = spr.vstack([test_tags_A, train_tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(songs_A.T * 100) als_model_tag = ALS(factors=32, regularization=0.08, use_gpu=True, iterations=iteration) als_model_tag.fit(tags_A.T * 100) for id in tqdm(range(self.n_test_song)): # 18636 / 태그 -> 11605 행 # song cand_song = als_model.recommend(id, test_songs_A, N=song_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] if (id < self.n_val_song): # 순서 - train, val, test val_song_res.append({ "id": self.plylst_nid_id[self.plylst_test_song.index[id]], "songs": rec_song_idx, "songs_score": rec_song_score }) else: test_song_res.append({ "id": self.plylst_nid_id[self.plylst_test_song.index[id]], "songs": rec_song_idx, "songs_score": rec_song_score }) # tag try: cand_tag = als_model_tag.recommend( id, test_tags_A, N=tag_ntop, filter_already_liked_items=True) rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag] rec_tag_score = [x[1] for x in cand_tag] if (id < self.n_val_song): val_tag_res.append({ "id": self.plylst_nid_id[self.plylst_test_tag.index[id]], "tags": rec_tag_idx, "tags_score": rec_tag_score }) else: test_tag_res.append({ "id": self.plylst_nid_id[self.plylst_test_tag.index[id]], "tags": rec_tag_idx, "tags_score": rec_tag_score }) except IndexError: pass print("DONE") return val_song_res, val_tag_res, test_song_res, test_tag_res
def __init__(self, num_entities, num_factors, **kwargs): self.num_entities = num_entities self.num_factors = num_factors self.model = ALS(factors=num_factors, **kwargs)
def multi_mf_(self, train_songs_A, train_tags_A, val_songs_A, val_tags_A, test_songs_A, test_tags_A, meta=True, song_ntop=500, tag_ntop=50, iteration=20, score=False): print(f'Multi_MF... iters:{iteration}') val_res = [] test_res = [] songs_A = spr.vstack([val_songs_A, test_songs_A, train_songs_A]) tags_A = spr.vstack([val_tags_A, test_tags_A, train_tags_A]) print(val_songs_A.shape, test_songs_A.shape, train_songs_A.shape) if meta == True: s_meta = self.mkspr_for_meta() print(songs_A.shape, tags_A.shape, s_meta.shape) A = spr.hstack([songs_A, tags_A, s_meta]) else: A = spr.hstack([songs_A, tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) als_model.fit(A.T * 100) song_model = ALS(use_gpu=True) tag_model = ALS(use_gpu=True) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:self.n_songs] tag_model.item_factors = als_model.item_factors[self.n_songs:] # for val val_song_rec_csr = songs_A[:self.n_val, :] val_tag_rec_csr = tags_A[:self.n_val, :] # for test test_song_rec_csr = songs_A[self.n_val:self.n_val + self.n_test, :] test_tag_rec_csr = tags_A[self.n_val:self.n_val + self.n_test, :] if score is True: pass else: # val cand_song = song_model.recommend_all(val_song_rec_csr, N=song_ntop) cand_tag = tag_model.recommend_all(val_tag_rec_csr, N=tag_ntop) val_res = [{ "id": self.plylst_nid_id[self.n_train + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] # test cand_song = song_model.recommend_all(test_song_rec_csr, N=song_ntop) cand_tag = tag_model.recommend_all(test_tag_rec_csr, N=tag_ntop) test_res = [{ "id": self.plylst_nid_id[self.n_train + self.n_val + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] return val_res, test_res
def mf_(self, train_songs_A, train_tags_A, val_songs_A, val_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20, score=False): print(f'MF... iters:{iteration}') val_song_res = [] val_tag_res = [] test_song_res = [] test_tag_res = [] songs_A = spr.vstack([val_songs_A, test_songs_A, train_songs_A]) tags_A = spr.vstack([val_tags_A, test_tags_A, train_tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(songs_A.T * 100) als_model_tag = ALS(factors=32, regularization=0.08, use_gpu=True, iterations=iteration) als_model_tag.fit(tags_A.T * 100) def res_recommend(id, als_model=als_model, matrix=val_songs_A, N=song_ntop, nid_id=self.song_sid_id, id_index=self.plylst_val_song.index, res=val_song_res): try: cand_song = als_model.recommend( id, val_songs_A, N=N, filter_already_liked_items=True) rec_song_idx = [nid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] val_song_res.append({"id":self.plylst_nid_id[id_index[id]],\ "songs" : rec_song_idx, "songs_score": rec_song_score}) except IndexError: pass if score is True: for id in tqdm(range(self.n_val_song)): res_recommend(id, als_model = als_model, matrix = val_songs_A, N=song_ntop, nid_id = self.song_sid_id,\ id_index = self.plylst_val_song.index, res = val_song_res) res_recommend(id, als_model = als_model_tag, matrix = val_tags_A, N=tag_ntop, nid_id = self.tag_tid_id,\ id_index = self.plylst_val_tag.index, res = val_tag_res) res_recommend(id, als_model = als_model, matrix = test_songs_A, N=song_ntop, nid_id = self.song_sid_id,\ id_index = self.plylst_test_song.index, res = test_song_res) res_recommend(id, als_model = als_model_tag, matrix = test_tags_A, N=tag_ntop, nid_id = self.tag_tid_id,\ id_index = self.plylst_test_tag.index, res = test_tag_res) else: # Score > False val_cand_song = als_model.recommend_all( val_songs_A, N=song_ntop, filter_already_liked_items=True) val_cand_tag = als_model_tag.recommend_all( val_tags_A, N=tag_ntop, filter_already_liked_items=True) test_cand_song = als_model.recommend_all( test_songs_A, N=song_ntop, filter_already_liked_items=True) test_cand_tag = als_model_tag.recommend_all( test_tags_A, N=tag_ntop, filter_already_liked_items=True) val_song_res = [{ "id": self.plylst_nid_id[self.plylst_val_song.index[id]], "songs": [self.song_sid_id.get(x) for x in rec_idx.tolist()] } for id, rec_idx in enumerate(val_cand_song, 0)] val_tag_res = [{ "id": self.plylst_nid_id[self.plylst_val_tag.index[id]], "tags": [self.tag_tid_id.get(x) for x in rec_idx.tolist()] } for id, rec_idx in enumerate(val_cand_tag, 0)] test_song_res = [{ "id": self.plylst_nid_id[self.plylst_test_song.index[id]], "songs": [self.song_sid_id.get(x) for x in rec_idx.tolist()] } for id, rec_idx in enumerate(test_cand_song, 0)] test_tag_res = [{ "id": self.plylst_nid_id[self.plylst_test_tag.index[id]], "tags": [self.tag_tid_id.get(x) for x in rec_idx.tolist()] } for id, rec_idx in enumerate(test_cand_tag, 0)] print("DONE") return val_song_res, val_tag_res, test_song_res, test_tag_res