def train_and_predict(train_filepath, test_filepath): train_df = pd.read_json(train_filepath) test_df = pd.read_json(test_filepath) tr_songs = train_df.songs.tolist() te_songs = test_df.songs.tolist() tr_tags = train_df.tags.tolist() te_tags = test_df.tags.tolist() vocab = Vocabulary(pd.concat([train_df, test_df], ignore_index=True)) train_data = encode_features(train_df, vocab) test_data = encode_features(test_df, vocab) # Shuffle train data train_data = shuffle(train_data) # list of lists -> CSR def lil_to_csr(indices, shape): data = [] row_ind = [] col_ind = [] for row_idx, row in enumerate(indices): for col_idx in row: data.append(1) row_ind.append(row_idx) col_ind.append(col_idx) return csr_matrix((data, (row_ind, col_ind)), shape=shape) train_csr = lil_to_csr(train_data, (len(train_data), vocab.size)) test_csr = lil_to_csr(test_data, (len(test_data), vocab.size)) r = scipy.sparse.vstack([test_csr, train_csr]) r = csr_matrix(r) factors = 512 alpha = 500.0 als_model = ALS(factors=factors, regularization=0.1) als_model.fit(r.T * alpha) song_model = ALS(factors=factors) tag_model = ALS(factors=factors) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:vocab.num_songs] tag_model.item_factors = als_model.item_factors[vocab.num_songs:] song_rec_csr = test_csr[:, :vocab.num_songs] tag_rec_csr = test_csr[:, vocab.num_songs:] song_rec = song_model.recommend_all(song_rec_csr, N=100) tag_rec = tag_model.recommend_all(tag_rec_csr, N=10) tag_rec += vocab.num_songs return [{ "id": test_playlist_id, "songs": list(map(vocab.id_to_song, song_rec[test_row_idx])), "tags": list(map(vocab.id_to_tag, tag_rec[test_row_idx])), } for test_row_idx, test_playlist_id in enumerate(tqdm(test_df.id))]
def _fit_for_tag(self, train, val): df = self._t_data.get_preference(train, val) t_len = self._t_data.get_tag_length() # user x item csr_matrix user_item_csr = sparse.csr_matrix( (df['preference'].astype(float), (df['user_id'], df['item_id']))) t_model = AlternatingLeastSquares(factors=1800) t_model.fit(user_item_csr.T * 65) # Configure tag only model t_model.item_factors = t_model.item_factors[:t_len] user_tags_csr = user_item_csr[:, :t_len] self._t_best = t_model.recommend_all(user_tags_csr, N=self._t_topk)
def _fit_for_song(self, train, val): df = self._s_data.get_preference(train, val) s_len = self._s_data.get_song_length() # user x item csr_matrix user_item_csr = sparse.csr_matrix( (df['preference'].astype(float), (df['user_id'], df['item_id']))) s_model = AlternatingLeastSquares(factors=2500) s_model.fit(user_item_csr.T * 160) # Configure song only model s_model.item_factors = s_model.item_factors[:s_len] user_song_csr = user_item_csr[:, :s_len] self._s_best = s_model.recommend_all(user_song_csr, N=self._s_topk)
def multi_mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop, tag_ntop, iteration): print(f'Multi_MF... iters:{iteration}') #res = [] songs_A = spr.vstack([test_songs_A, train_songs_A]) tags_A = spr.vstack([test_tags_A, train_tags_A]) A = spr.hstack([songs_A, tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(A.T * 15) song_model = ALS(use_gpu=False) tag_model = ALS(use_gpu=False) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:self.n_songs] tag_model.item_factors = als_model.item_factors[self.n_songs:] # for test song_rec_csr = songs_A[:self.n_test, :] tag_rec_csr = tags_A[:self.n_test, :] cand_song = song_model.recommend_all(song_rec_csr, N=500) cand_tag = tag_model.recommend_all(tag_rec_csr, N=50) res = [{ "id": self.plylst_nid_id[self.n_train + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] #rec_song = als_model.recommend_all(train_songs_A,N=500) #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score) ''' for id in tqdm(range(self.n_test)): # song cand_song = song_model.recommend(id,song_rec_csr, N=song_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] # tag cand_tag = tag_model.recommend(id,tag_rec_csr, N=tag_ntop, filter_already_liked_items=True) rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag] rec_tag_score = [x[1] for x in cand_tag] res.append({ "id": self.plylst_nid_id[self.n_train + id], "songs": rec_song_idx, "tags": rec_tag_idx, "songs_score":rec_song_score, "tags_score":rec_tag_score }) ''' print("DONE") return res
split_matrix(rating_matrix, user2idx, movie2idx) print( f'Train: {rating_matrix_train.count_nonzero()}\t', f'Validation Size: {rating_matrix_val.count_nonzero()}' ) # Train ALS Model model = AlternatingLeastSquares( factors=20, iterations=50, calculate_training_loss=True, num_threads=4 ) model.fit(rating_matrix_train.T) # Make Prediction recommendations = model.recommend_all( user_items=rating_matrix_train, N=100 ) # Evaluate precison_100 = n_precision(recommendations, rating_matrix_val, 100) recall_100 = n_recall(recommendations, rating_matrix_val, 100) print(f'P@100 : {precison_100:.2%}') print(f'R@100 : {recall_100:.2%}') # Save Recommendation np.savez('./output/rec_mf.npz', recommendations)
def multi_mf_(self, train_songs_A, train_tags_A, val_songs_A, val_tags_A, test_songs_A, test_tags_A, meta=True, song_ntop=500, tag_ntop=50, iteration=20, score=False): print(f'Multi_MF... iters:{iteration}') val_res = [] test_res = [] songs_A = spr.vstack([val_songs_A, test_songs_A, train_songs_A]) tags_A = spr.vstack([val_tags_A, test_tags_A, train_tags_A]) print(val_songs_A.shape, test_songs_A.shape, train_songs_A.shape) if meta == True: s_meta = self.mkspr_for_meta() print(songs_A.shape, tags_A.shape, s_meta.shape) A = spr.hstack([songs_A, tags_A, s_meta]) else: A = spr.hstack([songs_A, tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) als_model.fit(A.T * 100) song_model = ALS(use_gpu=True) tag_model = ALS(use_gpu=True) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:self.n_songs] tag_model.item_factors = als_model.item_factors[self.n_songs:] # for val val_song_rec_csr = songs_A[:self.n_val, :] val_tag_rec_csr = tags_A[:self.n_val, :] # for test test_song_rec_csr = songs_A[self.n_val:self.n_val + self.n_test, :] test_tag_rec_csr = tags_A[self.n_val:self.n_val + self.n_test, :] if score is True: pass else: # val cand_song = song_model.recommend_all(val_song_rec_csr, N=song_ntop) cand_tag = tag_model.recommend_all(val_tag_rec_csr, N=tag_ntop) val_res = [{ "id": self.plylst_nid_id[self.n_train + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] # test cand_song = song_model.recommend_all(test_song_rec_csr, N=song_ntop) cand_tag = tag_model.recommend_all(test_tag_rec_csr, N=tag_ntop) test_res = [{ "id": self.plylst_nid_id[self.n_train + self.n_val + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] return val_res, test_res
def mf_(self, train_songs_A, train_tags_A, val_songs_A, val_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20, score=False): print(f'MF... iters:{iteration}') val_song_res = [] val_tag_res = [] test_song_res = [] test_tag_res = [] songs_A = spr.vstack([val_songs_A, test_songs_A, train_songs_A]) tags_A = spr.vstack([val_tags_A, test_tags_A, train_tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(songs_A.T * 100) als_model_tag = ALS(factors=32, regularization=0.08, use_gpu=True, iterations=iteration) als_model_tag.fit(tags_A.T * 100) def res_recommend(id, als_model=als_model, matrix=val_songs_A, N=song_ntop, nid_id=self.song_sid_id, id_index=self.plylst_val_song.index, res=val_song_res): try: cand_song = als_model.recommend( id, val_songs_A, N=N, filter_already_liked_items=True) rec_song_idx = [nid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] val_song_res.append({"id":self.plylst_nid_id[id_index[id]],\ "songs" : rec_song_idx, "songs_score": rec_song_score}) except IndexError: pass if score is True: for id in tqdm(range(self.n_val_song)): res_recommend(id, als_model = als_model, matrix = val_songs_A, N=song_ntop, nid_id = self.song_sid_id,\ id_index = self.plylst_val_song.index, res = val_song_res) res_recommend(id, als_model = als_model_tag, matrix = val_tags_A, N=tag_ntop, nid_id = self.tag_tid_id,\ id_index = self.plylst_val_tag.index, res = val_tag_res) res_recommend(id, als_model = als_model, matrix = test_songs_A, N=song_ntop, nid_id = self.song_sid_id,\ id_index = self.plylst_test_song.index, res = test_song_res) res_recommend(id, als_model = als_model_tag, matrix = test_tags_A, N=tag_ntop, nid_id = self.tag_tid_id,\ id_index = self.plylst_test_tag.index, res = test_tag_res) else: # Score > False val_cand_song = als_model.recommend_all( val_songs_A, N=song_ntop, filter_already_liked_items=True) val_cand_tag = als_model_tag.recommend_all( val_tags_A, N=tag_ntop, filter_already_liked_items=True) test_cand_song = als_model.recommend_all( test_songs_A, N=song_ntop, filter_already_liked_items=True) test_cand_tag = als_model_tag.recommend_all( test_tags_A, N=tag_ntop, filter_already_liked_items=True) val_song_res = [{ "id": self.plylst_nid_id[self.plylst_val_song.index[id]], "songs": [self.song_sid_id.get(x) for x in rec_idx.tolist()] } for id, rec_idx in enumerate(val_cand_song, 0)] val_tag_res = [{ "id": self.plylst_nid_id[self.plylst_val_tag.index[id]], "tags": [self.tag_tid_id.get(x) for x in rec_idx.tolist()] } for id, rec_idx in enumerate(val_cand_tag, 0)] test_song_res = [{ "id": self.plylst_nid_id[self.plylst_test_song.index[id]], "songs": [self.song_sid_id.get(x) for x in rec_idx.tolist()] } for id, rec_idx in enumerate(test_cand_song, 0)] test_tag_res = [{ "id": self.plylst_nid_id[self.plylst_test_tag.index[id]], "tags": [self.tag_tid_id.get(x) for x in rec_idx.tolist()] } for id, rec_idx in enumerate(test_cand_tag, 0)] print("DONE") return val_song_res, val_tag_res, test_song_res, test_tag_res