def train_and_predict(train_filepath, test_filepath): train_df = pd.read_json(train_filepath) test_df = pd.read_json(test_filepath) tr_songs = train_df.songs.tolist() te_songs = test_df.songs.tolist() tr_tags = train_df.tags.tolist() te_tags = test_df.tags.tolist() vocab = Vocabulary(pd.concat([train_df, test_df], ignore_index=True)) train_data = encode_features(train_df, vocab) test_data = encode_features(test_df, vocab) # Shuffle train data train_data = shuffle(train_data) # list of lists -> CSR def lil_to_csr(indices, shape): data = [] row_ind = [] col_ind = [] for row_idx, row in enumerate(indices): for col_idx in row: data.append(1) row_ind.append(row_idx) col_ind.append(col_idx) return csr_matrix((data, (row_ind, col_ind)), shape=shape) train_csr = lil_to_csr(train_data, (len(train_data), vocab.size)) test_csr = lil_to_csr(test_data, (len(test_data), vocab.size)) r = scipy.sparse.vstack([test_csr, train_csr]) r = csr_matrix(r) factors = 512 alpha = 500.0 als_model = ALS(factors=factors, regularization=0.1) als_model.fit(r.T * alpha) song_model = ALS(factors=factors) tag_model = ALS(factors=factors) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:vocab.num_songs] tag_model.item_factors = als_model.item_factors[vocab.num_songs:] song_rec_csr = test_csr[:, :vocab.num_songs] tag_rec_csr = test_csr[:, vocab.num_songs:] song_rec = song_model.recommend_all(song_rec_csr, N=100) tag_rec = tag_model.recommend_all(tag_rec_csr, N=10) tag_rec += vocab.num_songs return [{ "id": test_playlist_id, "songs": list(map(vocab.id_to_song, song_rec[test_row_idx])), "tags": list(map(vocab.id_to_tag, tag_rec[test_row_idx])), } for test_row_idx, test_playlist_id in enumerate(tqdm(test_df.id))]
def load_recommender(als_model_file: str, index_file: str, item_feature_file: str = None, **kwargs) -> ImplicitRecommender: log.info("Loading als model") data = np.load(als_model_file, allow_pickle=True) model = AlternatingLeastSquares( factors=data['model.item_factors'].shape[1]) model.item_factors = data['model.item_factors'] model.YtY # This will initialize the _YtY instance variable which is used directly in internal methods if 'user_factors' in data: model.user_factors = data['model.user_factors'] user_labels = data['user_labels'] item_labels = data['item_labels'] if index_file is None: return ImplicitRecommender(model, user_labels, item_labels) elif index_file.endswith('.ann'): import annoy log.info("Loading annoy recommendation index") max_norm, extra = augment_inner_product_matrix(model.item_factors) recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular') recommend_index.load( index_file) # prefault=load_to_memory does not seem to work if item_feature_file is None: from .annoy import ImplicitAnnoyRecommender return ImplicitAnnoyRecommender(model, recommend_index, max_norm, user_labels, item_labels) else: log.info("Loading item features for recommendation") item_feature_data = pickle.load(open(item_feature_file, "rb")) tag_tfidf_transformer = item_feature_data['tag_tfidf_transformer'] tag_lookup = item_feature_data['tag_lookup'] item_embedding_weight = item_feature_data['item_embedding_weight'] from .annoy_item_features import ImplicitAnnoyItemFeatureRecommender return ImplicitAnnoyItemFeatureRecommender( model, recommend_index, max_norm, user_labels, item_labels, tag_tfidf_transformer, tag_lookup, item_embedding_weight) elif index_file.endswith('.hnsw'): import hnswlib from .hnsw import ImplicitHNSWRecommender log.info("Loading hnsw recommendation index") # we build the index in l2 space and load it in inner product space on purpose. # This space change gives us 0.96 recall l2_recommend_index = hnswlib.Index(space='ip', dim=model.item_factors.shape[1]) l2_recommend_index.load_index(index_file) l2_recommend_index.set_ef(kwargs.get('ef', 2000)) return ImplicitHNSWRecommender(model, l2_recommend_index, user_labels, item_labels) else: raise RecommenderException("Unsupported file type" + index_file)
def read_from_file(file_name: str): """ Reads a model from a '.npz' file """ data = np.load(file_name) model = AlternatingLeastSquares( factors=data['model.item_factors'].shape[1]) model.item_factors = data['model.item_factors'] model.YtY # This will initialize the _YtY instance variable which is used directly in internal methods if 'user_factors' in data: model.user_factors = data['model.user_factors'] return Recommender(model=model, user_labels=data['user_labels'], item_labels=data['item_labels'])
def _split_collective_model(self, csr_matrix): splited_models = [] splited_csr_matrix = [] prev_idx = 0 for num_entity in self.num_entities: model = ALS(factors=self.num_factors) model.user_factors = self.model.user_factors model.item_factors = self.model.item_factors[prev_idx:prev_idx+num_entity] splited_models.append(model) entity_csr_matrix = csr_matrix[:, prev_idx:prev_idx+num_entity] splited_csr_matrix.append(entity_csr_matrix) prev_idx = prev_idx + num_entity return splited_models, splited_csr_matrix
def _fit_for_tag(self, train, val): df = self._t_data.get_preference(train, val) t_len = self._t_data.get_tag_length() # user x item csr_matrix user_item_csr = sparse.csr_matrix( (df['preference'].astype(float), (df['user_id'], df['item_id']))) t_model = AlternatingLeastSquares(factors=1800) t_model.fit(user_item_csr.T * 65) # Configure tag only model t_model.item_factors = t_model.item_factors[:t_len] user_tags_csr = user_item_csr[:, :t_len] self._t_best = t_model.recommend_all(user_tags_csr, N=self._t_topk)
def _fit_for_song(self, train, val): df = self._s_data.get_preference(train, val) s_len = self._s_data.get_song_length() # user x item csr_matrix user_item_csr = sparse.csr_matrix( (df['preference'].astype(float), (df['user_id'], df['item_id']))) s_model = AlternatingLeastSquares(factors=2500) s_model.fit(user_item_csr.T * 160) # Configure song only model s_model.item_factors = s_model.item_factors[:s_len] user_song_csr = user_item_csr[:, :s_len] self._s_best = s_model.recommend_all(user_song_csr, N=self._s_topk)
def multi_mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop, tag_ntop, iteration): print(f'Multi_MF... iters:{iteration}') #res = [] songs_A = spr.vstack([test_songs_A, train_songs_A]) tags_A = spr.vstack([test_tags_A, train_tags_A]) A = spr.hstack([songs_A, tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) # epoch als_model.fit(A.T * 15) song_model = ALS(use_gpu=False) tag_model = ALS(use_gpu=False) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:self.n_songs] tag_model.item_factors = als_model.item_factors[self.n_songs:] # for test song_rec_csr = songs_A[:self.n_test, :] tag_rec_csr = tags_A[:self.n_test, :] cand_song = song_model.recommend_all(song_rec_csr, N=500) cand_tag = tag_model.recommend_all(tag_rec_csr, N=50) res = [{ "id": self.plylst_nid_id[self.n_train + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] #rec_song = als_model.recommend_all(train_songs_A,N=500) #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score) ''' for id in tqdm(range(self.n_test)): # song cand_song = song_model.recommend(id,song_rec_csr, N=song_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] # tag cand_tag = tag_model.recommend(id,tag_rec_csr, N=tag_ntop, filter_already_liked_items=True) rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag] rec_tag_score = [x[1] for x in cand_tag] res.append({ "id": self.plylst_nid_id[self.n_train + id], "songs": rec_song_idx, "tags": rec_tag_idx, "songs_score":rec_song_score, "tags_score":rec_tag_score }) ''' print("DONE") return res
train_task = train.apply_async((queue, ), queue='trainer') with open('recommender/ratings.csv', 'a') as f: writer = csv.writer(f) for row in queue: writer.writerow(row) queue = [] if __name__ == '__main__': import numpy as np import pandas as pd import csv from scipy.sparse import coo_matrix from implicit.als import AlternatingLeastSquares user_offset = 69878 ratings_df = pd.read_csv('recommender/ratings.csv') ratings = coo_matrix((ratings_df['rating'], (ratings_df['movie_id'], ratings_df['user_id']))) model = AlternatingLeastSquares(factors=32, regularization=0.01, dtype=np.float32, iterations=15, calculate_training_loss=True) model.user_factors = np.load('recommender/user_factors.npy') model.item_factors = np.load('recommender/item_factors.npy') user_items = ratings.T.tocsr() queue = [] train_task = None app.worker_main()
def multi_mf_(self, train_songs_A, train_tags_A, val_songs_A, val_tags_A, test_songs_A, test_tags_A, meta=True, song_ntop=500, tag_ntop=50, iteration=20, score=False): print(f'Multi_MF... iters:{iteration}') val_res = [] test_res = [] songs_A = spr.vstack([val_songs_A, test_songs_A, train_songs_A]) tags_A = spr.vstack([val_tags_A, test_tags_A, train_tags_A]) print(val_songs_A.shape, test_songs_A.shape, train_songs_A.shape) if meta == True: s_meta = self.mkspr_for_meta() print(songs_A.shape, tags_A.shape, s_meta.shape) A = spr.hstack([songs_A, tags_A, s_meta]) else: A = spr.hstack([songs_A, tags_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) als_model.fit(A.T * 100) song_model = ALS(use_gpu=True) tag_model = ALS(use_gpu=True) song_model.user_factors = als_model.user_factors tag_model.user_factors = als_model.user_factors song_model.item_factors = als_model.item_factors[:self.n_songs] tag_model.item_factors = als_model.item_factors[self.n_songs:] # for val val_song_rec_csr = songs_A[:self.n_val, :] val_tag_rec_csr = tags_A[:self.n_val, :] # for test test_song_rec_csr = songs_A[self.n_val:self.n_val + self.n_test, :] test_tag_rec_csr = tags_A[self.n_val:self.n_val + self.n_test, :] if score is True: pass else: # val cand_song = song_model.recommend_all(val_song_rec_csr, N=song_ntop) cand_tag = tag_model.recommend_all(val_tag_rec_csr, N=tag_ntop) val_res = [{ "id": self.plylst_nid_id[self.n_train + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] # test cand_song = song_model.recommend_all(test_song_rec_csr, N=song_ntop) cand_tag = tag_model.recommend_all(test_tag_rec_csr, N=tag_ntop) test_res = [{ "id": self.plylst_nid_id[self.n_train + self.n_val + id], "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()], "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()] } for id, rec_idx in enumerate(zip(cand_song, cand_tag))] return val_res, test_res