def validate(self, user_ids=d.get_target_playlists(), log_path=None, normalize_similarity=[False], damp_coeff=[1], add_zeros_quota=[1], loss_tolerance=[1e-6], iteration_limit=[30], use_incremental=[False]): if log_path != None: orig_stdout = sys.stdout f = open( log_path + '/' + self.name + ' ' + time.strftime('_%H-%M-%S') + ' ' + time.strftime('%d-%m-%Y') + '.txt', 'w') sys.stdout = f for ns in normalize_similarity: for dc in damp_coeff: for adq in add_zeros_quota: for lt in loss_tolerance: for il in iteration_limit: for ui in use_incremental: print( self._print(normalize_similarity=ns, add_zeros_quota=dc, loss_tolerance=lt, iteration_limit=il, damp_coeff=dc, use_incremental=ui)) self.fit(ICM=d.get_icm(), URM_train=d.get_urm_train_1(), normalize_similarity=ns, add_zeros_quota=adq, loss_tolerance=lt, iteration_limit=il, damp_coeff=dc, use_incremental=ui) recs = self.recommend_batch( user_ids, urm=d.get_urm_train_1()) r.evaluate(recs, d.get_urm_test_1()) if log_path != None: sys.stdout = orig_stdout f.close()
def __init__(self, h): """ h: (int), length of the sequences split_perc: (float) validation split percentage, 0 to skip the creation of the validation set """ super(SequentialRecommender, self).__init__() self.name = 'sequential' self.h = h # build sequences dataset and cache it self.sequences, self.target_indices = ps.get_sequences(h=h) target_ids = data.get_target_playlists()[0:data.N_SEQUENTIAL] self.target_ids = np.array(target_ids) self.already_liked_indices = (data.get_urm_train_1()[target_ids]).nonzero() self.H = seqsim.getH(self.sequences)
def run(self, urm_train=None, urm=None, urm_test=None, targetids=None, factors=100, regularization=0.01, iterations=100, alpha=25, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Parameters ---------- num_factors : int, number of latent factors urm : csr matrix, URM. If None, used: data.get_urm_train(). This should be the entire URM for which the targetids corresponds to the row indexes. urm_test : csr matrix, urm where to test the model. If None, use: data.get_urm_test() targetids : list, target user ids. If None, use: data.get_target_playlists() Returns ------- recs: (list) recommendations map10: (float) MAP10 for the provided recommendations """ _urm = data.get_urm_train_1() _icm = data.get_icm() _urm_test = data.get_urm_test_1() _targetids = data.get_target_playlists() #_targetids = data.get_all_playlists() start = time.time() urm_train = _urm if urm_train is None else urm_train #urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(urm=urm_train, factors=factors, regularization=regularization, iterations=iterations, alpha=alpha) recs = self.recommend_batch(userids=targetids, with_scores=with_scores, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name=self.name, verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time()-start)) return recs, map10
def run(self, distance, ucm_train=None, urm=None, urm_test=None, targetids=None, k=100, shrink=10, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Parameters ---------- distance : str, distance metric targetids : list, target user ids. If None, use: data.get_target_playlists() k : int, K nearest neighbour to consider shrink : float, shrink term used in the normalization threshold : float, all the values under this value are cutted from the final result implicit : bool, if true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM Returns ------- recs: (list) recommendations map10: (float) MAP10 for the provided recommendations """ start = time.time() _ucm_train = data.get_ucm_train() _urm = data.get_urm_train_1() _urm_test = data.get_urm_test_1() _targetids = data.get_target_playlists() ucm_train = _ucm_train if ucm_train is None else ucm_train urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(ucm_train, k=k, distance=distance, alpha=alpha, beta=beta, c=c, l=l, shrink=shrink, threshold=threshold, implicit=implicit) recs = self.recommend_batch(targetids, urm=urm, with_scores=with_scores, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name='{}_{}'.format(self.name,distance), verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time()-start)) return recs, map10
def run(self, normalize_similarity=False, add_zeros_quota=1, loss_tolerance=1e-6, iteration_limit=30, damp_coeff=1, use_incremental=False, export_results=True, export_r_hat=False, export_for_validation=False): if export_r_hat and export_for_validation: urm = d.get_urm_train_1() else: urm = d.get_urm() self.fit(ICM=d.get_icm(), URM_train=urm, normalize_similarity=normalize_similarity, add_zeros_quota=add_zeros_quota, loss_tolerance=loss_tolerance, iteration_limit=iteration_limit, damp_coeff=damp_coeff, use_incremental=use_incremental) if export_results: print('exporting results') recs = self.recommend_batch(d.get_target_playlists(), N=10, urm=urm, filter_already_liked=True, with_scores=False, items_to_exclude=[], verbose=False) importexport.exportcsv( recs, 'submission', self._print(normalize_similarity=normalize_similarity, add_zeros_quota=add_zeros_quota, loss_tolerance=loss_tolerance, iteration_limit=iteration_limit, damp_coeff=damp_coeff, use_incremental=use_incremental)) elif export_r_hat: print('saving estimated urm') self.save_r_hat(export_for_validation)
def fit(self, ICM=data.get_icm(), URM_train=data.get_urm_train_1(), normalize_similarity=True, add_zeros_quota=0.1, loss_tolerance=0.0001, iteration_limit=100, damp_coeff=0.1, use_incremental=True): self.URM_train = URM_train self.ICM = ICM self.n_items = self.URM_train.shape[1] self.n_users = self.URM_train.shape[0] self.n_features = self.ICM.shape[1] self.normalize_similarity = normalize_similarity self.add_zeros_quota = add_zeros_quota self.use_incremental = use_incremental self._generateTrainData_low_ram() common_features = self.ICM[self.row_list].multiply( self.ICM[self.col_list]) linalg_result = linalg.lsqr(common_features, self.data_list, show=False, atol=loss_tolerance, btol=loss_tolerance, iter_lim=iteration_limit, damp=damp_coeff) self.D_incremental = linalg_result[0].copy() self.D_best = linalg_result[0].copy() self.epochs_best = 0 self.loss = linalg_result[3] self._compute_W_sparse()
userids=userids, N=N, filter_already_liked=filter_already_liked, items_to_exclude=items_to_exclude) #evaluate the model with map10 map10 = recommender.evaluate(recommendations, test_urm=urm_test) if verbose: print('map@10: {}'.format(map10)) #write on external files on folder models_validation if write_on_file: out.write( '\n\nl1_ratio: {}\n alpha: {}\n Iterations: {}\n ' 'topK: {}\n evaluation map@10: {}'.format( l, a, m, k, map10)) """ If this file is executed, test the SPLUS distance metric """ if __name__ == '__main__': rec = SLIMElasticNetRecommender() rec.fit(urm=data.get_urm_train_1(), max_iter=1, topK=400, alpha=1e-4, l1_ratio=0.5) recs = rec.recommend_batch(userids=data.get_target_playlists()) rec.evaluate(recommendations=recs, test_urm=data.get_urm_test_1())
If this file is executed, test the P3alpha recommender """ if __name__ == '__main__': print() log.success('++ What do you want to do? ++') log.warning('(t) Test the model with some default params') log.warning('(r) Save the R^') log.warning('(s) Save the similarity matrix') log.warning('(v) Validate the model') log.warning('(e) Export the submission') log.warning('(x) Exit') arg = input()[0] print() if arg == 't': model = P3alphaRecommender(data.get_urm_train_1()) model.fit(topK=900, alpha=1.2, min_rating=0, implicit=True, normalize_similarity=False) recs = model.recommend_batch(data.get_target_playlists()) evaluate(recs, test_urm=data.get_urm_test_1()) elif arg == 'r': log.info('Wanna save for evaluation (y/n)?') if input()[0] == 'y': model = P3alphaRecommender(data.get_urm()) path = 'raw_data/saved_r_hat_evaluation/' else: model = P3alphaRecommender(data.get_urm_train_1()) path = 'raw_data/saved_r_hat/'
import numpy as np from scipy.sparse import load_npz import implicit import pandas as pd import data.data as data import scipy.sparse as sps # load data targetUsersIds = data.get_target_playlists() # get item_user matrix by transposing the URM matrix URM = data.get_urm_train_1() item_user_data = URM.transpose() print('> data loaded') # initialize a model (BM25 metric) model = implicit.nearest_neighbours.BM25Recommender(K=400, K1=1.5, B=0.3) # train the model on a sparse matrix of item/user/confidence weights model.fit(item_user_data) r_hat = np.dot(URM[targetUsersIds], model.similarity) sps.save_npz('raw_data/saved_r_hat_evaluation/BM25', r_hat) """ # build recommendations array recommendations = bridge.array_of_recommendations(model, target_user_ids=targetUsersIds, urm=URM) Export.export(np.array(recommendations), path='../submissions/', name='BM25 K {} K1 {} B{}'.format(K, K1, B)) print('file exported')
log.error('Invalid value of k {}'.format(at_k)) return aps = 0.0 for r in recommendations: row = test_urm.getrow(r[0]).indices m = min(at_k, len(row)) ap = 0.0 n_elems_found = 0.0 for j in range(1, m + 1): if r[j] in row: n_elems_found += 1 ap = ap + n_elems_found / j if m > 0: ap = ap / m aps = aps + ap result = aps / len(recommendations) if verbose: log.warning('MAP: {}'.format(result)) return result rec = IALS_numpy() rec.fit(R=data.get_urm_train_1()) r_hat = sps.csr_matrix(np.dot(rec.X[data.get_target_playlists()], rec.Y.T)) sps.save_npz('raw_data/saved_r_hat_evaluation/IALS', r_hat) #recs = rec.recommend_batch(userids=data.get_target_playlists()) #rec.evaluate(recs, data.get_urm_test())
if filter_already_liked: scores = self._filter_seen_on_scores(userid, scores) if len(items_to_exclude) > 0: raise NotImplementedError( 'Items to exclude functionality is not implemented yet') relevant_items_partition = (-scores).argpartition(n)[0:n] relevant_items_partition_sorting = np.argsort( -scores[relevant_items_partition]) ranking = relevant_items_partition[relevant_items_partition_sorting] if with_scores: best_scores = scores[ranking] return [userid] + [list(zip(list(ranking), list(best_scores)))] else: return [userid] + list(ranking) import data.data as d r = MFBPR() r.fit(d.get_urm_train_1(), epochs=300, n_factors=300, learning_rate=1e-1, user_regularization=1e-3, positive_item_regularization=1e-3, negative_item_regularization=1e-3, evaluate_every=1)
def create_ucm_from_urm(urm_train): """ Create ucm @Params proc_int (ProcessInteractions) personalizes the preprocess of the train.csv dataframe split (Split) personalizes the split into train and test of data coming after ProcessInteractions save_dataframes (Bool) whether to save the train and test dataframes or not """ path = "raw_data/ucm" + str(randint(1, 100)) print('starting dataset creation of UCM in ' + path) # maybe can be better a dense array? ICM = csr_matrix(create_icm(d.get_tracks_df(), [])) UCM = lil_matrix((d.N_PLAYLISTS,ICM.shape[1]), dtype=np.int) for p in range(d.N_PLAYLISTS): track_indices = urm_train[p].nonzero()[1] for track_id in track_indices: UCM[p] += ICM.getrow(track_id) log.progressbar(p, d.N_PLAYLISTS) # save matrices os.mkdir(path) save_npz(path + '/ucm', UCM) if __name__ == "__main__": urm = d.get_urm_train_1() create_ucm_from_urm(urm_train=urm)
for r in recommendations: row = test_urm.getrow(r[0]).indices m = min(at_k, len(row)) ap = 0.0 n_elems_found = 0.0 for j in range(1, m + 1): if r[j] in row: n_elems_found += 1 ap = ap + n_elems_found / j if m > 0: ap = ap / m aps = aps + ap result = aps / len(recommendations) if verbose: log.warning('MAP: {}'.format(result)) return result if __name__ == '__main__': rec = ProductRecommender() rec.fit(user_x_product=data.get_urm_train_1(), latent_features_guess=10, learning_rate=0.01, steps=2, regularization_penalty=0.2, convergeance_threshold=0.01) recs = rec.recommend_batch(data.get_target_playlists()) rec.evaluate(recommendations=recs, test_urm=data.get_urm_test_1())
distance=model.SIM_SPLUS, k=600, alpha=0.25, beta=0.5, shrink=10, l=0.25, c=0.5) print('Saving the similarity matrix...') sps.save_npz( 'raw_data/saved_sim_matrix_evaluation_2/{}'.format(model.name), model.get_sim_matrix()) elif arg == 'v': # model.validate(iterations=10, urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), targetids=data.get_target_playlists(), # distance=model.SIM_SPLUS, k=(100, 600), alpha=(0,2), beta=(0,2),shrink=(0,100),l=(0,1),c=(0,1)) model.validate(iterations=10, urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), targetids=data.get_target_playlists(), distance=model.SIM_RP3BETA, k=(100, 600), alpha=(0, 2), beta=(0, 2), shrink=(0, 100), l=1, c=1) #model.test(distance=CFItemBased.SIM_P3ALPHA, k=300,alpha=(0,2),shrink=(0,100)) elif arg == 'x': pass else: log.error('Wrong option!')
def run(self, urm_train=None, urm=None, urm_test=None, targetids=None, factors=100, regularization=0.01, iterations=100, alpha=25, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Returns ------- :return: recs: (list) recommendations :return: map10: (float) MAP10 for the provided recommendations """ _urm_train = data.get_urm_train_1() _urm = data.get_urm() _icm = data.get_icm() _urm_test = data.get_urm_test_1() _targetids = data.get_target_playlists() # _targetids = data.get_all_playlists() start = time.time() urm_train = _urm_train if urm_train is None else urm_train urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(l1_ratio=0.1, positive_only=True, alpha=1e-4, fit_intercept=False, copy_X=False, precompute=False, selection='random', max_iter=100, topK=100, tol=1e-4, workers=multiprocessing.cpu_count()) recs = self.recommend_batch(userids=targetids, with_scores=with_scores, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name=self.name, verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time() - start)) return recs, map10
def wizard_hybrid(): SIM_MATRIX = ['saved_sim_matrix', 'saved_sim_matrix_evaluation'] R_HAT = ['saved_r_hat', 'saved_r_hat_evaluation'] SAVE = ['saved_sim_matrix', 'saved_r_hat'] EVALUATE = ['saved_sim_matrix_evaluation', 'saved_r_hat_evaluation'] start = time.time() matrices_array, folder, models = hb.create_matrices_array() print('matrices loaded in {:.2f} s'.format(time.time() - start)) log.success('You have loaded: {}'.format(models)) NORMALIZATION_MODE = normalization_mode_selection() if folder in SAVE: WEIGHTS = weights_selection(models) if folder in SIM_MATRIX: name, urm_filter_tracks, rel_path = option_selection_save('SIM') hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) sps.save_npz('raw_data/' + rel_path + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) if folder in R_HAT: name, urm_filter_tracks, rel_path, EXPORT = option_selection_save( 'R_HAT') hybrid_rec = HybridRHat(matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) if EXPORT: N = ask_number_recommendations() recommendations = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) exportcsv(recommendations, path='submission', name=name) else: sps.save_npz('raw_data/' + rel_path + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) elif folder in EVALUATE: log.success('|WHAT YOU WANT TO DO ???|') log.warning('\'1\' BAYESIAN SEARCH VALIDATION') log.warning('\'2\' HAND CRAFTED WEIGHTS') mode = input()[0] # BAYESIAN SEARCH if mode == '1': log.success( '|SELECT A NUMBER OF |||ITERATIONS||| FOR THE ALGORITHM|') iterations = float(input()) urm_filter_tracks = data.get_urm_train_1() if folder in SIM_MATRIX: hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) if folder in R_HAT: hybrid_rec = HybridRHat(matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) hybrid_rec.validate(iterations=iterations, urm_test=data.get_urm_test_1(), userids=data.get_target_playlists()) # MANUAL WEIGHTS elif mode == '2': WEIGHTS = weights_selection(models) urm_filter_tracks = data.get_urm_train_1() chose = option_selection_evaluation_2() # save, evaluate or csv if chose == 's': log.success('|CHOSE A NAME FOR THE MATRIX...|') name = input() if folder in SIM_MATRIX: type = 'SIM' hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) elif folder in R_HAT: type = 'R_HAT' hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) sps.save_npz('raw_data/saved_r_hat_evaluation/' + name, hybrid_rec.get_r_hat(weights_array=WEIGHTS)) sym_rec = symmetric_recommender_creator( models, type, NORMALIZATION_MODE, urm_filter_tracks=data.get_urm_train_2()) sps.save_npz('raw_data/saved_r_hat_evaluation_2/' + name, sym_rec.get_r_hat(weights_array=WEIGHTS)) elif chose == 'e': if folder in SIM_MATRIX: type = 'SIM' hybrid_rec = HybridSimilarity( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) elif folder in R_HAT: type = 'R_HAT' hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) N = ask_number_recommendations() print('Recommending...') recs = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) hybrid_rec.evaluate(recommendations=recs, test_urm=data.get_urm_test_1()) # export the recommendations log.success( 'Do you want to save the CSV with these recomendations? (y/n)' ) if input()[0] == 'y': export_csv_wizard(recs) sym_rec = symmetric_recommender_creator( models, type, NORMALIZATION_MODE, urm_filter_tracks=data.get_urm_train_2()) recs2 = sym_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists()) sym_rec.evaluate(recommendations=recs2, test_urm=data.get_urm_test_2()) elif chose == 'c': if folder in R_HAT: hybrid_rec = HybridRHat( matrices_array, normalization_mode=NORMALIZATION_MODE, urm_filter_tracks=urm_filter_tracks) N = ask_number_recommendations() print('Recommending...') recs = hybrid_rec.recommend_batch( weights_array=WEIGHTS, target_userids=data.get_target_playlists(), N=N) export_csv_wizard(recs) else: log.error('not implemented yet') else: log.error('WRONG FOLDER')
def validate(l1_ratio_array, alpha_array, max_iter_array, topK_array, userids=data.get_target_playlists(), urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), filter_already_liked=True, items_to_exclude=[], N=10, verbose=True, write_on_file=True): """ ----------- :return: _ """ #create the initial model recommender = SLIMElasticNetRecommender() path = 'validation_results/' name = 'slim_rmse' folder = time.strftime('%d-%m-%Y') filename = '{}/{}/{}{}.csv'.format(path, folder, name, time.strftime('_%H-%M-%S')) # create dir if not exists os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, 'w') as out: for l in l1_ratio_array: for a in alpha_array: for m in max_iter_array: for k in topK_array: #train the model with the parameters if verbose: print( '\n\nTraining slim_rmse with\n l1_ratio: {}\n alpha: {}\n' 'Iterations: {}\n topK: {}'.format(l, a, m, k)) print('\n training phase...') recommender.fit(urm=urm_train, l1_ratio=l, alpha=a, max_iter=m, topK=k) #get the recommendations from the trained model recommendations = recommender.recommend_batch( userids=userids, N=N, filter_already_liked=filter_already_liked, items_to_exclude=items_to_exclude) #evaluate the model with map10 map10 = recommender.evaluate(recommendations, test_urm=urm_test) if verbose: print('map@10: {}'.format(map10)) #write on external files on folder models_validation if write_on_file: out.write( '\n\nl1_ratio: {}\n alpha: {}\n Iterations: {}\n ' 'topK: {}\n evaluation map@10: {}'.format( l, a, m, k, map10))
# #### Explode each row into multiple rows (one per interaction) #%% recs_tracks = [] for rec in raw_recs: playlist_id = rec[0] for t in rec[1:]: recs_tracks.append([playlist_id, t]) recs_df = pd.DataFrame(recs_tracks, columns=['playlist_id','track_id']) #%% [markdown] # #### Append the 'profile_length' column to the recommendation dataframe #%% target_ids = data.get_target_playlists() targetURM = data.get_urm_train_1()[target_ids] user_profile_lengths = np.array(targetURM.sum(axis=1)).flatten() profile_lengths_df = pd.DataFrame({'playlist_id': target_ids, 'profile_length': user_profile_lengths}) #%% rec_lengths_df = recs_df.merge(profile_lengths_df, on='playlist_id') #%% [markdown] # #### Popularity feature #%% df = data.get_playlists_df() popularity = df.groupby(['track_id']).size().reset_index(name='popularity') #%% rec_pop_df = rec_lengths_df.join(popularity.set_index('track_id'), on='track_id')