def get_r_hat(self, load_from_file=False, path=''): """ :param load_from_file: if the matrix has been saved can be set to true for load it from it :param path: path in which the matrix has been saved ------- :return the extimated urm from the recommender """ r_hat = data.get_empty_urm() r_hat[data.get_target_playlists()] = self.U[data.get_target_playlists()].dot(self.s_Vt) return r_hat
def get_r_hat(self): """ compute the r_hat for the model :return r_hat only for the target playlists """ if self.W_sparse == None: log.error( 'the recommender has not been trained, call the fit() method for compute W' ) r_hat = data.get_empty_urm() r_hat[data.get_target_playlists()] = self.URM_train[ data.get_target_playlists()].dot(self.W_sparse) return r_hat
def cluster_users_by_interactions_count(clip): """ Split the playlists based on interactions count above or below the specified clip value. Parameters ---------- clip : (int), clip value for splitting. The playlists will be splitted in 2 groups: those that have an interactions count <= clip and those that have an interactions count > clip. Returns ------- 2 lists of playlists ids """ playlists = data.get_playlists_df() target_playlist = pd.DataFrame({'playlist_id':data.get_target_playlists()}) target_playlist['index'] = target_playlist.index counts = target_playlist.merge(playlists).groupby(['playlist_id', 'index']).size().reset_index(name='counts') #counts = counts.reset_index() #counts.columns[2] = 'index' #counts['index'] = counts.index # build dataframe of number of interactions: playlist_id | tracks_count #counts = playlists.groupby('playlist_id').size().reset_index(name='counts') # split based on the interactions counts return counts[counts['counts']<=clip]['index'].values, counts[counts['counts']>clip]['index'].values
def process(self, df): """ From each non sequential playlist, removes the percentual of songs specified in the constructor by randomly picking songs inside the playlist. For each sequential playlist, removes just the last songs of the playlist. This is done because this way the train-test splitting of kaggle is reproduced. See https://www.kaggle.com/c/recommender-system-2018-challenge-polimi/discussion/69325 @Param df: (panda's df) dataframe associated to train.csv @Output df: the dataframe from which we have removed the picked songs """ seq_l = d.get_target_playlists()[0:d.N_SEQUENTIAL] non_seq_l = list(set(d.get_all_playlists()) - set(seq_l)) seq_df = df[df.playlist_id.isin(seq_l)] non_seq_df = df[df.playlist_id.isin(non_seq_l)] seq_df_dropped = seq_df.groupby('playlist_id').apply( lambda x: x.iloc[:-math.floor(len(x) * self.perc)]).reset_index( drop=True) non_seq_df_dropped = non_seq_df.groupby('playlist_id').apply( lambda x: x.drop(x.sample(n=math.floor(len(x) * self.perc)).index)) return pd.concat([seq_df_dropped, non_seq_df_dropped]).sort_values(by='playlist_id', kind='mergesort')
def validate(self, factors_array, iteration_array, urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), verbose=True, write_on_file=True, userids=data.get_target_playlists(), N=10, filter_already_liked=True, items_to_exclude=[]): #create the initial model recommender = Pure_SVD() path = 'validation_results/' name = 'pure_SVD' folder = time.strftime('%d-%m-%Y') filename = '{}/{}/{}{}.csv'.format(path, folder, name, time.strftime('_%H-%M-%S')) # create dir if not exists os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, 'w') as out: for f in factors_array: for i in iteration_array: #train the model with the parameters if verbose: print('\n\nTraining PURE_SVD with\n Factors: {}\n Iteration: {}\n'.format(f, i)) print('\n training phase...') recommender.fit(urm_train=urm_train, num_factors=f, iteration=i) #get the recommendations from the trained model recommendations = recommender.recommend_batch(userids=userids, N=N, filter_already_liked=filter_already_liked, items_to_exclude=items_to_exclude) #evaluate the model with map10 map10 = recommender.evaluate(recommendations, test_urm=urm_test) if verbose: print('map@10: {}'.format(map10)) #write on external files on folder models_validation if write_on_file: out.write('\n\nFactors: {}\n Iteration: {}\n evaluation map@10: {}'.format(f, i, map10))
def cluster_ensemble(clip, path_sparse, path_dense): sparse_pl, dense_pl = cluster.cluster_users_by_interactions_count(clip=clip) log.success('Cluster 1 (interactions count <= {}): {} playlists'.format(clip, len(sparse_pl))) log.success('Cluster 2 (interactions count > {}): {} playlists'.format(clip, len(dense_pl))) # filter target playlists from the 2 clusters s1 = set(sparse_pl) s2 = set(dense_pl) s_target = set(data.get_target_playlists()) s1_target = s1 & s_target s2_target = s2 & s_target sparse_pl = pd.DataFrame({'playlist_id':list(s1_target)}) dense_pl= pd.DataFrame({'playlist_id': list(s2_target)}) df_sparse = pd.read_csv(path_sparse) df_dense = pd.read_csv(path_dense) cluster1 = df_sparse.merge(sparse_pl) cluster2 = df_dense.merge(dense_pl) final = pd.concat([cluster1, cluster2]) final.to_csv(path_or_buf='submissions/cluster_ensemble' + t.strftime('_%H-%M-%S'), index=False)
def get_r_hat(self): """ compute the r_hat for the model filled with zeros in playlists not target :return r_hat """ if self.user_vecs is None: log.error('the recommender has not been trained, call the fit() method') r_hat = data.get_empty_urm() r_hat = r_hat.todense() r_estimated = np.dot(self.user_vecs[data.get_target_playlists()], self.item_vecs.T) r_hat[data.get_target_playlists()] = r_estimated r_hat = sps.csr_matrix(r_hat) print('saving matrix') return r_hat
def histogram_of_interactions(): """ Plot the histogram of the interactions counts: x axis: interactions y axis: count of playlists with that number of interactions """ playlists = data.get_playlists_df() target_playlist = pd.DataFrame( {'playlist_id': data.get_target_playlists()}) counts = playlists.merge(target_playlist).groupby( 'playlist_id').size().reset_index(name='interactions') # plot counts for each playlist #counts.plot(x='playlist_id', y='interactions', kind='scatter', figsize=(200,100)) hist = counts.groupby('interactions').size().reset_index(name='counts') hist.plot(x='interactions', y='counts', kind='bar', fontsize=7, figsize=(150, 100)) # plot histogram plt.show(block=True)
def fit(self, URM, n_factors=10, learning_rate=1e-4, epochs=10, user_regularization=0.001, positive_item_regularization=0.001, negative_item_regularization=0.001, evaluate_every=1): self.URM = URM self.epochs = epochs self.n_users = self.URM.shape[0] self.n_items = self.URM.shape[1] e = MFBPR_Epoch( URM, n_factors=n_factors, learning_rate=learning_rate, user_regularization=user_regularization, positive_item_regularization=positive_item_regularization, negative_item_regularization=negative_item_regularization) print('Fitting MFBPR...') for numEpoch in range(self.epochs): print('Epoch:', numEpoch) e.epochIteration() if (numEpoch + 1) % evaluate_every == 0: self.user_factors, self.item_factors = e.get_user_item_factors( ) recs = self.recommend_batch(userids=d.get_target_playlists()) self.evaluate(recs, d.get_urm_test_1()) self.user_factors, self.item_factors = e.get_user_item_factors() # let's see how fine it performs in the test set: # getting as positive sample a semple in the test set but not in the training trials = 10000 count_wrong = 0 for _ in range(trials): test = d.get_urm_test_1() user_id = np.random.choice(self.n_users) user_seen_items = d.get_urm()[user_id, :].indices test_items = test[user_id, :].indices pos_item_id = np.random.choice(test_items) neg_item_selected = False while (not neg_item_selected): neg_item_id = np.random.randint(0, self.n_items) if (neg_item_id not in user_seen_items): neg_item_selected = True xui = np.dot(self.user_factors[user_id, :], self.item_factors[pos_item_id, :]) xuj = np.dot(self.user_factors[user_id, :], self.item_factors[neg_item_id, :]) xuij = xui - xuj if xuij < 0: count_wrong += 1 # print('u: {}, i: {}, j: {}. xui - xuj: {}'.format(user_id, pos_item_id, neg_item_id, xuij)) print('percentange of wrong preferences in test set: {}'.format( count_wrong / trials))
def run(self, epochs=70, batch_size=1000, lambda_i=0.0, lambda_j=0.0, learning_rate=0.01, topK=1500, sgd_mode='adagrad', export_results=True, export_r_hat=False): """ meant as a shortcut to run the model after the validation procedure, allowing the export of the scores on the playlists or of the estimated csr matrix :param epochs(int) :param batch_size(int) after how many items the params should be updated :param lambda_i(float) first regularization term :param lambda_j(float) second regularization term :param learning_rate(float) algorithm learning rate :param topK(int) how many elements should be taken into account while computing URM*W :param sgd_mode(string) optimization algorithm :param export_results(bool) export a ready-to-kaggle csv with the predicted songs for each playlist :param export_r_hat(bool) whether to export or not the estimated csr matrix """ self.fit(URM_train=d.get_urm(), epochs=epochs, URM_test=None, user_ids=None, batch_size=batch_size, validate_every_N_epochs=1, start_validation_after_N_epochs=epochs + 1, lambda_i=lambda_i, lambda_j=lambda_j, learning_rate=learning_rate, topK=topK, sgd_mode=sgd_mode) if export_results: print('exporting results') recs = self.recommend_batch(d.get_target_playlists(), N=10, urm=d.get_urm(), filter_already_liked=True, with_scores=False, items_to_exclude=[], verbose=False) importexport.exportcsv( recs, 'submission', self._print(epochs=epochs, batch_size=batch_size, lambda_i=lambda_i, lambda_j=lambda_j, learning_rate=learning_rate, topK=topK, sgd_mode=sgd_mode)) elif export_r_hat: print('saving estimated urm') self.save_r_hat()
def fit(self, URM_train=d.get_urm_train_1(), epochs=190, URM_test=d.get_urm_test_1(), user_ids=d.get_target_playlists(), batch_size=1000, validate_every_N_epochs=2, start_validation_after_N_epochs=191, lambda_i=0.0, lambda_j=0.0, learning_rate=0.01, topK=1500, sgd_mode='adagrad'): """ train the model finding matrix W :param epochs(int) :param batch_size(int) after how many items the params should be updated :param lambda_i(float) first regularization term :param lambda_j(float) second regularization term :param learning_rate(float) algorithm learning rate :param topK(int) how many elements should be taken into account while computing URM*W :param sgd_mode(string) optimization algorithm :param URM_train(csr_matrix) the URM used to train the model. Either the full or the validation one :param URM_test(csr_matrix) needed if we'd like to perform validation :param user_ids(list) needed if we'd like to perform validation :param validate_every_N_epochs(int) how often the MAP evaluation should be displayed :param start_validation_after_N_epochs(int) """ self.URM_train = URM_train.T self.n_users = URM_train.shape[0] self.n_items = URM_train.shape[1] self.sgd_mode = sgd_mode print('before cython') from cythoncompiled.SLIM_BPR.SLIM_BPR_Cython_Epoch import SLIM_BPR_Cython_Epoch self.cythonEpoch = SLIM_BPR_Cython_Epoch(self.URM_train, sparse_weights=False, topK=topK, learning_rate=learning_rate, li_reg=lambda_i, lj_reg=lambda_j, batch_size=100, symmetric=True, sgd_mode=sgd_mode) print('after cython') # Cal super.fit to start training self._fit_alreadyInitialized(epochs=epochs, logFile=None, URM_test=URM_test, user_ids=user_ids, filterTopPop=False, minRatingsPerUser=1, batch_size=batch_size, validate_every_N_epochs=validate_every_N_epochs, start_validation_after_N_epochs=start_validation_after_N_epochs, lambda_i=lambda_i, lambda_j=lambda_j, learning_rate=learning_rate, topK=topK) print('after already_initialized')
def get_r_hat(self, weights_array): hybrid_matrix = sps.csr_matrix(self.normalized_matrices_array[0].shape) count = 0 for m in self.normalized_matrices_array: hybrid_matrix += m*weights_array[count] count += 1 if self.name == 'HybridSimilarity': #compute the r_hat if we have the similarity if self.INVERSE == False: hybrid_matrix = self.urm_filter_tracks[data.get_target_playlists()].dot(hybrid_matrix) else: hybrid_matrix = hybrid_matrix[data.get_target_playlists()].dot(self.urm_filter_tracks) r_hat = data.get_empty_urm() r_hat[data.get_target_playlists()] = hybrid_matrix hybrid_matrix = r_hat return hybrid_matrix
def get_r_hat(self, load_from_file=False, path=''): """ :param load_from_file: if the matrix has been saved can be set to true for load it from it :param path: path in which the matrix has been saved ------- :return the extimated urm from the recommender """ U_filtered = self.U[data.get_target_playlists()] r_hat = U_filtered.dot(self.s_Vt) return sps.csr_matrix(r_hat)
def run(self, num_factors, urm_train=None, urm=None, urm_test=None, targetids=None, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Parameters ---------- num_factors : int, number of latent factors urm : csr matrix, URM. If None, used: data.get_urm_train(). This should be the entire URM for which the targetids corresponds to the row indexes. urm_test : csr matrix, urm where to test the model. If None, use: data.get_urm_test() targetids : list, target user ids. If None, use: data.get_target_playlists() Returns ------- recs: (list) recommendations map10: (float) MAP10 for the provided recommendations """ _urm = data.get_urm_train() _icm = data.get_icm() _urm_test = data.get_urm_test() _targetids = data.get_target_playlists() #_targetids = data.get_all_playlists() start = time.time() urm_train = _urm if urm_train is None else urm_train #urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(urm_train=urm_train, num_factors=num_factors) recs = self.recommend_batch(userids=targetids) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name=self.name, verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time() - start)) return recs, map10
def get_r_hat(self, verbose=False): """ Return the r_hat matrix as: R^ = R•S or R^ = S•R """ R = self.urm targetids = data.get_target_playlists() if self._matrix_mul_order == 'inverse': return sim.dot_product(self._sim_matrix, R, target_rows=targetids, k=R.shape[0], format_output='csr', verbose=verbose) else: return sim.dot_product(R, self._sim_matrix, target_rows=targetids, k=R.shape[0], format_output='csr', verbose=verbose)
def validate(self, epochs=200, user_ids=d.get_target_playlists(), batch_size = [1000], validate_every_N_epochs = 5, start_validation_after_N_epochs = 0, lambda_i = [0.0], lambda_j = [0.0], learning_rate = [0.01], topK = [200], sgd_mode='adagrad', log_path=None): """ train the model finding matrix W :param epochs(int) :param batch_size(list) after how many items the params should be updated :param lambda_i(list) first regularization term :param lambda_j(list) second regularization term :param learning_rate(list) algorithm learning rate :param topK(list) how many elements should be taken into account while computing URM*W :param sgd_mode(string) optimization algorithm :param user_ids(list) needed if we'd like to perform validation :param validate_every_N_epochs(int) how often the MAP evaluation should be displayed :param start_validation_after_N_epochs(int) :param log_path(string) folder to which the validation results should be saved """ if log_path != None: orig_stdout = sys.stdout f = open(log_path + '/' + self.name + ' ' + time.strftime('_%H-%M-%S') + ' ' + time.strftime('%d-%m-%Y') + '.txt', 'w') sys.stdout = f for li in lambda_i: for lj in lambda_j: for tk in topK: for lr in learning_rate: for b in batch_size: print(self._print(epochs=epochs, batch_size=b, lambda_i=li, lambda_j=lj, learning_rate=lr, topK=tk, sgd_mode=sgd_mode)) s.fit(URM_train=d.get_urm_train(), epochs=epochs, URM_test=d.get_urm_test(), user_ids=user_ids, batch_size=b, validate_every_N_epochs=validate_every_N_epochs, start_validation_after_N_epochs=start_validation_after_N_epochs, lambda_i = li, lambda_j = lj, learning_rate = lr, topK=tk, sgd_mode=sgd_mode ) if log_path != None: sys.stdout = orig_stdout f.close()
def histogram_of_top_pop_items(top_n, only_target=True): playlists_df = data.get_playlists_df() if only_target: # filter only target playlist target_playlist_df = pd.DataFrame({'playlist_id' : data.get_target_playlists()}) playlists_df = playlists_df.merge(target_playlist_df) # track_id | count toptracks_df = playlists_df.groupby('track_id').size().reset_index(name='count') toptracks_df = toptracks_df.sort_values('count', ascending=False).head(top_n) toptracks_df.plot(x='track_id', y='count', kind='bar', fontsize=6, figsize=(150,100)) # plot histogram plt.show(block=True)
def __init__(self, h): """ h: (int), length of the sequences split_perc: (float) validation split percentage, 0 to skip the creation of the validation set """ super(SequentialRecommender, self).__init__() self.name = 'sequential' self.h = h # build sequences dataset and cache it self.sequences, self.target_indices = ps.get_sequences(h=h) target_ids = data.get_target_playlists()[0:data.N_SEQUENTIAL] self.target_ids = np.array(target_ids) self.already_liked_indices = (data.get_urm_train_1()[target_ids]).nonzero() self.H = seqsim.getH(self.sequences)
def get_r_hat(self, load_from_file=False, path=''): """ compute the r_hat for the model :return r_hat only for the target playlists """ if load_from_file: r_hat = sps.load_npz(path) else: if self.W_sparse == None: log.error( 'the recommender has not been trained, call the fit() method for compute W' ) r_hat = self.URM_train[data.get_target_playlists()].dot( self.W_sparse) return r_hat
def get_r_hat(self, load_from_file=False, path=''): """ compute the r_hat for the model :return r_hat """ if load_from_file: r_hat = sps.load_npz(path) else: if self.user_vecs is None: log.error( 'the recommender has not been trained, call the fit() method' ) s_user_vecs = sps.csr_matrix(self.user_vecs) s_item_vecs_t = sps.csr_matrix(self.item_vecs.T) r_hat = s_user_vecs[data.get_target_playlists()].dot(s_item_vecs_t) return r_hat
def run(self, distance, ucm_train=None, urm=None, urm_test=None, targetids=None, k=100, shrink=10, threshold=0, implicit=True, alpha=None, beta=None, l=None, c=None, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Parameters ---------- distance : str, distance metric targetids : list, target user ids. If None, use: data.get_target_playlists() k : int, K nearest neighbour to consider shrink : float, shrink term used in the normalization threshold : float, all the values under this value are cutted from the final result implicit : bool, if true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM Returns ------- recs: (list) recommendations map10: (float) MAP10 for the provided recommendations """ start = time.time() _ucm_train = data.get_ucm_train() _urm = data.get_urm_train_1() _urm_test = data.get_urm_test_1() _targetids = data.get_target_playlists() ucm_train = _ucm_train if ucm_train is None else ucm_train urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(ucm_train, k=k, distance=distance, alpha=alpha, beta=beta, c=c, l=l, shrink=shrink, threshold=threshold, implicit=implicit) recs = self.recommend_batch(targetids, urm=urm, with_scores=with_scores, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name='{}_{}'.format(self.name,distance), verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time()-start)) return recs, map10
def validate(self, user_ids=d.get_target_playlists(), log_path=None, normalize_similarity=[False], damp_coeff=[1], add_zeros_quota=[1], loss_tolerance=[1e-6], iteration_limit=[30], use_incremental=[False]): if log_path != None: orig_stdout = sys.stdout f = open( log_path + '/' + self.name + ' ' + time.strftime('_%H-%M-%S') + ' ' + time.strftime('%d-%m-%Y') + '.txt', 'w') sys.stdout = f for ns in normalize_similarity: for dc in damp_coeff: for adq in add_zeros_quota: for lt in loss_tolerance: for il in iteration_limit: for ui in use_incremental: print( self._print(normalize_similarity=ns, add_zeros_quota=dc, loss_tolerance=lt, iteration_limit=il, damp_coeff=dc, use_incremental=ui)) self.fit(ICM=d.get_icm(), URM_train=d.get_urm_train_1(), normalize_similarity=ns, add_zeros_quota=adq, loss_tolerance=lt, iteration_limit=il, damp_coeff=dc, use_incremental=ui) recs = self.recommend_batch( user_ids, urm=d.get_urm_train_1()) r.evaluate(recs, d.get_urm_test_1()) if log_path != None: sys.stdout = orig_stdout f.close()
def cluster_users_by_top_pop_count(clip_perc, top_n=100, only_target=True): """ Return the ids of the playlists containing at least the specified percentage of top popular track (in descending order based on contained top pop tracks count) Parameters ---------- clip_perc: (float) returns only playlist with a percentage of top pop tracks over the total tracks count >= clip_perc top_n: consider only the most popular tracks (it should be set equal to the max track count among all playlists) only_target: (bool) consider only the target playlist Returns ------- List of playlist_id """ playlists_df = data.get_playlists_df() #tot_interactions = playlists_df.shape[0] if only_target: # filter only target playlist target_playlist_df = pd.DataFrame({'playlist_id' : data.get_target_playlists()}) playlists_df = playlists_df.merge(target_playlist_df) # track_id | count toptracks_df = playlists_df.groupby('track_id').size().reset_index(name='count') #toptracks_df['relative_count'] = toptracks_df['count'] / tot_interactions toptracks_df = toptracks_df.sort_values('count', ascending=False).head(top_n) # playlist_id | top_pop_count filtered_df = playlists_df.merge(toptracks_df) filtered_df = filtered_df.groupby('playlist_id').size().reset_index(name='top_pop_count') #filtered_df = filtered_df.sort_values('top_pop_count', ascending=False) # playlist_id | count | top_pop_count | perc playlists_count_df = playlists_df.groupby('playlist_id').size().reset_index(name='count') final_df = playlists_count_df.merge(filtered_df) final_df['perc'] = np.divide(final_df['top_pop_count'], final_df['count']) # filter only playlist with top pop perc >= clip_perc final_df = final_df[final_df['perc']>=clip_perc] final_df.sort_values(['perc','top_pop_count'], ascending=False, inplace=True) return final_df['playlist_id'].values
def run(self, normalize_similarity=False, add_zeros_quota=1, loss_tolerance=1e-6, iteration_limit=30, damp_coeff=1, use_incremental=False, export_results=True, export_r_hat=False, export_for_validation=False): if export_r_hat and export_for_validation: urm = d.get_urm_train_1() else: urm = d.get_urm() self.fit(ICM=d.get_icm(), URM_train=urm, normalize_similarity=normalize_similarity, add_zeros_quota=add_zeros_quota, loss_tolerance=loss_tolerance, iteration_limit=iteration_limit, damp_coeff=damp_coeff, use_incremental=use_incremental) if export_results: print('exporting results') recs = self.recommend_batch(d.get_target_playlists(), N=10, urm=urm, filter_already_liked=True, with_scores=False, items_to_exclude=[], verbose=False) importexport.exportcsv( recs, 'submission', self._print(normalize_similarity=normalize_similarity, add_zeros_quota=add_zeros_quota, loss_tolerance=loss_tolerance, iteration_limit=iteration_limit, damp_coeff=damp_coeff, use_incremental=use_incremental)) elif export_r_hat: print('saving estimated urm') self.save_r_hat(export_for_validation)
def fit(self, clip=7): sparse_pl, dense_pl = cluster.cluster_users_by_interactions_count( clip=clip) log.success( 'Cluster 1 (interactions count <= {}): {} playlists'.format( clip, len(sparse_pl))) log.success( 'Cluster 2 (interactions count > {}): {} playlists'.format( clip, len(dense_pl))) # filter target playlists from the 2 clusters s1 = set(sparse_pl) s2 = set(dense_pl) s_target = set(data.get_target_playlists()) s1_target = s1 & s_target s2_target = s2 & s_target self.sparse_pl = list(s1_target) self.dense_pl = list(s2_target)
def test(self, distance=DistanceBasedRecommender.SIM_SPLUS, k=600, shrink=10, threshold=0, alpha=0.25, beta=0.5, l=0.5, c=0.25): """ meant as a shortcut to run the model after the validation procedure, allowing the export of the scores on the playlists or of the estimated csr matrix """ recs, map = self.run(urm=d.get_urm(), icm=d.get_icm(), targetids=d.get_target_playlists(), distance=distance, k=k, shrink=shrink, threshold=threshold, alpha=alpha, beta=beta, l=l, c=c, export=export_results) if export_r_hat: print('saving estimated urm') self.save_r_hat() return recs, map
def _check_presence_test_samples(df_test, target='target'): """ Checks that in the test dataframe there is at least one track for each target playlist :param df_test: (panda's dataframe) :param target_playlists: (list) """ if target == 'all': p = d.get_all_playlists() elif target == 'target': p = d.get_target_playlists() if len(df_test[df_test['playlist_id'].isin(p)].groupby( 'playlist_id')) != len(p): if target == 'all': print( "WARNING: not all the target playlists (JUST THE TARGETS) have a song in the training set" ) elif target == 'target': print( "WARNING: not all the playlists (ALL OF THEM) have a song in the training set" )
def validate(l1_ratio_array, alpha_array, max_iter_array, topK_array, userids=data.get_target_playlists(), urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), filter_already_liked=True, items_to_exclude=[], N=10, verbose=True, write_on_file=True): """ ----------- :return: _ """ #create the initial model recommender = SLIMElasticNetRecommender() path = 'validation_results/' name = 'slim_rmse' folder = time.strftime('%d-%m-%Y') filename = '{}/{}/{}{}.csv'.format(path, folder, name, time.strftime('_%H-%M-%S')) # create dir if not exists os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, 'w') as out: for l in l1_ratio_array: for a in alpha_array: for m in max_iter_array: for k in topK_array: #train the model with the parameters if verbose: print( '\n\nTraining slim_rmse with\n l1_ratio: {}\n alpha: {}\n' 'Iterations: {}\n topK: {}'.format(l, a, m, k)) print('\n training phase...') recommender.fit(urm=urm_train, l1_ratio=l, alpha=a, max_iter=m, topK=k) #get the recommendations from the trained model recommendations = recommender.recommend_batch( userids=userids, N=N, filter_already_liked=filter_already_liked, items_to_exclude=items_to_exclude) #evaluate the model with map10 map10 = recommender.evaluate(recommendations, test_urm=urm_test) if verbose: print('map@10: {}'.format(map10)) #write on external files on folder models_validation if write_on_file: out.write( '\n\nl1_ratio: {}\n alpha: {}\n Iterations: {}\n ' 'topK: {}\n evaluation map@10: {}'.format( l, a, m, k, map10))
def run(self, urm_train=None, urm=None, urm_test=None, targetids=None, factors=100, regularization=0.01, iterations=100, alpha=25, with_scores=False, export=True, verbose=True): """ Run the model and export the results to a file Returns ------- :return: recs: (list) recommendations :return: map10: (float) MAP10 for the provided recommendations """ _urm_train = data.get_urm_train_1() _urm = data.get_urm() _icm = data.get_icm() _urm_test = data.get_urm_test_1() _targetids = data.get_target_playlists() # _targetids = data.get_all_playlists() start = time.time() urm_train = _urm_train if urm_train is None else urm_train urm = _urm if urm is None else urm urm_test = _urm_test if urm_test is None else urm_test targetids = _targetids if targetids is None else targetids self.fit(l1_ratio=0.1, positive_only=True, alpha=1e-4, fit_intercept=False, copy_X=False, precompute=False, selection='random', max_iter=100, topK=100, tol=1e-4, workers=multiprocessing.cpu_count()) recs = self.recommend_batch(userids=targetids, with_scores=with_scores, verbose=verbose) map10 = None if len(recs) > 0: map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose) else: log.warning('No recommendations available, skip evaluation') if export: exportcsv(recs, path='submission', name=self.name, verbose=verbose) if verbose: log.info('Run in: {:.2f}s'.format(time.time() - start)) return recs, map10
userids=userids, N=N, filter_already_liked=filter_already_liked, items_to_exclude=items_to_exclude) #evaluate the model with map10 map10 = recommender.evaluate(recommendations, test_urm=urm_test) if verbose: print('map@10: {}'.format(map10)) #write on external files on folder models_validation if write_on_file: out.write( '\n\nl1_ratio: {}\n alpha: {}\n Iterations: {}\n ' 'topK: {}\n evaluation map@10: {}'.format( l, a, m, k, map10)) """ If this file is executed, test the SPLUS distance metric """ if __name__ == '__main__': rec = SLIMElasticNetRecommender() rec.fit(urm=data.get_urm_train_1(), max_iter=1, topK=400, alpha=1e-4, l1_ratio=0.5) recs = rec.recommend_batch(userids=data.get_target_playlists()) rec.evaluate(recommendations=recs, test_urm=data.get_urm_test_1())