def icm(): datareader = Datareader(mode='offline', only_load=True) evaluator = Evaluator(datareader) print('NLP...') stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) test_playlists = datareader.get_test_pids() nlp = NLP(datareader=datareader, stopwords=[], mode='tracks') print('Getting ucm and icm...') icm = nlp.get_icm() icm = bm25_row(icm) print('Computing similarity...') start = time.time() # Compute similarity similarity = tversky_similarity(icm, shrink=200, alpha=0.1, beta=1) similarity = similarity.tocsr() print(time.time() - start) urm = datareader.get_urm() print('Computing eurm...') start = time.time() # Compute eurm eurm_nlp = dot_product(urm[test_playlists, :], similarity, k=500) eurm_nlp = eurm_nlp.tocsr() # sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_weighted_offline.npz', eurm_nlp) evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp), name='nlp_enriched')
class Top_pop_p(object): def __init__(self): self.dr_on = Datareader(verbose=False, mode='online', only_load=True) self.dr_of = Datareader(verbose=False, mode='online', only_load=True) self.urm_on = self.dr_on.get_urm() self.urm_of = self.dr_on.get_urm() self.urm_col = sps.csc_matrix(self.urm_of) self.top_p = np.zeros(self.urm_of.shape[1]) def album(self): eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() ucm_album = self.dr_of.get_ucm_albums().tocsc() album_dic = self.dr_of.get_track_to_album_dict() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.urm_on.indptr[row + 1]][0] album = album_dic[track_ind] playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album.indptr[album+1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) return eurm def track(self): eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.urm_on.indptr[row + 1]][0] playlists = self.urm_col.indices[ self.urm_col.indptr[track_ind]: self.urm_col.indptr[track_ind+1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) print(eurm) return eurm.copy()
def prova(): dr = Datareader(mode='offline', only_load=True) print(dr.get_artist_to_tracks_dict()) exit() dr = Datareader(mode='offline', only_load=True, verbose=False) test_playlists = dr.get_test_pids() stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) nlp = NLP(mode='playlists', datareader=dr, stopwords=STOP_WORDS) s = nlp.get_ucm() print(s.shape) evaluator = Evaluator(dr) ucm = nlp.get_ucm() sim = sparse.load_npz(ROOT_DIR + '/data/cf_user_similarity.npz') print('Computing dot...') ucm = dot_product(sim, ucm, k=200) print('NNZ', ucm.nnz) exit() urm = dr.get_urm() # ucm = ucm.astype(np.float64) # inplace_csr_column_scale(ucm, token_weights) print('Computing similarity...') start = time.time() # Compute similarity similarity = tversky_similarity(ucm, shrink=200, alpha=0.1, beta=1) similarity = similarity.tocsr() print(time.time() - start) print('Computing eurm...') start = time.time() # Compute eurm eurm_nlp = dot_product(similarity, urm, k=500) eurm_nlp = eurm_nlp.tocsr() eurm_nlp = eurm_nlp[test_playlists, :] #sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_weighted_offline.npz', eurm_nlp) evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp), name='nlp_enriched')
import sys from scipy import sparse import numpy as np import utils.pre_processing as pre from utils.definitions import * from utils.datareader import Datareader from utils.evaluator import Evaluator from utils.pre_processing import * from utils.post_processing import * dr = Datareader(mode='offline', only_load=True, verbose=False) ev = Evaluator(dr) urm = dr.get_urm(binary=True) urm_csc = urm.tocsc(copy=True) sim_nlp = sparse.load_npz(ROOT_DIR + '/data/sim_nlp_lele.npz') for k in [1, 2, 3, 4, 5]: eurm_top = dr.get_eurm_top_pop_filter_cat_1(sim_nlp, k, topk=500) eurm_top = norm_l1_row(eurm_top) eurm_nlp = sparse.load_npz(ROOT_DIR + '/data/nlp_fusion_tuned_offline.npz') eurm_nlp = norm_l1_row(eurm_nlp) for a in [0.05, 0.10, 0.15, 0.20]: eurm = eurm_nlp * (1.0 - a) + eurm_top * a rec_list = eurm_to_recommendation_list(eurm, datareader=dr) ev.evaluate(rec_list, name='pop_first_k=' + str(k) + '_a=' + str(a))
from utils.datareader import Datareader from utils.evaluator import Evaluator from utils.post_processing import eurm_remove_seed from utils.post_processing import eurm_to_recommendation_list import numpy as np import scipy.sparse as sps from tqdm import tqdm from utils.definitions import * from utils.post_processing import eurm_remove_seed, append_rec_list dr = Datareader(verbose=False, mode='offline', only_load=True) urm = dr.get_urm() urm_col = sps.csc_matrix(urm) top_p = np.zeros(urm.shape[1]) rec = [] eurm1 = sps.lil_matrix(urm.shape) eurm2 = sps.lil_matrix(urm.shape) print(eurm1.shape) pids = dr.get_test_pids(cat=2) pids_all = dr.get_test_pids() # TopPop Album # ucm_album = dr.get_ucm_albums().tocsc() # album_dic = dr.get_track_to_album_dict() # TopPop Artist ucm_album = dr.get_ucm_albums().tocsc() artists_dic = dr.get_track_to_artist_dict() album_to_tracks = load_obj(name="album_tracks_dict_offline",
nlp = NLP2(dr, stopwords=[], norm=True, work=True, split=True, date=False, skip_words=True, porter=False, porter2=True, lanca=False, lanca2=True) ucm_csr = nlp.get_UCM(data1=True).tocsr() ucm_csc = ucm_csr.tocsc(copy=True) urm_csr = dr.get_urm().tocsr() urm_csc = urm_csr.tocsc(copy=True) test_playlists = dr.get_test_pids(cat=1) # test_playlists.extend(dr.get_test_pids(cat=2)) rec_list = [[] for x in range(10000)] i = 0 for playlist_id in tqdm(test_playlists): tokens = ucm_csr.indices[ucm_csr.indptr[playlist_id]:ucm_csr. indptr[playlist_id + 1]] playlists_with_tokens = [] for token in tokens: playlists_with_tokens.extend( ucm_csc.indices[ucm_csc.indptr[token]:ucm_csc.indptr[token +
knn = 100 topk = 750 complete_name = "maurizio_" + mode + "__knn=" + str(knn) + "_topk=" + str(topk) if __name__ == '__main__': sim = sps.load_npz(ROOT_DIR + "/similarities/offline-similarity_rp3beta_knn100.npz") dr = Datareader(mode=mode, only_load=True) ######### MAURIZ ICM = dr.get_icm(alid=True) cfw = CFW_D_Similarity_Linalg(URM_train=dr.get_urm(), ICM=ICM.copy(), S_matrix_target=sim, URM_validation=None) cfw.fit() weights = sps.diags(cfw.D_best) sps.save_npz("ICM_fw_maurizio", weights) ICM_weighted = ICM.dot(weights) sps.save_npz("ICM_fw_maurizio", ICM_weighted) ######## NOI
from recommenders.similarity.s_plus import tversky_similarity from utils.evaluator import Evaluator from utils.datareader import Datareader from utils.post_processing import * from tqdm import tqdm from scipy import sparse import utils.sparse as ut import pandas as pd import numpy as np import sys datareader = Datareader(mode='offline', only_load=True, verbose=False) evaluator = Evaluator(datareader) urm = datareader.get_urm() ucm_album = datareader.get_ucm_albums() albums_pop = ucm_album.sum(axis=0).A1 mask = np.argsort(albums_pop)[::-1][:100] ut.inplace_set_cols_zero(ucm_album, mask) ucm_album = bm25_row(ucm_album) print('Similarity..') sim = tversky_similarity(ucm_album, ucm_album.T, shrink=200, alpha=0.1, beta=1, k=800, verbose=1, binary=False) sim = sim.tocsr() test_pids = list(datareader.get_test_pids()) eurm = dot_product(sim, urm, k=750)
import warnings warnings.warn( 'This function still use the old version of the remove seed, it should be replaced soon by the one in post_processing class' ) self.urm = sps.csr_matrix(self.urm[self.pid]) tmp = self.urm.tocoo() row = tmp.row col = tmp.col min = self.eurm.tocoo().min() self.eurm = sps.lil_matrix(self.eurm) self.eurm[row, col] = -1 self.eurm = sps.csr_matrix(self.eurm) return self.eurm if __name__ == '__main__': from utils.datareader import Datareader dr = Datareader(verbose=False, mode="offline", only_load="False") rec = Top_pop() rec.fit(dr.get_urm(), dr.get_test_playlists().transpose()[0]) eurm = rec.compute_rating().tocsr() sps.save_npz("top_pop online.npz", eurm.tocsr()) exit() import utils.evaluator as ev from utils.post_processing import eurm_to_recommendation_list eva = ev.Evaluator(dr) eva.evaluate(eurm_to_recommendation_list(eurm), "cacca TOPTOP")
# idft = np.power(idft, 0.5) # norm_urm = self.urm.multiply(idft.reshape(1,-1)).tocsr() # Computer the eURM self.eurm = dot_product(norm_urm, self.model, k=top_k) self.eurm = sps.csr_matrix(self.eurm) if verbose: print("time: " + str(int(time.time() - start_time) / 60)) return self.eurm if __name__ == '__main__': dr = Datareader(verbose=False, mode='offline', only_load=True) urm = dr.get_urm(binary=False) pid = dr.get_test_pids() position_urm = dr.get_position_matrix(position_type='last') pos_urm = position_urm.T.tocoo().tocsr() ev = Evaluator(dr) knn = 100 topk = 750 rec = TF_collaborative_item() # for knn in range(50, 300, 50): rec.fit(urm, pid) rec.compute_model(verbose=True, knn=knn, power=0.6, save_model=False) # rec.model = rec.model.tocsr() # rec.model.eliminate_zeros() #
class Top_pop_p(object): ''' Class that allow the user to get the personalized top pop build following track or album ''' def __init__(self): 1 def get_top_pop_album(self, mode): ''' :return: csr_matrix filled with the reccomendation for the cat 2 following album ''' if mode == "online": self.dr_on = Datareader(verbose=False, mode='online', only_load=True) self.urm_on = self.dr_on.get_urm() self.urm_col = sps.csc_matrix(self.urm_on) self.top_p = np.zeros(self.urm_on.shape[1]) eurm = sps.lil_matrix(self.urm_on.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_on.get_test_pids() ucm_album = self.dr_on.get_ucm_albums().tocsc() album_dic = self.dr_on.get_track_to_album_dict() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self. urm_on.indptr[row + 1]][0] album = album_dic[track_ind] playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album .indptr[album + 1]] top = self.urm_on[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) elif mode == "offline": self.dr_of = Datareader(verbose=False, mode='offline', only_load=True) self.urm_of = self.dr_of.get_urm() self.urm_col = sps.csc_matrix(self.urm_of) self.top_p = np.zeros(self.urm_of.shape[1]) eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_of.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() ucm_album = self.dr_of.get_ucm_albums().tocsc() album_dic = self.dr_of.get_track_to_album_dict() for row in tqdm(pids): track_ind = self.urm_of.indices[self.urm_of.indptr[row]:self. urm_of.indptr[row + 1]][0] album = album_dic[track_ind] playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album .indptr[album + 1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_of) return eurm.copy().tocsr() def get_top_pop_track(self, mode): ''' :return: csr_matrix filled with the reccomendation for the cat 2 following track ''' if mode == "online": self.dr_on = Datareader(verbose=False, mode='online', only_load=True) self.urm_on = self.dr_on.get_urm() self.urm_col = sps.csc_matrix(self.urm_on) self.top_p = np.zeros(self.urm_on.shape[1]) eurm = sps.lil_matrix(self.urm_on.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_on.get_test_pids() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self. urm_on.indptr[row + 1]][0] playlists = self.urm_col.indices[self.urm_col. indptr[track_ind]:self. urm_col.indptr[track_ind + 1]] top = self.urm_on[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) elif mode == "offline": self.dr_of = Datareader(verbose=False, mode='offline', only_load=True) self.urm_of = self.dr_of.get_urm() self.urm_col = sps.csc_matrix(self.urm_of) self.top_p = np.zeros(self.urm_of.shape[1]) eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_of.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() for row in tqdm(pids): track_ind = self.urm_of.indices[self.urm_of.indptr[row]:self. urm_of.indptr[row + 1]][0] playlists = self.urm_col.indices[self.urm_col. indptr[track_ind]:self. urm_col.indptr[track_ind + 1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_of) return eurm.copy().tocsr()
import sys from scipy import sparse import numpy as np import utils.pre_processing as pre from utils.definitions import * from utils.datareader import Datareader from utils.evaluator import Evaluator from utils.pre_processing import * from utils.post_processing import * from fast_import import * dr = Datareader(mode='offline', only_load=True, verbose=False) ev = Evaluator(dr) urm = dr.get_urm_with_position(1) urm_std = dr.get_urm() rec = CF_UB_BM25(urm=urm, datareader=dr, verbose_evaluation=False) rec.model(alpha=1, beta=0, k=250) rec.urm = urm_std rec.fast_recommend() res = rec.fast_evaluate_eurm() print(res[1])