class Top_pop_p(object): def __init__(self): self.dr_on = Datareader(verbose=False, mode='online', only_load=True) self.dr_of = Datareader(verbose=False, mode='online', only_load=True) self.urm_on = self.dr_on.get_urm() self.urm_of = self.dr_on.get_urm() self.urm_col = sps.csc_matrix(self.urm_of) self.top_p = np.zeros(self.urm_of.shape[1]) def album(self): eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() ucm_album = self.dr_of.get_ucm_albums().tocsc() album_dic = self.dr_of.get_track_to_album_dict() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.urm_on.indptr[row + 1]][0] album = album_dic[track_ind] playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album.indptr[album+1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) return eurm def track(self): eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.urm_on.indptr[row + 1]][0] playlists = self.urm_col.indices[ self.urm_col.indptr[track_ind]: self.urm_col.indptr[track_ind+1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) print(eurm) return eurm.copy()
def icm(): datareader = Datareader(mode='offline', only_load=True) evaluator = Evaluator(datareader) print('NLP...') stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) test_playlists = datareader.get_test_pids() nlp = NLP(datareader=datareader, stopwords=[], mode='tracks') print('Getting ucm and icm...') icm = nlp.get_icm() icm = bm25_row(icm) print('Computing similarity...') start = time.time() # Compute similarity similarity = tversky_similarity(icm, shrink=200, alpha=0.1, beta=1) similarity = similarity.tocsr() print(time.time() - start) urm = datareader.get_urm() print('Computing eurm...') start = time.time() # Compute eurm eurm_nlp = dot_product(urm[test_playlists, :], similarity, k=500) eurm_nlp = eurm_nlp.tocsr() # sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_weighted_offline.npz', eurm_nlp) evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp), name='nlp_enriched')
def prova(): dr = Datareader(mode='offline', only_load=True) print(dr.get_artist_to_tracks_dict()) exit() dr = Datareader(mode='offline', only_load=True, verbose=False) test_playlists = dr.get_test_pids() stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) nlp = NLP(mode='playlists', datareader=dr, stopwords=STOP_WORDS) s = nlp.get_ucm() print(s.shape) evaluator = Evaluator(dr) ucm = nlp.get_ucm() sim = sparse.load_npz(ROOT_DIR + '/data/cf_user_similarity.npz') print('Computing dot...') ucm = dot_product(sim, ucm, k=200) print('NNZ', ucm.nnz) exit() urm = dr.get_urm() # ucm = ucm.astype(np.float64) # inplace_csr_column_scale(ucm, token_weights) print('Computing similarity...') start = time.time() # Compute similarity similarity = tversky_similarity(ucm, shrink=200, alpha=0.1, beta=1) similarity = similarity.tocsr() print(time.time() - start) print('Computing eurm...') start = time.time() # Compute eurm eurm_nlp = dot_product(similarity, urm, k=500) eurm_nlp = eurm_nlp.tocsr() eurm_nlp = eurm_nlp[test_playlists, :] #sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_weighted_offline.npz', eurm_nlp) evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp), name='nlp_enriched')
def reorder_test_playlists(): # Playlists reordering dr = Datareader(test_num=1, mode='offline', only_load=True) path_test_csv = dr._Datareader__path + dr._Datareader__test_playlist_file pids = [] for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: pids.extend(dr.get_test_pids(cat=i)) test_playlists_df = pd.read_csv(path_test_csv, sep='\t', encoding='utf-8') test_playlists_df = test_playlists_df.set_index(['pid']) # Load and resave csv file ordered by cat test_playlists_df = test_playlists_df.reindex(pids) test_playlists_df['pid'] = test_playlists_df.index test_playlists_df.to_csv(path_test_csv, sep='\t', index=False, encoding='utf-8')
def new(): datareader = Datareader(mode='offline', only_load=True) evaluator = Evaluator(datareader) print('NLP...') stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) test_playlists = datareader.get_test_pids() nlp = NLP(datareader=datareader, stopwords=[], mode='both') print('Getting ucm and icm...') ucm = nlp.get_ucm() ucm = bm25_row(ucm) icm = nlp.get_icm() icm = bm25_row(icm) icm_T = icm.T #ucm = bm25_row(ucm) #urm = datareader.get_urm() print('Computing eurm...') start = time.time() eurm_nlp = dot_product(ucm[test_playlists, :], icm_T, k=500) print(time.time() - start) print('Converting to csr...') eurm_nlp = eurm_nlp.tocsr() print(eurm_nlp.shape) #eurm_nlp = eurm_nlp[test_playlists:, :] sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_new_method_offline.npz', eurm_nlp) evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp), name='nlp_new_method', show_plot=False)
s = s.sort(key=lambda x: x[0]) print("> done") return test_known_tracks verbose = False if __name__ == "__main__": dr = Datareader(mode='offline', train_format='50k', verbose=False, only_load=True) ev = Evaluator(dr) test_known_tracks = build_test_dict(dr) test_pids_cat2 = dr.get_test_pids(cat=2) rec_list = np.zeros(shape=(10000,500)) pred = np.zeros(shape=(10000, 2262292)) for i in tqdm(range(1000,2000)): # print("prima target") # print(test_pids_cat2[0]) # print(test_known_tracks[test_pids_cat2[0]]) # print([x[1] for x in test_known_tracks[test_pids_cat2[0]]]) # # print("start") sequences = urm_to_sequences(urm_pos=dr.get_position_matrix(position_type='last'), target_list=[x[1] for x in test_known_tracks[test_pids_cat2[0]]], min_common=1)
work=True, split=True, date=False, skip_words=True, porter=False, porter2=True, lanca=False, lanca2=True) ucm_csr = nlp.get_UCM(data1=True).tocsr() ucm_csc = ucm_csr.tocsc(copy=True) urm_csr = dr.get_urm().tocsr() urm_csc = urm_csr.tocsc(copy=True) test_playlists = dr.get_test_pids(cat=1) # test_playlists.extend(dr.get_test_pids(cat=2)) rec_list = [[] for x in range(10000)] i = 0 for playlist_id in tqdm(test_playlists): tokens = ucm_csr.indices[ucm_csr.indptr[playlist_id]:ucm_csr. indptr[playlist_id + 1]] playlists_with_tokens = [] for token in tokens: playlists_with_tokens.extend( ucm_csc.indices[ucm_csc.indptr[token]:ucm_csc.indptr[token + 1]]) urm_tmp = urm_csr[playlists_with_tokens]
if skip_words: nome+="skipw_" if porter: nome+="porter_" if porter2: nome+="porter2_" if lanca: nome+="lanca_" if lanca2: nome+="lanca2_" if data1: nome+="data1_" nlp = NLP2(dr, stopwords=[], norm=norm,work=work,split=split,date=date, skip_words=skip_words, porter=porter,porter2=porter2,lanca=lanca,lanca2=lanca2) # new_titles, occ_full, occ_single = nlp.fit( verbose=False, workout=True, normalize=True, date=True, lancaster=False, # porter=False, underscore=True, double_fit=False) ucm = nlp.get_UCM(data1=data1) urm = dr.get_urm() test_playlists = dr.get_test_pids() print('ucm', ucm.shape) print('Computing similarity...') start = time.time() # Compute similarity ucm= bm25_row(ucm) similarity = tversky_similarity(ucm, binary=False, shrink=1, alpha=0.1, beta=1) similarity = similarity.tocsr() print(time.time() - start) print('Computing eurm...') start = time.time() # Compute eurm eurm = dot_product(similarity, urm, k=500)
def convert(path): # LOAD DATA data = json.load(open(path + "/challenge_set.json")) dr = Datareader(mode='online', only_load=True, verbose=False) # CHALLENGE PLAYLISTS target_playlists = data['playlists'] # Drop tracks and reorder target_playlists_df = pd.DataFrame(target_playlists) target_playlists_df = target_playlists_df.drop(['tracks'], axis=1) target_playlists_df.sort_values(by=['pid'], inplace=True) # Set pid as first column cols = target_playlists_df.columns.tolist() cols = cols[-1:] + cols[:-1] target_playlists_df = target_playlists_df[cols] # Save csv file target_playlists_df.to_csv(path_original + 'test_playlists.csv', sep='\t', index=False) # Dict uri -> tid tracks_df = dr.get_df_tracks() # Create dict track_uri - track_id values = list(tracks_df['tid'].as_matrix()) keys = list(tracks_df['track_uri'].as_matrix()) uri_to_tid = dict(zip(keys, values)) # CHALLENGE INTERACTIONS iteractions = [[], [], []] for p in tqdm(range(len(target_playlists))): tracks = data["playlists"][p]["tracks"] playlistId = data["playlists"][p]["pid"] iteractions[0].extend([playlistId] * len(tracks)) for t in range(len(tracks)): tid = uri_to_tid[tracks[t]["track_uri"]] iteractions[1].extend([tid]) iteractions[2].extend([tracks[t]["pos"]]) d = {'pid': iteractions[0], 'tid': iteractions[1], 'pos': iteractions[2]} all_interactions = pd.DataFrame(d) all_interactions.sort_values(by=['pid'], inplace=True) all_interactions.to_csv(path_original + 'test_interactions.csv', sep='\t', index=False) # Playlists reordering pids = [] for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: pids.extend(dr.get_test_pids(cat=i)) test_playlists_df = pd.read_csv(path_original + 'test_playlists.csv', sep='\t', encoding='utf-8') test_playlists_df = test_playlists_df.set_index(['pid']) # Load and resave csv file ordered by cat test_playlists_df = test_playlists_df.reindex(pids) test_playlists_df['pid'] = test_playlists_df.index test_playlists_df.to_csv(path_original + 'test_playlists.csv', sep='\t', index=False, encoding='utf-8')
topk) if mode == "offline": """Test Set""" #Data initialization dr = Datareader(verbose=False, mode=mode, only_load=True) #Evaluetor initialization #Recommender algorithm initialization rec = Knn_collabrative_user() #Getting for the recommender algorithm urm = dr.get_urm() urm.data = np.ones(len(urm.data)) pid = dr.get_test_pids() # Depopularize top = urm.sum(axis=0).A1 mask = np.argsort(top)[::-1][:2000] ut.inplace_set_cols_zero(urm, mask) #Fitting data rec.fit(urm, pid) #Computing similarity/model rec.compute_model(top_k=knn, sm_type=tversky_similarity, shrink=200, alpha=0.1, beta=1,
from utils.definitions import * # Datareader dr = Datareader(mode='online', only_load=True) #ev = Evaluator(dr) # Dataframe with interactions df_train = dr.get_df_train_interactions() df_test = dr.get_df_test_interactions() df = pd.concat([df_train, df_test], axis=0, join='outer') playlists = df['pid'].as_matrix() tracks = df['tid'].as_matrix() dictionary = dr.get_track_to_artist_dict() pids = list(dr.get_train_pids()) + list(dr.get_test_pids()) # URM urm = dr.get_urm() urm = urm[pids] print(urm.shape) print('artists...') artists = [dictionary[t] for t in tracks] print('ucm...') ucm = sparse.csr_matrix((np.ones(len(playlists)), (playlists, artists)), shape=(1049361, len(dr.get_artists()))) ucm = ucm.tocsr() ucm = ucm[pids] print(ucm.shape)
test_interactions_df.sort_values(['pos'], ascending=True) test_playlists_df = dr.get_df_test_playlists() test_playlists = test_playlists_df['pid'].as_matrix() # A list of list [pos, tid] for each playlist sorted by pos test_known_tracks = test_interactions_df.groupby( ['pid'])[['pos', 'tid']].apply(lambda x: x.values.tolist()) for s in test_known_tracks: s = s.sort(key=lambda x: x[0]) print("> done") urm_pos = dr.get_position_matrix(position_type='last') print("urm pos loaded") test_pids_nine = dr.get_test_pids(cat=9) print(test_pids_nine) print("playlist", test_pids_nine[0], test_known_tracks[test_pids_nine[0]]) sequences = urm_to_sequences( urm_pos=urm_pos, target_list=[x[1] for x in test_known_tracks[test_pids_nine[0]]], min_common=15) # for seq in sequences: # print(seq) print(len(sequences)) model = PrefixSpan.train(sequences, minSupport=0.1, maxPatternLength=250) result = model.freqSequences().collect() for fs in result:
import numpy as np import sys datareader = Datareader(mode='offline', only_load=True, verbose=False) evaluator = Evaluator(datareader) urm = datareader.get_urm() ucm_album = datareader.get_ucm_albums() albums_pop = ucm_album.sum(axis=0).A1 mask = np.argsort(albums_pop)[::-1][:100] ut.inplace_set_cols_zero(ucm_album, mask) ucm_album = bm25_row(ucm_album) print('Similarity..') sim = tversky_similarity(ucm_album, ucm_album.T, shrink=200, alpha=0.1, beta=1, k=800, verbose=1, binary=False) sim = sim.tocsr() test_pids = list(datareader.get_test_pids()) eurm = dot_product(sim, urm, k=750) eurm = eurm.tocsr() eurm = eurm[test_pids, :] sparse.save_npz('eurm_albums_depop_100_offline.npz', eurm) eurm = eurm_remove_seed(eurm, datareader) evaluator.evaluate(eurm_to_recommendation_list(eurm), name='cbuser_album_depop_100', show_plot=False)
nlp = NLP(dr, stopwords=[], norm=norm, work=work, split=split, date=date, skip_words=skip_words, porter=porter, porter2=porter2, lanca=lanca, lanca2=lanca2) ucm = nlp.get_UCM(data1=data1) urm = dr.get_urm() test_playlists = dr.get_test_pids() ucm = bm25_row(ucm) similarity = tversky_similarity(ucm, binary=False, shrink=1, alpha=0.1, beta=1) similarity = similarity.tocsr() #eurm eurm = dot_product(similarity, urm, k=topk) eurm = eurm.tocsr() eurm = eurm[test_playlists, :] rec_list = eurm_to_recommendation_list(eurm)
class Top_pop_p(object): ''' Class that allow the user to get the personalized top pop build following track or album ''' def __init__(self): 1 def get_top_pop_album(self, mode): ''' :return: csr_matrix filled with the reccomendation for the cat 2 following album ''' if mode == "online": self.dr_on = Datareader(verbose=False, mode='online', only_load=True) self.urm_on = self.dr_on.get_urm() self.urm_col = sps.csc_matrix(self.urm_on) self.top_p = np.zeros(self.urm_on.shape[1]) eurm = sps.lil_matrix(self.urm_on.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_on.get_test_pids() ucm_album = self.dr_on.get_ucm_albums().tocsc() album_dic = self.dr_on.get_track_to_album_dict() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self. urm_on.indptr[row + 1]][0] album = album_dic[track_ind] playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album .indptr[album + 1]] top = self.urm_on[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) elif mode == "offline": self.dr_of = Datareader(verbose=False, mode='offline', only_load=True) self.urm_of = self.dr_of.get_urm() self.urm_col = sps.csc_matrix(self.urm_of) self.top_p = np.zeros(self.urm_of.shape[1]) eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_of.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() ucm_album = self.dr_of.get_ucm_albums().tocsc() album_dic = self.dr_of.get_track_to_album_dict() for row in tqdm(pids): track_ind = self.urm_of.indices[self.urm_of.indptr[row]:self. urm_of.indptr[row + 1]][0] album = album_dic[track_ind] playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album .indptr[album + 1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_of) return eurm.copy().tocsr() def get_top_pop_track(self, mode): ''' :return: csr_matrix filled with the reccomendation for the cat 2 following track ''' if mode == "online": self.dr_on = Datareader(verbose=False, mode='online', only_load=True) self.urm_on = self.dr_on.get_urm() self.urm_col = sps.csc_matrix(self.urm_on) self.top_p = np.zeros(self.urm_on.shape[1]) eurm = sps.lil_matrix(self.urm_on.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_on.get_test_pids() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self. urm_on.indptr[row + 1]][0] playlists = self.urm_col.indices[self.urm_col. indptr[track_ind]:self. urm_col.indptr[track_ind + 1]] top = self.urm_on[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) elif mode == "offline": self.dr_of = Datareader(verbose=False, mode='offline', only_load=True) self.urm_of = self.dr_of.get_urm() self.urm_col = sps.csc_matrix(self.urm_of) self.top_p = np.zeros(self.urm_of.shape[1]) eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_of.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() for row in tqdm(pids): track_ind = self.urm_of.indices[self.urm_of.indptr[row]:self. urm_of.indptr[row + 1]][0] playlists = self.urm_col.indices[self.urm_col. indptr[track_ind]:self. urm_col.indptr[track_ind + 1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_of) return eurm.copy().tocsr()
name ="DSLIM" complete_name = mode+"_"+name+"_knn="+str(knn)+"_topk="+str(topk)\ + '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") if mode=="offline": complete_name+="_test="+str(test_num) bot = Bot_v1(complete_name) try: ######################SHRINKED dr = Datareader(mode=mode, test_num=test_num, train_format="50k", only_load=True) ev = Evaluator(dr) pids = dr.get_test_pids() urm, dictns, dict2 = dr.get_urm_shrinked() urm_evaluation = dr.get_evaluation_urm()[pids] pids_converted = np.array([dictns[x] for x in pids], dtype=np.int32) slim = MultiThreadDSLIM_RMSE(urm.T) slim.fit(l1_penalty=l1, l2_penalty=l2, positive_only=True, beta=beta, topK=topk) evaluate_shrinked(W_sparse= slim.W_sparse, urm_shrinked= urm, pids_shrinked= pids_converted) sps.save_npz(complete_name+".npz",slim.W_sparse,) except Exception as e:
if __name__ == '__main__': # SELECT EXECUTION MODE mode = "online" name = "cbf_user_artists" knn = 800 topk = 750 save_eurm = True complete_name = mode + "_" + name + "_knn=" + str(knn) + "_topk=" + str( topk) if mode == "offline": # Initialization dr = Datareader(verbose=False, mode=mode, only_load=True) test_pids = list(dr.get_test_pids()) ev = Evaluator(dr) urm = dr.get_urm() # UCM ucm_artists = dr.get_ucm_artists() ucm_artists = bm25_row(ucm_artists) # Similarity print('Similarity..') sim = tversky_similarity(ucm_artists, ucm_artists.T, shrink=200, target_items=test_pids, alpha=0.1, beta=1,
import numpy as np import scipy.sparse as sps from tqdm import tqdm from utils.definitions import * from utils.post_processing import eurm_remove_seed, append_rec_list dr = Datareader(verbose=False, mode='offline', only_load=True) urm = dr.get_urm() urm_col = sps.csc_matrix(urm) top_p = np.zeros(urm.shape[1]) rec = [] eurm1 = sps.lil_matrix(urm.shape) eurm2 = sps.lil_matrix(urm.shape) print(eurm1.shape) pids = dr.get_test_pids(cat=2) pids_all = dr.get_test_pids() # TopPop Album # ucm_album = dr.get_ucm_albums().tocsc() # album_dic = dr.get_track_to_album_dict() # TopPop Artist ucm_album = dr.get_ucm_albums().tocsc() artists_dic = dr.get_track_to_artist_dict() album_to_tracks = load_obj(name="album_tracks_dict_offline", path=ROOT_DIR + "/boosts/") tracks_to_album = load_obj(name="artist_tracks_dict_offline", path=ROOT_DIR + "/boosts/")
self.eurm = dot_product(self.model, user_pen, k=top_k, verbose=verbose, target_items=self.pid) if verbose: print("time: " + str(int(time.time() - start_time) / 60)) return self.eurm if __name__ == '__main__': dr = Datareader(verbose=True, mode='offline', only_load=True) urm = dr.get_urm(binary=True) pid = dr.get_test_pids() ev = Evaluator(dr) topk = 750 configs = [{ 'cat': 10, 'knn': 100, 'power': 2.4 }, { 'cat': 9, 'knn': 200, 'power': 0.4 }, { 'cat': 8, 'knn': 100,