def get_icm(self): tracks_data = pd.read_csv("../data/tracks.csv") artists = tracks_data.reindex(columns=['track_id', 'artist_id']) artists.sort_values( by='track_id', inplace=True) # this seems not useful, values are already ordered artists_list = [[a] for a in artists['artist_id']] icm_artists = MultiLabelBinarizer( sparse_output=True).fit_transform(artists_list) icm_artists_csr = icm_artists.tocsr() albums = tracks_data.reindex(columns=['track_id', 'album_id']) albums.sort_values( by='track_id', inplace=True) # this seems not useful, values are already ordered albums_list = [[a] for a in albums['album_id']] icm_albums = MultiLabelBinarizer( sparse_output=True).fit_transform(albums_list) icm_albums_csr = icm_albums.tocsr() durations = tracks_data.reindex(columns=['track_id', 'duration_sec']) durations.sort_values( by='track_id', inplace=True) # this seems not useful, values are already ordered durations_list = [[d] for d in durations['duration_sec']] icm_durations = MultiLabelBinarizer( sparse_output=True).fit_transform(durations_list) icm_durations_csr = icm_durations.tocsr() ICM = sparse.hstack( (icm_albums_csr, icm_artists_csr, icm_durations_csr)) ICM_csr = ICM.tocsr() return ICM_csr
def get_ICM(self, a): if self.local == True: self.ICM = sparse.load_npz("input/ICM"+str(self.testID)+".npz") return self.ICM artists = self.tracks.reindex(columns=['track_id', 'artist_id']) artists.sort_values(by='track_id', inplace=True) artists_list = [[a] for a in artists['artist_id']] icm_artists = MultiLabelBinarizer(classes=self.get_artists(), sparse_output=True).fit_transform(artists_list) icm_artists_csr = icm_artists.tocsr() #return icm_artists_csr albums = self.tracks.reindex(columns=['track_id', 'album_id']) albums.sort_values(by='track_id', inplace=True) albums_list = [[a] for a in albums['album_id']] icm_albums = MultiLabelBinarizer(classes=self.get_albums(), sparse_output=True).fit_transform(albums_list) icm_albums_csr = icm_albums.tocsr() #return icm_albums_csr #durations= self.tracks.reindex(columns=['track_id', 'duration_sec']) #durations.sort_values(by='track_id', inplace=True) #durations_list = [[d] for d in durations['duration_sec']] #icm_durations = MultiLabelBinarizer(classes=self.get_durations(), sparse_output=True).fit_transform(durations_list) #icm_durations_csr= icm_durations.tocsr() return sps.hstack((a*icm_artists_csr,icm_albums_csr))
def split_only_sequential(self, URM, URM_df): helper = Helper() sequential_playlists = helper.get_target_playlists_list()[:5000] selected_playlists = np.array([]) self.target_playlists = sequential_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in sequential_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) # Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_randomic_exactly_last(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) helper = Helper() self.target_playlists = helper.get_target_playlists_list()[5000:] selected_playlists = self.target_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def build_ICM(self): print('Building ICM from tracks_final...') # 1 - Artists artists_df = self.tracks_final.reindex( columns=['track_id', 'artist_id']) artists_df.sort_values(by='track_id', inplace=True) artists_list = [[a] for a in artists_df['artist_id']] icm_artists = MultiLabelBinarizer( classes=self.get_artists(), sparse_output=True).fit_transform(artists_list) icm_artists_csr = icm_artists.tocsr() # 2 - Albums albums_df = self.tracks_final.reindex(columns=['track_id', 'album']) albums_df.sort_values(by='track_id', inplace=True) albums_list = [ ast.literal_eval(a) if len(ast.literal_eval(a)) > 0 and ast.literal_eval(a)[0] is not None else [] for a in albums_df['album'] ] icm_albums = MultiLabelBinarizer( classes=self.get_albums(), sparse_output=True).fit_transform(albums_list) icm_albums_csr = icm_albums.tocsr() # 3 - Tags tags_df = self.tracks_final.reindex(columns=['track_id', 'tags']) tags_df.sort_values(by='track_id', inplace=True) tags_list = [ast.literal_eval(t) for t in tags_df['tags']] icm_tags = MultiLabelBinarizer( classes=self.get_tags(), sparse_output=True).fit_transform(tags_list) icm_tags_csr = icm_tags.tocsr() # 4 - Stack together ICM = sparse.hstack((icm_artists_csr, icm_albums_csr, icm_tags_csr)) # 5 - Tf-idf ICM_tfidf = feature_extraction.text.TfidfTransformer().fit_transform( ICM) ICM_tfidf = normalize(ICM_tfidf, axis=0, norm='l2') return ICM_tfidf.tocsr()
def get_URM(self): print('Building URM...') grouped = self.train_final.groupby('playlist_id', as_index=True).apply( lambda x: list(x['track_id'])) matrix = MultiLabelBinarizer(classes=self.get_tracks(), sparse_output=True).fit_transform(grouped) self.URM = matrix.tocsr() return self.URM
def get_test_df(self): #occhio qua che va modificato, togliere le due righe sotto questa grouped = self.test_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) URM_test_matrix = MultiLabelBinarizer( classes=self.b.get_tracks(), sparse_output=True).fit_transform(grouped) self.test_df = URM_test_matrix.tocsr() return self.test_df
def split_cluster_randomic_only_last(self, URM, URM_df): # splitting URM in test set e train set segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() # Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list( )[:5000] # WILL REMOVE THEM print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key and playlist_id not in target_playlists: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def build_ICM(self): if self.loadPredefined: try: self.ICM = sps.load_npz(self.dataSubfolder + "ICM.npz") print( "PlaylistDataReader: ICM is imported from the saved data.") return except FileNotFoundError: print("PlaylistDataReader: ICM is not.Found. Building new one") # artists df_artists = self.trackData.reindex( columns=["track_id", "artist_id"]) df_artists.sort_values(by="track_id", inplace=True) artist_list = [[a] for a in df_artists["artist_id"]] icm_artists = MultiLabelBinarizer( classes=self.get_artists(), sparse_output=True).fit_transform(artist_list) icm_artists_csr = icm_artists.tocsr() # albums df_albums = self.trackData.reindex( columns=["track_id", "album_id"]) df_albums.sort_values(by="track_id", inplace=True) album_list = [[b] for b in df_albums["album_id"]] icm_albums = MultiLabelBinarizer( classes=self.get_albums(), sparse_output=True).fit_transform(album_list) icm_albums_csr = icm_albums.tocsr() ICM = sps.hstack((icm_artists_csr, icm_albums_csr)) # ICM_tfidf_T = feature_extraction.text.TfidfTransformer().fit_transform(ICM.T) # ICM_tfidf = ICM_tfidf_T.T # ICM_tfidf = normalize(ICM_tfidf, axis=0, norm='l2') # self.ICM = ICM_tfidf.tocsr() self.ICM = ICM sps.save_npz(self.dataSubfolder + "ICM.npz", self.ICM) print("PlaylistDataReader: ICM matrix built completed") print("PlaylistDataReader: shape is {}".format(self.ICM.shape)) return
def split_randomic(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) n_target = len(available_playlists) * ( 1 - self.train_test_split ) #respecting the submission file we have to post while n_target > 0: random_index = randint(0, len(available_playlists) - 1) #chosen playlist_id playlist_id = available_playlists[random_index] tracks_left = len(URM[playlist_id].data) if tracks_left > 7: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) n_target -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def split_randomic_all_playlists_longer_10000(self, URM, URM_df, threshold_length=10): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) helper = Helper() target_playlists_kaggle = helper.get_target_playlists_list() for playlist_id in available_playlists: if len(selected_playlists) == 10000: break if playlist_id not in target_playlists_kaggle and len( URM[playlist_id].indices) > threshold_length: selected_playlists = np.append(selected_playlists, playlist_id) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
def build_URM(self): #print("PlaylistDataReader: URM Matrix is being built...") if self.loadPredefined: try: self.URM_all = sps.load_npz(self.dataSubfolder + "URM_all.npz") print( "PlaylistDataReader: URM is imported from the saved data.") return except FileNotFoundError: print("PlaylistDataReader: URM is not.Found. Building new one") grouped = self.trainData.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) matrix = MultiLabelBinarizer( classes=self.get_tracks(), sparse_output=True).fit_transform(grouped) self.URM_all = matrix.tocsr() sps.save_npz(self.dataSubfolder + "URM_all.npz", self.URM_all) print("PlaylistDataReader: URM matrix built completed") print("PlaylistDataReader: shape is {}".format( self.URM_all.shape)) return
def split_cluster_randomic(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() segment_size = 1 min_playlist_len_after_split = 5 dist = target_analyzer.get_distribution_array( segment_size=segment_size) # in this way n_target = 10000 helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] n_target = np.sum(dist) # - len(target_playlists) while n_target > 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) while dist[target_segment] <= 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) n_target -= 1 dist[target_segment] -= 1 selected_playlists = np.append(selected_playlists, playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) grouped_test = grouped.copy() relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) # if playlist_id in target_playlists: # to_be_removed = int(len(tracks) * 0.2) # to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(playlist_id)[-to_be_removed:] # for track in to_be_removed_tracks: # relevant_items[playlist_id].append(track) # tracks = np.delete(tracks, np.where(tracks == track)) # for i in range(to_be_removed): # removed_track = tracks[-1] # relevant_items[playlist_id].append(removed_track) # tracks = np.delete(tracks, len(tracks) - 1) # else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks grouped_test[playlist_id] = relevant_items[playlist_id] count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.dict_test = relevant_items # bib URM # self.URM_train = helper.get_urm_csr_bib(URM = self.URM_train) # plotter = TargetAnalyzer() # plotter.plot_standard_distribution() # plotter.plot_distribution(self.URM_train, self.target_playlists) self.URM_test = MultiLabelBinarizer( classes=all_tracks, sparse_output=True).fit_transform(grouped_test) self.URM_test = self.URM_test.tocsr() self.URM_test = self.URM_test.astype(np.float64) self.URM_train = self.URM_train.astype(np.float64)
def split_sequential(self, URM, URM_df): segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() #Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] #n_target = np.sum(dist) - len(target_playlists) # Removing from the cluster distribution the len of the sequential target for playlist_id in target_playlists: playlist_id = int(playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) #target_len = len(URM[playlist_id].data) #dist[target_len] -= 1 print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: #Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) if playlist_id in target_playlists: to_be_removed = int(len(tracks) * 0.2) #Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items
class Evaluator(object): def __init__(self): self.URM_train = None self.URM_test = None self.dict_test = None self.target_playlists = None self.train_test_split = 0.80 self.at = 10 self.tracks_df = pd.read_csv("../data/tracks.csv") self.starting_weight_low = np.array([0.22, 0.14, 0.215, 0.45]) def get_URM_train(self): return self.URM_train def get_dict_test(self): return self.dict_test def get_target_playlists(self): return self.target_playlists def get_URM_test(self): return self.URM_test #Split totally randomic, no cluster def split_randomic(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) n_target = len(available_playlists) * ( 1 - self.train_test_split ) #respecting the submission file we have to post while n_target > 0: random_index = randint(0, len(available_playlists) - 1) #chosen playlist_id playlist_id = available_playlists[random_index] tracks_left = len(URM[playlist_id].data) if tracks_left > 7: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) n_target -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items # Split considering all playlists EXCEPT TARGET (because they will be surely afflicted by Kaggle test split) def split_randomic_all_playlists(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) helper = Helper() target_playlists_kaggle = helper.get_target_playlists_list() for playlist_id in available_playlists: if playlist_id not in target_playlists_kaggle: selected_playlists = np.append(selected_playlists, playlist_id) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items def split_randomic_all_playlists_longer(self, URM, URM_df, threshold_length=10): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) helper = Helper() target_playlists_kaggle = helper.get_target_playlists_list() for playlist_id in available_playlists: if playlist_id not in target_playlists_kaggle and len( URM[playlist_id].indices) > threshold_length: selected_playlists = np.append(selected_playlists, playlist_id) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items def split_randomic_all_playlists_longer_10000(self, URM, URM_df, threshold_length=10): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) helper = Helper() target_playlists_kaggle = helper.get_target_playlists_list() for playlist_id in available_playlists: if len(selected_playlists) == 10000: break if playlist_id not in target_playlists_kaggle and len( URM[playlist_id].indices) > threshold_length: selected_playlists = np.append(selected_playlists, playlist_id) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items def split_randomic_all_playlists_shorter(self, URM, URM_df, threshold_length=10): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) helper = Helper() target_playlists_kaggle = helper.get_target_playlists_list() for playlist_id in available_playlists: if playlist_id not in target_playlists_kaggle and len( URM[playlist_id].indices) <= threshold_length: selected_playlists = np.append(selected_playlists, playlist_id) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items def split_randomic_exactly_last(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) helper = Helper() self.target_playlists = helper.get_target_playlists_list()[5000:] selected_playlists = self.target_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items #Split getting 5000 sequentially, 5000 randomic with cluster segment = 1 def split_sequential(self, URM, URM_df): segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() #Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] #n_target = np.sum(dist) - len(target_playlists) # Removing from the cluster distribution the len of the sequential target for playlist_id in target_playlists: playlist_id = int(playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) #target_len = len(URM[playlist_id].data) #dist[target_len] -= 1 print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: #Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) if playlist_id in target_playlists: to_be_removed = int(len(tracks) * 0.2) #Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items def split_only_sequential(self, URM, URM_df): helper = Helper() sequential_playlists = helper.get_target_playlists_list()[:5000] selected_playlists = np.array([]) self.target_playlists = sequential_playlists grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in sequential_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) # Torna le #to_be_removed tracks ordinate sequenzialmente. e le toglie dalla lista delle tracks to_be_removed_tracks = helper.get_sorted_tracks_in_playlist( playlist_id)[-to_be_removed:] for track in to_be_removed_tracks: relevant_items[playlist_id].append(track) tracks = np.delete(tracks, np.where(tracks == track)) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items # OLD_SPLIT (Piccio) def split(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() segment_size = 2 min_playlist_len_after_split = 5 dist = target_analyzer.get_distribution_array( segment_size=segment_size) # in this way n_target = 10000 helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] n_target = np.sum(dist) # - len(target_playlists) while n_target > 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) while dist[target_segment] <= 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) n_target -= 1 dist[target_segment] -= 1 selected_playlists = np.append(selected_playlists, playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) grouped_test = grouped.copy() relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks grouped_test[playlist_id] = relevant_items[playlist_id] count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.dict_test = relevant_items self.URM_test = MultiLabelBinarizer( classes=all_tracks, sparse_output=True).fit_transform(grouped_test) self.URM_test = self.URM_test.tocsr() self.URM_test = self.URM_test.astype(np.float64) self.URM_train = self.URM_train.astype(np.float64) #Get all randomic playlists applying cluster, segment = 2 def split_cluster_randomic(self, URM, URM_df): # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() segment_size = 1 min_playlist_len_after_split = 5 dist = target_analyzer.get_distribution_array( segment_size=segment_size) # in this way n_target = 10000 helper = Helper() target_playlists = helper.get_target_playlists_list()[:5000] n_target = np.sum(dist) # - len(target_playlists) while n_target > 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) while dist[target_segment] <= 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_len = len(URM[playlist_id].data) * 0.8 if target_len > min_playlist_len_after_split: target_segment = int(target_len / segment_size) n_target -= 1 dist[target_segment] -= 1 selected_playlists = np.append(selected_playlists, playlist_id) available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) grouped_test = grouped.copy() relevant_items = defaultdict(list) count = 0 for playlist_id in selected_playlists: tracks = np.array(grouped[playlist_id]) # if playlist_id in target_playlists: # to_be_removed = int(len(tracks) * 0.2) # to_be_removed_tracks = helper.get_sorted_tracks_in_playlist(playlist_id)[-to_be_removed:] # for track in to_be_removed_tracks: # relevant_items[playlist_id].append(track) # tracks = np.delete(tracks, np.where(tracks == track)) # for i in range(to_be_removed): # removed_track = tracks[-1] # relevant_items[playlist_id].append(removed_track) # tracks = np.delete(tracks, len(tracks) - 1) # else: to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks grouped_test[playlist_id] = relevant_items[playlist_id] count += 1 all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.dict_test = relevant_items # bib URM # self.URM_train = helper.get_urm_csr_bib(URM = self.URM_train) # plotter = TargetAnalyzer() # plotter.plot_standard_distribution() # plotter.plot_distribution(self.URM_train, self.target_playlists) self.URM_test = MultiLabelBinarizer( classes=all_tracks, sparse_output=True).fit_transform(grouped_test) self.URM_test = self.URM_test.tocsr() self.URM_test = self.URM_test.astype(np.float64) self.URM_train = self.URM_train.astype(np.float64) # Get all randomic playlists of LAST 5000 targret playlists applying cluster, segment = 1 def split_cluster_randomic_only_last(self, URM, URM_df): # splitting URM in test set e train set segment = 1 # splitting URM in test set e train set selected_playlists = np.array([]) available_playlists = np.arange(URM.shape[0]) target_analyzer = TargetAnalyzer() # Gets distribution of only last 5000 playlists dist = target_analyzer.get_distribution_array_only_last(segment) helper = Helper() target_playlists = helper.get_target_playlists_list( )[:5000] # WILL REMOVE THEM print("Clustering with segment = " + str(segment)) for key in tqdm(range(len(dist))): while dist[key] != 0: random_index = randint(0, len(available_playlists) - 1) playlist_id = available_playlists[random_index] target_segment = int(0.8 * len(URM[playlist_id].data)) if target_segment == key and playlist_id not in target_playlists: available_playlists = np.delete( available_playlists, np.where(available_playlists == playlist_id)) selected_playlists = np.append(selected_playlists, playlist_id) dist[key] -= 1 self.target_playlists = selected_playlists.astype(int) grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) relevant_items = defaultdict(list) for playlist_id in selected_playlists: # Tracks = lista delle tracks prese dalla URM tracks = np.array(grouped[playlist_id]) to_be_removed = int(len(tracks) * 0.2) for i in range(to_be_removed): index = randint(0, len(tracks) - 1) removed_track = tracks[index] relevant_items[playlist_id].append(removed_track) tracks = np.delete(tracks, index) grouped[playlist_id] = tracks all_tracks = self.tracks_df["track_id"].unique() matrix = MultiLabelBinarizer(classes=all_tracks, sparse_output=True).fit_transform(grouped) self.URM_train = matrix.tocsr() self.URM_train = self.URM_train.astype(np.float64) self.dict_test = relevant_items def MAP(self, recommended_items, relevant_items): is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) # Cumulative sum: precision at 1, at 2, at 3 ... p_at_k = is_relevant * np.cumsum( is_relevant, dtype=np.float32) / (1 + np.arange(len(is_relevant))) map_score = np.sum(p_at_k) / np.min( [len(relevant_items), len(is_relevant)]) return map_score def evaluate(self, playlist_id, recommended_items): relevant_items = self.dict_test[playlist_id] map = self.MAP(recommended_items, relevant_items) return map #evaluates just weights assigned in the run method. No array weights neede def global_evaluate_single(self, recommender): MAP_final = 0 recommender.fit(self.URM_train) count = 0 for target in tqdm(self.target_playlists): recommended_items = recommender.recommend(target) MAP_final += self.evaluate(target, recommended_items) count += 1 MAP_final /= len(self.target_playlists) return MAP_final
def split(self): """ Splits the dataset into training and test set. Builds the URM train csr matrix and the test dataframe in a submission-like structure. """ print('Splitting the dataset...') # Load the original data set and group by playlist URM_df = self.b.get_train_final() grouped = URM_df.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) grouped.sort_index(inplace=True) # Set num_playlist_to_evaluate self.num_playlists_to_evaluate = int(grouped.size) #self.num_playlists_to_evaluate = 10000 if self.is_test: # Set num_playlist_to_test #self.num_playlists_to_test = self.num_playlists_to_evaluate self.num_playlists_to_test = self.b.get_target_playlists().shape[0] # Find indices of playlists to test and set target_playlists testable_idx = grouped[[len(x) >= 1 for x in grouped]].index test_idx = np.random.choice(testable_idx, self.num_playlists_to_evaluate, replace=False) test_idx.sort() #self.target_playlists = test_idx self.target_playlists = self.b.get_target_playlists() # Extract the test set portion of the data set test_mask = grouped[test_idx] test_mask.sort_index(inplace=True) # Iterate over the test set to randomly remove 5 tracks from each playlist ord_playlists = self.b.get_ordered_playlists() test_df_list = [] i = 0 for t in test_mask: if test_idx[i] in self.b.get_ordered_playlists()[:4999]: t_tracks_to_test = np.asarray( ord_playlists[test_idx[i]][int(-(len(t) * 0.2)):]) else: t_tracks_to_test = np.random.choice(t, int(len(t) * 0.2), replace=False) if test_idx[i] in self.b.get_target_playlists(): test_df_list.append([test_idx[i], t_tracks_to_test]) for tt in t_tracks_to_test: t.remove(tt) i += 1 else: self.num_playlists_to_test = self.b.get_target_playlists().shape[0] self.target_playlists = list(self.b.get_target_playlists()) test_df_list = [] i = 0 test_mask = grouped[self.target_playlists] test_mask.sort_index(inplace=True) for t in test_mask: test_df_list.append([self.target_playlists[i], t]) i += 1 # Build test_df and URM_train self.test_df = pd.DataFrame(test_df_list, columns=['playlist_id', 'track_ids']) URM_train_matrix = MultiLabelBinarizer( classes=self.b.get_tracks(), sparse_output=True).fit_transform(grouped) self.URM_train = URM_train_matrix.tocsr() # Set target tracks t_list = [t for sub in self.test_df['track_ids'] for t in sub] t_list_unique = list(set(t_list)) t_list_unique.sort() self.target_tracks = t_list_unique
def build_URM(self): grouped = self.train.groupby('playlist_id', as_index=True).apply( (lambda playlist: list(playlist['track_id']))) URM = MultiLabelBinarizer(classes=self.tracks['track_id'].unique(), sparse_output=True).fit_transform(grouped) return URM.tocsr()
def load_matrices(self): # addresses of the files train_file = './Dataset/Data/train.csv' target_playlists_file = './Dataset/Data/target_playlists.csv' tracks_file = './Dataset/Data/tracks.csv' sequential_train_file = './Dataset/Data/train_sequential.csv' train_data = pd.read_csv(train_file) tracks_data = pd.read_csv(tracks_file) sequential_data = pd.read_csv(sequential_train_file) target_data = pd.read_csv(target_playlists_file) # building the URM taking into account the order of the 5k target playlists sequential_playlists = sequential_data['playlist_id'].unique() target_playlists = target_data['playlist_id'].unique() all_playlists = train_data['playlist_id'].unique() t = train_data.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) t = t.drop(sequential_playlists) s = sequential_data.groupby( 'playlist_id', as_index=True).apply(lambda x: list(x['track_id'])) n_playlists = all_playlists.size n_tracks = tracks_data['track_id'].unique().size first_iteration = True for i in tqdm(range(n_playlists)): if i in s: # generate an array of weights for the playlist data = [] indices = s[i] n = len(indices) # top is the variable value top = 0.93 base = 1 - top weight = top / n index = 0 incr = weight while index < n: data.append(base + incr) incr += weight index += 1 # assign the right weight to the right track index = 0 d = {} while index < n: d[indices[index]] = data[index] index += 1 # build the row, transform to csr and concatenate to the matrix row = np.zeros(n_tracks) for key in sorted(d.keys()): row[key] = d[key] if first_iteration: mat = sps.csr_matrix(row) first_iteration = False else: row_csr = sps.csr_matrix(row) mat = sps.vstack((mat, row_csr)) else: indices = t[i] row = np.zeros(n_tracks) for j in range(len(indices)): row[indices[j]] = 1.0 if first_iteration: mat = sps.csr_matrix(row) first_iteration = False else: row_csr = sps.csr_matrix(row) mat = sps.vstack((mat, row_csr)) self.URM = mat sps.save_npz("./Dataset/URM.npz", mat) # building the ICM matrix artists = tracks_data.reindex(columns=['track_id', 'artist_id']) artists.sort_values( by='track_id', inplace=True) # this seems not useful, values are already ordered artists_list = [[a] for a in artists['artist_id']] icm_artists = MultiLabelBinarizer( sparse_output=True).fit_transform(artists_list) icm_artists_csr = icm_artists.tocsr() albums = tracks_data.reindex(columns=['track_id', 'album_id']) albums.sort_values( by='track_id', inplace=True) # this seems not useful, values are already ordered albums_list = [[a] for a in albums['album_id']] icm_albums = MultiLabelBinarizer( sparse_output=True).fit_transform(albums_list) icm_albums_csr = icm_albums.tocsr() durations = tracks_data.reindex(columns=['track_id', 'duration_sec']) durations.sort_values( by='track_id', inplace=True) # this seems not useful, values are already ordered durations_list = [[d] for d in durations['duration_sec']] icm_durations = MultiLabelBinarizer( sparse_output=True).fit_transform(durations_list) icm_durations_csr = icm_durations.tocsr() ICM = sps.hstack((icm_albums_csr, icm_artists_csr, icm_durations_csr)) self.ICM = ICM.tocsr() self.target_playlists = target_playlists