def crossValidate(logFile, test_size=0.01) : """ :param recommender: we assume it has an s_recommend method :param test_size: how big is the test size in the test split :return: prints to screen and to logfile the cv average of the metrics """ Kaggle = False if Kaggle == True: interactionsCsv = pd.read_csv("../input/train.csv") targetList = pd.read_csv("../input/target_playlists.csv").iloc[:, 0] tracksCsv = pd.read_csv("../input/tracks.csv") else: interactionsCsv = pd.read_csv("input/train.csv") targetList = pd.read_csv("input/target_playlists.csv").iloc[:, 0] tracksCsv = pd.read_csv("input/tracks.csv") cumulative_precision = [None] * 8 cumulative_recall = [None] * 8 cumulative_MAP = [None] * 8 cf_parameters = {'topK': 80, 'alpha': 1, 'beta': 0.27, 'normalize_similarity': True, 'implicit': True, 'norm': 'l1'} for seed, i in zip([13, 17, 23, 33, 45, 57, 69, 77], range(8)): X_train, X_test = train_test_split(interactionsCsv, test_size=test_size, random_state=seed) urm_train = URM(X_train) urm_test = URM(X_test) urm = urm_train rp3b = RP3betaRecommender(urm.getCSR()) rp3b.fit(**cf_parameters) cumulative_precision[i], cumulative_recall[i], cumulative_MAP[i] = evaluate_algorithm(urm_test, rp3b) cumulative_precision = np.array(cumulative_precision) cumulative_recall = np.array(cumulative_recall) cumulative_MAP = np.array(cumulative_MAP) print("Recommender, performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.6f}" .format(cumulative_precision.mean(), cumulative_recall.mean(), cumulative_MAP.mean())) logFile.write("Test case: {}, Precision = {:.4f}, Recall = {:.4f}, MAP = {:.6f}\n".format(cf_parameters, cumulative_precision.mean(), cumulative_recall.mean(), cumulative_MAP.mean())) logFile.flush()
def population_train_split(csv, method=('threshold', [14, 35])): """ :param csv: a csv file with playlist_id, track_id :param method: cqn be percentile or threshold :return: a tuple (train_group1, train_group2) """ urm = URM(csv) if method[0] == 'threshold': group_1_2_TH = method[1][0] group_2_3_TH = method[1][1] elif method[0] == 'percentile': group_1_2_TH = getURMThreshold(urm, method[1][0]) group_2_3_TH = getURMThreshold(urm, method[1][1]) else: raise ValueError("not a valid split method") unique_playlists = np.unique(np.array(csv.iloc[:, 0].tolist())) print("Thresholds: {}, {}".format(group_1_2_TH, group_2_3_TH)) playlists_group1 = [ i for i in unique_playlists if len(urm.extractTracksFromPlaylist(i)) <= group_1_2_TH ] playlists_group2 = [ i for i in unique_playlists if group_1_2_TH < len(urm.extractTracksFromPlaylist(i)) <= group_2_3_TH ] playlists_group3 = [ i for i in unique_playlists if len(urm.extractTracksFromPlaylist(i)) > group_2_3_TH ] train_group1 = csv.loc[csv['playlist_id'].isin(playlists_group1)] train_group2 = csv.loc[csv['playlist_id'].isin(playlists_group2)] train_group3 = csv.loc[csv['playlist_id'].isin(playlists_group3)] return (train_group1, train_group2, train_group3)
def population_split(csv, method=('threshold', [7])): """ :param csv: a csv file with playlist_id, track_id :param percentile: the percentile that separates the lower n % of the population and the 100-n% of the population, this is used instead of a fixed threshold to cope with dynamic playlist length (what we mean by "few" playlists depends on the dataset, is not a hardcoded number) :return: a tuple (playlists_group1, playlists_group2) """ urm = URM(csv) if method[0] == 'threshold': group_1_2_TH = method[1][0] group_2_3_TH = method[1][1] elif method[0] == 'percentile': group_1_2_TH = getURMThreshold(urm, method[1][0]) group_2_3_TH = getURMThreshold(urm, method[1][1]) else: raise ValueError("not a valid split method") unique_playlists = np.unique(np.array(csv.iloc[:, 0].tolist())) print("Thresholds: {}, {}".format(group_1_2_TH, group_2_3_TH)) playlists_group1 = [ i for i in unique_playlists if len(urm.extractTracksFromPlaylist(i)) <= group_1_2_TH ] playlists_group2 = [ i for i in unique_playlists if group_1_2_TH < len(urm.extractTracksFromPlaylist(i)) <= group_2_3_TH ] playlists_group3 = [ i for i in unique_playlists if len(urm.extractTracksFromPlaylist(i)) > group_2_3_TH ] return (playlists_group1, playlists_group2, playlists_group3)
self.similarity_matrix = csr_matrix((data, idx), (num_items, num_items)) ### import random def output(i, j, val): # convert back to 1-indexed print('{0}\t{1}\t{2:.3f}'.format(i + 1, j + 1, val)) interactionsCsv = pd.read_csv("../input/train.csv") targetList = pd.read_csv("../input/target_playlists.csv").iloc[:, 0] X_train, X_test = train_test_split(interactionsCsv, test_size=0.05, random_state=17) urm_train = URM(X_train) urm_test = URM(X_test) dataset = fast_sparse_matrix(urm_train.getCSR()) num_users, num_items = dataset.shape model = SLIM() num_samples = 2 print('learning entire similarity matrix...') # usually we'll call train() on the entire dataset model = SLIM() model.fit(dataset)
# avvio il pool passando la funzione (con la parte fissa dell'input) # e il rimanente parametro, variabile res = pool.map(_pfit, np.arange(n_items)) # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) # generate the sparse weight matrix self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32) ### interactionsCsv = pd.read_csv("../input/train.csv") targetList = pd.read_csv("../input/target_playlists.csv").iloc[:, 0] X_train, X_test = train_test_split(interactionsCsv, test_size=0.05, random_state=17) urm_train = URM(X_train) urm_test = URM(X_test) model = SLIM_RMSE(urm_train.getCSR()) model.fit(l1_penalty=0.1, l2_penalty=0.1, positive_only=True, topK=100) model.recommend(0, 10)
X_train_3, X_test_3 = train_group3, None X_train = pd.concat([X_train_1, X_train_2, X_train_3]) X_test = pd.concat([X_test_1, X_test_2, X_test_3]) # G1 (1185556, 2) (26235, 2) # G2 (1119337, 2) (92454, 2) Precision = 0.0991, Recall = 0.2313, MAP = 0.123951 # G3 (1107769, 2) (104022, 2) Precision = 0.1809, Recall = 0.1936, MAP = 0.119263 # Tot (989080, 2) (222711, 2) """ Build Matrices """ icm_1 = ICM(tracksCsv, col="artist") icm_2 = ICM(tracksCsv, col="album") if submission: urm_train = URM(interactionsCsv) urm_train_t = URM(interactionsCsv[['track_id', 'playlist_id']], transposed=True) else: urm_train = URM(X_train) urm_train_t = URM(X_train[['track_id', 'playlist_id']], transposed=True) #urm_test = URM(X_test) urm_test = URM(X_test_1) matrices = { 'URM': urm_train, 'URM_T': urm_train_t, 'ICM_1': icm_1, 'ICM_2': icm_2 } """ Algorithm """
X_test = pd.concat([X_test1, X_test2]) """ Params """ """ Select only one group and alg. for tuning, select both groups to compare MAP to the alg. with no population split""" enable_dict = {'enabled_groups': enabled_groups, 'enableCBI': False, 'enableRP3B': False, 'enableCBF': True, 'enableCBU': False, 'enableSLIM': False} submission = False """ Build URM """ if submission : X_train = interactionsCsv urm_train = URM(X_train) urm_test = URM(X_test) icm_1 = ICM(tracksCsv, col="artist") icm_2 = ICM(tracksCsv, col="album") X_train_t = X_train[['track_id', 'playlist_id']] urm_train_t = URM(X_train_t, transposed=True) matrices = {'URM': urm_train, 'URM_T': urm_train_t, 'ICM_1': icm_1, 'ICM_2': icm_2} """ Algorithm """ """G1 0.121 G2 0.116 -> Server 0.09042""" """G1 0.125 G2 0.117 -> Server 0.09134""" group1_param = {'cbi_param_dict' : {'k': 150, 'h': 20, 'mode': 'item'}, 'cbu_param_dict' : {'k': 150, 'h': 20, 'mode': 'user'},
Kaggle = False if Kaggle == True: interactionsCsv = pd.read_csv("../input/train.csv") targetList = pd.read_csv("../input/target_playlists.csv").iloc[:, 0] tracksCsv = pd.read_csv("../input/tracks.csv") else: interactionsCsv = pd.read_csv("input/train.csv") targetList = pd.read_csv("input/target_playlists.csv").iloc[:, 0] tracksCsv = pd.read_csv("input/tracks.csv") print(interactionsCsv.describe()) icm = ICM(tracksCsv, col="artist") icm2 = ICM(tracksCsv, col="album") urm_full = URM(interactionsCsv) X_train, X_test = train_test_split(interactionsCsv, test_size=0.05, random_state=17) urm_train = URM(X_train) urm_test = URM(X_test) # Transposed matrix X_train_t = X_train[['track_id', 'playlist_id']] X_test_t = X_train_t[['track_id', 'playlist_id']] urm_full_t = URM(interactionsCsv[['track_id', 'playlist_id']], transposed=True) urm_test_t = URM(X_test_t, transposed=True) urm_train_t = URM(X_train_t, transposed=True) """ RUNNING SCRIPT PARAMETERS