示例#1
0
def crossValidate(logFile, test_size=0.01) :
    """
    :param recommender: we assume it has an s_recommend method
    :param test_size: how big is the test size in the test split
    :return: prints to screen and to logfile the cv average of the metrics
    """
    Kaggle = False

    if Kaggle == True:
        interactionsCsv = pd.read_csv("../input/train.csv")
        targetList = pd.read_csv("../input/target_playlists.csv").iloc[:, 0]
        tracksCsv = pd.read_csv("../input/tracks.csv")
    else:
        interactionsCsv = pd.read_csv("input/train.csv")
        targetList = pd.read_csv("input/target_playlists.csv").iloc[:, 0]
        tracksCsv = pd.read_csv("input/tracks.csv")

    cumulative_precision = [None] * 8
    cumulative_recall = [None] * 8
    cumulative_MAP = [None] * 8


    cf_parameters = {'topK': 80,
                         'alpha': 1,
                         'beta': 0.27,
                         'normalize_similarity': True,
                         'implicit': True,
                         'norm': 'l1'}

    for seed, i in zip([13, 17, 23, 33, 45, 57, 69, 77], range(8)):

        X_train, X_test = train_test_split(interactionsCsv, test_size=test_size, random_state=seed)
        urm_train = URM(X_train)
        urm_test = URM(X_test)

        urm = urm_train


        rp3b = RP3betaRecommender(urm.getCSR())
        rp3b.fit(**cf_parameters)


        cumulative_precision[i], cumulative_recall[i], cumulative_MAP[i] = evaluate_algorithm(urm_test, rp3b)


    cumulative_precision = np.array(cumulative_precision)
    cumulative_recall = np.array(cumulative_recall)
    cumulative_MAP = np.array(cumulative_MAP)

    print("Recommender, performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.6f}"
              .format(cumulative_precision.mean(), cumulative_recall.mean(), cumulative_MAP.mean()))

    logFile.write("Test case: {}, Precision = {:.4f}, Recall = {:.4f}, MAP = {:.6f}\n".format(cf_parameters,
                                                                                                  cumulative_precision.mean(),
                                                                                                  cumulative_recall.mean(),
                                                                                                  cumulative_MAP.mean()))
    logFile.flush()
示例#2
0
def population_train_split(csv, method=('threshold', [14, 35])):
    """
    :param csv: a csv file with playlist_id, track_id
    :param method: cqn be percentile or threshold

    :return: a tuple (train_group1, train_group2)
    """
    urm = URM(csv)
    if method[0] == 'threshold':
        group_1_2_TH = method[1][0]
        group_2_3_TH = method[1][1]

    elif method[0] == 'percentile':
        group_1_2_TH = getURMThreshold(urm, method[1][0])
        group_2_3_TH = getURMThreshold(urm, method[1][1])
    else:
        raise ValueError("not a valid split method")

    unique_playlists = np.unique(np.array(csv.iloc[:, 0].tolist()))
    print("Thresholds: {}, {}".format(group_1_2_TH, group_2_3_TH))

    playlists_group1 = [
        i for i in unique_playlists
        if len(urm.extractTracksFromPlaylist(i)) <= group_1_2_TH
    ]
    playlists_group2 = [
        i for i in unique_playlists
        if group_1_2_TH < len(urm.extractTracksFromPlaylist(i)) <= group_2_3_TH
    ]
    playlists_group3 = [
        i for i in unique_playlists
        if len(urm.extractTracksFromPlaylist(i)) > group_2_3_TH
    ]

    train_group1 = csv.loc[csv['playlist_id'].isin(playlists_group1)]
    train_group2 = csv.loc[csv['playlist_id'].isin(playlists_group2)]
    train_group3 = csv.loc[csv['playlist_id'].isin(playlists_group3)]

    return (train_group1, train_group2, train_group3)
示例#3
0
def population_split(csv, method=('threshold', [7])):
    """
    :param csv: a csv file with playlist_id, track_id
    :param percentile: the percentile that separates the lower n % of the population and the 100-n%
    of the population, this is used instead of a fixed threshold to cope with dynamic playlist length
    (what we mean by "few" playlists depends on the dataset, is not a hardcoded number)

    :return: a tuple (playlists_group1, playlists_group2)
    """
    urm = URM(csv)
    if method[0] == 'threshold':
        group_1_2_TH = method[1][0]
        group_2_3_TH = method[1][1]

    elif method[0] == 'percentile':
        group_1_2_TH = getURMThreshold(urm, method[1][0])
        group_2_3_TH = getURMThreshold(urm, method[1][1])
    else:
        raise ValueError("not a valid split method")

    unique_playlists = np.unique(np.array(csv.iloc[:, 0].tolist()))
    print("Thresholds: {}, {}".format(group_1_2_TH, group_2_3_TH))

    playlists_group1 = [
        i for i in unique_playlists
        if len(urm.extractTracksFromPlaylist(i)) <= group_1_2_TH
    ]
    playlists_group2 = [
        i for i in unique_playlists
        if group_1_2_TH < len(urm.extractTracksFromPlaylist(i)) <= group_2_3_TH
    ]
    playlists_group3 = [
        i for i in unique_playlists
        if len(urm.extractTracksFromPlaylist(i)) > group_2_3_TH
    ]

    return (playlists_group1, playlists_group2, playlists_group3)
示例#4
0
            self.similarity_matrix = csr_matrix((data, idx), (num_items, num_items))


###
import random


def output(i, j, val):
    # convert back to 1-indexed
    print('{0}\t{1}\t{2:.3f}'.format(i + 1, j + 1, val))

interactionsCsv = pd.read_csv("../input/train.csv")
targetList = pd.read_csv("../input/target_playlists.csv").iloc[:, 0]
X_train, X_test = train_test_split(interactionsCsv, test_size=0.05, random_state=17)

urm_train = URM(X_train)
urm_test = URM(X_test)

dataset = fast_sparse_matrix(urm_train.getCSR())
num_users, num_items = dataset.shape

model = SLIM()

num_samples = 2


print('learning entire similarity matrix...')
# usually we'll call train() on the entire dataset
model = SLIM()
model.fit(dataset)
示例#5
0
        # avvio il pool passando la funzione (con la parte fissa dell'input)
        # e il rimanente parametro, variabile
        res = pool.map(_pfit, np.arange(n_items))

        # res contains a vector of (values, rows, cols) tuples
        values, rows, cols = [], [], []
        for values_, rows_, cols_ in res:
            values.extend(values_)
            rows.extend(rows_)
            cols.extend(cols_)

        # generate the sparse weight matrix
        self.W_sparse = sps.csc_matrix((values, (rows, cols)),
                                       shape=(n_items, n_items),
                                       dtype=np.float32)


###
interactionsCsv = pd.read_csv("../input/train.csv")
targetList = pd.read_csv("../input/target_playlists.csv").iloc[:, 0]
X_train, X_test = train_test_split(interactionsCsv,
                                   test_size=0.05,
                                   random_state=17)

urm_train = URM(X_train)
urm_test = URM(X_test)

model = SLIM_RMSE(urm_train.getCSR())
model.fit(l1_penalty=0.1, l2_penalty=0.1, positive_only=True, topK=100)

model.recommend(0, 10)
示例#6
0
    X_train_3, X_test_3 = train_group3, None

X_train = pd.concat([X_train_1, X_train_2, X_train_3])
X_test = pd.concat([X_test_1, X_test_2, X_test_3])

# G1 (1185556, 2) (26235, 2)
# G2 (1119337, 2) (92454, 2)        Precision = 0.0991, Recall = 0.2313, MAP = 0.123951
# G3 (1107769, 2) (104022, 2)       Precision = 0.1809, Recall = 0.1936, MAP = 0.119263
# Tot (989080, 2) (222711, 2)
"""        Build Matrices      """

icm_1 = ICM(tracksCsv, col="artist")
icm_2 = ICM(tracksCsv, col="album")

if submission:
    urm_train = URM(interactionsCsv)
    urm_train_t = URM(interactionsCsv[['track_id', 'playlist_id']],
                      transposed=True)
else:
    urm_train = URM(X_train)
    urm_train_t = URM(X_train[['track_id', 'playlist_id']], transposed=True)
    #urm_test = URM(X_test)
    urm_test = URM(X_test_1)

matrices = {
    'URM': urm_train,
    'URM_T': urm_train_t,
    'ICM_1': icm_1,
    'ICM_2': icm_2
}
"""             Algorithm            """
示例#7
0
X_test = pd.concat([X_test1, X_test2])

"""              Params              """
""" Select only one group and alg. for tuning, 
select both groups to compare MAP to the alg.
with no population split"""

enable_dict = {'enabled_groups': enabled_groups,
               'enableCBI': False, 'enableRP3B': False, 'enableCBF': True, 'enableCBU': False, 'enableSLIM': False}
submission = False

"""             Build URM            """
if submission :
    X_train = interactionsCsv

urm_train = URM(X_train)
urm_test = URM(X_test)
icm_1 = ICM(tracksCsv, col="artist")
icm_2 = ICM(tracksCsv, col="album")

X_train_t = X_train[['track_id', 'playlist_id']]
urm_train_t = URM(X_train_t, transposed=True)

matrices = {'URM': urm_train, 'URM_T': urm_train_t, 'ICM_1': icm_1, 'ICM_2': icm_2}

"""             Algorithm            """

"""G1 0.121 G2 0.116 -> Server 0.09042"""
"""G1 0.125 G2 0.117 -> Server 0.09134"""
group1_param = {'cbi_param_dict' : {'k': 150, 'h': 20, 'mode': 'item'},
                'cbu_param_dict' : {'k': 150, 'h': 20, 'mode': 'user'},
示例#8
0
文件: main.py 项目: vittorio96/RecSys
Kaggle = False

if Kaggle == True:
    interactionsCsv = pd.read_csv("../input/train.csv")
    targetList = pd.read_csv("../input/target_playlists.csv").iloc[:, 0]
    tracksCsv = pd.read_csv("../input/tracks.csv")
else:
    interactionsCsv = pd.read_csv("input/train.csv")
    targetList = pd.read_csv("input/target_playlists.csv").iloc[:, 0]
    tracksCsv = pd.read_csv("input/tracks.csv")

print(interactionsCsv.describe())
icm = ICM(tracksCsv, col="artist")
icm2 = ICM(tracksCsv, col="album")
urm_full = URM(interactionsCsv)
X_train, X_test = train_test_split(interactionsCsv,
                                   test_size=0.05,
                                   random_state=17)

urm_train = URM(X_train)
urm_test = URM(X_test)

# Transposed matrix
X_train_t = X_train[['track_id', 'playlist_id']]
X_test_t = X_train_t[['track_id', 'playlist_id']]
urm_full_t = URM(interactionsCsv[['track_id', 'playlist_id']], transposed=True)
urm_test_t = URM(X_test_t, transposed=True)
urm_train_t = URM(X_train_t, transposed=True)
"""
    RUNNING SCRIPT PARAMETERS