示例#1
0
# now the dataset of target users
logger.info('Reading {}'.format(args.target_users))
targets = read_dataset(args.target_users, sep=',')
# we need to merge with the new indices
targets['user_idx'] = user_idx[targets['user_id'].values].values

# finally interactions
logger.info('Reading {}'.format(args.interactions))
interactions, n, n1 = read_interactions(args.interactions,
                                        sep=args.sep,
                                        user_to_idx=user_idx,
                                        item_to_idx=item_idx)
interactions = interactions[interactions['item_idx'] >= 0.0]

urm = df_to_csr(interactions,
                user_idx.shape[0],
                item_idx.shape[0],
                is_implicit=True)

recommender = RecommenderClass()
model_path = 'output/models/' + args.model_file  # from where to load the already computed model (similarity matrix)
recommender.load_weights(model_path)
recs = recommender.make_prediction(targets['user_idx'].values,
                                   urm,
                                   recomendable_items,
                                   num=5)

# open the prediction file and write the header
if args.prediction_file:
    pfile = open(args.prediction_file, 'w')
    header = 'user_id,recommended_items' + '\n'
    pfile.write(header)
roc_auc_, precision_, recall_, map_, mrr_, ndcg_ = np.zeros(
    args.cv_folds), np.zeros(args.cv_folds), np.zeros(args.cv_folds), np.zeros(
        args.cv_folds), np.zeros(args.cv_folds), np.zeros(args.cv_folds)
at = args.rec_length
nfold = 0
for train_df, test_df in k_fold_cv(dataset,
                                   user_key=args.user_key,
                                   item_key=args.item_key,
                                   k=args.cv_folds,
                                   clean_test=True,
                                   seed=args.rnd_seed):
    logger.info(train_df.shape)
    logger.info(test_df.shape)
    logger.info('Fold {}'.format(nfold + 1))
    train = df_to_csr(train_df,
                      is_implicit=args.is_implicit,
                      nrows=nusers,
                      ncols=nitems)
    test = df_to_csr(test_df,
                     is_implicit=args.is_implicit,
                     nrows=nusers,
                     ncols=nitems)

    # train the recommender
    recommender = RecommenderClass(**init_args)
    logger.info('Recommender: {}'.format(recommender))
    tic = dt.now()
    logger.info('Training started')
    recommender.fit(train)
    logger.info('Training completed in {}'.format(dt.now() - tic))

    # evaluate the ranking quality
示例#3
0
                             columns=args.columns,
                             user_key=args.user_key,
                             item_key=args.item_key,
                             rating_key=args.rating_key,
                             item_to_idx=item_to_idx,
                             user_to_idx=user_to_idx)

# build reverse maps
idx_to_item = pd.Series(index=item_to_idx.data, data=item_to_idx.index)
idx_to_user = pd.Series(index=user_to_idx.data, data=user_to_idx.index)

nusers, nitems = train_df.user_idx.max() + 1, train_df.item_idx.max() + 1
train = df_to_csr(train_df,
                  is_binary=args.is_binary,
                  nrows=nusers,
                  ncols=nitems,
                  user_key='user_idx',
                  item_key='item_idx',
                  rating_key=args.rating_key)
test = df_to_csr(test_df,
                 is_binary=args.is_binary,
                 nrows=nusers,
                 ncols=nitems,
                 user_key='user_idx',
                 item_key='item_idx',
                 rating_key=args.rating_key)

# train the recommender
recommender = RecommenderClass(**init_args)
logger.info('Recommender: {}'.format(recommender))
tic = dt.now()
示例#4
0
            init_args[key] = eval(value)
        except:
            init_args[key] = value

# convert the column argument to list
if args.columns is not None:
    args.columns = args.columns.split(',')

# read the dataset
logger.info('Reading {}'.format(args.train))
train_df = read_dataset(args.train, sep=',', header=0)
logger.info('Reading {}'.format(args.test))
test_df = read_dataset(args.test, sep=',', header=0)

nusers, nitems = train_df.user_idx.max()+1, train_df.item_idx.max()+1
train = df_to_csr(train_df, nrows=nusers, ncols=nitems)
test = df_to_csr(test_df, nrows=nusers, ncols=nitems)

# train the recommender
recommender = RecommenderClass(**init_args)
logger.info('Recommender: {}'.format(recommender))
tic = dt.now()
logger.info('Training started')
recommender.fit(train)
logger.info('Training completed in {}'.format(dt.now() - tic))

# open the prediction file
if args.prediction_file:
    pfile = open(args.prediction_file, 'w')
    n = args.rec_length if args.rec_length is not None else nitems
    header = 'user_id,'
示例#5
0
def grid_search_cv(RecommenderClass,
                   dataset,
                   param_space,
                   metric=roc_auc,
                   at=None,
                   cv_folds=5,
                   is_binary=True,
                   user_key='user_id',
                   item_key='item_id',
                   rating_key='rating',
                   rnd_seed=1234):
    """
    Finds the best hyper-parameters of a recommender algorithm with Grid Search

    :param RecommenderClass: Class of the recommender to tune (must be subclass of Recommender)
    :param dataset: data to use for tuning
    :param param_space: space of the parameters to explore
    :param metric: metric to maximize
    :param at: optional length of the recommendation list used in recommendaiton
    :param cv_folds: number of cross-validation iters
    :param is_binary: True to discard ratings, False otherwise
    :param user_key: name of the column with user ids in dataset
    :param item_key: name of the column with item ids in dataset
    :param rating_key: name of the column with ratings in dataset
    :param rnd_seed: random seed used for cross-validation
    :return: a tuple with (best configuration, best metric value)
    """

    tried_conf = []
    results = np.zeros(np.prod([len(v) for v in param_space.values()]),
                       dtype=np.float32)
    space_size = len(results)
    logger.info('Size of the parameter space: {} ({} cv trials)'.format(
        space_size, space_size * cv_folds))
    param_grid = ParameterGrid(param_space)
    # compute the cv splits
    nusers, nitems = dataset[user_key].max() + 1, dataset[item_key].max() + 1
    cv_split = []
    for train_df, test_df in k_fold_cv(dataset,
                                       user_key=user_key,
                                       item_key=item_key,
                                       k=cv_folds,
                                       clean_test=True,
                                       seed=rnd_seed):
        train = df_to_csr(train_df,
                          is_binary=is_binary,
                          nrows=nusers,
                          ncols=nitems,
                          user_key=user_key,
                          item_key=item_key,
                          rating_key=rating_key)
        test = df_to_csr(test_df,
                         is_binary=is_binary,
                         nrows=nusers,
                         ncols=nitems,
                         user_key=user_key,
                         item_key=item_key,
                         rating_key=rating_key)
        cv_split.append((train, test))

    for i, params in enumerate(param_grid):
        logger.info('Iteration {}/{}: {}'.format(i + 1, space_size, params))
        tried_conf.append(params)
        cv_result = 0.0
        for f, (train, test) in enumerate(cv_split):
            # train the recommender
            recommender = RecommenderClass(**params)
            recommender.fit(train)
            # evaluate the ranking quality
            n_eval = 0
            metric_ = 0.0
            for test_user in range(nusers):
                relevant_items = test[test_user].indices
                if len(relevant_items) > 0:
                    n_eval += 1
                    # this will rank **all** items
                    recommended_items = recommender.recommend(
                        user_id=test_user, exclude_seen=True)
                    # evaluate the recommendation list with ranking metrics ONLY
                    if metric == roc_auc:
                        metric_ += roc_auc(recommended_items, relevant_items)
                    elif metric == ndcg:
                        metric_ += ndcg(recommended_items,
                                        relevant_items,
                                        relevance=test[test_user].data,
                                        at=at)
                    else:
                        metric_ += metric(recommended_items,
                                          relevant_items,
                                          at=at)
            metric_ /= n_eval
            cv_result += metric_
        # average value of the metric in cross-validation
        results[i] = cv_result / cv_folds
        logger.info('Result: {:.4f}'.format(results[i]))
    # return the best configuration
    best = results.argsort()[-1]
    return tried_conf[best], results[best]
示例#6
0
# read the dataset
logger.info('Reading {}'.format(args.dataset))
dataset, idx_to_user, idx_to_item = read_dataset(
    args.dataset,
    header=args.header,
    sep=args.sep,
    columns=args.columns,
    make_implicit=args.make_implicit,
    implicit_th=args.implicit_th,
    item_key=args.item_key,
    user_key=args.user_key,
    rating_key=args.rating_key,
    item_to_idx=item_idx,
    user_to_idx=user_idx)

nusers, nitems = len(idx_to_user), len(idx_to_item)
logger.info('The dataset has {} users and {} items'.format(nusers, nitems))

# let's construct the training set
train = df_to_csr(dataset, is_implicit=args.is_implicit, nrows=nusers, ncols=nitems)
logger.info('The train set is a sparse matrix of shape: {}'.format(train.shape))

# train the recommender
recommender = RecommenderClass(**init_args)
logger.info('Recommender: {}'.format(recommender))
logger.info('Parameters: {}'.format(init_args if args.params else 'default'))
tic = dt.now()
logger.info('Training started')
recommender.fit(train)
logger.info('Training completed in {}'.format(dt.now() - tic))
示例#7
0
num_users, num_items = len(users), len(items)
print("There are %d users and %d items" % (num_users, num_items))

# indexing of users and items
user_idx = pd.Series(index=users, data=np.arange(num_users))
item_idx = pd.Series(index=items, data=np.arange(num_items))

# building the final dataframe adding "user's index" and "item's index"
useful_interactions["user_idx"] = user_idx[
    useful_interactions["user_id"].values].values
useful_interactions["item_idx"] = item_idx[
    useful_interactions["item_id"].values].values
data_csr = df_to_csr(useful_interactions,
                     num_items,
                     num_users,
                     user_key='item_idx',
                     item_key='user_idx')

useful_items = []
for i in np.arange(num_items):
    if (data_csr[i].nnz > 1):
        useful_items.append(i)

useful_items = pd.DataFrame(useful_items, columns=['item_idx'])
useful_interactions = useful_interactions.merge(useful_items,
                                                on='item_idx',
                                                how='right')
useful_interactions = useful_interactions.drop(['user_idx', 'item_idx'],
                                               axis=1)
示例#8
0
    return roc_auc_, precision_, recall_, map_, mrr_, ndcg_


metric = roc_auc
cv_folds = 5
at = 10
is_binary = True

train_df = read_dataset('../../data/ml100k/binary_holdout/train.csv',
                        sep=',',
                        header=0)
test_df = read_dataset('../../data/ml100k/binary_holdout/test.csv',
                       sep=',',
                       header=0)
nusers, nitems = train_df.user_idx.max() + 1, train_df.item_idx.max() + 1
train = df_to_csr(train_df, is_binary=is_binary, nrows=nusers, ncols=nitems)
test = df_to_csr(test_df, is_binary=is_binary, nrows=nusers, ncols=nitems)

#
# TopPop
#
# RecommenderClass = TopPop
# param_space = {}
# # Evaluate all the metrics over the hold out split
# recommender = RecommenderClass()
# metrics = holdout_eval(recommender, train, test, at=at)
# logger.info('Metrics: {}'.format(metrics))
#
#
# GlobalEffects
#
示例#9
0
test_observed_df, test_hidden_df = per_user_holdout(test_users_df,
                                                    user_key='user_idx',
                                                    item_key='item_idx',
                                                    n_observed=args.n_observed,
                                                    seed=args.rnd_seed)
tot_observed, tot_hidden = test_observed_df.shape[0], test_hidden_df.shape[0]
logger.info('Observed ratings: {}({:.2f}%)'.format(
    tot_observed, tot_observed / (tot_observed + tot_hidden) * 100))
logger.info('Observed ratings: {}({:.2f}%)'.format(
    tot_hidden, tot_hidden / (tot_observed + tot_hidden) * 100))

# build the sparse matrices
train = df_to_csr(train_users_df,
                  is_binary=args.is_binary,
                  nrows=nusers_train,
                  ncols=nitems,
                  item_key='item_idx',
                  user_key='user_idx',
                  rating_key=args.rating_key)
test_observed = df_to_csr(test_observed_df,
                          is_binary=args.is_binary,
                          nrows=nusers_test,
                          ncols=nitems,
                          item_key='item_idx',
                          user_key='user_idx',
                          rating_key=args.rating_key)

test_hidden = df_to_csr(test_hidden_df,
                        is_binary=args.is_binary,
                        nrows=nusers_test,
                        ncols=nitems,
示例#10
0
profiles = pd.read_csv("data/user_profile.csv", sep='\t')

# now the dataset of target users
logger.info('Reading {}'.format(args.target_users))
targets = read_dataset(args.target_users, sep=',')

# print(set(targets_all['user_id'].values) <= set(idx_user.data))
#targets = targets_all.merge(profiles, how='inner', on='user_id')
targets['user_idx'] = user_idx[targets['user_id'].values].values

# finally interactions
logger.info('Reading {}'.format(args.interactions))
interactions, n1, n2 = read_interactions(args.interactions, sep=args.sep, user_to_idx=user_idx, item_to_idx=item_idx)
interactions = interactions[interactions['item_idx'] >= 0.0]

urm = df_to_csr(interactions, user_idx.shape[0], len(interactions['item_idx'].unique()), is_implicit=True)

recommender = CBFUsersRecommender()
recommender.load_user_weights('output/models/sparse_cbf.npz')
recs = recommender.make_prediction(targets['user_idx'].values, urm)
print(recs.shape)

# open the prediction file and write the header
if args.prediction_file:
    pfile = open(args.prediction_file, 'w')
    header = 'user_id,recommended_items' + '\n'
    pfile.write(header)

new_user_idx = targets['user_idx'].values
for target in range(recs.shape[0]):
    user_id = idx_user[new_user_idx[target]]