Exemplo n.º 1
0
def surprise_algo(algo, train_path="datas/train.csv", test_path="datas/test.csv", verbose=True):
    # reader with rating scale
    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5))
    
    # Specify the training and test dataset
    folds_files = [(train_path, test_path)]

    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()
    
    print("Start prediction...")
    for trainset, testset in pkf.split(data):
        # train and predict algorithm.
        model = algo.fit(trainset)
        predictions = algo.test(testset)
    
    pred = pd.read_csv(test_path, names = ["User", "Movie", "Rating"])
    
    print("Postprocessing predictions...")
    for index, row in pred.iterrows():
        rating = round(predictions[index].est)
        if rating > 5:
            rating = 5
        elif rating < 1:
            rating = 1
        row.Rating = rating
    
    return pred
Exemplo n.º 2
0
def test_randomizedsearchcv_best_estimator():
    """Ensure that the best estimator is the one that gives the best score (by
    re-running it)"""
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))

    param_distributions = {
        'n_epochs': [5],
        'lr_all': uniform(0.002, 0.003),
        'reg_all': uniform(0.04, 0.02),
        'n_factors': [1],
        'init_std_dev': [0]
    }
    rs = RandomizedSearchCV(SVD,
                            param_distributions,
                            measures=['mae'],
                            cv=PredefinedKFold(),
                            joblib_verbose=100)
    rs.fit(data)
    best_estimator = rs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator,
                         data,
                         measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == rs.best_score['mae']
Exemplo n.º 3
0
def test_randomizedsearchcv_best_estimator(u1_ml100k):
    """Ensure that the best estimator is the one that gives the best score (by
    re-running it)"""

    param_distributions = {
        'n_epochs': [5],
        'lr_all': uniform(0.002, 0.003),
        'reg_all': uniform(0.04, 0.02),
        'n_factors': [1],
        'init_std_dev': [0]
    }
    rs = RandomizedSearchCV(SVD,
                            param_distributions,
                            measures=['mae'],
                            cv=PredefinedKFold(),
                            joblib_verbose=100)
    rs.fit(u1_ml100k)
    best_estimator = rs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator,
                         u1_ml100k,
                         measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == rs.best_score['mae']
Exemplo n.º 4
0
def test_gridsearchcv_best_estimator(u1_ml100k):
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    param_grid = {
        'n_epochs': [5],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [1],
        'init_std_dev': [0]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae'],
                      cv=PredefinedKFold(),
                      joblib_verbose=100)
    gs.fit(u1_ml100k)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator,
                         u1_ml100k,
                         measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
Exemplo n.º 5
0
def surprise_slopeOne(train_file, test_file):
    """
    SlopeOne with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method SlopeOne from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        -
    Returns:
        numpy array: predictions
    """
    print("slopeone")
    algo = SlopeOne()
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Exemplo n.º 6
0
def surprise_SVD(train_file, test_file):
    """
    Svd with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Svd  from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        n_factors : The number of factors.
        n_epochs : The number of iteration of the SGD procedure
        lr_all: The learning rate for all
        reg_all : The regularization term for all


    Returns:
        numpy array: predictions
    """
    print("SVD")
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    # Algorithm
    algo = SVD(n_epochs=30, lr_all=0.01, reg_all=0.1)
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Exemplo n.º 7
0
def test_dump():
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    pkf = PredefinedKFold()

    trainset, testset = next(pkf.split(data))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
Exemplo n.º 8
0
def surprise_knn_ub(train_file, test_file):
    """
    Knn userbased with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method KNNBaseLineOnly from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters :
        k : The (max) number of neighbors to take into account for aggregation
        sim_options (dict) – A dictionary of options for the similarity measure.

    Returns:
        numpy array: predictions
    """
    print("knnUB")
    algo = KNNBaseline(k=300,
                       sim_options={
                           'name': 'pearson_baseline',
                           'user_based': True
                       })
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
Exemplo n.º 9
0
def test_best_estimator():
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))

    param_grid = {
        'n_epochs': [5],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [1],
        'init_std_dev': [0]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae'],
                      cv=PredefinedKFold(),
                      joblib_verbose=100)
    gs.fit(data)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator,
                         data,
                         measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
Exemplo n.º 10
0
    def prepare_data(self):
        super(RecommenderOnSurprice, self).prepare_data()

        reader = Reader(line_format='user item rating', sep='\t', rating_scale=(1, 5))
        data = Dataset.load_from_folds([(self.train_path, self.test_path)], reader=reader)
        trainset, testset = None, None
        pkf = PredefinedKFold()
        for trainset_, testset_ in pkf.split(data):
            trainset, testset = trainset_, testset_
        self.trainset, self.testset = trainset, testset
Exemplo n.º 11
0
    def load_test_files(self):
        reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
        train_file = 'test-data-train.csv'
        test_file = 'test-data-test.csv'
        folds_files = [(train_file, test_file)]

        data = Dataset.load_from_folds(folds_files, reader=reader)
        pkf = PredefinedKFold()

        trainset, testset = next(pkf.split(data))

        return trainset, testset
Exemplo n.º 12
0
def precompute_data(experiment_dir):
    rating_train_path = os.path.join(experiment_dir, 'ratings_train.txt')
    rating_test_path = os.path.join(experiment_dir, 'ratings_test.txt')
    ratings_reader = Reader(line_format="user item rating", sep=' ')
    dataset = Dataset.load_from_folds([(rating_train_path, rating_test_path)],
                                      ratings_reader)
    pkf = PredefinedKFold()
    trainset, testset = list(pkf.split(dataset))[0]

    n_x, yr = trainset.n_users, trainset.ir
    min_support = 1
    args = [n_x, yr, min_support]
    sims = pearson(*args).astype(np.float32)
    return trainset, testset, sims
Exemplo n.º 13
0
def load_data_hyper(train_file, submission_file):
    train_file, submission_file, df, df_toBeSubmitted = modify_data(
        train_file, submission_file)
    reader = Reader(line_format='user item rating', sep=',')

    fold = [(train_file, submission_file)]
    trainAndTest = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()

    # Go through 1 fold
    for trainset, testset in pkf.split(trainAndTest):
        data = trainset
        test = testset

    return data, test, df, df_toBeSubmitted
Exemplo n.º 14
0
def test_PredifinedKFold():

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3
Exemplo n.º 15
0
def test_PredifinedKFold(toy_data_reader):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader, rating_scale=(1, 5))

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3

    # Make sure pkf returns the same folds as the deprecated data.folds()
    with pytest.warns(UserWarning):
        trainset_, testset_ = next(data.folds())
    assert testset_ == testset
Exemplo n.º 16
0
def test_PredifinedKFold(toy_data_reader):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader,
                                   rating_scale=(1, 5))

    # Make sure rating files are read correctly
    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))
    assert trainset.n_ratings == 6
    assert len(testset) == 3

    # Make sure pkf returns the same folds as the deprecated data.folds()
    with pytest.warns(UserWarning):
        trainset_, testset_ = next(data.folds())
    assert testset_ == testset
Exemplo n.º 17
0
    def get_top_n_recommendations(self, test_set, top_n):
        self.test_set = test_set

        # Surprise requires a slightly different input data format, so we use two different CSVs
        test_path_tmp = "resources//test_file.csv"
        train_path_tmp = "resources//train_file.csv"

        self.train_set.to_csv(train_path_tmp, index=False, header=False)
        self.test_set.to_csv(test_path_tmp, index=False, header=False)

        fold_files = [(train_path_tmp, test_path_tmp)]
        reader = Reader(rating_scale=(1, 10),
                        line_format='user item rating',
                        sep=',')
        data = Dataset.load_from_folds(fold_files, reader=reader)

        for trainset, testset in PredefinedKFold().split(data):
            self.method.fit(trainset)

        already_ranked_items_by_users = self.train_set.groupby(
            'userID')['itemID'].apply(list)

        recommendations = {}
        pbar = tqdm(total=len(self.test_set.userID.unique()))
        for userID in self.test_set.userID.unique():
            pbar.update(1)

            if userID not in self.train_set.userID.unique():
                recommendations[str(userID)] = []
                continue

            items_expected_ranking = {}
            for itemID in self.train_set.itemID.unique():
                if itemID in already_ranked_items_by_users[userID]:
                    continue
                # We call here the specific Surprise method that we use for this model
                # The method predicts a score for a given item
                predicted = self.method.predict(str(userID),
                                                str(itemID),
                                                clip=False)
                items_expected_ranking[itemID] = predicted[3]

            # Now we just sort by decreasing scores and take the top N
            sorted_predictions = sorted(items_expected_ranking.items(),
                                        key=operator.itemgetter(1))
            sorted_predictions.reverse()
            sorted_predictions = [x[0] for x in sorted_predictions]
            user_recommendations = sorted_predictions[:top_n]
            recommendations[str(userID)] = user_recommendations
        pbar.close()
        return recommendations
Exemplo n.º 18
0
def func7():
    import os
    from surprise import SVD
    from surprise import Dataset
    from surprise import Reader
    from surprise import accuracy
    from surprise.model_selection import PredefinedKFold

    files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')
    reader = Reader('ml-100k')

    train_file = files_dir + 'u%d.base'
    test_file = files_dir + 'u%d.test'
    folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()

    algo = SVD()
    for trainset, testset in pkf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        accuracy.rmse(predictions, verbose=True)
Exemplo n.º 19
0
def basic_rec(model_name, train_path, test_path, target_id):
    # build data
    # TODO check float and min_r
    reader = Reader(line_format='user item rating',
                    sep='\t',
                    rating_scale=(1, 5))
    data = Dataset.load_from_folds([(train_path, test_path)], reader=reader)
    trainset, testset = None, None
    pkf = PredefinedKFold()
    for trainset_, testset_ in pkf.split(data):
        trainset, testset = trainset_, testset_

    # train model
    rec_algo = get_model(model_name)
    rec_algo.fit(trainset)
    # eval
    preds = rec_algo.test(testset)
    rmse = accuracy.rmse(preds, verbose=True)

    # predor target
    fn_pred = lambda uid: rec_algo.predict(str(uid), str(target_id), r_ui=0
                                           ).est
    target_predictions = list(map(fn_pred, range(trainset.n_users)))

    # topn
    testset = trainset.build_anti_testset()
    predictions = rec_algo.test(testset)
    top_n = get_top_n(predictions, n=50)

    hit_ratios = {}
    for uid, user_ratings in top_n.items():
        topN = [int(iid) for (iid, _) in user_ratings]
        hits = [
            1 if target_id in topN[:i] else 0 for i in [1, 3, 5, 10, 20, 50]
        ]
        hit_ratios[int(uid)] = hits
    return target_predictions, hit_ratios
Exemplo n.º 20
0
def test_knns():
    """Ensure the k and min_k parameters are effective for knn algorithms."""

    # the test and train files are from the ml-100k dataset (10% of u1.base and
    # 10 % of u1.test)
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))
    pkf = PredefinedKFold()

    # Actually, as KNNWithMeans and KNNBaseline have back up solutions for when
    # there are not enough neighbors, we can't really test them...
    klasses = (KNNBasic, )  # KNNWithMeans, KNNBaseline)

    k, min_k = 20, 5
    for klass in klasses:
        algo = klass(k=k, min_k=min_k)
        for trainset, testset in pkf.split(data):
            algo.fit(trainset)
            predictions = algo.test(testset)
            for pred in predictions:
                if not pred.details['was_impossible']:
                    assert min_k <= pred.details['actual_k'] <= k
Exemplo n.º 21
0
    def get_rating_predictions(self, test_set, cluster_user_mapping=None):
        self.test_set = test_set
        test_path_tmp = "..\\resources\\tmp\\test_file.csv"
        train_path_tmp = "..\\resources\\tmp\\train_file.csv"

        self.train_set.to_csv(train_path_tmp, index=False, header=False)
        self.test_set.to_csv(test_path_tmp, index=False, header=False)

        fold_files = [(train_path_tmp, test_path_tmp)]
        reader = Reader(rating_scale=(1, 10),
                        line_format='user item rating',
                        sep=',')
        data = Dataset.load_from_folds(fold_files, reader=reader)

        for trainset, testset in PredefinedKFold().split(data):

            if cluster_user_mapping is None:
                self.method.fit(trainset)
            else:
                df_users_in_clusters = pd.DataFrame.from_dict(
                    cluster_user_mapping)
                df_cluser_users = df_users_in_clusters.groupby('')
                #Distinct clusters:
                clusters = list(set(cluster_user_mapping.values()))

                #for cluster in clusters:
                #cluster_train_data = trainset[trainset.userID.isin() userID]
                pass

        results = pd.DataFrame(columns=['userID', 'itemID', 'real', 'est'])

        pbar = tqdm(total=len(self.test_set.index))

        for key, val in self.test_set.iterrows():
            prediction = self.method.predict(str(val.userID),
                                             str(val.itemID),
                                             clip=False)
            results = results.append(
                {
                    "userID": int(val.userID),
                    "itemID": int(val.itemID),
                    "real": int(val.rating),
                    "est": int(prediction.est)
                },
                ignore_index=True)
            pbar.update(1)
        pbar.close()
        return results
Exemplo n.º 22
0
    def get_top_n_recommendations(self, test_set, top_n):
        self.test_set = test_set

        test_path_tmp = "..\\resources\\tmp\\test_file.csv"
        train_path_tmp = "..\\resources\\tmp\\train_file.csv"

        self.train_set.to_csv(train_path_tmp, index=False, header=False)
        self.test_set.to_csv(test_path_tmp, index=False, header=False)

        fold_files = [(train_path_tmp, test_path_tmp)]
        reader = Reader(rating_scale=(1, 10),
                        line_format='user item rating',
                        sep=',')
        data = Dataset.load_from_folds(fold_files, reader=reader)

        for trainset, testset in PredefinedKFold().split(data):
            self.method.fit(trainset)

        already_ranked_items_by_users = self.train_set.groupby(
            'userID')['itemID'].apply(list)

        recommendations = {}
        pbar = tqdm(total=len(self.test_set.userID.unique()))
        for userID in self.test_set.userID.unique():
            pbar.update(1)

            if userID not in self.train_set.userID.unique():
                recommendations[str(userID)] = []
                continue

            items_expected_ranking = {}
            for itemID in self.train_set.itemID.unique():
                if itemID in already_ranked_items_by_users[userID]:
                    continue
                # Calc prediction for item for user
                predicted = self.method.predict(str(userID),
                                                str(itemID),
                                                clip=False)
                items_expected_ranking[itemID] = predicted[3]
            sorted_predictions = sorted(items_expected_ranking.items(),
                                        key=operator.itemgetter(1))
            sorted_predictions.reverse()
            sorted_predictions = [str(x[0]) for x in sorted_predictions]
            user_recommendations = sorted_predictions[:top_n]
            recommendations[str(userID)] = user_recommendations
        pbar.close()
        return recommendations
Exemplo n.º 23
0
    def predict_rating_split_by_time(self, files_pair, algo_test):

        algo = algo_test[0]

        use_auto_parse = algo_test[1]
        if use_auto_parse:
            fold_files = [(files_pair)]
            reader = Reader(rating_scale=(1, 10),
                            line_format='user item rating',
                            sep=',')
            data = Dataset.load_from_folds(fold_files, reader=reader)

            for trainset, testset in PredefinedKFold().split(data):
                algo.fit(trainset)
                predictions = algo.test(testset)
                rmse = accuracy.rmse(predictions, verbose=False)
                return rmse
        else:

            # Prepare dataset

            train_set = pd.read_csv(files_pair[0], parse_dates=[3])
            test_set = pd.read_csv(files_pair[1], parse_dates=[3])

            item_to_id_mapping = {}
            user_to_id_mapping = {}

            item_index = 0
            user_index = 0
            all_sets = pd.concat([train_set, test_set])
            for item in all_sets['itemID']:
                if item not in item_to_id_mapping.keys():
                    item_to_id_mapping[item] = item_index
                    item_index += 1
            for user in all_sets['userID']:
                if user not in user_to_id_mapping.keys():
                    user_to_id_mapping[user] = user_index
                    user_index += 1

            train_set['itemID'] = train_set['itemID'].map(item_to_id_mapping)
            test_set['itemID'] = test_set['itemID'].map(item_to_id_mapping)
            train_set['userID'] = train_set['userID'].map(user_to_id_mapping)
            test_set['userID'] = test_set['userID'].map(user_to_id_mapping)

            algo.fit(train_set)
            rec_list = algo.get_top_n_recommendations(test_set)
            pass
Exemplo n.º 24
0
def test_dump(u1_ml100k):
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    trainset, testset = next(PredefinedKFold().split(u1_ml100k))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
Exemplo n.º 25
0
def pkf():
    return PredefinedKFold()
Exemplo n.º 26
0
def main():
    class MyParser(argparse.ArgumentParser):
        '''A parser which prints the help message when an error occurs. Taken from
        http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu.'''  # noqa

        def error(self, message):
            sys.stderr.write('error: %s\n' % message)
            self.print_help()
            sys.exit(2)

    parser = MyParser(
        description='Evaluate the performance of a rating prediction ' +
        'algorithm ' +
        'on a given dataset using cross validation. You can use a built-in ' +
        'or a custom dataset, and you can choose to automatically split the ' +
        'dataset into folds, or manually specify train and test files. ' +
        'Please refer to the documentation page ' +
        '(http://surprise.readthedocs.io/) for more details.',
        epilog="""Example:\n
        surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}"
        -load-builtin ml-100k -n-folds 3""")

    algo_choices = {
        'NormalPredictor': NormalPredictor,
        'BaselineOnly': BaselineOnly,
        'KNNBasic': KNNBasic,
        'KNNBaseline': KNNBaseline,
        'KNNWithMeans': KNNWithMeans,
        'SVD': SVD,
        'SVDpp': SVDpp,
        'NMF': NMF,
        'SlopeOne': SlopeOne,
        'CoClustering': CoClustering,
    }

    parser.add_argument('-algo',
                        type=str,
                        choices=algo_choices,
                        help='The prediction algorithm to use. ' +
                        'Allowed values are ' +
                        ', '.join(algo_choices.keys()) + '.',
                        metavar='<prediction algorithm>')

    parser.add_argument('-params',
                        type=str,
                        metavar='<algorithm parameters>',
                        default='{}',
                        help='A kwargs dictionary that contains all the ' +
                        'algorithm parameters.' +
                        'Example: "{\'n_epochs\': 10}".')

    parser.add_argument('-load-builtin',
                        type=str,
                        dest='load_builtin',
                        metavar='<dataset name>',
                        default='ml-100k',
                        help='The name of the built-in dataset to use.' +
                        'Allowed values are ' +
                        ', '.join(dataset.BUILTIN_DATASETS.keys()) +
                        '. Default is ml-100k.')

    parser.add_argument(
        '-load-custom',
        type=str,
        dest='load_custom',
        metavar='<file path>',
        default=None,
        help='A file path to custom dataset to use. ' + 'Ignored if ' +
        '-loadbuiltin is set. The -reader parameter needs ' + 'to be set.')

    parser.add_argument('-folds-files',
                        type=str,
                        dest='folds_files',
                        metavar='<train1 test1 train2 test2... >',
                        default=None,
                        help='A list of custom train and test files. ' +
                        'Ignored if -load-builtin or -load-custom is set. '
                        'The -reader parameter needs to be set.')

    parser.add_argument('-reader',
                        type=str,
                        metavar='<reader>',
                        default=None,
                        help='A Reader to read the custom dataset. Example: ' +
                        '"Reader(line_format=\'user item rating timestamp\',' +
                        ' sep=\'\\t\')"')

    parser.add_argument('-n-folds',
                        type=int,
                        dest='n_folds',
                        metavar="<number of folds>",
                        default=5,
                        help='The number of folds for cross-validation. ' +
                        'Default is 5.')

    parser.add_argument('-seed',
                        type=int,
                        metavar='<random seed>',
                        default=None,
                        help='The seed to use for RNG. ' +
                        'Default is the current system time.')

    parser.add_argument('--with-dump',
                        dest='with_dump',
                        action='store_true',
                        help='Dump the algorithm ' +
                        'results in a file (one file per fold). ' +
                        'Default is False.')

    parser.add_argument('-dump-dir',
                        dest='dump_dir',
                        type=str,
                        metavar='<dir>',
                        default=None,
                        help='Where to dump the files. Ignored if ' +
                        'with-dump is not set. Default is ' +
                        os.path.join(get_dataset_dir(), 'dumps/'))

    parser.add_argument('--clean',
                        dest='clean',
                        action='store_true',
                        help='Remove the ' + get_dataset_dir() +
                        ' directory and exit.')

    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version=__version__)

    args = parser.parse_args()

    if args.clean:
        folder = get_dataset_dir()
        shutil.rmtree(folder)
        print('Removed', folder)
        exit()

    # setup RNG
    rd.seed(args.seed)
    np.random.seed(args.seed)

    # setup algorithm
    params = eval(args.params)
    if args.algo is None:
        parser.error('No algorithm was specified.')
    algo = algo_choices[args.algo](**params)

    # setup dataset
    if args.load_custom is not None:  # load custom and split
        if args.reader is None:
            parser.error('-reader parameter is needed.')
        reader = eval(args.reader)
        data = Dataset.load_from_file(args.load_custom, reader=reader)
        cv = KFold(n_splits=args.n_folds, random_state=args.seed)

    elif args.folds_files is not None:  # load from files
        if args.reader is None:
            parser.error('-reader parameter is needed.')
        reader = eval(args.reader)
        folds_files = args.folds_files.split()
        folds_files = [(folds_files[i], folds_files[i + 1])
                       for i in range(0,
                                      len(folds_files) - 1, 2)]
        data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)
        cv = PredefinedKFold()

    else:  # load builtin dataset and split
        data = Dataset.load_builtin(args.load_builtin)
        cv = KFold(n_splits=args.n_folds, random_state=args.seed)

    cross_validate(algo, data, cv=cv, verbose=True)
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5))
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
Exemplo n.º 28
0
def run_knn_baseline(sparse_data):
    #filename = "test.json"
    prefix = "knn_baseline_"
    trainFile = prefix + "train.txt"
    testFile = prefix + "test.txt"

    raw_data, userPurchasedSet, userTrueTestSet = preprocess(
        sparse_data, trainFile, testFile)
    folds_files = [(trainFile, testFile)]
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_folds(folds_files, reader=reader)
    pkf = PredefinedKFold()
    bsl_options = {
        'method': 'sgd',
        'n_epochs': 20,
        'learning_rate': 0.005,
    }
    ### sim name: cosine    msd       pearson     pearson_baseline
    ### user_based : True ---- similarity will be computed based on users
    ###            : False ---- similarity will be computed based on items.
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    predictions = {}
    top_n = {}
    testsSet = None
    total_precisions = 0.0
    total_recalls = 0.0
    total_hit = 0.0
    total_nDCG = 0.0
    total_ffeature = 0.0
    result_file = prefix + "result.txt"
    result_f = open(result_file, "w")
    for trainset, testset in pkf.split(data):
        testsSet = testset

        #algo = SVD(n_factors = 5)
        algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options)
        algo.fit(trainset)
        pre = algo.test(testset)
        accuracy.rmse(pre)
        accuracy.mae(pre)
        #calculate_rmse(predictions)

        ### test
        rowNum = raw_data.get_row_size()
        colNum = raw_data.get_col_size()
        cur_time = time.time()
        time_cost = 0

        for i in range(rowNum):
            user = raw_data.get_userID(i)
            predictions[user] = set()
            pq = []
            heapq.heapify(pq)
            for j in range(colNum):
                item = raw_data.get_itemID(j)
                if user not in userPurchasedSet or item in userPurchasedSet[
                        user]:
                    continue
                value = raw_data.get_val(user, item, 'rating')
                predict = algo.predict(user, item, r_ui=0, verbose=False)[3]
                if len(pq) >= 10:
                    heapq.heappop(pq)
                heapq.heappush(pq, (predict, item))
            top_n[user] = set()
            for items in pq:
                top_n[user].add(items[1])
            if user in userTrueTestSet:
                curPrecisions = calculate_precision(top_n[user],
                                                    userTrueTestSet[user])
                curRecalls = calculate_recall(top_n[user],
                                              userTrueTestSet[user])
                ffeature = calculate_f_feature(curPrecisions, curRecalls)
                curHit = isHit(top_n[user], userTrueTestSet[user])
                cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user])
                total_precisions += curPrecisions
                total_recalls += curRecalls
                total_hit += curHit
                total_nDCG += cur_nDCG
                total_ffeature += ffeature
                result_f.write(user + "\t" + str(curPrecisions) + "\t" +
                               str(curRecalls) + "\t" + str(ffeature) + "\t" +
                               str(curHit) + '\t' + str(cur_nDCG) + "\n")
            if i != 0 and i % 1000 == 0:
                duration = (time.time() - cur_time) / 60
                time_cost += duration
                remaining_time = ((rowNum - i) / 1000) * duration
                cur_time = time.time()
                #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG
                print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min'
    print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG
    rowNum = raw_data.get_row_size()
    print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str(
        total_ffeature / rowNum
    ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum
    result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" +
                   str(total_recalls / rowNum) + "\t" +
                   str(total_ffeature / rowNum) + "\t" +
                   str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) +
                   "\n")
    result_f.close()
Exemplo n.º 29
0
def test_trainset_testset(toy_data_reader):
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files,
                                   reader=toy_data_reader)

    pkf = PredefinedKFold()
    trainset, testset = next(pkf.split(data))

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, n_ratings, rating_scale
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.n_ratings == 6
    assert trainset.rating_scale == (1, 5)

    # test raw2inner
    for i in range(4):
        assert trainset.to_inner_uid('user' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_uid('unkown_user')

    for i in range(2):
        assert trainset.to_inner_iid('item' + str(i)) == i
    with pytest.raises(ValueError):
        trainset.to_inner_iid('unkown_item')

    # test inner2raw
    assert trainset._inner2raw_id_users is None
    assert trainset._inner2raw_id_items is None
    for i in range(4):
        assert trainset.to_raw_uid(i) == 'user' + str(i)
    for i in range(2):
        assert trainset.to_raw_iid(i) == 'item' + str(i)
    assert trainset._inner2raw_id_users is not None
    assert trainset._inner2raw_id_items is not None

    # Test the build_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', 4) in testset
    assert ('user3', 'item1', 5) in testset
    assert ('user3', 'item1', 0) not in testset

    # Test the build_anti_testset() method
    algo = BaselineOnly()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    algo.test(testset)  # ensure an algorithm can manage the data
    assert ('user0', 'item0', trainset.global_mean) not in testset
    assert ('user3', 'item1', trainset.global_mean) not in testset
    assert ('user0', 'item1', trainset.global_mean) in testset
    assert ('user3', 'item0', trainset.global_mean) in testset
Exemplo n.º 30
0
    ratingMap[(x[0], x[1])] = x[2]
# print("total data: " + str(len(ratingMap)))

# add avg business stars
# avgBusList = list()
# userRatingRDD = userRatingRDD.filter(lambda t: int(preUserMap[t[0]]) % 30 == 0)
# busRatingRDD = busRatingRDD.filter(lambda t: int(preBusinessMap[t[0]]) % 40 == 0)
# print("add user avg count: " + str(userRatingRDD.count()))
# print("add bus avg count: " + str(busRatingRDD.count()))

reader = Reader(line_format='user item rating', sep=",", skip_lines=1)

folds_files = [(trainingFilePath, validationFilePath)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

algo = SVD()

predictionList = list()
for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    for uid, iid, true_r, est, _ in predictions:
        predictionList.append((uid, iid, est))

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
Exemplo n.º 31
0
import os

from surprise import SVD
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise.accuracy import neg_rmse
from surprise.model_selection import cross_validate
from surprise.model_selection import PredefinedKFold

# the test and train files are from the ml-100k dataset (10% of u1.base and
# 10 % of u1.test)
train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
pkf = PredefinedKFold()


def test_SVD_parameters():
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = SVD(n_factors=1, n_epochs=1, random_state=1)
    rmse_default = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[
        'test_neg_rmse']

    # n_factors
    algo = SVD(n_factors=2, n_epochs=1, random_state=1)
    rmse_factors = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[
        'test_neg_rmse']
    assert rmse_default != rmse_factors