def evaluate_model(self, data, algo):

        raw_ratings = data.raw_ratings

        # A = 90% of the data, B = 10% of the data
        threshold = int(.9 * len(raw_ratings))
        A_raw_ratings = raw_ratings[:threshold]
        B_raw_ratings = raw_ratings[threshold:]

        data.raw_ratings = A_raw_ratings  # train data
        # retrain on the whole set A
        trainset = data.build_full_trainset()
        algo.fit(trainset)

        # Compute biased accuracy on A
        testset = trainset.build_testset()
        predictions = algo.test(testset)
        print('Biased accuracy on A,', end='   ')
        accuracy.rmse(predictions, verbose=True)
        accuracy.mae(predictions, verbose=True)
        print('len(predictions)')
        print(len(predictions))

        # Compute unbiased accuracy on B
        testset = data.construct_testset(
            B_raw_ratings)  # testset is now the set B
        predictions = algo.test(testset)
        print('Unbiased accuracy on B,', end=' ')
        accuracy.rmse(predictions, verbose=True)
        accuracy.mae(predictions, verbose=True)
        print('len(predictions)')
        print(len(predictions))
示例#2
0
    def KNN_pred(self, is_total=0, combin_func='avg'):
        names = locals()
        r = Reader(rating_scale=(1, 5))
        df = self.testdatas
        total_test = np.array(df[['uid', 'iid', 'total']])
        total_p = self.algos[0].test(total_test)
        for i in range(1, self.no_of_criteria + 1):
            # names['c' + str(i) + '_test'] = np.array(df[['uid','iid', 'c' + str(i)]])
            names['c' + str(i) + '_test'] = Dataset.load_from_df(
                df[['uid', 'iid', 'c' + str(i)]], reader=r)
            names['c' + str(i) +
                  '_test'] = names.get('c' + str(i) +
                                       '_test').build_full_trainset()
            names['c' + str(i) + '_test'] = names.get('c' + str(i) +
                                                      '_test').build_
            names['c' + str(i) + '_p'] = self.algos[i].test(
                names.get('c' + str(i) + '_test'))

        multi_p = []
        if is_total == 0:
            if combin_func == 'avg':
                for i in range(len(total_p)):
                    s = 0
                    for j in range(1, self.no_of_criteria + 1):
                        s = s + names.get('c' + str(j) + '_p')[i].est
                    avg = s / self.no_of_criteria
                    p = predictions.Prediction(total_p[i].uid, total_p[i].iid,
                                               total_p[i].r_ui, avg,
                                               total_p[i].details)
                    multi_p.append(p)
            elif combin_func == 'total_reg':
                k = self.k
                b = self.b
                for i in range(len(total_p)):
                    s = 0
                    for j in range(self.no_of_criteria):
                        s = s + k[j] * names.get('c' + str(j + 1) +
                                                 '_p')[i].est
                    s = s + b
                    p = predictions.Prediction(total_p[i].uid, total_p[i].iid,
                                               total_p[i].r_ui, s,
                                               total_p[i].details)
                    multi_p.append(p)
        else:
            if combin_func == 'avg':
                for i in range(len(total_p)):
                    s = 0
                    for j in range(1, self.no_of_criteria + 1):
                        s = s + names.get('c' + str(j) + '_p')[i].est
                    avg = (s + total_p[i].est) / (self.no_of_criteria + 1)
                    p = predictions.Prediction(total_p[i].uid, total_p[i].iid,
                                               total_p[i].r_ui, avg,
                                               total_p[i].details)
                    multi_p.append(p)
            else:
                print('总分作为准则不适合用于回归聚合函数')
        s_mae = round(accuracy.mae(total_p), 4)
        m_mae = round(accuracy.mae(multi_p), 4)
        return s_mae, m_mae, total_p, multi_p
示例#3
0
def baseline(trainset, testset):
    algo = BaselineOnly()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)
示例#4
0
def svdalgorithm(trainset, testset):
    algo = SVD()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)
示例#5
0
def chose_yahoo(file_path):
    # mae= []
    # rmse = []
    reader = Reader(line_format='timestamp user item  rating', sep='\t')#timestamp
    #载入数据,包括多准则评分:故事,角色,表演,画面,音乐,以及整体评分
    story = Dataset.load_from_file(file_path + 'story.txt', reader=reader)
    role = Dataset.load_from_file(file_path + 'role.txt', reader=reader)
    show = Dataset.load_from_file(file_path + 'show.txt', reader=reader)
    image = Dataset.load_from_file(file_path + 'image.txt', reader=reader)
    music = Dataset.load_from_file(file_path + 'music.txt', reader=reader)
    total = Dataset.load_from_file(file_path + 'total.txt', reader=reader)
    # print('载入数据成功!\n')
    #按2:8拆分数据
    random_states = 180
    story_train, story_test = train_test_split(story, random_state = random_states)
    role_train, role_test = train_test_split(role, random_state = random_states)
    show_train, show_test = train_test_split(show, random_state = random_states)
    image_train, image_test = train_test_split(image, random_state = random_states)
    music_train, music_test = train_test_split(music, random_state = random_states)
    total_train, total_test = train_test_split(total, random_state = random_states)
    # print('数据划分成功!\n')
    #选择的是基于项目的协同过滤算法,项目相似度计算采用cosine方法
    sim_options = {'name': 'pearson',#用皮尔森基线相似度避免出现过拟合
                   'user_based': False} # 基于用户的协同过滤算法
    algo1 = KNNWithMeans(sim_options=sim_options)
    algo2 = KNNWithMeans(sim_options=sim_options)
    algo3 = KNNWithMeans(sim_options=sim_options)
    algo4 = KNNWithMeans(sim_options=sim_options)
    algo5 = KNNWithMeans(sim_options=sim_options)
    algo6 = KNNWithMeans(sim_options=sim_options)
    algo1.fit(story_train)
    algo2.fit(role_train)
    algo3.fit(show_train)
    algo4.fit(image_train)
    algo5.fit(music_train)
    algo6.fit(total_train)
    story_p = algo1.test(story_test)
    role_p = algo2.test(role_test)
    show_p = algo3.test(show_test)
    image_p =algo4.test(image_test)
    music_p = algo5.test(music_test)
    single_p = algo6.test(total_test)
    # rmse.append(accuracy.rmse(single_p))
    #平均法
    # multi_p = avg(story_p, role_p, show_p, image_p, music_p, single_p)
    #整体回归
    P = combine(story_p, role_p, show_p, image_p, music_p, single_p)
    df = pd.read_csv(file_path + 'all.txt', sep = '\t', names = ['id', 'uid', 'mid', 'total', 'story', 'role', 'show', 'image', 'music'])
    k, b = totalRegModel(df)
    multi_p = totalReg(P, k, b, single_p)
    #基于每个用户的回归
    
    mae = (accuracy.mae(single_p),accuracy.mae(multi_p))
    # rmse.append(accuracy.rmse(multi_p))
    return mae#, rmse
示例#6
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    #Try SVD
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    #Try the NMF
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
    knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False)

    # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    
    #Other Collaborative Filtering Algorithms
    slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False)
    coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
示例#7
0
def test_mae():
    """Tests for the MAE function."""

    predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)]
    assert mae(predictions) == 0

    predictions = [pred(0, 0), pred(0, 2)]
    assert mae(predictions) == abs(0 - 2) / 2

    predictions = [pred(2, 0), pred(3, 4)]
    assert mae(predictions) == (abs(2 - 0) + abs(3 - 4)) / 2

    with pytest.raises(ValueError):
        mae([])
示例#8
0
def test_mae():
    """Tests for the MAE function."""

    predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)]
    assert mae(predictions) == 0

    predictions = [pred(0, 0), pred(0, 2)]
    assert mae(predictions) == abs(0 - 2) / 2

    predictions = [pred(2, 0), pred(3, 4)]
    assert mae(predictions) == (abs(2 - 0) + abs(3 - 4)) / 2

    with pytest.raises(ValueError):
        mae([])
def test_model(model):

    reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
    fold_files = [('~/Desktop/Tufts/Fall2018/COMP135/Project3/trainset.csv',
                   '~/Desktop/Tufts/Fall2018/COMP135/Project3/testset.csv')]

    pdkfold = sp.model_selection.split.PredefinedKFold()
    clf = model.best_estimator['mae']
    data = Dataset.load_from_folds(fold_files, reader=reader)

    for train, test in pdkfold.split(data):

        clf.fit(train)
        preds = clf.test(test)
        accuracy.mae(preds)
def generate_test_score(test_preds, error_metric):
    if error_metric == 'rmse':
        return accuracy.rmse(test_preds)
    elif error_metric == 'mae':
        return accuracy.mae(test_preds)
    elif error_metric == 'fcp':
        return accuracy.fcp(test_preds)
def use_pearson_baseline():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using Pearson baseline')
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    algo_pearson = KNNBasic(sim_options=sim_options)
    algo_pearson.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_KNN = algo_pearson.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_KNN)
    accuracy_mae = accuracy.mae(predictions_KNN)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
示例#12
0
    def run_with_diff_k(self, algo, args, range_, folds=2, test_filter=None, threshold=2, msg=None, modal_name=None):
        arg_name = {
            'KNN': 'k',
            'NMF': 'n_factors',
            'SVD': 'n_factors'
        }[modal_name]

        rmse_by_k = []
        mae_by_k = []
        k_values = []
        for k in range(*range_):
            k_values.append(k)
            args.update({arg_name: k})
            modal = algo(**args)
            kf = KFold(n_splits=folds)
            rmse_by_fold = []
            mae_by_fold = []
            for trainset, testset in kf.split(self.data):
                modal.fit(trainset)
                if test_filter:
                    testset = test_filter(testset, threshold)
                predictions = modal.test(testset)
                rmse_by_fold.append(accuracy.rmse(predictions, verbose=True))
                mae_by_fold.append(accuracy.mae(predictions, verbose=True))
            rmse_by_k.append(np.mean(rmse_by_fold))
            mae_by_k.append(np.mean(mae_by_fold))
   
        plt.plot(k_values, rmse_by_k)
        plt.plot(k_values, mae_by_k)
        plt.legend(['RMSE', 'MAE'])
        plt.title(msg)
        plt.show()
示例#13
0
def get_svd_recommender(df, test_size=0.25, path="", exists=False):
    """
    builds and trains an SVD recommender
    :param df: a dataframe containing user ID's, beer ID's and ratings
    :param test_size: the fraction of samples that should be reserved for testing
    :param path: the path to an existing svd recommender that was saved to a file
    :param exists: whether or not to upload the algo from a saved file
    :return: trained recommender, list of predictions, and the root mean square error of the recommender
    """
    if exists:
        return dump.load(path)[1]

    # allows surprise to read df
    reader = Reader(rating_scale=(1, 5))
    # must load in particular column order
    data = Dataset.load_from_df(df[['user_id', 'beer_id', 'user_score']],
                                reader)

    trainset, testset = train_test_split(data, test_size=test_size)
    algo = SVD()
    # Train the algorithm on the trainset
    algo.fit(trainset)
    # and predict ratings for the testset. test() returns a list of prediction objects
    # which have several attributes such as est (the prediction) and r_ui (the true rating)
    predictions = algo.test(testset)

    # rmse below 1 is considered low
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)

    return algo, predictions, rmse
def use_cosine_similarity():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using cosine similarity')
    sim_options = {
        'name': 'cosine',
        'user_based': False  # compute  similarities between items
    }
    algo_cosine = KNNBasic(sim_options=sim_options)
    algo_cosine.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_KNN = algo_cosine.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_KNN)
    accuracy_mae = accuracy.mae(predictions_KNN)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
示例#15
0
    def recommender_nmf_baseline(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_nmf_baseline = NMF()

        algo_nmf_baseline.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_nmf_baseline = algo_nmf_baseline.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('NMF BASELINE: ' + ' RMSE ' +
              str(rmse(predictions_nmf_baseline, verbose=False)) + ' MAE ' +
              str(mae(predictions_nmf_baseline, verbose=False)))

        return algo_nmf_baseline
def acc(df, alg, algname, n_train, n_users, cutoff=.5):
    np.random.seed(0)

    train, test = subswipe_data(df, n_train=n_train, n_test=test_count, n_users=n_users)

    alg.fit(train)
    predictions = alg.test(test)

    # Change predictions to binary choice of left or right. Prediction class derives from NamedTuple.
    predictions = [
        Prediction(prediction.uid, prediction.iid, prediction.r_ui, int(prediction.est < cutoff), prediction.details)
        for prediction in predictions
    ]

    # Mean absolute error.
    mae = accuracy.mae(predictions)

    df_predicted = pd.DataFrame(columns=["uid", "iid", "predicted", "actual"])
    for prediction in predictions:
        df_predicted = df_predicted.append(
            {
                "uid": prediction.uid,
                "iid": prediction.iid,
                "predicted": prediction.est,
                "actual": df_swipes[prediction.uid].loc[prediction.iid]
            },
            ignore_index=True
        )

    acc_dict = {"algname": algname, "n_train": n_train, "n_users": n_users, "acc": mae}
    print(acc_dict)
    return acc_dict
示例#17
0
def surprise_algos(train, test, svdpp=False):
    train_set, test_set = get_train_test(train, test)
    algos = [NormalPredictor, BaselineOnly, SlopeOne, NMF, SVD]
    if svdpp:
        algos.append(SVDpp)
    values = {}
    values['Method'] = []
    values['RMSE'] = []
    values['MAE'] = []
    for algo_constructor in algos:
        name = get_name(algo_constructor)
        print(name)
        try:
            algo = algo_constructor(random_state=0)
        except:
            algo = algo_constructor()
        algo.fit(train_set)
        predictions = algo.test(test_set)

        rmse = accuracy.rmse(predictions)
        mae = accuracy.mae(predictions)
        values['Method'].append(name)
        values['RMSE'].append(rmse)
        values['MAE'].append(mae)
    return pd.DataFrame(values).sort_values('RMSE', ascending=False).set_index('Method')
def use_sgd():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using SGD')
    bsl_options = {
        'method': 'sgd',
        'learning_rate': .005,
    }

    algo_SGD = BaselineOnly(bsl_options=bsl_options)
    algo_SGD.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_SGD = algo_SGD.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_SGD)
    accuracy_mae = accuracy.mae(predictions_SGD)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
def use_als():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5}
    algo_ALS = BaselineOnly(bsl_options=bsl_options)
    algo_ALS.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_ALS = algo_ALS.test(testset)

    accuracy_rmse \
        = accuracy.rmse(predictions_ALS)
    accuracy_mae = accuracy.mae(predictions_ALS)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
示例#20
0
 def test_model(self):
     # Checks the predicted values against the test set
     # Returns Root Mean Square Error (RMSE) accuracy
     predictions = self.model.test(self.test)
     return accuracy.mae(predictions,
                         verbose=False), accuracy.rmse(predictions,
                                                       verbose=False)
示例#21
0
    def use_algo(algo, name):
        start = time.time()
        algo.fit(trainset)
        predictions = algo.test(testset)
        end = time.time()
        total_time = end - start
        rmse = accuracy.rmse(predictions, verbose=False)
        mae = accuracy.mae(predictions, verbose=False)

        ex_ee = extraction_efficiency(algo, train_affinities, validation_affinities, surprise_get_topk, items)

        predictions = algo.test(trainset_for_testing)
        train_rmse = accuracy.rmse(predictions, verbose=False)
        train_mae = accuracy.mae(predictions, verbose=False)
        return {"algo": name, "rmse": rmse, "mae": mae, "map": ex_ee["map"], "retrieval_time": ex_ee["retrieval_time"],
                "train_rmse": train_rmse, "train_mae": train_mae, "time": total_time}
示例#22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file_path",
                        default="data/train.csv",
                        help="training file path")
    parser.add_argument("--test_file_path",
                        default="data/test.csv",
                        help="testing file path")
    parser.add_argument("--approach",
                        default="SVD",
                        help="Baseline | SVD | SlopeOne | NMF | CoClustering")
    parser.add_argument("--output_ranking_file",
                        default="ranking",
                        help="output ranking for test")
    bsl_options = {'method': 'sgd', 'n_epochs': 20, 'reg_u': 100, 'reg_i': 50}
    options = {
        "Baseline": BaselineOnly(bsl_options, verbose=True),
        "SVD": SVD(verbose=True, n_factors=20, n_epochs=3),
        "SlopeOne": SlopeOne(),
        "NMF": NMF(),
        "CoClustering": CoClustering()
    }
    args = parser.parse_args()
    reader = Reader(line_format='user item rating timestamp', sep='\t')
    algo = options[args.approach]
    train_data = Dataset.load_from_file(args.train_file_path, reader=reader)
    test_data = Dataset.load_from_file(args.test_file_path, reader=reader)
    train_set = train_data.build_full_trainset()
    test_set = test_data.build_full_trainset().build_testset()
    print("training....")
    algo.fit(train_set)
    print("testing...")
    predictions = algo.test(test_set)
    accuracy.mae(predictions, verbose=True)
    accuracy.rmse(predictions, verbose=True)
    ### Extra Credit
    output_ranking(predictions,
                   args.output_ranking_file + "_" + args.approach + ".out")
    precisions, recalls = precision_recall_at_k(predictions,
                                                k=10,
                                                threshold=2.5)
    print("Precision:",
          sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))
    print("F-measure:", f_measure(precisions, recalls))
    print("conversion_rate:", get_conversion_rate(predictions, k=10))
    print("ndcg:", get_ndcg(predictions, k_highest_scores=10))
示例#23
0
def svdalgorithm(trainset, testset):

    print("\n" + "-" * 5 + " SVD algorithm using surprise package " + "-" * 5)
    algo = SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
 def evaluate_cold_users(self):
     print('evaluating cold users', self.name, '... ', end='')
     start = time.time()
     self.cold_mae = accuracy.mae(self.cold_predictions, verbose=False)
     self.cold_rmse = accuracy.rmse(self.cold_predictions, verbose=False)
     precisions_and_recalls = [precision_recall_at_k(self.cold_predictions, k) for k in self.ks]
     self.cold_MAPs, self.cold_MARs = zip(*precisions_and_recalls)
     end = time.time()
     print('done in ', round(end-start), 'seconds')
示例#25
0
def svdpp(trainset, testset):
    # Matrix factorization - SVD++
    print("\n" + "-" * 5 + " SVD++ algorithm using surprise package " +
          "-" * 5)
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
示例#26
0
def slopeOne(trainset, testset):
    # Slope One
    print("\n" + "-" * 5 + " SlopeOne algorithm using surprise package " +
          "-" * 5)
    algo = SlopeOne()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
示例#27
0
def coClustering(trainset, testset):
    # CoClustering
    print("\n" + "-" * 5 + " CoClustering algorithm using surprise package " +
          "-" * 5)
    algo = CoClustering()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
示例#28
0
def baseline(trainset, testset):

    print("\n" + "-" * 5 + " Baseline algorithm using surprise package " +
          "-" * 5)
    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
示例#29
0
def main():
    # save path to training data csv
    # convert to panda Dataframe to bypass an error
    #file_path = os.path.expanduser('../data/train.csv')
    #df = pd.read_csv(path=file_path, sep = ';')

    # pickle_dict = pickle.load('../data/train_update.csv')
    # df = pd.DataFrame(ratings_dict)

    # load dataset into dataframe
    train = pd.read_csv('../data/train_update.csv', sep=';')
    test = pd.read_csv('../data/test_update.csv', sep=';')

    reader = Reader(rating_scale=(0, 10))

    print

    train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']],
                                     reader=reader)
    test_set = Dataset.load_from_df(test[['User-ID', 'ISBN', 'Book-Rating']],
                                    reader=reader)
    # load data from file
    # data = Dataset.load_from_df(df[['User-ID', 'ISBN', 'Book-Rating']], reader=reader)

    # to use when train on full train set
    # trainset = train_set.build_full_trainset()
    # validationset = trainset.build_testset()

    # create classifier (using a basic k nearest neighbors approach)
    algo = KNNBasic()

    trainset, testset = train_test_split(train_set,
                                         test_size=.9,
                                         random_state=1234)
    algo.fit(train_set)

    #cross_validate(algo, trainset, verbose=True)
    predictions = algo.test(testset)

    # compute MAE and RMSE
    accuracy.mae(predictions)
    accuracy.rmse(predictions)
def fit_model(data):
    train, test = train_test_split(data, test_size=0.25)
    svd = SVD(n_epochs=25, lr_all=0.01, reg_all=0.4)
    svd.fit(train)
    pred = svd.test(test)
    print('RMSE for test set: {}'.format(accuracy.rmse(pred)))
    print('MAE for test set: {}'.format(accuracy.mae(pred)))
    # save model
    path = '../Models/Collaborative_filtering2.model'
    pickle.dump(svd, open(path, 'wb'))
    print("Model is saved to: {}".format(path))
示例#31
0
 def metric(predictions, verbose=True, metric_type="rmse"):
     assert metric_type in {"mse", "fcp", "mae", "rmse"}
     if metric_type == "mse":
         metric = accuracy.mse(predictions=predictions, verbose=verbose)
     elif metric_type == "fcp":
         metric = accuracy.fcp(predictions=predictions, verbose=verbose)
     elif metric_type == "mae":
         metric = accuracy.mae(predictions=predictions, verbose=verbose)
     else:
         metric = accuracy.rmse(predictions=predictions, verbose=verbose)
     return metric
示例#32
0
algos_name.append('SVDpp')
algos.append(SVDpp(n_factors=1, random_state=1))

#algos_name.append('KNN')
#algos.append(KNNBasic())

for name, algo in zip(algos_name, algos):
    print('===', name)
    trainset, testset = train_test_split(data, test_size=0.2, random_state=1)

    # train and test algorithm.
    predictions = algo.fit(trainset).test(testset)

    # Compute and print Root Mean Absolute Error
    accuracy.mae(predictions, verbose=True)

    # predict
    pred_test = []
    for u,b in zip(user_test, book_test):
        pred_test.append(algo.predict(u,b).est)
    pred_test = np.array(pred_test)
    pred_test[pred_test > 10] = 10
    pred_test[pred_test < 1] = 1

    # write output
    output_path_raw = os.path.join('outputs', 'subm_surprise_'+name+'_raw.csv')
    np.savetxt(output_path_raw, pred_test, fmt='%1.4f')
    print('Ouput written to %s' % output_path_raw)
    output_path_round = os.path.join('outputs', 'subm_surprise_'+name+'_round.csv')
    np.savetxt(output_path_round, np.around(pred_test), fmt='%d')