Exemplo n.º 1
0
def slope_one(test, train, all):
    start = time.time()
    reader = Reader(rating_scale=(0.5, 5))
    data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
    test_data = Dataset.load_from_df(test[['userId', 'movieId', 'rating']],
                                     reader)
    trainset = data.build_full_trainset()
    testset = test_data.build_full_trainset().build_testset()
    algo = SlopeOne()
    algo.fit(trainset)
    fit = time.time()
    fit_time = fit - start
    predictions = algo.test(testset)
    uid = []
    mid = []
    rate = []
    for i in range(len(predictions)):
        uid.append(predictions[i].uid)
        mid.append(predictions[i].iid)
        rate.append(predictions[i].est)
    out = {'userId': uid, 'movieId': mid, 'rating': rate}
    out = pd.DataFrame.from_dict(out)
    predict_time = time.time() - fit
    overall = predict_time + fit - start
    return out, [fit_time, predict_time, overall]
Exemplo n.º 2
0
def surprise_slopeOne(train_file, test_file):
    """
    SlopeOne with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method SlopeOne from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        -
    Returns:
        numpy array: predictions
    """
    print("slopeone")
    algo = SlopeOne()
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
def SlopeOne_alg():
    print('Using SlopeOne')
    alg = SlopeOne()
    print(alg)
    alg.fit(trainset)
    predictions = alg.test(testset)
    print(accuracy.rmse(predictions))
Exemplo n.º 4
0
	def get(self, user_id):
		# SQL query
		conn = mysql.connect()
		cursor = conn.cursor()
		df = pd.read_sql_query("SELECT * FROM story_reviews", conn)

		# Data and Model
		reader = Reader(rating_scale=(1, 5))
		data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader)
		model = SlopeOne()
		
		# Training
		training_set = data.build_full_trainset()
		model.fit(training_set)

		# Prediction
		anti_training_set = training_set.build_anti_testset()
		prediction_set = [x for x in anti_training_set if x[0]==user_id]
		predictions = model.test(prediction_set)
		
		# Return Top N Recommendations
		n = 10
		predictions.sort(key=lambda x:x.est, reverse=True)
		top_n_predictions = predictions[:n]

		story_recommendations = []
		
		for predictionItem in top_n_predictions:
			story_recommendations.append(predictionItem.iid)

		return jsonify(recommendations = story_recommendations)
Exemplo n.º 5
0
 def SlopeOne_train(self):
     '''
     seed:int-3划分训练集测试集的随机种子
     k:int-40,最大邻居数量
     options:dict-{'name': 'pearson', 'user_based': False},算法的选项,默认为Pearson相似度,基于项目的方法
     '''
     self.algos = []
     df = self.trainDatas
     names = locals()
     r = Reader(rating_scale=(1, 5))
     # 读取、划分数据;训练预测数据
     total = Dataset.load_from_df(df[['uid', 'iid', 'total']], reader=r)
     total_train = total.build_full_trainset()
     total_algo = SlopeOne()
     total_algo.fit(total_train)
     self.algos.append(total_algo)
     for i in range(1, self.no_of_criteria + 1):
         names['c' + str(i)] = Dataset.load_from_df(
             df[['uid', 'iid', 'c' + str(i)]], reader=r)
         names['c' + str(i) +
               '_train'] = names.get('c' + str(i)).build_full_trainset()
         names['algo_c' + str(i)] = SlopeOne()
         names.get('algo_c' + str(i)).fit(names.get('c' + str(i) +
                                                    '_train'))
         self.algos.append(names.get('algo_c' + str(i)))
Exemplo n.º 6
0
def slopeOne(trainset, testset):
    # Slope One
    print("\n" + "-" * 5 + " SlopeOne algorithm using surprise package " +
          "-" * 5)
    algo = SlopeOne()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
Exemplo n.º 7
0
 def slope_one(self):
     """
     SlopeOne to reflect how much one item is liked over than another.
     Returns:
         predictions_df: The predictions of the model on the test data in
             Pandas Data Frame format
     """
     algorithm = SlopeOne()
     predictions = algorithm.fit(self.train_data).test(self.test_data)
     predictions_df = self.data.test_df.copy()
     predictions_df['Rating'] = [x.est for x in predictions]
     if self.test_purpose: 
         self.evalueate_model(predictions_df['Rating'], 'Surprise slope_one')
     return predictions_df
Exemplo n.º 8
0
def slope_one():
    print('Algoritmo Baseline Only...')
    print('Que data desea utilizar?')
    print('(1) Android')
    print('(2) WordPress')
    data_utilizar = input()

    # Funcion de encoding para no tener error de lectura del archivo.
    reload(sys)
    sys.setdefaultencoding('utf8')

    if data_utilizar == 1:
        file_path = configuration.FILE_PATH_ANDROID
        reader = Reader(line_format='user item rating', sep='\t')
    else:
        file_path = configuration.FILE_PATH_WORDPRESS
        reader = Reader(line_format='user item rating', sep=',')

    # Dataset
    data = Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=10)

    algo = SlopeOne()

    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    print_perf(perf)
def EvaluateDifferentAlgorithms():
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(),
            KNNWithZScore(),
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_6months,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
Exemplo n.º 10
0
 def __init__(self):
     super().__init__("slope", SlopeOne, param_grid={})
     best_params = super().tune()
     print(best_params)
     res = not best_params
     if res:
         self.algo = SlopeOne()
def select_model(user_review):
    user_review = data_prep()
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(
        user_review[['user_id', 'business_id', 'stars']], reader)
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            KNNBasic(),
            KNNBaseline(),
            KNNWithMeans(),
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF()
    ]:
        # Perform cross validation
        print(algorithm)
        print('start ......')
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)
        print(benchmark)
def check_for_args():
    args = sys.argv
    for arg in args:
        if (arg == 'SVD'):
            alg_list.append(SVD())
        elif (arg == 'SVDpp'):
            alg_list.append(SVDpp())
        elif (arg == 'SlopeOne'):
            alg_list.append(SlopeOne())
        elif (arg == 'NMF'):
            alg_list.append(NMF())
        elif (arg == 'NormalPredictor'):
            alg_list.append(NormalPredictor())
        elif (arg == 'KNNBaseline'):
            alg_list.append(KNNBaseline())
        elif (arg == 'KNNBasic'):
            alg_list.append(KNNBasic())
        elif (arg == 'KNNWithMeans'):
            alg_list.append(KNNWithMeans())
        elif (arg == 'KNNWithZScore'):
            alg_list.append(KNNWithZScore())
        elif (arg == 'BaselineOnly'):
            alg_list.append(BaselineOnly())
        elif (arg == 'CoClustering'):
            alg_list.append(CoClustering())

    return alg_list
Exemplo n.º 13
0
def run_baselines(ratings_dict, compressed_test_ratings_dict, data_origin):
    for alg in algos:
        if alg == "KNNBasic":
            algo = KNNBasic()
        elif alg == "KNNWithZScore":
            algo = KNNWithZScore()
        elif alg == "SVD":
            algo = SVD()
        elif alg == "NMF":
            algo = NMF()
        elif alg == "SlopeOne":
            algo = SlopeOne()
        elif alg == "CoClustering":
            algo = CoClustering()

        if data_origin == 'netflix':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, 'netflix')
        elif data_origin == 'small':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, 'small')
        elif data_origin == '100k':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, '100k')

        # print results
        print("\n\nAlg %s" % alg)
        print("Number of user-items pairs: %d" % nr_predictions)
        print("Accuracy: %.2f " % accuracy)
        print("RMSE: %.2f" % rmse)
        print("MAE: %.2f" % mae)
        print("Precision: %.2f" % precision)
        print("Recall: %.2f" % recall)
        print("F1: %.2f" % f1)
Exemplo n.º 14
0
def benchmark(data):
    performance = []
    algorithms = [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering(),
        SVD_SGD_momentum(),
        SVDpp_SGD_momentum()
    ]
    for algorithm in algorithms:
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE', 'FCP'],
                                 cv=3,
                                 verbose=False)
        output = pd.DataFrame.from_dict(results).mean(axis=0)
        output = output.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        performance.append(output)
    output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values(
        'test_rmse')
    store_dataframe(output_df, 'Algorithm_Benchmark.csv')
Exemplo n.º 15
0
def crossvalidate(data):
    results = []
    for algorithm in [
            NormalPredictor(),
            KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)),
            BaselineOnly(),
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            CoClustering()
    ]:
        result = cross_validate(algorithm,
                                data,
                                measures=['RMSE'],
                                cv=5,
                                verbose=False)
        temp = pd.DataFrame.from_dict(result).mean(axis=0)
        temp = temp.append(
            pd.Series([str(algorithm).split(' ')[0].split(".")[-1]],
                      index=['Algorithm']))
        results.append(temp)
    rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values(
        'test_rmse')
    return rmse_values
Exemplo n.º 16
0
def computeSlopeOne(data, test_np):
    """Compute the slope one method and return the predictions on the test
     The method has no parameter.
         
         data : data frame which represent the train set
         test_np : data frame on which the prediction will be returned
         
         return : test_np with a column of prediction named 'slopeone_rating'"""
    
    trainset, test = dataTrainSurprise(data, test_np)
    
    slopeone = SlopeOne().fit(trainset)
    
    test['slopeone_rating'] = test[['user_id', 'movie_id']] \
    .apply(lambda row: slopeone.predict(row['user_id'], row['movie_id'])[3], axis=1)
    
    return test
Exemplo n.º 17
0
 def _hyperopt(self):
     algo = SlopeOne()
     return cross_validate(algo,
                           self._data,
                           measures=ACCURACY_METRICS,
                           cv=self._cv,
                           n_jobs=self._cv_n_jobs,
                           verbose=self._debug)[self._metric].mean()
Exemplo n.º 18
0
def slope_one(trainset, testset, predset):
    
    modelname = 'slopeone'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    algo = SlopeOne()
    print('SlopeOne Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('  Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
Exemplo n.º 19
0
 def SlopeOne(self, namefile, uid, iid, rati, value_uid, value_iid):
     test_data = pd.read_csv('./container/' + namefile)
     dt = pd.DataFrame(test_data)
     # Retrieve the trainset.
     reader = Reader(rating_scale=(0, 100))
     data = Dataset.load_from_df(dt[[uid, iid, rati]], reader)
     trainset = data.build_full_trainset()
     algo = SlopeOne()
     algo.fit(trainset)
     pred = algo.predict(float(value_uid),
                         float(value_iid),
                         r_ui=1,
                         verbose=True)
     #var_rmse = accuracy.rmse(pred)
     #return result to json
     jsondata = {}
     jsondata["uid"] = pred.uid
     jsondata["idd"] = pred.iid
     jsondata["rati"] = round(pred.est, 2)
     return jsondata
Exemplo n.º 20
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    #Try SVD
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    #Try the NMF
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
    knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False)

    # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    
    #Other Collaborative Filtering Algorithms
    slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False)
    coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
Exemplo n.º 21
0
def randomize():
    sim_options_cosine = {'name': 'cosine', 'user_based': False}
    sim_options_msd = {'name': 'msd', 'user_based': False}
    sim_options_pearson = {'name': 'pearson', 'user_based': False}
    sim_options_baseline = {
        'name': 'pearson_baseline',
        'user_based': False,
        'shrinkage': 0
    }

    algorithms = [
        ('kNN Basic - Cosine',
         KNNBasic(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Basic - MSD', KNNBasic(sim_options=sim_options_msd,
                                     verbose=False)),
        ('kNN Basic - Pearson',
         KNNBasic(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Basic - Pearson B',
         KNNBasic(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Means - Cosine',
         KNNWithMeans(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Means - MSD',
         KNNWithMeans(sim_options=sim_options_msd, verbose=False)),
        ('kNN Means - Pearson',
         KNNWithMeans(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Means - Pearson B',
         KNNWithMeans(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Z - Cosine',
         KNNWithZScore(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Z - MSD',
         KNNWithZScore(sim_options=sim_options_msd, verbose=False)),
        ('kNN Z - Pearson',
         KNNWithZScore(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Z - Pearson B',
         KNNWithZScore(sim_options=sim_options_baseline, verbose=False)),
        ('kNN Baseline - Cosine',
         KNNBaseline(sim_options=sim_options_cosine, verbose=False)),
        ('kNN Baseline - MSD',
         KNNBaseline(sim_options=sim_options_msd, verbose=False)),
        ('kNN Baseline - Pearson',
         KNNBaseline(sim_options=sim_options_pearson, verbose=False)),
        ('kNN Baseline - Pearson B',
         KNNBaseline(sim_options=sim_options_baseline, verbose=False)),
        ('SVD', SVD(verbose=False)), ('SVDpp', SVDpp(verbose=False)),
        ('Baseline Only', BaselineOnly(verbose=False)),
        ('CoClustering', CoClustering(verbose=False)),
        ('SlopeOne', SlopeOne()), ('NMF', NMF(verbose=False))
    ]

    random_ = random.randint(0, len(algorithms))

    return algorithms[random_]
Exemplo n.º 22
0
	def get(self, algorithm, user_id):
		# SQL query
		conn = mysql.connect()
		cursor = conn.cursor()
		df = pd.read_sql_query("SELECT * FROM story_reviews", conn)

		# Data and Model
		reader = Reader(rating_scale=(1, 5))
		data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader)

		if algorithm=='svd':
			print('Using SVD')
			model = SVD()
		elif algorithm=='svdpp':
			print('Using SVD++')
			model = SVDpp()
		elif (algorithm=='nmf'):
			print('Using NMF')
			model = NMF()
		elif (algorithm=='slopeone'):
			print('Using Slope One')
			model = SlopeOne()
		elif (algorithm=='coclustering'):
			print('Using Co-Clustering')
			model = CoClustering()
		else:
			print('Using SVD')
			model = SVD()
		
		# Training
		training_set = data.build_full_trainset()
		model.fit(training_set)

		# Prediction
		anti_training_set = training_set.build_anti_testset()
		prediction_set = [x for x in anti_training_set if x[0]==user_id]
		predictions = model.test(prediction_set)

		# TESTING : Run 5-fold Cross Validation using Root Mean Square Error and Mean Average Error
		# cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
		
		# Return Top N Recommendations
		n = 10
		predictions.sort(key=lambda x:x.est, reverse=True)
		top_n_predictions = predictions[:n]

		story_recommendations = []
		
		for predictionItem in top_n_predictions:
			story_recommendations.append(predictionItem.iid)

		return jsonify(recommendations = story_recommendations)
Exemplo n.º 23
0
    def SlopeOne_from_to(self, namefile, uid, iid, rati, from_uid, to_uid,
                         from_iid, to_iid):
        test_data = pd.read_csv('./container/' + namefile)
        dt = pd.DataFrame(test_data)
        # Retrieve the trainset.
        reader = Reader(rating_scale=(0, 100))
        data = Dataset.load_from_df(dt[[uid, iid, rati]], reader)
        trainset = data.build_full_trainset()
        algo = SlopeOne()
        algo.fit(trainset)

        arr = []
        for value_uid in range(from_uid, to_uid):
            for value_iid in range(from_iid, to_iid):
                pred = algo.predict(value_uid, value_iid, r_ui=1, verbose=True)
                tempdata = []
                tempdata.append(pred.uid)
                tempdata.append(pred.iid)
                tempdata.append(round(pred.est, 2))
                arr.append(tempdata)
        #return result to json
        return arr
Exemplo n.º 24
0
    def EvaluateAllModels(self):
        """
                         test_rmse   fit_time  test_time
        Algorithm
        SVDpp             0.965824   9.401286   0.151476
        SVD               0.967286   1.474139   0.062471
        BaselineOnly      0.972408   0.108964   0.057277
        NMF               0.992677   4.073005   0.171846
        KNNWithZScore     1.001898   0.620192   0.083341
        KNNWithMeans      1.002924   0.489803   0.078121
        SlopeOne          1.006664  19.091191   1.275676
        KNNBaseline       1.007437   0.890452   0.088495
        KNNBasic          1.016717   0.432159   0.072929
        NormalPredictor   1.253265   0.041646   0.078105
        CoClustering      1.828291   3.020921   0.052071
        :return: test_rmse sonucu en düşük olan alınır.
        """
        benchmark = []
        # Iterate over all algorithms
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NMF(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # Perform cross validation
            results = cross_validate(algorithm,
                                     self.data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # Get results & append algorithm name
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)

        result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
            'test_rmse')
        print(result)

        return result
Exemplo n.º 25
0
    def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"):
        conn = sqlite3.connect(DATABASE_NAME)
        df = pd.read_sql_query(
            "SELECT userID, glassID, relativeRating FROM ratings", conn)

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            df[['userID', 'glassID', 'relativeRating']], reader)

        trainset, testset = train_test_split(data, test_size=.20)

        isUserBased = True if (isUserBased == "Yes") else False
        if similarityMeasure == 1:
            similarityMeasure = "cosine"
        elif similarityMeasure == 2:
            similarityMeasure = "pearson"
        else:
            similarityMeasure = "pearson_baseline"

        sim_options = {'name': similarityMeasure, 'user_based': isUserBased}

        if method == 1:
            algo = SVD()
        elif method == 2:
            algo = SlopeOne()
        elif method == 3:
            algo = NMF()
        elif method == 4:
            algo = NormalPredictor()
        elif method == 5:
            algo = KNNBaseline(sim_options=sim_options)
        elif method == 6:
            algo = KNNBasic(sim_options=sim_options)
        elif method == 7:
            algo = KNNWithMeans(sim_options=sim_options)
        elif method == 8:
            algo = KNNWithZScore(sim_options=sim_options)
        elif method == 9:
            algo = BaselineOnly()
        else:
            algo = CoClustering()

        algo.fit(trainset)
        predictions = algo.test(testset)

        conn.close()

        #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
        return round(accuracy.rmse(predictions, verbose=False), 4)
Exemplo n.º 26
0
 def __init__(self, modelName, dataPath):
     self.modelDict = {
         "KNNBasic": KNNBasic(),
         "KNNWithMeans": KNNWithMeans(),
         "KNNWithZScore": KNNWithZScore(),
         "SVD": SVD(),
         "SVDpp": SVDpp(),
         "NMF": NMF(),
         "SlopeOne": SlopeOne(),
         "CoClustering": CoClustering()
     }
     self.trainset = None
     self.testset = None
     self.data = None
     self.model = self.modelDict[modelName]
     self.loadData(os.path.expanduser(dataPath))
def select_cf_model(algorithms=[SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]):
    #=========================Create automated context to pick best CF model========================
    benchmark = []
    algos = []
    # Iterate over all algorithms
    for algorithm in algorithms:
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
        algos = algos +[algorithm]
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark.append(tmp)
        
    out = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  
    return out,algos
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file_path",
                        default="data/train.csv",
                        help="training file path")
    parser.add_argument("--test_file_path",
                        default="data/test.csv",
                        help="testing file path")
    parser.add_argument("--approach",
                        default="SVD",
                        help="Baseline | SVD | SlopeOne | NMF | CoClustering")
    parser.add_argument("--output_ranking_file",
                        default="ranking",
                        help="output ranking for test")
    bsl_options = {'method': 'sgd', 'n_epochs': 20, 'reg_u': 100, 'reg_i': 50}
    options = {
        "Baseline": BaselineOnly(bsl_options, verbose=True),
        "SVD": SVD(verbose=True, n_factors=20, n_epochs=3),
        "SlopeOne": SlopeOne(),
        "NMF": NMF(),
        "CoClustering": CoClustering()
    }
    args = parser.parse_args()
    reader = Reader(line_format='user item rating timestamp', sep='\t')
    algo = options[args.approach]
    train_data = Dataset.load_from_file(args.train_file_path, reader=reader)
    test_data = Dataset.load_from_file(args.test_file_path, reader=reader)
    train_set = train_data.build_full_trainset()
    test_set = test_data.build_full_trainset().build_testset()
    print("training....")
    algo.fit(train_set)
    print("testing...")
    predictions = algo.test(test_set)
    accuracy.mae(predictions, verbose=True)
    accuracy.rmse(predictions, verbose=True)
    ### Extra Credit
    output_ranking(predictions,
                   args.output_ranking_file + "_" + args.approach + ".out")
    precisions, recalls = precision_recall_at_k(predictions,
                                                k=10,
                                                threshold=2.5)
    print("Precision:",
          sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))
    print("F-measure:", f_measure(precisions, recalls))
    print("conversion_rate:", get_conversion_rate(predictions, k=10))
    print("ndcg:", get_ndcg(predictions, k_highest_scores=10))
Exemplo n.º 29
0
def matrix_factorization_param(data_cv):
    # Iterate over all algorithms
    benchmark = []

    for algorithm in [
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            NormalPredictor(),
            CoClustering()
    ]:
        # Perform cross validation
        results = model_selection.cross_validate(algorithm,
                                                 data_cv,
                                                 measures=['RMSE', 'MAE'],
                                                 cv=5,
                                                 verbose=False)
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    rmse = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_mae')
    #print(rmse)

    # Parameter grid
    param_grid = {
        'n_factors': [100, 150, 200],
        'n_epochs': [20, 40],
        'lr_all': [0.001, 0.005, 0.008],
        'reg_all': [0.075, 0.1, 0.15]
    }
    algorithm_gs = model_selection.GridSearchCV(SVD,
                                                param_grid,
                                                measures=['rmse'],
                                                cv=5,
                                                n_jobs=-1)
    algorithm_gs.fit(data_cv)

    # best parameters for a model with the lowest rmse
    best_algo = algorithm_gs.best_estimator['rmse']
    return best_algo
Exemplo n.º 30
0
    def checkBestAlgorithm(self):
        self.df = pd.read_csv(csv_name)
        reader = Reader(rating_scale=(1, 10))
        data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']],
                                    reader)
        benchmark = []
        rmseTuple = []
        # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다.
        for algorithm in [
                SVD(),
                SVDpp(),
                SlopeOne(),
                NormalPredictor(),
                KNNBaseline(),
                KNNBasic(),
                KNNWithMeans(),
                KNNWithZScore(),
                BaselineOnly(),
                CoClustering()
        ]:
            # 교차검증을 수행하는 단계.
            results = cross_validate(algorithm,
                                     data,
                                     measures=['RMSE'],
                                     cv=3,
                                     verbose=False)

            # 결과 저장과 알고리즘 이름 추가.
            tmp = pd.DataFrame.from_dict(results).mean(axis=0)
            rmseTuple.append((algorithm, tmp['test_rmse']))
            tmp = tmp.append(
                pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                          index=['Algorithm']))
            benchmark.append(tmp)
        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
        print("\n")
        rmseTuple.sort(key=lambda x: x[1])

        print("Best algorithm : ")
        print(str(rmseTuple[0]).split(' ')[0].split('.')[-1])
        return rmseTuple[0]