def cluster(data, kwargs):            
    # Set algorithm
    cluster_u     = kwargs.get('user_cluster')
    cluster_i     = kwargs.get('item_cluster')
    n_epochs      = kwargs.get('maxiter')
    
    # Set algorithm
    algo = CoClustering(n_cltr_u  = cluster_u[0], n_cltr_i  = cluster_i[0],
                        n_epochs  = n_epochs , random_state = kwargs['random_seed'] )
    
    # Train the algorithm on the data, and predict ratings for the testset
    algo.fit(data)
    
    prediction = np.zeros([10000,1000])
    for row in range(10000):
        for col in range(1000):
            prediction[row,col] = algo.predict(str(row+1),str(col+1)).est
            
    return prediction
示例#2
0
def computeCoClustering(data, test_np):
    """Compute the co-clustering method and return the predictions on the test
     The method has the following parameter:
         - Number of user clusters : 2
         - Number of item clusters : 19
         - Number of epochs: 30
         
         data : data frame which represent the train set
         test_np : data frame on which the prediction will be returned
         
         return : test_np with a column of prediction named 'cocluster_rating'"""
    
    trainset, test = dataTrainSurprise(data, test_np)
    
    cocltr_algo = CoClustering(n_cltr_u=2, n_cltr_i=19, n_epochs = 30).fit(trainset)
    
    test['cocluster_rating'] = test[['user_id', 'movie_id']] \
    .apply(lambda row: cocltr_algo.predict(row['user_id'], row['movie_id'])[3], axis=1)
    
    return test
def co_clustering(train, test, ids, Xtest, Xids):
    """
    Co-clustering algorithm, users and items assigned to clusters and co_clusters
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('Co-clustering')
    algo = CoClustering(n_cltr_u=1, n_cltr_i=1, n_epochs=50, random_state=15)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
示例#4
0
# Use the famous SVD algorithm.
algo = CoClustering()

# Run 5-fold cross-validation and print results.
#cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
algo.fit(trainset)
'''
predictions = algo.test(testset)

print(predictions)
'''
'''
uid = str(0)  # raw user id (as in the ratings file). They are **strings**!
iid = str(35546)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, verbose=True)
print(pred[3])
'''
testset = open("../test1.csv", "r")
result = open("./result/result_CoClustering.txt", "w")
full_result = open("./full_result/result_CoClustering.txt", "w")

for line in testset:
    temp = line.split(",")
    pred = algo.predict(temp[0], temp[1], verbose=True)
    score = round(pred[3])
    #print()
    result.write(str(score) + "\n")
    full_result.write(str(pred[3]) + "\n")
示例#5
0
duplicates = ratings.duplicated()
duplicates = duplicates[duplicates == True]

columnsTitles=["user_id","book_id","rating"]
ratings = ratings.reindex(columns=columnsTitles)


reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)

algo = CoClustering()
start = time.time()
cross_validate(algo, data, measures=['RMSE'], cv=10, verbose=True)
cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))

pred = algo.predict(7563, 1, r_ui=4, verbose=True)

from collections import defaultdict

def get_top_k(predictions, k):
    '''Return a top_k dicts where keys are user ids and values are lists of
    tuples [(item id, rating estimation) ...].

    Takes in a list of predictions as returned by the test method.
    '''

    # First map the predictions to each user.
    top_k = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_k[uid].append((iid, est))
示例#6
0
class Surprise():
    def train(self, algo='SVD', like=True, test='cv', local=False):

        if local:
            csv_path = os.path.join(os.path.dirname(__file__),
                                    "data/preprocessed")
            self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv")
            self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv")
        else:
            self.recipes = storage.import_file('data/preprocessed',
                                               'recipe_pp.csv')
            self.reviews = storage.import_file('data/preprocessed',
                                               'review_pp.csv')

        if like:
            self.target = 'liked'
            self.s_min = 0
            self.s_max = 1
        else:
            self.target = 'rating'
            self.s_min = 1
            self.s_max = 5

        reader = Reader(rating_scale=(self.s_min, self.s_max))

        self.relevant_data = self.reviews[[
            'user_id', 'recipe_id', self.target
        ]]
        model_data = Dataset.load_from_df(self.relevant_data, reader)

        # Algos

        if 'NormalPredictor':
            self.algorithm = NormalPredictor()

        elif 'BaselineOnly':
            self.algorithm = BaselineOnly()

        elif 'KNNBasic':
            self.algorithm = KNNBasic()

        elif 'KNNWithMeans':
            self.algorithm = KNNWithMeans()

        elif 'KNNWithZScore':
            self.algorithm = KNNWithZScore()

        elif 'KNNBaseline':
            self.algorithm = KNNBaseline()

        elif 'SVD':
            params = {
                'n_epochs': 20,
                'n_factors': 100,
                'lr_all': 0.002,
                'reg_all': 0.02
            }
            self.algorithm = SVD(params)  # Tuned with svd_grid

        elif 'SVDpp':
            self.algorithm = SVDpp()

        elif 'NMF':
            self.algorithm = NMF()

        elif 'SlopeOne':
            self.algorithm = SlopeOne()

        elif 'CoClustering':
            self.algorithm = CoClustering()

        if test == 'cv':
            cv_results = cross_validate(self.algorithm,
                                        model_data,
                                        measures=['RMSE', 'MAE'],
                                        cv=5,
                                        verbose=True)
            rmse = np.round(cv_results['test_rmse'].mean(), 3)
            mae = np.round(cv_results['test_mae'].mean(), 3)
            train_data = model_data.build_full_trainset()
            self.algorithm.fit(train_data)

        elif test == 'svd_grid':
            param_grid = {
                'n_epochs': [10, 20],
                'n_factors': [100, 200],
                'lr_all': [0.001, 0.002],
                'reg_all': [0.01, 0.02]
            }
            train_data = model_data.build_full_trainset()
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
            gs.fit(model_data)
            rmse = gs.best_score['rmse']
            mae = gs.best_score['mae']
            print(gs.best_params['rmse'], gs.best_params['mae'])
            self.algorithm = gs.best_estimator['rmse']
            train_data = model_data.build_full_trainset()
            self.algorithm.fit(train_data)

        else:
            train, test = train_test_split(model_data,
                                           test_size=0.3,
                                           random_state=42)
            self.algorithm.fit(train)
            predictions = self.algorithm.test(test)
            rmse = np.round(accuracy.rmse(predictions), 3)
            mae = np.round(accuracy.mae(predictions), 3)

        return rmse, mae

    def predict(self, user_id):

        inputs = self.relevant_data[self.relevant_data['user_id'] == user_id] \
                 .merge(self.recipes, on="recipe_id", how="left")[['recipe_id', 'name', self.target]]

        display(inputs)

        user_recipes = self.relevant_data[self.relevant_data['user_id'] ==
                                          user_id].recipe_id.unique()
        recipe_list = self.relevant_data[
            self.relevant_data['user_id'] != user_id].recipe_id.unique()
        predictions = [
            self.algorithm.predict(user_id, rec) for rec in recipe_list
            if rec not in list(user_recipes)
        ]

        pdf = pd.DataFrame(predictions,
                           columns=[
                               'user_id', 'recipe_id', self.target,
                               f'rec_{self.target}', 'details'
                           ])
        pdf = pdf.drop(columns=[self.target, 'details'])
        pdf = pdf.sort_values(f'rec_{self.target}', ascending=False)

        rec_target = pdf[f'rec_{self.target}']
        pdf['rec_score'] = (rec_target - self.s_min) / (self.s_max -
                                                        self.s_min)

        outputs = pdf.merge(self.recipes, on="recipe_id", how="left")[[
            'recipe_id', 'name', f'rec_{self.target}', 'rec_score'
        ]]

        display(outputs.head(10))

        return outputs
示例#7
0
#              'n_epochs': [10,20,30,40,50,60,70,80,90,100]}


# Evaluate the model with 5-fold cross validation
#data.split(5)

#grid_search = GridSearch(CoClustering, param_grid, measures=['RMSE'])
#grid_search.evaluate(data)
#print ("after grid_search.evaluate(data)")
#print_perf(perf)

#results_df = pd.DataFrame.from_dict(grid_search.cv_results)
#print(results_df) """

# create a co-clustering algorithm
algo = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=100)
algo.train(trainset)

# use the trained algorithm to predict ratings for every user in the test set
f = open('testOutput.csv', 'w')
f.write("test_id,rating\n")
for i in range(len(dftest)):
    prediction = algo.predict(dftest.at[i, 'user_id'],
                              dftest.at[i, 'business_id'],
                              r_ui=4,
                              verbose=True)
    predRating = prediction.est
    f.write(str(i) + "," + str(predRating) + '\n')

f.close()