示例#1
0
def knn_m(data, training, testing):
    '''
        Tune KNN with Means parameters then calculates RMSE, coverage and running time of KNN with Means

        Args:
            data(Dataset): the whole dataset divided into 5 folds
            training(Dataset): training dataset
            testing(Dataset): test dataset

        Returns:
            rmse: RMSE of KNN with Means with optimized parameters
            top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                                                        'min_support': [1, 5], 'user_based': [False]}}

    # optimize parameters
    knnm_grid_search = GridSearch(KNNWithMeans, knn_param_grid, measures=['RMSE'], verbose=False)
    knnm_grid_search.evaluate(data)
    param = knnm_grid_search.best_params['RMSE']
    print('KNNWithMeans:', param)

    # fit model using the optimized parameters
    knnm = KNNWithMeans(k=param['k'], name=param['sim_options']['name'],
                        min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'])
    knnm.train(training)

    # evaluate the model using test data
    predictions = knnm.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse, top_n
示例#2
0
def knnm_running_time(data):
    '''
        Calculates the running times for training and predictions for KNN with Means

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_KnnMeanstrain: running time for training
            elapsedtime_KnnMeanstest: running time for predictions on testset
    '''
    elapsedtime_KnnMeanstrain = []
    elapsedtime_KnnMeanstest = []

    # tune the parameters on the entire data
    param_grid = {
        'k': [5, 10, 20],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson'],
            'min_support': [1, 5],
            'user_based': [False]
        }
    }
    grid_search = GridSearch(KNNWithMeans,
                             param_grid,
                             measures=['RMSE'],
                             verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    k = param['k']
    sim = param['sim_options']['name']
    min_support = param['sim_options']['min_support']
    user_based = param['sim_options']['user_based']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        knnm = KNNWithMeans(k=k,
                            name=sim,
                            min_support=min_support,
                            user_based=user_based)
        knnm.train(training)
        elapsedtime_KnnMeanstrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        knnm.test(testing)
        elapsedtime_KnnMeanstest.append(time.time() - test_start)
    return elapsedtime_KnnMeanstrain, elapsedtime_KnnMeanstest
示例#3
0
def model_training_and_evalution():
    print "欢迎来到 训练阶段"
    file_path=os.path.expanduser(r'E:\JiangIntellijWorkingSpace\tools\music_recommendation\transform_playlist_song_rating.txt')
    reader=Reader(line_format='user item rating',sep='\t')
    music_data=Dataset.load_from_file(file_path,reader=reader)
    print("构建数据集")
    trainset=music_data.build_full_trainset()
    print"开始训练模型....."
    sim_options={'name':'pearson_baseline','user_based':False}
    algo=KNNWithMeans(sim_options)
    algo.train(trainset)
    rid_to_name,name_to_rid=read_item_names()
    # print name_to_rid
    toy_story_raw_id=name_to_rid[u'Over The Horizon-SAMSUNG GALAXY THEME']
    # toy_story_raw_id=423245641
    print toy_story_raw_id
    toy_story_inner_id=algo.trainset.to_inner_iid(toy_story_raw_id)
    toy_story_neighbors=algo.get_neighbors(toy_story_inner_id,k=10)
    toy_story_neighbors=(algo.trainset.to_raw_iid(inner_id)for inner_id in toy_story_neighbors)
    toy_story_neighbors=(rid_to_name[rid]for rid in toy_story_neighbors)
    print('the 10 nearest neighbors of it are(为你推荐最相近的10首歌单):')
    for music in toy_story_neighbors:
        print music
示例#4
0
file_path = os.path.expanduser('./data/163_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 计算歌曲和歌曲之间的相似度
print "构建数据集..."
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}

#查找最近的user
print "开始训练模型..."
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNWithMeans()
algo.train(trainset)

current_playlist = list(name_id_dic.keys())[39]
print "歌单名称", current_playlist

# 取出近邻
# 映射名字到id
playlist_id = name_id_dic[current_playlist]
print "歌单id", playlist_id
# 取出来对应的内部user id => to_inner_uid
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print "内部id", playlist_inner_id

playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)

# 把歌曲id转成歌曲名字
示例#5
0
class recommender:
    def __init__(self, algorithm):

        # Always call base method before doing anything.
        self.name = algorithm.lower()  # SVD, NMF, SAE, LSTM
        self.surprise_algorithms = ['svd', 'nmf', 'knnbasic', 'knnmeans']
        self.devooght_algorithms = ['fism']
        '''
         To implement with surprise:
             - Matrix-Factorization Based:
                 SVDpp: The SVD++ algorithm, an extension of SVD taking into account implicit ratings.
             - Neighbourhood-based:
                 Coclustering
                 KNNWithZScore: A basic collaborative filtering algorithm, taking into account the z-score normalization of each user.
                 KNNBaseline: A basic collaborative filtering algorithm taking into account a baseline rating.
             - Random Predictor    
                 NormalPredictor: Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
             - Baseline    
                 BaselineOnly: Algorithm predicting the baseline estimate for given user and item.
             - Slope One
                 SlopeOne: A simple yet accurate collaborative filtering algorithm.

        To implement using RNN:
            - LSTM 
            - GRU (Devooght, Bersini)
            - GRU with clustering (Devooght, Bersini)
            
        To extract latent factors:
            - Stacked Autoencoders
            - CNN
            - CNN with Stacked Autoencoders
        '''

        self.df_known_predictions = None
        self.df_unknown_predictions = None
        self.known_sequence_dict = None
        self.unknown_sequence_dict = None
        self.k = None
        self.k_min = None
        self.metrics = None

    def get_name(self, verbose=False):
        return self.name

    def fit(self,
            df_ratings=None,
            columns=['userId', 'itemId', 'rating'],
            verbose=False,
            **kwargs):

        self.columns = np.array(columns)
        # If Surprise lib is the base package to fit, then df_ratings must be used.
        # Algorithms that use Surprise Lib: NMF, SVD, KNN, SVDpp

        if (df_ratings is not None):
            self.df_ratings = df_ratings.copy()

        ###########################################
        # Convert Utility Matrix to df_ratings if utility matrix is passed
        #
        #
        ###########################################

        if self.name in self.surprise_algorithms:  # Surprise-based recommenders
            from surprise import Dataset
            from surprise import Reader

            # A reader is still needed but only the rating_scale param is required.
            # The Reader class is used to parse a file containing ratings.
            reader = Reader(rating_scale=(0.5, 5.0))

            # Separating timestamp column
            if ('timestamp' in columns):
                self.df_timestamp = self.df_ratings['timestamp'].copy()
                self.df_ratings.drop(labels='timestamp', inplace=True, axis=1)

            # The columns must correspond to user id, item id and ratings (in that order).
            data = Dataset.load_from_df(
                self.df_ratings[self.columns[np.where(
                    self.columns != 'timestamp')]], reader)

            # Creting trainset variable to be used in prediction functions of Surprise
            self.trainset = data.build_full_trainset()

            # Creating Model
            if self.name == 'svd':
                from surprise import SVD

                # Setting Number of Factors in Matrix Factorization
                if ('n_factors' in kwargs):
                    self.n_factors = kwargs['n_factors']
                else:
                    self.n_factors = 100
                    if (verbose):
                        print("Using default number of factors: {}".format(
                            self.n_factors))

                # Setting number of epochs in stocastic gradient descent
                if ('n_epochs' in kwargs):
                    self.n_epochs = kwargs['n_epochs']
                else:
                    self.n_epochs = 20
                    if (verbose):
                        print("Using default number of epochs: {}".format(
                            self.n_epochs))

                self.model = SVD(n_factors=self.n_factors,
                                 n_epochs=self.n_epochs,
                                 verbose=verbose)

            elif self.name == 'nmf':
                from surprise import NMF

                # Setting Number of Factors in Matrix Factorization
                if ('n_factors' in kwargs):
                    self.n_factors = kwargs['n_factors']
                else:
                    self.n_factors = 15
                    if (verbose):
                        print("Using default number of factors: {}".format(
                            self.n_factors))

                # Setting number of epochs in stocastic gradient descent
                if ('n_epochs' in kwargs):
                    self.n_epochs = kwargs['n_epochs']
                else:
                    self.n_epochs = 50
                    if (verbose):
                        print("Using default number of epochs: {}".format(
                            self.n_epochs))

                self.model = NMF(n_factors=self.n_factors,
                                 n_epochs=self.n_epochs,
                                 verbose=verbose)

            elif self.name == 'knnbasic':
                from surprise import KNNBasic

                # Setting number of neighbours
                if ('k' in kwargs):
                    self.k = kwargs['k']
                else:
                    self.k = 40
                    if (verbose):
                        print("Using default k: {}".format(self.k))

                # Setting minimum number of neighbours
                if ('k_min' in kwargs):
                    self.k_min = kwargs['k_min']
                else:
                    self.k_min = 1
                    if (verbose):
                        print("Using default k_min: {}".format(1))

                self.model = KNNBasic(k=self.k,
                                      min_k=self.k_min,
                                      verbose=verbose)

            elif self.name == 'kmeans':
                from surprise import KNNWithMeans

                # Setting number of neighbours
                if ('k' in kwargs):
                    self.k = kwargs['k']
                else:
                    self.k = 40
                    if (verbose):
                        print("Using default k: {}".format(40))

                # Setting minimum number of neighbours
                if ('k_min' in kwargs):
                    self.k_min = kwargs['k_min']
                else:
                    self.k_min = 1
                    if (verbose):
                        print("Using default k_min: {}".format(1))

                self.model = KNNWithMeans(k=self.k,
                                          min_k=self.k_min,
                                          verbose=verbose)

            else:
                if (verbose):
                    print("Algorithm not configured: {}".format(self.name))
                return -1

            # Train the algorithm on the trainset, and predict ratings for the testset
            self.model.train(self.trainset)

            return 0

        elif (self.name in self.devooght_algorithms):

            # Arguments
            directory_path = os.path.join(
                '.', 'Sequence_based_recommendation_files', self.name)
            preprocess.create_dirs(dirname=directory_path, verbose=verbose)

            data = preprocess.remove_rare_elements(data=df_ratings,
                                                   min_user_activity=1,
                                                   min_item_popularity=1,
                                                   verbose=verbose)

            data = preprocess.save_index_mapping(data=data,
                                                 dirname=directory_path,
                                                 separator=',')

            train_set, val_set, test_set = preprocess.split_data(
                data=data,
                nb_val_users=0.1,  # val_size
                nb_test_users=0.1,  # test_size
                dirname=directory_path,
                verbose=verbose)

            preprocess.make_sequence_format(train_set=train_set,
                                            val_set=val_set,
                                            test_set=test_set,
                                            dirname=directory_path,
                                            verbose=verbose)

            preprocess.save_data_stats(data=data,
                                       train_set=train_set,
                                       val_set=val_set,
                                       test_set=test_set,
                                       dirname=directory_path,
                                       verbose=verbose)

            # Training Algorithm
            parser = parse.command_parser(parse.predictor_command_parser,
                                          train.training_command_parser,
                                          parse.early_stopping_command_parser)

            if self.name == 'fism':
                args = parser.parse_args([
                    '--dir',
                    os.path.join(directory_path, 'models'),
                    '-d',
                    directory_path,  #directory_path + '/', 
                    '-b',
                    '20',  # Batch size: the number of training examples present in a single blatch
                    '--max_iter',
                    '50',  # Maximum number of iterations: the number of batches needed to complete one epoch
                    '--progress',
                    '10',  # when progress information should be printed during training
                    '-m',
                    self.name.upper(),  # Method
                    #'-i', '-1', # Number of batches - only on test parser
                    '--loss',
                    'RMSE',
                    '--save',
                    'Best'
                ])

                self.model = parse.get_predictor(args)

                dataset = handler.DataHandler(
                    dirname=args.dataset,
                    extended_training_set=args.extended_set,
                    shuffle_training=args.tshuffle)

                self.model.prepare_model(dataset)
                self.metrics = self.model.train(
                    dataset,
                    save_dir=args.dir,
                    time_based_progress=args.time_based_progress,
                    progress=float(args.progress),
                    autosave=args.save,
                    max_progress_interval=args.mpi,
                    max_iter=args.max_iter,
                    min_iterations=args.min_iter,
                    max_time=args.max_time,
                    early_stopping=parse.get_early_stopper(args),
                    load_last_model=args.load_last_model,
                    validation_metrics=args.metrics.split(','))

            else:
                if (verbose):
                    print("Algorithm not configured: {}".format(self.name))
                return -1

            return 0

        else:  # if self.name not in self.surprise_algorithms
            if (verbose):
                print("Invalid algorithm: {}".format(self.name))

    def get_model(self):
        return self.model

    def get_metrics(self):
        return self.metrics

    def calculate_known_predictions(self):
        # Calculating all predictions for known items

        if self.name in self.surprise_algorithms:
            # Calculating predictions dataframe as userId, itemId, rating, prediction
            # predictions return raw uid and iid

            known_predictions = self.model.test(self.trainset.build_testset(
            ))  # Brings all predictions of existing ratings

            for prediction in known_predictions:
                arr = np.array([
                    int(prediction.uid),
                    int(prediction.iid), prediction.r_ui, prediction.est
                ])
                if prediction == known_predictions[0]:
                    predictions = np.array([arr])
                else:
                    predictions = np.append(predictions, [arr], axis=0)

            self.df_known_predictions = pd.DataFrame({
                'userId':
                predictions[:, 0],
                'itemId':
                predictions[:, 1],
                'rating':
                predictions[:, 2],
                'prediction':
                predictions[:, 3]
            })

            if ('timestamp' in self.columns):
                self.df_known_predictions = self.df_known_predictions.set_index(
                    keys=['userId', 'itemId']).join(
                        df_ratings.drop('rating', axis=1).set_index(
                            keys=['userId', 'itemId'])).reset_index()

            self.df_known_predictions['userId'] = self.df_known_predictions[
                'userId'].astype(int)
            self.df_known_predictions['itemId'] = self.df_known_predictions[
                'itemId'].astype(int)

    def get_known_predictions(self, calculate_predictions=False):
        if self.df_known_predictions is None or calculate_predictions == True:
            self.calculate_known_predictions()

        return self.df_known_predictions

    def calculate_unknown_predictions(self):
        # Calculating all predictions for known items
        # predictions return raw uid and iid

        if self.name in self.surprise_algorithms:
            unknown_predictions = self.model.test(
                self.trainset.build_anti_testset(
                ))  # => Brings all predictions of non-existing ratings

            for prediction in unknown_predictions:
                arr = np.array([
                    int(prediction.uid),
                    int(prediction.iid), 0, prediction.est
                ])
                if prediction == unknown_predictions[0]:
                    predictions = np.array([arr])
                else:
                    predictions = np.append(predictions, [arr], axis=0)

            self.df_unknown_predictions = pd.DataFrame({
                'userId':
                predictions[:, 0],
                'itemId':
                predictions[:, 1],
                'rating':
                predictions[:, 2],
                'prediction':
                predictions[:, 3]
            })

    def get_unknown_predictions(self, calculate_predictions=False):
        if self.df_unknown_predictions is None or calculate_predictions == True:
            self.calculate_unknown_predictions()

        return self.df_unknown_predictions

    def predict(self, userId, itemId, verbose=False):

        if self.name in self.surprise_algorithms:
            prediction = self.model.predict(
                uid=int(userId),
                iid=int(itemId))  # Take as input the raw user id and item id
            #ref: http://surprise.readthedocs.io/en/stable/algobase.html#surprise.prediction_algorithms.algo_base.AlgoBase.predict

            if prediction.details['was_impossible'] == True:
                if (verbose):
                    print(
                        "Impossible to predict item {} rating for user {} (one of them may not have been in training step)"
                        .format(itemId, userId))
                return 0
            else:
                return prediction.est

    def get_top_n(self, n=10, source='unknown', calculate_sequence=False):
        '''Return the top-N recommendation for each user from a set of predictions.
        Args:        
            n(int): The number of recommendation to output for each user. Default
                is 10.
        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n. '''

        if (source.lower() == 'known'):

            # Checking if known predictions are calculated
            if (self.df_known_predictions is None):
                self.get_unknown_predictions(calculate_predictions=True)

            if (calculate_sequence == True
                    or self.known_sequence_dict is None):
                self.known_sequence_dict = dict()

                for userId in self.df_known_predictions['userId'].unique():
                    # Selecting single user
                    df_user = self.df_known_predictions[
                        self.df_known_predictions['userId'] == userId].copy()

                    # Sorting values by prediction
                    df_user.sort_values(by=['prediction'],
                                        ascending=False,
                                        inplace=True)

                    # Saving the first K in sequence dict
                    self.known_sequence_dict[userId] = np.array(
                        df_user['itemId'].head(n))

            return self.known_sequence_dict
示例#6
0
class Surprise_recommender:
    def __init__(self, reader):
        '''
        Constructor

        ------
        Args:
        reader: A reader object for the dataset object in surprise
        '''
        self.reader = reader
        return

    def create_test_set(self, test_data):
        '''
        Function to create test_set
        This function drops timestamp from the data

        ------
        Args:
        test_data: input test data
        
        ------
        Returns:
        ts: test data after removing time stamp feature
        Basically a list with the following format: user, item, rating
        '''
        ts = [[td[0], td[1], td[2]] for td in test_data]
        return ts

    def create_train_set(self, train_data):
        '''
        Function to create training set
        
        ------
        Args:
        train_data: Training set in the form of list
        
        ------
        Returns:
        Trainset object from surprise
        Basically a list with the following format: user, item, rating, timestamp
        '''
        ds = Dataset(self.reader)
        return ds.construct_trainset(train_data)

    def train_test_model(self, validation_set, train_set, test_set, algorithm,
                         task):
        '''
        Function to train models using different algorithms. Dumps GridSearch results
        for further analysis.

        ------
        Args:
        train_set: The training data formatted according to the needs of surprise
        algorithm: The algorithm for training the model
        test_set: Testing data to check RMSE and MAE after GridSearch
        validation_set: Dataset for hyperparameter optimization
        task: Make predictions for rating, sentiment scores or for combined rating

        ------
        Returns:None
        '''

        if algorithm == 'SVD':

            param_grid = {
                'n_epochs': np.arange(1, 101, 10).tolist(),
                'n_factors': [10, 50, 100]
            }
            grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'MAE'])

            grid_search.evaluate(validation_set)

            p.dump(grid_search.cv_results,
                   open('../stats/svd_results_' + task + '.p', 'wb'))
            best_model_RMSE = grid_search.best_params['RMSE']
            validation_rmse = grid_search.best_score['RMSE']
            best_model_mae = grid_search.best_params['MAE']
            validation_mae = grid_search.best_score['MAE']
            #print(validation_rmse)
            #print(validation_mae)
            print(type(grid_search.cv_results))
            print(grid_search.cv_results)

            #Test based on best training RMSE
            n_epochs = best_model_RMSE['n_epochs']
            n_factors = best_model_RMSE['n_factors']
            self.algo = SVD(n_epochs=n_epochs, n_factors=n_factors)
            self.algo.train(train_set)
            predictions = self.algo.test(test_set)
            test_rmse = accuracy.rmse(predictions, verbose=True)
            test_mae = accuracy.mae(predictions, verbose=True)
            print("RMSE of predictions", test_rmse)
            print("MAE of predictions", test_mae)

        if algorithm == 'NMF':

            param_grid = {
                'n_epochs': np.arange(0, 100, 10).tolist(),
                'n_factors': [10, 100]
            }
            grid_search = GridSearch(NMF, param_grid, measures=['RMSE', 'MAE'])

            grid_search.evaluate(validation_set)

            p.dump(grid_search,
                   open('../stats/nmf_results_' + task + '.p', 'wb'))
            best_model_RMSE = grid_search.best_params['RMSE']
            validation_rmse = grid_search.best_score['RMSE']
            best_model_mae = grid_search.best_params['MAE']
            validation_mae = grid_search.best_score['MAE']
            print(validation_rmse)
            print(validation_mae)

            #Test based on best training RMSE
            n_epochs = best_model_RMSE['n_epochs']
            n_factors = best_model_RMSE['n_factors']
            self.algo = NMF(n_epochs=n_epochs, n_factors=n_factors)
            self.algo.train(train_set)
            predictions = self.algo.test(test_set)
            test_rmse = accuracy.rmse(predictions, verbose=True)
            test_mae = accuracy.mae(predictions, verbose=True)
            print("RMSE of predictions", test_rmse)
            print("MAE of predictions", test_mae)

        if algorithm == 'KNNWithMeans':
            param_grid = {
                'k':
                np.arange(1, 20).tolist(),
                'sim_options': [{
                    'name': 'cosine',
                    'user_based': True
                }, {
                    'name': 'msd',
                    'user_based': True
                }, {
                    'name': 'pearson',
                    'user_based': True
                }]
            }
            grid_search = GridSearch(KNNWithMeans,
                                     param_grid,
                                     measures=['RMSE', 'MAE'])
            grid_search.evaluate(validation_set)

            p.dump(grid_search,
                   open('../stats/knn_means_results' + task + '.p', 'wb'))

            best_model_RMSE = grid_search.best_params['RMSE']
            validation_rmse = grid_search.best_score['RMSE']
            best_model_mae = grid_search.best_score['MAE']
            validation_mae = grid_search.best_score['MAE']

            #Test based on best training RMSE
            k = best_model_RMSE['k']
            sim_options = best_model_RMSE['sim_options']
            self.algo = KNNWithMeans(k=k, sim_options=sim_options)
            self.algo.train(train_set)
            predictions = self.algo.test(test_set)
            test_rmse = accuracy.rmse(predictions, verbose=True)
            test_mae = accuracy.mae(predictions, verbose=True)
            print("RMSE of predictions", test_rmse)
            print("MAE of predictions", test_mae)

    def generate_top_n_recommendation(self, test_set, train_set):
        '''
        Function to generate top N recommendations
        
        ----
        Args:
        user_id: The id of the user
        test_set: The testing set as a list
        train_set: The training set as a list
        '''
        user_list = set([x[0] for x in train_set])
        print("Number of users = ", len(user_list))

        precision_list = []
        recall_list = []
        f_score_list = []
        j = 0
        for user in user_list:
            # print("===============================================================")
            # print("=====================+++++++++++++++++++++++===================")
            # print("===============================================================")
            j += 1
            if j % 1000 == 0:
                print("Touchdown, j = ", j)
            item_train = set([x[1] for x in train_set if x[0] == user])
            item_test = set([x[1] for x in test_set if x[0] == user])
            item_train_all = set([x[1] for x in train_set])
            item_test_all = set([x[1] for x in test_set])
            item_all = item_train_all.union(item_test_all)
            # print("User = "******"===============================================================")
            # print("TRain items = ",item_train)
            # print("===============================================================")
            # print("Test items = ",item_test)
            # print("ITem all = ",item_all)
            # print("Number of  test items= ",len(item_test))
            negative_items = [
                x for x in item_all
                if x not in item_train and x not in item_test
            ]
            # print("Number of negative items = ",len(negative_items))

            # Get 1000 random negative items
            negative_indices = np.random.randint(0,
                                                 len(negative_items),
                                                 size=1000)
            negative_subset = [negative_items[x] for x in negative_indices]
            # Get 5 positive items from testing set:
            positive_subset = list(item_test)
            np.random.shuffle(positive_subset)
            # print("Positive subset items = ",positive_subset)
            # print(negative_subset)
            subset = positive_subset + negative_subset
            pred_list = []
            for item in subset:
                pred = self.algo.predict(user, item, r_ui=1, verbose=False)
                pred_list.append(pred)
            predictions = sorted(pred_list, key=lambda x: x.est, reverse=True)
            # print(" =============================================================")
            precision = self.calculate_precision(predictions, positive_subset,
                                                 10)
            # print("Precision = ",precision)
            recall = self.calculate_recall(predictions, positive_subset, 10)
            # print("Recall = ",recall)
            # f_score=self.calculate_f_measure(precision,recall)
            # print("F score = ",f_score)
            precision_list.append(precision)
            recall_list.append(recall)
            # f_score_list.append(f_score)

        precision = np.mean(precision_list)
        recall = np.mean(recall_list)
        print("Mean precision = ", precision)
        print("Mean recall = ", recall)
        print("fscore=", self.calculate_f_measure(precision, recall))
        return

    def calculate_precision(self, predictions, positive_items, N):
        '''
        Function to calculate precision
        '''
        count = 0
        for i in np.arange(N):
            p = predictions[i]
            if p.iid in positive_items:
                count += 1
        precision = float(count) / N
        return precision

    def calculate_recall(self, predictions, positive_items, N):
        '''
        Function to calculate recall
        '''
        count = 0
        pred = predictions[:N]  #Get TOP N Predictions
        for p in positive_items:
            for i in pred:
                if i.iid == p:
                    count += 1
                    break

        recall = float(count) / len(positive_items)
        return recall

    def calculate_f_measure(self, precision, recall):
        '''
        Function to calculate recall
        '''
        try:
            f = 2.0 * precision * recall / (precision + recall)
        except:
            f = 0
        return f
示例#7
0
dftest = pd.read_csv(test_file_path)
dftest = dftest.drop(['test_id', 'date'], axis=1)


# create a trainset object 
reader = Reader()
data = Dataset.load_from_df(dftrain, reader)
trainingSet = data.build_full_trainset()


# create a user-based K-nearest neighbours algorithm
# - uses the Pearson correlation to measure user similarites 
# - takes user bias into account 
sim_options = {'name':'pearson'}
algo = KNNWithMeans(sim_options=sim_options)

# train the algorithm using the training set
########### fails here with MemoryError when I try to use the full set
algo.train(trainingSet)

# use the trained algorithm to predict ratings for the test set 
# output to a csv file
f = open('ub_testOutput.csv', 'w')
for i in range (len(dftest)):
    pred = algo.predict(dftest.at[i,'user_id'], dftest.at[i, 'business_id'], r_ui=4, verbose=True)
    predRating = pred.est
    f.write(str(i) + ", " + str(predRating) + '\n')
f.close()