示例#1
0
def svdpp(trainset, testset, predset):
    
    modelname = 'svdpp'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
                   }
    
    algo = SVDpp(n_epochs=40, n_factors=100, bsl_options=bsl_options, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, lr_yj=0.01, reg_bu = 0.05, reg_bi = 0.05, reg_pu = 0.09, reg_qi = 0.1, reg_yj=0.01)
    print('SVDpp Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
    
def svdPP(data):  #SVDPP algorithm
    print("\nTraining SVDPP model..\n")
    global x_test, y_test, testlen, trainlen, model_params, x_train, y_train, X, Y, avg_rat, cold_itm
    p1, p2, p3 = [
        model_params[1]['n_epochs'], model_params[1]['lr_all'],
        model_params[1]['reg_all']
    ]
    svdModel = SVDpp(n_epochs=p1, lr_all=p2, reg_all=p3)
    svdModel.fit(data.build_full_trainset())
    print("\nTraining done..\nPrediction started..")
    test = [(x_test[i][0], x_test[i][1], y_test[i]) for i in range(testlen)]
    #train_=[(x_train[i][0],x_train[i][1],y_train[i]) for i in range(trainlen)]
    #total_=[(X[i][0],X[i][1],Y[i]) for i in range(trainlen+testlen)]
    predict = svdModel.test(test)

    #trainset, testset = t_t_s(data, test_size=.25)
    svdModel_1 = SVDpp()
    svdModel_1.fit(data.build_full_trainset())
    predict1 = svdModel_1.test(test)
    #predict_train = svdModel_1.test(train_)
    #predict_tot = svdModel_1.test(total_)
    usrA = [int(i[0]) - 1 for i in predict]
    itmA = [int(i[1]) - 1 for i in predict]
    res = [i[3] for i in predict]
    res1 = [i[3] for i in predict1]
    for i in range(testlen):
        if itmA[i] in cold_itm:
            res[i] = avg_rat[usrA[i]]
            res1[i] = avg_rat[usrA[i]]
    #restrain=[i[3] for i in predict_train]
    print("\nPrediction done..\n")
    return [res, res1, svdModel, svdModel_1]  #,restrain, predict_tot
def final_model(data):
    """Pickles the collaborative filtering recommendation system model for repeat customers.

    Args:
    data -- a dataframe containing user id, item id, and ratings columns in that order.
    """
    # Creates a user ratings surprise matrix for fitting model
    user_ratings_matrix = surprise_df(data)

    # Splits dataset into train and test datasets to generate predictions
    train_set, test_set = train_test_split(user_ratings_matrix,
                                           test_size=0.2,
                                           random_state=19)

    # Best params determined using GridSearchCV
    params = {'n_factors': 10, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}

    svdpp = SVDpp(n_factors=params['n_factors'],
                  n_epochs=params['n_epochs'],
                  lr_all=params['lr_all'],
                  reg_all=params['reg_all'])

    svdpp.fit(train_set)
    predictions = svdpp.test(test_set)

    # Use surprise wrapper to pickle model
    dump.dump('repeat_customer_model',
              predictions=predictions,
              algo=svdpp,
              verbose=0)
示例#4
0
def time_location_model(df):
    """
        Shows the performance of model based on just bias
    """
    lower = df['date_dist_rating'].min()
    upper = df['date_dist_rating'].max()
    df = df.drop(columns=["rating", "dist_rating", "date_rating"], axis=1)

    reader = Reader(rating_scale=(lower, upper))  #TODO figure out

    data = surprise.dataset.Dataset.load_from_df(df=df, reader=reader)

    ts = data.build_full_trainset()
    dusers = ts._raw2inner_id_users
    ditems = ts._raw2inner_id_items

    # breakpoint()
    trainset, testset = train_test_split(data)

    algo = SVDpp()
    algo.fit(trainset)

    # testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    print('\n')
    return (trainset, testset, predictions, dusers, ditems)
示例#5
0
class RecommenderSVDpp(Recommender):
    def __init__(self, recommendation_dataset: RecommendationDataSet):
        super(RecommenderSVDpp, self).__init__(recommendation_dataset.movies)
        self.algorithm = SVDpp()
        self.recommendation_dataset = recommendation_dataset

    def fit(self, dataset):
        return self.algorithm.fit(dataset)

    def test(self, test_set):
        return self.algorithm.test(test_set)

    def get_recommendation(self, watched, k=20):
        # get dataset 
        new_user_id, full_dataset = self.recommendation_dataset.get_dataset_with_extended_user(watched)
        inner_user_id = full_dataset.to_inner_uid(new_user_id)

        # after new dataset we need again train our model with the new user for the whole 
        # dataset with the new user.
        self.algorithm.fit(full_dataset)

        # watched movies
        watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()}

        # Calculate for all similar user, predictions
        test_items = [
            self.algorithm.predict(new_user_id, full_dataset.to_raw_iid(i))
            for i in range(0, full_dataset.n_items)
            if i not in watched
        ]

        topn_items = [i[0] for i in get_top_n(test_items, n=k, minimum_rating=1.0)[new_user_id]]
        return self.movies.get_movie_by_movie_ids(topn_items)
示例#6
0
def top_ten_df (df):
    '''
    inputs:
    df (Pandas DF) the dataframe that you would like to train on/NOTE: use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF

    outputs:
    top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe
    '''

    data= f.read_data_surprise(df)#use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF

    # First train an SVD algorithm on entire dataset (choose 6x name filter)
    trainset = data.build_full_trainset()
    algo = SVDpp()#n_epochs= 18, lr_all= 0.01, reg_all= 0.175
    algo.fit(trainset)

    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()#HEAVY THIS TAKES THE MOST RAM
    predictions = algo.test(testset)

    #create a dictionary of predictions
    top_n = f.get_top_n(predictions, n=10)

    #Turn the dictionary into a df
    top_ten_df = pd.DataFrame(top_n)

    return top_ten_df
示例#7
0
def svdpp(trainset, testset):
    # Matrix factorization - SVD++
    print("\n" + "-" * 5 + " SVD++ algorithm using surprise package " +
          "-" * 5)
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
示例#8
0
def SVDpp_calculation(data , trainset, testset, time, cv):
    start = time.time()
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    cross_validate_svdpp_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True)
    end = time.time()
    time = end-start
    
    return time, cross_validate_svdpp_dict
示例#9
0
class TrainModel:

    # def __init__(self, method='als', n_epochs=20, sim_option='pearson_baseline'):
    #
    #     self.algo = KNNBasic(bsl_options={'method': method,'n_epochs': n_epochs},
    #                          sim_options={'name': sim_option, 'user_based': False})
    def __init__(self, lr_all=0.006, n_epochs=40):
        self.algo = SVDpp(lr_all=lr_all, n_epochs=n_epochs)
        self.reader = Reader(rating_scale=(0, 1))
        self.filename = 'trained_model.pkl'

    def read_from_df(self, dataframe, user_col, item_col, rating_col):
        data = Dataset.load_from_df(
            dataframe[[user_col, item_col, rating_col]], self.reader)
        trainset = data.build_full_trainset()
        return trainset

    def train_mod(self, dataframe, user_col, item_col, rating_col):
        self.algo.fit(
            self.read_from_df(dataframe, user_col, item_col, rating_col))

    def dump_model(self, predictions):
        saved_ent = dump.dump(self.filename,
                              algo=self.algo,
                              predictions=predictions)
        return saved_ent

    def load_model(self):
        predictions, loaded_ent = dump.load(self.filename)
        return predictions, loaded_ent

    def get_user_pred(self,
                      user_id,
                      dataframe,
                      user_col,
                      item_col,
                      rating_col,
                      n=2):
        data = Dataset.load_from_df(
            dataframe[[user_col, item_col, rating_col]], self.reader)
        testset = data.build_full_trainset().build_anti_testset()
        predictions = self.algo.test(testset)
        top_n = dict()
        for uid, iid, _, est, _ in predictions:
            if uid == user_id: top_n[iid] = est
        top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True)
        return predictions, top_n[:n]

    def get_user_pred_stable(self, user_id, predictions, n=2):
        top_n = dict()
        for uid, iid, _, est, _ in predictions:
            if uid == user_id: top_n[iid] = est
        top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True)
        # top_nn = {k: top_n[k] for k in top_n.keys()[0][:n]}
        return top_n[:n]
示例#10
0
def svdpp(dataset):
    start = time.time()
    algo = SVDpp()
    kf = KFold(n_splits=5)
    for trainset, testset in kf.split(dataset):
        algo.fit(trainset)
        predictions = algo.test(testset)
        acc = accuracy.rmse(predictions, verbose=True)
    end = time.time()
    print('svdpp花分钟数为:', (end - start) / 60)
    return acc
def svdpp(train, test, ids, Xtest, Xids):
    """
    Extension of svd taking the implicit ratings into account
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('SVD++')
    algo = SVDpp(n_factors=100,
                 n_epochs=10,
                 lr_all=0.0015,
                 reg_all=0.05,
                 random_state=15)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
示例#12
0
def svdpp_running_time(data):
    '''
        Calculates the running times for training and predictions for SVD++

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_SVDpptrain: running time for training
            elapsedtime_SVDpptest: running time for predictions on testset
    '''
    elapsedtime_SVDpptrain = []
    elapsedtime_SVDpptest = []

    # tune the parameters on the entire data
    param_grid = {
        'n_factors': [25, 50, 100, 250],
        'n_epochs': [10, 20, 30, 40, 50]
    }
    grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    n_factors = param['n_factors']
    n_epochs = param['n_epochs']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        svdpp = SVDpp(n_factors=n_factors, n_epochs=n_epochs)
        svdpp.train(training)
        elapsedtime_SVDpptrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        svdpp.test(testing)
        elapsedtime_SVDpptest.append(time.time() - test_start)
    return elapsedtime_SVDpptrain, elapsedtime_SVDpptest
示例#13
0
def model(train_set, test_set):
    params = {'n_factors': 3, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}

    svdpp = SVDpp(n_factors=params['n_factors'],
                  n_epochs=params['n_epochs'],
                  lr_all=params['lr_all'],
                  reg_all=params['reg_all'])
    svdpp.fit(train_set)

    predictions = svdpp.test(test_set)
    rmse = accuracy.rmse(predictions, verbose=False)

    return predictions, rmse
示例#14
0
def RecommendPredictions():
    ## Load train and test data into Dataframes
    trainDF = pan.read_csv("data_source/train_count_norm_1_10.csv",
                           header=None,
                           dtype={2: np.float16})
    trainDF = trainDF.fillna(10.0)

    reader = Reader(rating_scale=(1, 10))

    print "Load train set...."
    dataTrain = Dataset.load_from_df(trainDF[[0, 1, 2]], reader=reader)
    trainset = dataTrain.build_full_trainset()

    print "Initiate Training ....."
    algo = SVDpp(n_epochs=1, lr_all=0.01, reg_all=0.02, verbose=True)
    algo.train(trainset)

    ## Predictions for test set with ground truth present
    print " Load test set..."
    testDF = pan.read_csv("data_source/test_count_norm_1_10.csv",
                          header=None,
                          dtype={2: np.float16})
    testDF = testDF.fillna(10.0)
    dataTest = Dataset.load_from_df(testDF[[0, 1, 2]], reader=reader)
    testset = dataTest.build_full_trainset().build_testset()

    print "Start predictions"
    predictions = algo.test(testset)

    try:
        os.remove("data_source/predictions_results_svdpp.csv")
    except OSError:
        pass

    print "Saving Prediction results in File"
    resultFile = open("data_source/predictions_results_svdpp.csv", "a")
    csv_writer = csv.writer(resultFile)

    for item in predictions:
        predictionTuple = (item.uid, item.iid, item.r_ui, item.est)
        csv_writer.writerow(predictionTuple)

    resultFile.close()

    ## Predictions for test set with random products present
    ##	LEFT

    #rmse = accuracy.rmse(predictions, verbose=True)
示例#15
0
    def run_colab_filter(self):
        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(1, 4))

        # The columns must correspond to user id, item id and ratings (in that order).
        data = Dataset.load_from_df(
            self.df20[['user_id', 'route_id', 'rating']], reader)

        # Retrieve the trainset.
        trainset = data.build_full_trainset()

        # Than predict ratings for all pairs (u, i) that are NOT in the training set.
        testset = trainset.build_anti_testset()

        algo_tuned = SVDpp(n_factors=20)
        algo_tuned.fit(trainset)

        iid = self.df20['route_id'].unique()
        #user_id = 200128311 #mine, trad, alpine, intermediate
        #user_id = 110596403 #boulder-er
        #user_id = 200272475 #boulder-er, advanced
        #user_id = 200077815 #michaels, trad, alpine, intermediate
        user_id = 106540415  #mixed climber, alpine climber, advanced
        iid_me = self.df20.loc[self.df20['user_id'] == user_id, 'user_id']
        iids_to_pred = np.setdiff1d(iid, iid_me)

        testset = [[user_id, iid, 2] for iid in iids_to_pred]
        predictions_tuned = algo_tuned.test(testset)

        dump.dump(file_name='SVD_tuned.p',
                  predictions=predictions_tuned,
                  algo=algo_tuned)

        pred_ratings_tuned = np.array([pred.est for pred in predictions_tuned])

        i_max = np.argpartition(pred_ratings_tuned, -20)[-20:]
        i_max = i_max[np.argsort(-pred_ratings_tuned[i_max])]

        iid = iids_to_pred[i_max]

        #top 20 recommended climbs
        self.df_top_climbs_mf = pd.DataFrame(iid, pred_ratings_tuned[i_max])
        self.df_top_climbs_mf = self.df_top_climbs.reset_index()

        self.df_top_climbs_mf.columns = ['predicted rating', 'route id']
示例#16
0
def SVD_pp():
    algo = SVDpp()

    # 定义K折交叉验证迭代器,k=3
    kf = KFold(n_splits=3)
    for trainset, testset in kf.split(data):
        # 训练并预测
        algo.fit(trainset)
        predictions = algo.test(testset)
        # 计算RMSE
        accuracy.rmse(predictions, verbose=True)  # verbose 输出当前跌代,默认False

    uid = str(196)
    iid = str(302)
    # 输出uid对iid的预测结果
    pred = algo.predict(uid, iid, r_ui=4, verbose=True)

    time2 = time.time()
    print(time2 - time1)
示例#17
0
def surprise_SVDpp(train_file, test_file):
    """
    Svd++ with Surprise library.
    Compute the predictions on a test_set after training on a train_set using the method Svd++  from Surprise.
    Args:
        train_file (string): path to created test file
        test_file (string): path to created train file
    Hyperparameters:
        n_factors : The number of factors.
        n_epochs : The number of iteration of the SGD procedure
        lr_'x': The learning rate for 'x'
        reg_'x' : The regularization term for 'x'
    'x':
        bi : The item biases
        bu : The user biases
        qi : The item factors
        yj : The (implicit) item factors
        pu : The user factors


    Returns:
        numpy array: predictions
    """
    print("SVDpp")
    fold = [(train_file, test_file)]
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_folds(fold, reader=reader)
    pkf = PredefinedKFold()
    # Algorithm

    algo = SVDpp(n_epochs=40, n_factors=100, lr_all=0.01, reg_all=0.01)
    for trainset, testset in pkf.split(data):
        # Train
        algo.fit(trainset)

        # Predict
        predictions = algo.test(testset)
    pred = np.zeros(len(predictions))
    for i in range(len(predictions)):
        val = predictions[i].est
        pred[i] = val
    return pred
    def evaluate_on_test(self, train_set, test_set):
        """
        Evaluate the algorithm on the test set after running it on the test set
        :param train_set:
        :param test_set:
        :return: RMSE value on test set
        """
        if train_set is not None and test_set is not None:
            print("Evaluate RMSE on test data")
            self.LOG_HANDLE.info("Evaluate RMSE on test data")

            # http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp
            algo = SVDpp()

            # Train the algorithm on the trainset, and predict ratings for the testset
            algo.fit(train_set)
            predictions = algo.test(test_set)

            # Then compute RMSE
            return accuracy.rmse(predictions)
    def train_best_model_generate_ratings_test(self, ratings_set, test_set):
        """
        Train the best model (with minimum AMSE) on the complete ratings set and then compute the ratings for the test set
        :param ratings_set: The complete ratings data set
        :param test_set: The streams for the users for which ratings are not yet available
        :return: A data frame of the form user, stream, predicted rating
        """
        if ratings_set and test_set:
            print(
                "Training the best model and generating the ratings for the test data set"
            )
            self.LOG_HANDLE.info(
                "Training the best model and generating the ratings for the test data set"
            )

            algo = SVDpp(**model_params.svdpp_best_params)
            algo.fit(ratings_set)

            predictions = algo.test(test_set)
            return predictions
class RecommenderSVDppSimilarUsers(Recommender):
    """ 
        Instead of building new dataset when the new user is in, we get similar users,
        and based on that try to get similar movies
    """
    def __init__(self, movies):
        super(RecommenderSVDppSimilarUsers, self).__init__(movies)
        self.algorithm = SVDpp()

    def fit(self, dataset):
        return self.algorithm.fit(dataset)

    def test(self, test_set):
        return self.algorithm.test(test_set)

    def get_recommendation(self, watched, k=20, k_inner_item=10):
        # get dataset
        full_dataset = self.algorithm.trainset

        # watched movies
        watched = {
            full_dataset.to_inner_iid(key): value
            for key, value in watched.items()
        }

        # get similar users
        similar_users = self.get_similar_user_ids(watched, k=k_inner_item)

        # Calculate for all similar user, predictions
        candidates = defaultdict(float)
        for inner_move_id in range(0, full_dataset.n_items):
            if inner_move_id not in watched:
                movie_id = full_dataset.to_raw_iid(inner_move_id)
                for inner_user_id, similarity in similar_users.items():
                    prediction = self.algorithm.predict(
                        full_dataset.to_raw_uid(inner_user_id), movie_id)
                    candidates[movie_id] += similarity * prediction.est

        # heapq.nlargest(k, candidates.items(), key=itemgetter(1))
        return self.movies.get_movie_by_movie_ids(
            heapq.nlargest(k, candidates, key=candidates.get))
示例#21
0
class SurpriseRecommender(Recommender):
    name = 'surprise-svdpp'

    def train(self, data):
        ratings_dict = {'itemID': data[:,1],
                        'userID': data[:,0],
                        'rating': data[:,2]}
        df = pd.DataFrame(ratings_dict)

        reader = Reader(rating_scale=(0, 1))

        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader).build_full_trainset()
        # self.algo = KNNBasic(verbose=False)
        self.algo = SVDpp(verbose=True)
        self.algo.fit(data)

    def rate(self, user, movie):
        return self.algo.test([[user, movie, 0]])[0].est

    def rate_bool(self, user, movie):
        return self.rate(user, movie) > 0.5
示例#22
0
def svd_model(df):
    """ Apply SVD.
    """
    df = pd.melt(df,
                 id_vars='smiles',
                 value_vars=list(df.columns[1:]),
                 var_name='Target',
                 value_name='TargetValue')

    mark = df.TargetValue.isna()
    unknown = df.loc[mark]
    known = df.loc[~mark]

    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(known[['smiles', 'Target', 'TargetValue']],
                                reader)

    kf = KFold(n_splits=3, random_state=57)

    algo = SVDpp(n_factors=12, reg_all=0.003, lr_all=0.006, random_state=132)

    for trainset, testset in kf.split(data):

        algo.fit(trainset)
        predictions = algo.test(testset)

        rmse = round(accuracy.rmse(predictions, verbose=True), 3)

        print('RMSE of SVD model for cross validation' + str(rmse))

    result = unknown.copy()
    result['ToxicProb'] = result.apply(
        lambda x: algo.predict(x.smiles, x.Target).est, axis=1)
    result = result.drop(columns='TargetValue')

    return result
示例#23
0
def svdpp(data, training, testing):
    '''
    Tune SVD++ parameters then calculates RMSE, coverage and running time of SVD++

    Args:
        data(Dataset): the whole dataset divided into 5 folds
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of SVD++ with optimized parameters
        top_n: number of unique predictions for top n items
    '''
    # candidate parameters
    param_grid = {'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50]}

    # optimize parameters
    grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data)
    param = grid_search.best_params['RMSE']
    print('SVDpp:', param)
    # RMSE against parameters
    result_df = pd.DataFrame.from_dict(grid_search.cv_results)
    result_df.to_csv('data/svdpp_rmse_against_param.csv')


    # fit model using the optimized parameters
    svdpp = SVDpp(n_factors=param['n_factors'], n_epochs=param['n_epochs'])
    svdpp.train(training)

    # evaluate the model using test data
    predictions = svdpp.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)

    return rmse, top_n
def surpriseSVDpp(mode,
                  DataPath='../data/data_clean.txt',
                  TrainPath='../data/train_clean.txt',
                  TestPath='../data/test_clean.txt',
                  n_factors=20,
                  n_epochs=20,
                  lr_all=0.007,
                  reg_all=0.02,
                  verbose=True):

    # We need the rating scale.
    reader = Reader(rating_scale=(1, 5))

    if mode == 'evaluation':

        # train data processing
        train = pd.read_csv(TrainPath, sep="\t", header=None)
        train.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(train[["User Id", "Movie Id", "Rating"]],
                                    reader)
        trainset = data.build_full_trainset()

        # fit model
        algo = SVDpp(n_factors=n_factors,
                     n_epochs=n_epochs,
                     init_mean=0,
                     init_std_dev=0.1,
                     lr_all=lr_all,
                     reg_all=reg_all,
                     verbose=verbose)
        algo.fit(trainset)

        # evaluate train error
        test = trainset.build_testset()
        predictions = algo.test(test)
        train_err = accuracy.rmse(predictions, verbose=False)

        # test data processing
        test = pd.read_csv(TestPath, sep="\t", header=None)
        test.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(test[["User Id", "Movie Id", "Rating"]],
                                    reader)
        testset = data.build_full_trainset()

        # evaluate train error
        test = testset.build_testset()
        predictions = algo.test(test)
        test_err = accuracy.rmse(predictions, verbose=False)

        # Return V (qi),  U (pu), train_err (RMSE), test_err (RMSE)
        return algo.qi, algo.pu, train_err, test_err

    elif mode == 'visualization':

        # train data processing
        alldata = pd.read_csv(DataPath, sep="\t", header=None)
        alldata.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(alldata[["User Id", "Movie Id", "Rating"]],
                                    reader)
        trainset = data.build_full_trainset()

        # fit model
        algo = SVDpp(n_factors=n_factors,
                     n_epochs=n_epochs,
                     init_mean=0,
                     init_std_dev=0.1,
                     lr_all=lr_all,
                     reg_all=reg_all,
                     verbose=verbose)
        algo.fit(trainset)

        # evaluate train error
        test = trainset.build_testset()
        predictions = algo.test(test)
        train_err = accuracy.rmse(predictions, verbose=False)

        U = algo.pu
        V = algo.qi

        A, _, B = np.linalg.svd(V.T)
        A = A.T
        # Use the first 2 cols for work
        Asub = A[:, :2]

        Uproj = np.dot(Asub.T, U.T)
        Vproj = np.dot(Asub.T, V.T)

        # Return Vproj,  Uproj, train_err (RMSE of Y = U^T V)
        return Vproj, Uproj, train_err
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = SVDpp()
algo.fit(trainset)
'''
uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, verbose=True)
# print(pred.est)
'''

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

# rank_n = get_rank_n_for_one_user(predictions, str(603))
# print(rank_n)

db = pymysql.connect(host_ip, user, password, database_name)
cursor = db.cursor()
store_predictions_to_sql(predictions)
db.close()

# top_n = get_top_n(predictions, n=10)
#
# # Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])
示例#26
0
    svd.fit(train)
    svd_fu = pd.concat([
        ratings['user_id'].drop_duplicates().reset_index(drop=True),
        pd.DataFrame(svd.pu.tolist())
    ],
                       axis=1)
    svd_fi = pd.concat([
        ratings['movie_id'].drop_duplicates().reset_index(drop=True),
        pd.DataFrame(svd.qi.tolist())
    ],
                       axis=1)
    train, test = surprise_train_test_split(data,
                                            train_size=0.9,
                                            test_size=0.1,
                                            shuffle=False)
    svd_pred = svd.test(test)
    svd_pred = pd.DataFrame([i.r_ui for i in svd_pred])
    with open('feature/svd_pp_fu.pkl', 'wb') as f:
        pickle.dump(svd_fu, f)
    with open('feature/svd_pp_fi.pkl', 'wb') as f:
        pickle.dump(svd_fi, f)
    with open('feature/svd_pp_pred.pkl', 'wb') as f:
        pickle.dump(svd_pred, f)
else:
    with open('feature/svd_pp_fu.pkl', 'rb') as f:
        svd_fu = pickle.load(f)
    with open('feature/svd_pp_fi.pkl', 'rb') as f:
        svd_fi = pickle.load(f)
    with open('feature/svd_pp_pred.pkl', 'rb') as f:
        svd_pred = pickle.load(f)
ratings = pd.merge(ratings, users, how='left', on='user_id')
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    print (score_df)
    print (svd_data.raw_ratings)
    #Try SVD
    algo_svd = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svd, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svd.fit(full_train_set)
    predictions = algo_svd.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    latent_usr_factor = algo_svd.pu 
    latent_item_factor = algo_svd.qi 
    user_bias = algo_svd.bu
    item_bias = algo_svd.bi
    recomendation_reportname_df_svd = pd.merge(recommendation_df_svd, df_reports_id, how = 'left', on= 'report_id')

    
    #Try SVD++
    algo_svdpp = SVDpp()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svdpp.fit(full_train_set)
    predictions = algo_svdpp.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svdpp = get_top_n(predictions, n=5)
    latent_usr_factor_pp = algo_svd.pu 
    latent_item_factor_pp = algo_svd.qi 
    user_bias_pp = algo_svd.bu
    item_bias_pp = algo_svd.bi
    recomendation_reportname_df_svdpp = pd.merge(recommendation_df_svdpp, df_reports_id, how = 'left', on= 'report_id')

      #Try SVD++ with more factors as default is 20
    algo_svdpp_mod = SVDpp(n_factors =50, n_epochs = 50)
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svdpp.fit(full_train_set)
    predictions = algo_svdpp.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    print (score)
    
    #print (recommendation_df)
    
    
    #Try the NMF
    #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo_nmf = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_nmf.fit(full_train_set)
    predictions = algo_nmf.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_nmf = get_top_n(predictions, n=5)
    #print (recommendation_df)
    latent_usr_factor_nmf = algo_svd.pu 
    latent_item_factor_nmf = algo_svd.qi 
    user_bias_nmf = algo_svd.bu
    item_bias_nmf = algo_svd.bi
    recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id')
    sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816]
    
        #Try the NMF without default
    #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo_nmf_mod = NMF(n_factors =50, n_epochs = 50)
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True, )
    # Fitting the SVD
    algo_nmf.fit(full_train_set)
    predictions = algo_nmf.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_nmf = get_top_n(predictions, n=5)
    #print (recommendation_df)
    latent_usr_factor_nmf = algo_svd.pu 
    latent_item_factor_nmf = algo_svd.qi 
    user_bias_nmf = algo_svd.bu
    item_bias_nmf = algo_svd.bi
    recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id')
    sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816]
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
     # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(algo_svd, svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(algo_svdpp,svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(algo_nmf, svd_data, cv=5, n_jobs=5, verbose=False) 
    svdpp_cv_mod = cross_validate(algo_svdpp_mod,svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv_mod = cross_validate(algo_nmf_mod, svd_data, cv=5, n_jobs=5, verbose=False) 
示例#28
0
# Load the movielens-100k dataset  UserID::MovieID::Rating::Timestamp
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.15)

# Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item.

algoritmo = SVDpp(n_epochs=5)

algoritmo.fit(trainset)

# Selecionamos o usuário e o filme que será analisado
# User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas
uid = str(49)
# Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4
iid = str(2058)  # raw item id

# get a prediction for specific users and items.
pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True)

# run the trained model against the testset
test_pred = algoritmo.test(testset)

# Avalia RMSE
print("Avaliação RMSE: ")
accuracy.rmse(test_pred, verbose=True)

# Avalia MAE
print("Avaliação MAE: ")
accuracy.mae(test_pred, verbose=True)
示例#29
0
文件: svdpp.py 项目: losenineai/rs
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n


algo = SVDpp()
algo.fit(train_set)

predictions = algo.test(testsets.values)
top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

kf = KFold(n_splits=3)

algo = SVDpp()
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
示例#30
0
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


## Check precision and recall for K predictions for each user
kf = KFold(n_splits=2)
algo = SVDpp()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions,
                                                k=15,
                                                threshold=3.5)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

## An item is considered relevant if its true rating rui is greater than a
## given threshold. An item is considered recommended if its estimated rating
## r^ui is greater than the threshold, and if it is among the k highest
## estimated ratings.

## Remember that:
## Recall = Sensitivity = TP / (TP + FN)