示例#1
0
def user_factorization(data_raw, user_clusters, params):
    n_factors = params["LOCAL_U_NMF_K"]
    user_df = pd.DataFrame()
    for i in range(user_clusters):
        u_i = data_raw[data_raw["user cluster"] == i]
        reader = surprise.Reader(rating_scale=(1, 5))
        dataset = surprise.Dataset.load_from_df(
            u_i[["User", "Movie", "Prediction"]], reader)
        trainset = dataset.build_full_trainset()
        algo = NMF(n_factors=n_factors,
                   n_epochs=params["LOCAL_U_NMF_EPOCHS"],
                   verbose=True)
        algo.fit(trainset)
        testset = trainset.build_testset()
        preds = algo.test(testset)
        predictions_train = pd.DataFrame(preds)
        testset = trainset.build_anti_testset()
        preds = algo.test(testset)
        predictions_rest = pd.DataFrame(preds)
        user_df = pd.concat([user_df, predictions_train, predictions_rest],
                            ignore_index=False,
                            copy=False)
    all_u_m = get_all_u_m()
    user_df = all_u_m.merge(user_df, how="left", on=["uid", "iid"])
    user_df = user_df[["uid", "iid", "est"]]
    logging.info("return from user_factorization")
    return user_df
示例#2
0
def item_factorization(data_raw, item_clusters, user_df, params):
    n_factors = params["LOCAL_I_NMF_K"]
    item_df = pd.DataFrame()
    for i in range(item_clusters):
        i_i = data_raw[data_raw["item cluster"] == i]
        reader = surprise.Reader(rating_scale=(1, 5))
        dataset = surprise.Dataset.load_from_df(
            i_i[["User", "Movie", "Prediction"]], reader)
        trainset = dataset.build_full_trainset()
        algo = NMF(n_factors=n_factors,
                   n_epochs=params["LOCAL_I_NMF_EPOCHS"],
                   verbose=True)
        algo.fit(trainset)
        #i_i.rename(columns={"User":"******","Movie":"iid","Prediction":"est"},inplace=True)
        testset = trainset.build_testset()
        preds = algo.test(testset)
        predictions_train = pd.DataFrame(preds)
        testset = trainset.build_anti_testset()
        preds = algo.test(testset)
        predictions_rest = pd.DataFrame(preds)
        item_df = pd.concat([item_df, predictions_train, predictions_rest],
                            ignore_index=False,
                            copy=False)
    item_df = user_df[["uid", "iid"]].merge(item_df,
                                            how="left",
                                            on=["uid", "iid"])
    item_df["est"].loc[item_df["est"].isnull()] = 0
    logging.info("return from item_factorization")
    return item_df
示例#3
0
def nmf(data, training, testing):
    '''
    Tune NMF parameters then calculates RMSE, coverage and running time of NMF

    Args:
        data(Dataset): the whole dataset divided into 5 folds
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of NMF with optimized parameters
        top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    nmf_param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]}

    # optimize parameters
    grid_search = GridSearch(NMF, nmf_param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data)
    param = grid_search.best_params['RMSE']
    print('NMF:', param)

    # fit model using the optimized parameters
    nmf = NMF(n_factors=param['n_factors'], n_epochs=param['n_epochs'])
    nmf.train(training)

    # evaluate the model using test data
    predictions = nmf.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)
    return rmse, top_n
示例#4
0
def predict_NMF(userid):
    df = pd.read_csv('ratings_small.csv').drop(['timestamp'], axis=1)
    reader = Reader(rating_scale=(1, 30))

    #使用reader格式从文件中读取数据
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']],
                                reader=reader)

    #拆分训练集与测试集,75%的样本作为训练集,25%的样本作为测试集
    trainset, testset = train_test_split(data, test_size=.25)

    #使用NMF
    algo = NMF()
    algo.fit(trainset)
    pred_nmf = algo.test(testset)
    top_nmf_n = get_top_n(pred_nmf, n=5)

    movie_titles = pd.read_csv('movies_metadata.csv', usecols=['id', 'title'])
    movie_titles = movie_titles.rename(columns={'id': 'movieId'})
    movie_titles['movieId'] = pd.to_numeric(movie_titles['movieId'],
                                            errors='coerce').fillna(0)
    movie_titles['movieId'] = movie_titles['movieId'].astype('int')
    movie_titles.drop_duplicates()

    for uid, user_ratings in top_nmf_n.items():
        if (uid == userid):
            #print(uid, [iid for (iid, _) in user_ratings])
            title_list = [iid for (iid, _) in user_ratings]

    titles = movie_titles[movie_titles.movieId.isin(title_list)]
    print(titles[2:])
    return titles[2:]
示例#5
0
    def recommender_nmf_baseline(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_nmf_baseline = NMF()

        algo_nmf_baseline.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_nmf_baseline = algo_nmf_baseline.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('NMF BASELINE: ' + ' RMSE ' +
              str(rmse(predictions_nmf_baseline, verbose=False)) + ' MAE ' +
              str(mae(predictions_nmf_baseline, verbose=False)))

        return algo_nmf_baseline
示例#6
0
def do_nmf(data_raw, impute_params):
    data = data_raw.pivot(index="User", columns="Movie",
                          values="Prediction").to_numpy()
    reader = surprise.Reader(rating_scale=(1, 5))
    dataset = surprise.Dataset.load_from_df(
        data_raw[["User", "Movie", "Prediction"]], reader)
    trainset = dataset.build_full_trainset()

    algo = NMF(n_factors=impute_params["FACTORS"],
               n_epochs=impute_params["EPOCHS"],
               verbose=True)
    algo.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    predictions = pd.DataFrame(predictions)

    predictions.rename(columns={
        "uid": "User",
        "iid": "Movie",
        "est": "Prediction"
    },
                       inplace=True)
    predictions = predictions[["User", "Movie", "Prediction"]]

    data = pd.concat([data_raw, predictions], ignore_index=True)
    data = data.pivot(index="User", columns="Movie",
                      values="Prediction").to_numpy()
    return data
示例#7
0
def colaborative_filtering_based_model(path, config, engine, df_valid_games):
    with open(path, 'r') as f:
        raw_strings = f.readlines()

    total_count = len(raw_strings)
    current_count = 0

    user_ratings = []
    scaler = MinMaxScaler((1, 5))

    for raw_string in raw_strings:
        user_id, user_inventory = list(json.loads(raw_string).items())[0]
        if user_inventory is not None:
            app_ids = [item['appid'] for item in user_inventory]
            app_scores = [item['playtime_forever'] for item in user_inventory]
            app_scores = scaler.fit_transform(np.log1p(app_scores).reshape(-1, 1))
            
            user_ratings_temp = [[user_id, app_ids[i], app_scores[i].item()] for i in range(len(app_ids))]
            user_ratings += user_ratings_temp

        show_work_status(1,total_count,current_count)
        current_count+=1

    user_item_ratings = pd.DataFrame(user_ratings)
    user_item_ratings.columns = ['user_id', 'item_id', 'rating']

    # Prediction part
    game_ids_set = set(df_valid_games.steam_appid)
    grouped_user_item_ratings = user_item_ratings.groupby('user_id')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(user_item_ratings[['user_id', 'item_id', 'rating']], reader)

    alg = NMF(n_factors=20)
    alg.fit(data.build_full_trainset())

    total_count = len(user_item_ratings.user_id.unique())
    current_count = 0
    dict_user_recommendations = {}
    for user in user_item_ratings.user_id.unique().tolist():
        temp = grouped_user_item_ratings.get_group(user)
        not_purchased_ids = game_ids_set - set([str(x) for x in temp.item_id])
        
        user_test_temp = [[user, not_purchased_id, 0] for not_purchased_id in not_purchased_ids]
        user_test_temp = pd.DataFrame(user_test_temp)
        user_test_temp.columns = ['user_id', 'item_id', 'rating']
        
        data = Dataset.load_from_df(user_test_temp[['user_id', 'item_id', 'rating']], reader)
        user_test = data.build_full_trainset().build_testset()
        results = alg.test(user_test)
        dict_user_recommendations.update({user: pd.DataFrame(results).sort_values('est', ascending=False).iloc[:10, 1].values.tolist()})
        
        show_work_status(1,total_count,current_count)
        current_count+=1   

    df_cf_based_results = pd.DataFrame(dict_user_recommendations).T
    df_cf_based_results.index.name = 'user_id'
    df_cf_based_results.reset_index(inplace=True)
    df_cf_based_results.to_sql(config.mysql_user_like_table, engine, if_exists='replace')
def nmf(train, test, ids, Xtest, Xids):
    """
    Non Negative Matrix Factorization
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """
    print('NMF')
    algo = NMF(n_factors=20,
               n_epochs=50,
               random_state=15,
               reg_pu=0.5,
               reg_qi=0.05)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
示例#9
0
def nmf_running_time(data):
    '''
        Calculates the running times for training and predictions for NMF

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_NMFtrain: running time for training
            elapsedtime_NMFtest: running time for predictions on testset
    '''
    elapsedtime_NMFtrain = []
    elapsedtime_NMFtest = []

    # tune the parameters on the entire data
    param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]}
    grid_search = GridSearch(NMF, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    n_factors = param['n_factors']
    n_epochs = param['n_epochs']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        nmf = NMF(n_factors=n_factors, n_epochs=n_epochs)
        nmf.train(training)
        elapsedtime_NMFtrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        nmf.test(testing)
        elapsedtime_NMFtest.append(time.time() - test_start)
    return elapsedtime_NMFtrain, elapsedtime_NMFtest
示例#10
0
def trainFinalModels(ratingsTrainDataset, ratingsTest, bestParamsNMF,
                     bestParamsKNN):
    ratingsTrainTrainset = ratingsTrainDataset.build_full_trainset()

    modelNMF = NMF(**bestParamsNMF)
    modelNMF.fit(ratingsTrainTrainset)
    saveModel(modelNMF, 'NMF')

    predictions = modelNMF.test(ratingsTest)
    rmseValue = rmse(predictions)
    maeValue = mae(predictions)
    saveFinalResult('NMF', rmseValue, maeValue)

    modelKNN = KNNWithMeans(**bestParamsKNN)
    modelKNN.fit(ratingsTrainTrainset)
    saveModel(modelKNN, 'KNN')

    predictions = modelKNN.test(ratingsTest)
    rmseValue = rmse(predictions)
    maeValue = mae(predictions)
    saveFinalResult('KNN', rmseValue, maeValue)
示例#11
0
文件: MF.py 项目: LLNL/MTLRecSys
class NonNegative_MF(BaseSurpriseSTLEstimator):
    """
    Nonnegative Matrix Factorization
    
    Args:
        :attr:`n_factors` (int): 
            number of latent vectors/factors for matrix factorization
        :attr:`n_epochs` (int): 
            Integer, The number of iteration of the SGD procedure. Default is 20
    
    see https://surprise.readthedocs.io/en/stable/matrix_factorization.html for more info
    """
    def __init__(self, n_factors, n_epochs=50, name='NonNegative_MF'):
        super().__init__(name, 'non_feature_based')
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.model = NMF(n_factors=self.n_factors, n_epochs=self.n_epochs)

    def _fit(self, x):
        self.model.fit(x)

    def _predict(self, x):
        return self.model.test(x)

    def get_hyper_params(self):
        hparams = {
            'n_factors': {
                'type': 'integer',
                'values': [2, 150]
            },
            'n_epochs': {
                'type': 'integer',
                'values': [2, 150]
            }
        }
        return hparams

    def set_hyper_params(self, **kwargs):
        self.n_factors = kwargs['n_factors']
示例#12
0
def algoFunc(train_data, test_data):
    SVD_var = SVD()
    print("Singular Value Decomposition :\n")
    SVD_var.fit(train_data)
    predict_var = SVD_var.test(test_data)
    SVD_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    SVD_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nProbabilistic Matrix Factorization :\n")
    PMF_var = SVD(biased=False)
    PMF_var.fit(train_data)
    predict_var = PMF_var.test(test_data)
    PMF_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    PMF_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nNon-negative Matrix Factorization :\n")
    NMF_var = NMF()
    NMF_var.fit(train_data)
    predict_var = NMF_var.test(test_data)
    NMF_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    NMF_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nUser based Collaborative Filtering algorithm :\n")
    UB_var = KNNBasic(sim_options={'user_based': True})
    UB_var.fit(train_data)
    predict_var = UB_var.test(test_data)
    user_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    user_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nItem based Collaborative Filtering algorithm :\n")
    IB_var = KNNBasic(sim_options={'user_based': False})
    IB_var.fit(train_data)
    predict_var = IB_var.test(test_data)
    item_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    item_MAE_var = accuracy.mae(predict_var, verbose=True)
    print("\n")

    return SVD_RMSE_var, SVD_MAE_var, PMF_RMSE_var, PMF_MAE_var, NMF_RMSE_var, NMF_MAE_var, user_RMSE_var, user_MAE_var, item_RMSE_var, item_MAE_var
示例#13
0
def train_algo(this_data):
    """
    Fit a Non-negative Matrix Factorization algo to the data.

    Args:
        this_data - surprise.dataset; the loaded json data.

    Returns:
        predictions - surprise library object; all predictions generated by algo.  
    """
    print("Running algo...")
    trainset = this_data.build_full_trainset()

    NMF_algo = NMF(biased=False, n_epochs=50, n_factors=35)

    NMF_algo.fit(trainset)

    testset = trainset.build_anti_testset()

    predictions = NMF_algo.test(testset)

    print("Getting predictions...")

    return predictions
class NMF_Cosine_Recommender:
    """[summary]
       @author Will Jobs
    """
    def __init__(self,
                 df_users,
                 df_movies,
                 df_ratings,
                 df_movie_lens_tags,
                 biased=False):
        """[summary]

        Args:
            df_users ([type]): [description]
            df_movies ([type]): [description]
            df_ratings ([type]): [description]
            df_movie_lens_tags ([type]): [description]
            biased
        """
        self.users = df_users
        self.movies = df_movies
        self.ratings = df_ratings
        self.ml_tags = df_movie_lens_tags
        self.biased = biased
        self.trained_nmf = False
        self.preprocessed = False
        self.trained_cosine = False
        self.cv_score = None
        self.cv_fit_time = None
        self.movies_merged = pd.DataFrame()
        self.nmf_predictions = pd.DataFrame()
        self.tfidf_matrix = None
        self.algo = None
        self.W = None
        self.H = None

    def preprocess_tags(self, verbose=True):
        """[summary]

        Args:
            verbose (bool, optional): [description]. Defaults to True.
            seed ([type], optional): [description]. Defaults to None.
        """
        if self.preprocessed:  # only do this once
            return

        if verbose:
            print('Preprocessing tags and movie information...', end='')

        self.ml_tags.rename(columns={
            'userId': 'userID',
            'movieId': 'movieID'
        },
                            inplace=True)
        self.ml_tags = self.ml_tags.astype({'tag': str})

        tmp_tags = self.ml_tags.copy()
        tmp_movies = self.movies.copy()

        # replace punctuation in tags (a space), movie name (a space), and genres (no space). These will eventually be folded into the tags list
        # doing it this way to avoid altering the original tags during presentation later
        tmp_tags['new_tag'] = tmp_tags.tag.str.replace(r'[^\w\s]', ' ')
        tmp_movies['new_name'] = tmp_movies.name.str.replace(r'[^\w\s]', ' ')
        tmp_movies['new_genre1'] = tmp_movies.genre1.str.replace(
            r'[^\w\s]', '')
        tmp_movies['new_genre2'] = tmp_movies.genre2.str.replace(
            r'[^\w\s]', '')
        tmp_movies['new_genre3'] = tmp_movies.genre3.str.replace(
            r'[^\w\s]', '')

        # aggregate all users' tags up per movie
        tags_nostrip = tmp_tags.groupby('movieID').tag.apply(
            ' '.join).reset_index()
        tags_nostrip.rename(columns={'tag': 'tags'}, inplace=True)
        tags_strip = tmp_tags.groupby('movieID').new_tag.apply(
            ' '.join).reset_index()
        tags_strip = tags_nostrip.merge(tags_strip, on='movieID')

        # merge name, genres, and tags together
        self.movies_merged = tmp_movies.merge(tags_strip,
                                              on='movieID',
                                              how='left')
        self.movies_merged['tags_strip'] = self.movies_merged.apply(
            lambda x: '{} {} {} {} {}'.format(
                x['new_name'], x['new_genre1'], x['new_genre2']
                if type(x['new_genre2']) != float else "", x['new_genre3']
                if type(x['new_genre3']) != float else "", x['new_tag']),
            axis=1)
        self.movies_merged.drop(columns=[
            'new_tag', 'new_name', 'new_genre1', 'new_genre2', 'new_genre3'
        ],
                                inplace=True)

        # merge in the combined tags (with punctuation)
        self.movies = self.movies.merge(tags_nostrip, on='movieID', how='left')

        self.preprocessed = True

        if verbose:
            print('Done')

    def train_cosine_similarity(self, seed=None, verbose=True):
        """[summary]

        Args:
            seed ([type], optional): [description]. Defaults to None.
            verbose (bool, optional): [description]. Defaults to True.

        Raises:
            RuntimeError: [description]
        """
        if not self.preprocessed:
            raise RuntimeError(
                'Cannot train cosine similarity until preprocessing is done (via preprocess_tags)'
            )

        if self.trained_cosine:  # only do this once
            return

        if seed is not None:
            random.seed(seed)

        vectorizer = TfidfVectorizer(stop_words='english', min_df=3)

        if verbose:
            print('Cosine similarity training...', end='')

        self.tfidf_matrix = vectorizer.fit_transform(
            self.movies_merged['tags_strip'])
        self.trained_cosine = True

        if verbose:
            print('Done')

    def run_nmf(self,
                n_factors=15,
                run_cross_validation=True,
                cv_metric='RMSE',
                seed=None,
                verbose=True):
        """[summary]

        Args:
            n_factors (int, optional): [description]. Defaults to 15.
            run_cross_validation (bool, optional): [description]. Defaults to True.
            cv_metric (str, optional): [description]. Defaults to 'RMSE'.
            seed ([type], optional): [description]. Defaults to None.
            verbose (bool, optional): [description]. Defaults to True.
        """

        # ratings get clipped from 1 to 5
        reader = Reader(rating_scale=(1.0, 5.0))
        data = Dataset.load_from_df(self.ratings, reader)

        # first, calculate CV on a fraction of the dataset
        if run_cross_validation:
            if verbose:
                print('Running cross-validation...', end='')

            if seed is not None:
                random.seed(seed)

            algo = NMF(n_factors=n_factors,
                       biased=self.biased,
                       random_state=seed)
            cv_results = cross_validate(algo,
                                        data,
                                        measures=['RMSE'],
                                        cv=5,
                                        verbose=False)
            avg_cv_result = pd.DataFrame.from_dict(cv_results).mean(axis=0)
            self.cv_score = avg_cv_result['test_' + cv_metric.lower()]
            self.cv_fit_time = avg_cv_result['fit_time']

            if verbose:
                print('Done')
                print('Average CV score: {}\nAverage fit time: {} seconds'.
                      format(round(self.cv_score, 4),
                             round(self.cv_fit_time, 4)))

        if seed is not None:
            random.seed(seed)

        # ratings must have 3 cols: users, items, ratings (in that order)
        train_set = data.build_full_trainset()

        self.algo = NMF(n_factors=n_factors,
                        biased=self.biased,
                        random_state=seed)

        if verbose:
            print('NMF Fitting...', end='')

        self.algo.fit(train_set)

        self.W = self.algo.pu
        self.H = np.transpose(self.algo.qi)

        # get predictions for *every* user/movie combo. These will be also compared to the actual ratings
        if verbose:
            print('Done')
            print('Generating all user-movie pairs for predictions...', end='')

        all_pairs = [(x, y, 0) for x in self.users.userID
                     for y in self.movies.movieID]

        # getting predictions for ALL user/movie combos
        # took 40 seconds on 3.4 million rows
        if verbose:
            print('Done')
            print('Calculating predictions on all user-movie pairs...', end='')

        all_preds = self.algo.test(all_pairs)
        all_preds = pd.DataFrame([{
            'userID': y.uid,
            'movieID': y.iid,
            'nmf_prediction': y.est
        } for y in all_preds])

        self.nmf_predictions = all_preds.merge(self.ratings,
                                               on=['userID', 'movieID'],
                                               how='left')
        self.nmf_predictions = self.nmf_predictions[[
            'userID', 'movieID', 'rating', 'nmf_prediction'
        ]]
        self.trained_nmf = True

        if verbose:
            print('Done')

    def train(self,
              n_factors=15,
              run_cross_validation=True,
              seed=None,
              verbose=True):
        """[summary]

        Args:
            n_factors (int, optional): [description]. Defaults to 15.
            run_cross_validation (bool, optional): [description]. Defaults to True.
            seed ([type], optional): [description]. Defaults to None.
            verbose (bool, optional): [description]. Defaults to True.
        """
        self.preprocess_tags(verbose=verbose)
        self.train_cosine_similarity(seed=seed, verbose=verbose)
        self.run_nmf(n_factors=n_factors,
                     run_cross_validation=run_cross_validation,
                     seed=seed,
                     verbose=verbose)

    def get_similar_movies(self, movieID, number_of_movies=None, verbose=True):
        """[summary]

        Args:
            movieID ([type]): [description]
            verbose (bool, optional): [description]. Defaults to True.

        Raises:
            RuntimeError: [description]

        Returns:
            [type]: [description]
        """
        if not (self.preprocessed and self.trained_cosine):
            raise RuntimeError(
                'Cannot make recommendations without training NMF, preprocessing, and training cosine first.'
            )

        # get the index of the movie
        idx = np.where(self.movies_merged.movieID == movieID)[0][0]

        if verbose:
            print('Getting similar movies to ' +
                  self.movies_merged.iloc[idx]['name'] + '...',
                  end='')

        y = cosine_similarity(self.tfidf_matrix[idx], self.tfidf_matrix)
        idx_scores = pd.DataFrame(
            [(idx, score)
             for (idx, score) in enumerate(list(y[0])) if score > 0],
            columns=['idx', 'similarity'])

        result = pd.concat([
            self.movies_merged.iloc[idx_scores.idx].reset_index(), idx_scores
        ],
                           axis=1).sort_values(by='similarity',
                                               ascending=False)

        # get rid of transformed columns from movies_merged (except tag), and get the *original* name and genres with punctuation
        result.drop(columns=[
            x for x in [*self.movies_merged.columns, 'index', 'idx']
            if x != 'movieID'
        ],
                    inplace=True)
        result = result.merge(self.movies, on='movieID', how='left')
        result = result[[
            'movieID', 'name', 'year', 'genre1', 'genre2', 'genre3', 'tags',
            'similarity'
        ]]

        if verbose:
            print('Done')

        # don't include the movie we're finding similarities for
        if number_of_movies is not None:
            return result[1:].head(number_of_movies)
        else:
            return result[1:]

    def get_recommendations(self,
                            userID,
                            number_of_recs=5,
                            seed=None,
                            show_user_likes=True,
                            verbose=True):
        """[summary]
        Algorithm:
        1. Get 20 of the users' top ratings. Start with 5s, if > 20 exist, sample 20 randomly.
        2. If fewer than 20 5s exist, sample 4s until get to 20 (or use up all 4s).
            - If there are no 5s or 4s, ignore the user's ratings, and just
              return the <number_of_recs> top predicted ratings for this user. Done.
        3. For each movie in the top list, calculate cosine similarity, and get the 10 most-similar
           movies which the user has NOT seen.
        4. Combine the 20 most-similar lists of 10 movies into a single list.
        5. Remove duplicates from this list, choosing the highest-similarity achieved
        6. For each movie, look up the predicted rating for this user.
        7. Multiply each movie's similarity times the predicted rating.
        8. Return the top <number_of_recs> predicted movies (or all if not enough). Done.

        Args:
            userID ([type]): [description]
            number_of_recs (int, optional): [description]. Defaults to 5.
            seed ([type], optional): [description]. Defaults to None.
            verbose (bool, optional): [description]. Defaults to True.

        Returns:
            pandas DataFrame: expected ratings. Columns: movieID, name, genres, weighted_rating
        """
        MAX_CONSIDERED_RATINGS = 20
        CONSIDER_N_SIMILAR = 10

        def combine_genres(df):
            # combine genres into a single column. Note that NaNs parse as float during apply
            df['genres'] = df.apply(lambda row: (row['genre1'] if not type(row['genre1'])==float else "") + \
                                                ("/" + row['genre2'] if not type(row['genre2'])==float else "") + \
                                                ("/" + row['genre3'] if not type(row['genre3'])==float else ""), axis=1)
            df.drop(columns=['genre1', 'genre2', 'genre3'], inplace=True)

        def get_subset_ratings():
            if verbose:
                print("Getting user's highest rated movies to start from...",
                      end='')

            all_5s = self.ratings[(self.ratings.userID == userID)
                                  & (self.ratings.rating == 5)]

            if len(all_5s) >= MAX_CONSIDERED_RATINGS:
                subset_ratings = all_5s.sample(MAX_CONSIDERED_RATINGS,
                                               random_state=seed)
            else:
                # use all 5s, and add in 4s until we have <MAX_CONSIDERED_RATINGS>
                subset_ratings = all_5s.copy()
                all_4s = self.ratings[(self.ratings.userID == userID)
                                      & (self.ratings.rating == 4)]
                count_needed = MAX_CONSIDERED_RATINGS - len(all_5s)
                subset_ratings = pd.concat([
                    subset_ratings,
                    all_4s.sample(min(count_needed, len(all_4s)),
                                  random_state=seed)
                ],
                                           ignore_index=True)

            subset_ratings = subset_ratings.merge(
                self.movies[['movieID', 'name']], on='movieID')

            if verbose:
                print('Done')

            return subset_ratings[['userID', 'movieID', 'name', 'rating']]

        def get_most_similar_movies(subset_ratings):
            if verbose:
                print("Finding similar movies to {} movies the user liked...".
                      format(len(subset_ratings)))

            seen_movies = list(
                self.ratings[(self.ratings.userID == userID)].movieID)
            similar_movies = pd.DataFrame()

            for movie in subset_ratings.movieID:
                tmp_similar = self.get_similar_movies(movie, verbose=verbose)

                # limit to movies the user hasn't seen, and limit to top <CONSIDER_N_SIMILAR>
                tmp_similar = tmp_similar[~tmp_similar['movieID'].isin(
                    seen_movies)].head(CONSIDER_N_SIMILAR)
                tmp_similar['similar_to'] = subset_ratings[
                    subset_ratings['movieID'] == movie].name.values[0]
                similar_movies = pd.concat([similar_movies, tmp_similar],
                                           ignore_index=True)

            # now remove duplicates, and get the top similarity for each movie
            similar_movies.sort_values(by='similarity',
                                       ascending=False,
                                       inplace=True)
            similar_movies.drop_duplicates(subset='movieID',
                                           keep='first',
                                           inplace=True)

            return similar_movies

        if not (self.trained_nmf and self.preprocessed
                and self.trained_cosine):
            raise RuntimeError(
                'Cannot make recommendations without training NMF, preprocessing, and training cosine first.'
            )

        if userID not in self.users.userID.values:
            raise ValueError(
                'User {} does not exist in ratings dataset. If this is a new user, create a new user using the average ratings.'
                .format(userID))

        if seed is not None:
            random.seed(seed)

        review_counts = self.ratings[self.ratings.userID ==
                                     userID].rating.value_counts()

        if review_counts.get(5, 0) + review_counts.get(4, 0) == 0:
            # ignore user's ratings, and just get the user's top <number_of_recs> ratings
            if verbose:
                print(
                    "User has no ratings >= 4. Ignoring user's ratings, returning top predicted ratings."
                )

            # get only predicted ratings for ones the user hasn't seen
            subset_ratings = self.nmf_predictions.loc[
                (self.nmf_predictions.userID == userID)
                & (self.nmf_predictions.rating.isna())].copy()
            subset_ratings = subset_ratings.merge(self.movies, on='movieID')
            combine_genres(subset_ratings)

            # add in columns that would have been calculated
            subset_ratings['similar_to'] = ""
            subset_ratings['similarity'] = np.nan
            subset_ratings['weighted_rating'] = subset_ratings[
                'nmf_prediction']

            # reorder columns
            subset_ratings = subset_ratings[[
                'movieID', 'name', 'year', 'genres', 'tags', 'similar_to',
                'similarity', 'nmf_prediction', 'weighted_rating'
            ]]
            subset_ratings.sort_values(by='nmf_prediction',
                                       ascending=False,
                                       inplace=True)

            return subset_ratings.head(number_of_recs)

        # get up to <MAX_CONSIDERED_RATINGS> 5s
        subset_ratings = get_subset_ratings()

        if show_user_likes:
            print('\n---------------\nHighest-reviewed movies for userID {}:'.
                  format(userID))
            print(subset_ratings)
            print('\n---------------\n')

        # get the similarity for each movie in subset_ratings
        similar_movies = get_most_similar_movies(subset_ratings)

        # now we have the similarity scores for the movies most like the movies the user rated highest
        # get the predicted ratings, and multiply those by the similarity scores
        if verbose:
            print(
                "Getting user's predicted ratings and calculated expected rating...",
                end='')

        user_predictions = self.nmf_predictions[self.nmf_predictions['userID']
                                                == userID]
        similar_movies = similar_movies.merge(user_predictions,
                                              on='movieID',
                                              how='inner')
        similar_movies['weighted_rating'] = similar_movies[
            'similarity'] * similar_movies['nmf_prediction']

        if verbose:
            print('Done')
            print("Finalizing output...", end='')

        # combine genres and reorder columns
        combine_genres(similar_movies)
        similar_movies = similar_movies[[
            'movieID', 'name', 'year', 'genres', 'tags', 'similar_to',
            'similarity', 'nmf_prediction', 'weighted_rating'
        ]]
        similar_movies.sort_values(by='weighted_rating',
                                   ascending=False,
                                   inplace=True)

        if verbose:
            print('Done')

        return similar_movies.head(number_of_recs)
all_predictions = {}
all_predictions['uid'] = {}
all_predictions['uid']['iid'] = 'est'

for uid, iid, true_r, est, _ in predictions:
    all_predictions[uid] = {}
    all_predictions[uid][iid] = est
    # all_predictions[uid] =

svd_movie_matrix = movie_matrix.fillna(all_predictions[uid][iid])

# NMF - non negative matrix factorization
nmfAlgo = NMF()
nmfAlgo.fit(trainset)
nmf_predictions = nmfAlgo.test(testset)
# print(nmf_predictions)

all_nmf_predictions = {}
all_nmf_predictions['uid'] = {}
all_nmf_predictions['uid']['iid'] = 'est'

for uid, iid, true_r, est, _ in nmf_predictions:
    all_nmf_predictions[uid] = {}
    all_nmf_predictions[uid][iid] = est

nmf_movie_matrix = movie_matrix.fillna(all_nmf_predictions[uid][iid])
# evaluate(algo, data, measures=['RMSE'])

if __name__ == '__main__':
    app.run(debug=True)
示例#16
0
    for i in range(len(testset)):
        if (testset[i][1] not in to_delete):
            trimmed_testset.append(testset[i])
    total_trimmed.append(trimmed_testset)

rmse_mean = []

for i in range(2, 52, 2):
    print("%d---------------------------------" % i)
    algo = NMF(n_factors=i, random_state=1)

    for j in range(0, len(total_train)):

        algo.fit(total_train[j])

        predictions = algo.test(total_trimmed[j])

        # Compute and print Root Mean Squared Error
        mean = []
        mean.append(accuracy.rmse(predictions, verbose=True))
    rmse_mean.append(np.mean(mean))

rmse_mean_array = np.asarray(rmse_mean)

print('============================================')
print('Question 19:')
fig = plt.figure()
plt.plot(range(2, 52, 2), rmse_mean, 'b-')
plt.xlabel('K Value')
plt.ylabel('RMSE')
plt.show()
示例#17
0
data.raw_ratings = train_raw_ratings

# Finds best parameters for NMF model with bias
# Scores using MSE
params = {"biased": [True], "n_factors": np.arange(2, 12, 2)}
nmf = GridSearchCV(NMF, params, measures=["mse"], cv=3)
nmf.fit(data)

print("\nBest number of factors found:", nmf.best_params['mse']['n_factors'])

# Trains NVM using best parameters found
best_nmf = NMF(biased=True, n_factors=nmf.best_params['mse']['n_factors'])
best_nmf.fit(data.build_full_trainset())

# Tests on training set
predictions = best_nmf.test(data.build_full_trainset().build_testset())
mse = accuracy.mse(predictions, verbose=False)
print("Training Set MSE:", mse)

# Scores on test set
predictions = best_nmf.test(data.construct_testset(test_raw_ratings))
mse = accuracy.mse(predictions, verbose=False)
print("Test Set MSE:", mse)

# Checks Recommendations
# -----------------------
recs = defaultdict(list)  # List of recommendations for each user
num_recs = 5  # Number of recommendations to get for each user
for uid, iid, true_r, est, _ in predictions:
    recs[uid].append((iid, est))
def compute_recommendations():
    #connecting to the database
    # engine = create_engine("mysql://*****:*****@localhost/ratingsx?charset=utf8", echo=True)
    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(
        sessionmaker(bind=engine, autocommit=False, autoflush=False))

    blockPrint()

    #reading in the database
    df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine)
    df_ratings = df_ratings[['user_id', 'item_id', 'rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()

    #formatting the dataset using the surprise library
    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 5))
    data = Dataset.load_from_df(df_ratings, reader=reader)
    training_set = data.build_full_trainset()

    algorithm = NMF()  # use the singular value decomposition

    algorithm.train(training_set)  # fit the data to the model
    testing_set = training_set.build_anti_testset()
    predictions = algorithm.test(testing_set)  # make prediction

    #writing the function for top predictions
    def get_top_n(predictions, n=10):
        #     Return the top-N recommendation for each user from a set of predictions.

        #     Args:
        #         predictions(list of Prediction objects): The list of predictions, as
        #             returned by the test method of an algorithm.
        #         n(int): The number of recommendation to output for each user. Default
        #             is 10.

        #     Returns:
        #     A dict where keys are user (raw) ids and values are lists of tuples:
        #         [(raw item id, rating estimation), ...] of size n.

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n
# getting the top 10 predictions
    top_n = get_top_n(predictions, n=10)

    # Print the recommended items for each user
    a = []
    for uid, user_ratings in top_n.items():
        a.append([uid, [iid for (iid, _) in user_ratings]])
    df_list_pred = pd.DataFrame.from_records(a, columns=['A', 'B'])

    df_user = pd.DataFrame(df_list_pred.A.values.tolist())
    df_pred = pd.DataFrame(df_list_pred.B.values.tolist())

    df_pred.columns = [
        'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7',
        'pred_8', 'pred_9', 'pred_10'
    ]

    df_items = pd.read_sql('SELECT * FROM items;', con=engine)

    # df_pred = df_pred.applymap(lambda x: df_items.loc[x, 'title'])
    df_pred[['id']] = df_user
    df_pred = df_pred[[
        'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10'
    ]]

    df_pred['id'] = df_pred['id'].astype(int)

    # Append recomemndations
    df_pred.to_sql('recommendations', engine, if_exists='append',
                   index=False)  #if_exists='append'
    session.commit()

    #logging the predictions
    df_log = df_pred
    df_log['algorithm'] = 'NMF'
    df_log = df_log.rename(columns={'id': 'user_id'})
    df_log = df_log[[
        'user_id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10', 'algorithm'
    ]]

    df_log.to_sql('predictionlogs', engine, if_exists='append',
                  index=False)  #if_exists='append'
    session.commit()

    global mae2
    global rmse2
    mae2 = accuracy.mae(predictions)
    rmse2 = accuracy.rmse(predictions)
    mae2 = float(mae2)
    rmse2 = float(rmse2)
示例#19
0
thres = 3
file_path = os.path.expanduser('ratings.csv')
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data_raw = Dataset.load_from_file(file_path, reader=reader)
train, test = train_test_split(data_raw, test_size=0.1)

sim_options = {'name': 'pearson', 'user_based': True}
algo1 = KNNWithMeans(k = 24, sim_options=sim_options) # !!!!!! need to revise after part 4 done
algo2 = NMF(n_factors = 18, random_state = 1)
algo3 = SVD(n_factors = 16, random_state = 1, biased=True)
algo1.fit(train)
algo2.fit(train)
algo3.fit(train)

predictions1 = algo1.test(test)
predictions2 = algo2.test(test)
predictions3 = algo3.test(test)

target1 = []
for i in range(len(test)):
    target1.append(test[i][2])

for t in range(len(target1)):
    if target1[t] > thres:
        target1[t]=1
    else:
        target1[t]=0

target2 = []
for i in range(len(test)):
    target2.append(test[i][2])
示例#20
0
# define a cross-validation iterator
kf = KFold(n_splits=10)

for k in k_values:
    algo = NMF(n_factors=k)
    this_k_RMSE = list()
    this_k_MAE = list()
    # use cross-validation iterator, perform CV manually
    for trainset, testset in kf.split(data):
        # fit the whole trainset
        algo.fit(trainset)

        #trim testset here
        testset_df = pd.DataFrame(testset,
                                  columns=['userId', 'movieId', 'rating'])
        testset_popular = testset_df.groupby("movieId").filter(
            lambda x: len(x) > 2).values.tolist()
        # testset_unpopular = testset_df.groupby("movieId").filter(lambda x: len(x) <= 2).values.tolist()
        # testset_highvariance =testset_df.groupby("movieId").filter(lambda x: np.var(x['rating'])>=2 and len(x)>=5 ).values.tolist()

        predictions = algo.test(testset_popular)
        this_k_RMSE.append(accuracy.rmse(predictions, verbose=True))

    RMSE.append(np.mean(this_k_RMSE))

plt.figure(figsize=[6, 5]).set_tight_layout(True)
plt.plot(k_values, RMSE, label='RMSE')
plt.xlabel('k')
plt.legend(loc="upper right")
示例#21
0
# Load the movielens-100k dataset  UserID::MovieID::Rating::Timestamp
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.15)

# Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item.

algoritmo = NMF(n_epochs=5)

algoritmo.fit(trainset)

# Selecionamos o usuário e o filme que será analisado
# User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas
uid = str(49)
# Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4
iid = str(2058)  # raw item id

# get a prediction for specific users and items.
pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True)

# run the trained model against the testset
test_pred = algoritmo.test(testset)

# Avalia RMSE
print("Avaliação RMSE: ")
accuracy.rmse(test_pred, verbose=True)

# Avalia MAE
print("Avaliação MAE: ")
accuracy.mae(test_pred, verbose=True)
示例#22
0
        print("KNNMean")
        knnmean = KNNWithMeans(sim_options={"name": "cosine"})
        knnmean.fit(trainset)
        knnmean_predictions = knnmean.test(testset)
        results = get_group_measures(preds_all=knnmean_predictions,
                                     U1=U1_users,
                                     U2=U2_users,
                                     U3=U3_users,
                                     U4=U3_users)
        knnmean_results.append(results)

        print("NMF")
        nmf = NMF()
        nmf.fit(trainset)
        nmf_predictions = nmf.test(testset)
        results = get_group_measures(preds_all=nmf_predictions,
                                     U1=U1_users,
                                     U2=U2_users,
                                     U3=U3_users,
                                     U4=U3_users)
        nmf_results.append(results)
        """print("TOP")
        top = TOP()
        top.fit(trainset)
        top_predictions = top.test(testset)
        results = get_group_measures(preds_all=top_predictions, U1=U1_users, U2=U2_users, U3=U3_users, U4=U4_users)
        top_results.append(results)

        print("NormalPredictor")
        rand = NormalPredictor()
示例#23
0
def matrix_factorize_for_latent_feats():
    """
    Function employs matrix factorization (NMF) of user scores for the episodes they've seen. Typically, this type of recommender algorithm aims to predict reviewer scores for episodes that users *haven't* seen, with user and episode latent features as a byproduct. This function will return a dataframe of episodes strongest in each latent feature instead.

    Inputs:
    ---
    None as yet. This code should just run once the function is called.

    Output:
    ---
    *matfact_lf_recs: Pandas DataFrame; DF containing the top episodes for each latent feature (as determined by the algorithm)
    """

    # First, we must build our reviewer DFs from which we will factorize user ratings:
    rev_rvr_df = pd.read_pickle("data/combined_rev_rvr_df.pkl.bz2")

    # Drop duplicate values in reviews and reviewer DF:
    rev_rvr_df.drop_duplicates(inplace=True)

    summary_df = pd.read_pickle("data/combined_info_rev_df.pkl.bz2")

    # Here, we create dictionaries for episode IDs and reviewer IDs to be applied to the recommendation DFs we'll be creating below:
    id_ep = dict(enumerate(rev_rvr_df["ep_title"].unique()))
    ep_id = dict((values, keys) for keys, values in id_ep.items())
    id_rvr = dict(enumerate(rev_rvr_df["reviewer"].unique()))
    rvr_id = dict((values, keys) for keys, values in id_rvr.items())

    # Append reviewer and episode IDs to reviews DF using the dictionaries created above:
    rev_rvr_df["rvr_id"] = None
    rev_rvr_df["ep_id"] = None
    rev_rvr_df.loc[:, 'ep_id'] = rev_rvr_df["ep_title"].map(
        lambda x: ep_id.get(x, np.nan))
    rev_rvr_df.loc[:, "rvr_id"] = rev_rvr_df["reviewer"].map(
        lambda x: rvr_id.get(x, np.nan))

    # Fill empty reviewer scores with mean values so we can create our utility matrix and so the NMF algo can work without throwing errors:
    rev_rvr_df["rvr_rating"].fillna(value=rev_rvr_df["rvr_rating"].mean(),
                                    inplace=True)

    # Creating reader object to read in our DF. This is a necessary step to create the Dataset object from our DF that the Surprise library likes to work with:
    reader = Reader(rating_scale=(1.0, 10.0))
    data = Dataset.load_from_df(rev_rvr_df[["rvr_id", "ep_id", "rvr_rating"]],
                                reader=reader)

    # Splitting our data into training and testing sets for the algorithm:
    trainset, testset = train_test_split(data, test_size=0.2)

    # Creating and fitting our NMF algorithm with optimal hyperparameters (per gridsearching at a previous date):
    optimal_nmf = NMF(n_factors=20,
                      n_epochs=100,
                      reg_bi=0.15,
                      reg_qi=0.3,
                      lr_bu=0.005,
                      lr_bi=0.001,
                      biased=True,
                      verbose=False)
    optimal_nmf.fit(trainset)

    # Brief aside to check our error metrics, for those curious:
    predicted_nmf = optimal_nmf.test(testset)
    print(accuracy.mae(predicted_nmf))
    print(accuracy.rmse(predicted_nmf))

    # Now we create a DF for ALL episodes and their latent feature scores in the columns:
    latent_feat = pd.DataFrame(optimal_nmf.qi)

    # Append episode titles to the big latent_feat DF using the id_ep dictionary we created above (as a reminder: that's the dictionary where keys are episode indices in the big latent_feat DF and vals are episode titles):
    latent_feat["ep_title"] = pd.Series(id_ep)

    # Creating a list of episodes that have the highest value for each latent feature so that we can create a separate DF of just these episodes:
    max_lf_vals = list(latent_feat.iloc[:, :-1].max())

    # Here, we create a new DF to contain ONLY the episodes that are highest in each latent feature:
    max_lf_df = pd.DataFrame(columns=latent_feat.columns)

    for col_idx, vals in enumerate(max_lf_vals):
        temp_df = pd.DataFrame(latent_feat[latent_feat.loc[:,
                                                           col_idx] == vals])
        max_lf_df = pd.concat([max_lf_df, temp_df])

    # Appending series-season-episode identifiers, IMDb user rating, and episode titles from the summary_df by merging on episode titles (now that both have ep titles):
    max_lf_df = max_lf_df.merge(summary_df[[
        "series", "season", "episode", "ep_title", "identifier",
        "IMDB_user_rating"
    ]],
                                on="ep_title")
    matfact_lf_recs = max_lf_df.loc[:, [
        "series", "season", "episode", "ep_title", "identifier",
        "IMDB_user_rating"
    ]]

    return matfact_lf_recs
示例#24
0
#knnoptm=30
#nnmfoptm=20
#mfoptm=8
trainset, testset = train_test_split(data, test_size=0.1)

#def make_prediction(filter_type):
#if filter_type=='KNN':
#sim_options={'name': 'pearson','user_based':True}
algo = NMF(n_factors=20)
#elif filter_type='NNMF':
#filter_=NMF(n_factors=20)
#elif filter_type='MFbias':
#filter_=SVD(n_factors=8,biased=True)
#filter_.fit(trainset)
algo.fit(trainset)
predictions = algo.test(testset)

truth_table = np.array([prediction.r_ui for prediction in predictions])
truth_table_copy = truth_table.copy()
truth_table_copy[truth_table < 3] = 0
truth_table_copy[truth_table >= 3] = 1

user_dic = defaultdict(list)
for uid, _, tru_r, est, _ in predictions:
    user_dic[uid].append((est, tru_r))
print 'dic created'


def precision_recall(predictions, t):
    precision = dict()
    recall = dict()
class MovieRecommender:
    def __init__(self):
        self._knn = None
        self._nmf = None
        self._trainset = None
        self._predictions = None

        self.initialized = False

    def initialize(self, data_filepath):
        self._data = Dataset.load_from_file(data_filepath,
                                            reader=Reader('ml-100k'))
        self._trainset = self._data.build_full_trainset()

        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        self._knn = KNNBaseline(sim_options=sim_options)
        self._nmf = NMF()

        start_new_thread(self._train)

    def get_similar_movies(self, movie_id, k=10):
        if not self.initialized:
            return []

        model = self._knn

        movie_inner_id = model.trainset.to_inner_iid(movie_id)
        similar_movie_inner_ids = model.get_neighbors(movie_inner_id, k=k)

        to_raw_iid = model.trainset.to_raw_iid
        similar_movie_ids = (to_raw_iid(inner_id)
                             for inner_id in similar_movie_inner_ids)

        movie_ids = [
            similar_movie_id.encode('ascii')
            for similar_movie_id in similar_movie_ids
        ]
        return movie_dataset.get_movies(movie_ids)

    def get_similar_movies_for_user(self, user_id, num_movies=10):
        if not self.initialized:
            return []

        user_id = str(user_id)
        user_predictions = [
            prediction for prediction in self._predictions
            if prediction[0] == user_id
        ]

        sorted_predictions = sorted(user_predictions,
                                    key=lambda x: x.est,
                                    reverse=True)
        top_n_predictions = sorted_predictions[:num_movies]

        similar_movie_ids = (prediction.iid
                             for prediction in top_n_predictions)

        movie_ids = [
            similar_movie_id.encode('ascii')
            for similar_movie_id in similar_movie_ids
        ]
        return movie_dataset.get_movies(movie_ids)

    def update_user_ratings(self, user_id, movie_id, rating):
        if not self.initialized:
            return

        rating = float(rating)

        has_previous_rating = False
        if self._trainset.knows_user(user_id):
            trainset_dict = dict(self._trainset.ur[user_id])
            has_previous_rating = movie_id in trainset_dict

        user_id = str(user_id)
        movie_id = str(movie_id)
        new_rating = (user_id, movie_id, rating, time())
        if has_previous_rating:
            for i, rating in enumerate(self._data.raw_ratings):
                if rating[0] == user_id and rating[1] == movie_id:
                    self._data.raw_ratings[i] = new_rating
                    break
        else:
            self._data.raw_ratings.append(new_rating)

        self._trainset = self._data.build_full_trainset()
        self._train()

    def _train(self):
        self._nmf.train(self._trainset)
        self._knn.train(self._trainset)

        self._predictions = self._nmf.test(self._trainset.build_anti_testset())

        self.initialized = True
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    print (score_df)
    print (svd_data.raw_ratings)
    #Try SVD
    algo_svd = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svd, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svd.fit(full_train_set)
    predictions = algo_svd.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    latent_usr_factor = algo_svd.pu 
    latent_item_factor = algo_svd.qi 
    user_bias = algo_svd.bu
    item_bias = algo_svd.bi
    recomendation_reportname_df_svd = pd.merge(recommendation_df_svd, df_reports_id, how = 'left', on= 'report_id')

    
    #Try SVD++
    algo_svdpp = SVDpp()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svdpp.fit(full_train_set)
    predictions = algo_svdpp.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svdpp = get_top_n(predictions, n=5)
    latent_usr_factor_pp = algo_svd.pu 
    latent_item_factor_pp = algo_svd.qi 
    user_bias_pp = algo_svd.bu
    item_bias_pp = algo_svd.bi
    recomendation_reportname_df_svdpp = pd.merge(recommendation_df_svdpp, df_reports_id, how = 'left', on= 'report_id')

      #Try SVD++ with more factors as default is 20
    algo_svdpp_mod = SVDpp(n_factors =50, n_epochs = 50)
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_svdpp.fit(full_train_set)
    predictions = algo_svdpp.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    print (score)
    
    #print (recommendation_df)
    
    
    #Try the NMF
    #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo_nmf = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo_nmf.fit(full_train_set)
    predictions = algo_nmf.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_nmf = get_top_n(predictions, n=5)
    #print (recommendation_df)
    latent_usr_factor_nmf = algo_svd.pu 
    latent_item_factor_nmf = algo_svd.qi 
    user_bias_nmf = algo_svd.bu
    item_bias_nmf = algo_svd.bi
    recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id')
    sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816]
    
        #Try the NMF without default
    #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo_nmf_mod = NMF(n_factors =50, n_epochs = 50)
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True, )
    # Fitting the SVD
    algo_nmf.fit(full_train_set)
    predictions = algo_nmf.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_nmf = get_top_n(predictions, n=5)
    #print (recommendation_df)
    latent_usr_factor_nmf = algo_svd.pu 
    latent_item_factor_nmf = algo_svd.qi 
    user_bias_nmf = algo_svd.bu
    item_bias_nmf = algo_svd.bi
    recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id')
    sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816]
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
     # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(algo_svd, svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(algo_svdpp,svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(algo_nmf, svd_data, cv=5, n_jobs=5, verbose=False) 
    svdpp_cv_mod = cross_validate(algo_svdpp_mod,svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv_mod = cross_validate(algo_nmf_mod, svd_data, cv=5, n_jobs=5, verbose=False) 
示例#27
0
from surprise import NMF
from surprise.model_selection import train_test_split
from surprise import Dataset
from surprise import Reader

reader = Reader(line_format='user item rating', sep=',' ,rating_scale=(0,10) )
data = Dataset.load_from_file('user_ratings1.data', reader=reader)
data.raw_ratings

trainset, testset = train_test_split(data, test_size = .7, train_size = .3, random_state = 1)

algo = NMF(n_factors=25, n_epochs=200, random_state=1)
#trainset = data.build_full_trainset()
algo.fit(trainset)


from surprise import accuracy
pred = algo.test(testset)

accuracy.rmse(pred), accuracy.mae(pred)


for i in range(20):
    print('user: %-10s      item: %-10s    r_ui: %-10s    est: %.2f     %-10s' % (pred[i].uid, pred[i].iid, pred[i].r_ui , pred[i].est , pred[i][-1]))
    
def surpriseNMF(mode,
                DataPath='../data/data_clean.txt',
                TrainPath='../data/train_clean.txt',
                TestPath='../data/test_clean.txt',
                n_factors=15,
                n_epochs=50,
                reg_pu=0.06,
                reg_qi=0.06,
                reg_bu=0.02,
                reg_bi=0.02,
                lr_bu=0.005,
                lr_bi=0.005,
                init_low=0,
                init_high=1,
                biased=False,
                verbose=True):

    # We need the rating scale.
    reader = Reader(rating_scale=(1, 5))

    if mode == 'evaluation':

        # train data processing
        train = pd.read_csv(TrainPath, sep="\t", header=None)
        train.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(train[["User Id", "Movie Id", "Rating"]],
                                    reader)
        trainset = data.build_full_trainset()

        # fit model
        algo = NMF(n_factors=n_factors,
                   n_epochs=n_epochs,
                   reg_pu=reg_pu,
                   reg_qi=reg_qi,
                   reg_bu=reg_bu,
                   reg_bi=reg_bi,
                   lr_bu=lr_bu,
                   lr_bi=lr_bi,
                   init_low=init_low,
                   init_high=init_high,
                   biased=biased,
                   verbose=verbose)
        algo.fit(trainset)

        # evaluate train error
        test = trainset.build_testset()
        predictions = algo.test(test)
        train_err = accuracy.rmse(predictions, verbose=False)

        # test data processing
        test = pd.read_csv(TestPath, sep="\t", header=None)
        test.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(test[["User Id", "Movie Id", "Rating"]],
                                    reader)
        testset = data.build_full_trainset()

        # evaluate train error
        test = testset.build_testset()
        predictions = algo.test(test)
        test_err = accuracy.rmse(predictions, verbose=False)

        # Return V (qi),  U (pu), train_err (RMSE), test_err (RMSE)
        return algo.qi, algo.pu, train_err, test_err

    elif mode == 'visualization':

        # train data processing
        alldata = pd.read_csv(DataPath, sep="\t", header=None)
        alldata.columns = ["User Id", "Movie Id", "Rating"]
        data = Dataset.load_from_df(alldata[["User Id", "Movie Id", "Rating"]],
                                    reader)
        trainset = data.build_full_trainset()

        # fit model
        algo = NMF(n_factors=n_factors,
                   n_epochs=n_epochs,
                   reg_pu=reg_pu,
                   reg_qi=reg_qi,
                   reg_bu=reg_bu,
                   reg_bi=reg_bi,
                   lr_bu=lr_bu,
                   lr_bi=lr_bi,
                   init_low=init_low,
                   init_high=init_high,
                   biased=biased,
                   verbose=verbose)
        algo.fit(trainset)

        # evaluate train error
        test = trainset.build_testset()
        predictions = algo.test(test)
        train_err = accuracy.rmse(predictions, verbose=False)

        U = algo.pu
        V = algo.qi

        A, _, B = np.linalg.svd(V.T)
        A = A.T
        # Use the first 2 cols for work
        Asub = A[:, :2]

        Uproj = np.dot(Asub.T, U.T)
        Vproj = np.dot(Asub.T, V.T)

        # Return Vproj,  Uproj, train_err (RMSE of Y = U^T V)
        return Vproj, Uproj, train_err
示例#29
0
def main(rec='SVD', threshold=4, topK=10):
    # First train an SVD algorithm on the movielens dataset.
    print("load data...")
    '''
    data = Dataset.load_builtin('ml-1m')
    # test set is made of 40% of the ratings.
    test_size = 0.4
    trainset, testset = train_test_split(data, test_size=test_size)
    '''

    # path to dataset file
    test_data_path = r'C:\Users\abc\.surprise_data\ml-100k\ml-100k\u.data'  #这个还不知道干嘛用
    file_path = os.path.expanduser(
        r'C:\Users\abc\.surprise_data\ml-100k\ml-100k\u.data')
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)
    trainset = data.build_full_trainset()

    test_user, test_item, test_rate = read_data(test_data_path)  #分为三组
    #print("test size %.1f..." % test_size)
    print("training...")

    sim_options = {
        'name': 'cosine',
        'user_based': False  # 计算物品相似度
    }
    #选择算法
    if rec == 'NMF':
        algo = NMF()
    elif rec == 'SVD':
        algo = SVD()
        name = ['SVD']
    else:
        algo = KNNBaseline(sim_options=sim_options)
        name = ['ItemKNN']

    train_start = time.time()
    algo.fit(trainset)
    train_end = time.time()
    print('train time:%.1f s' % (train_end - train_start))

    #Than predict ratings for all pairs (u, i) that are NOT in the training set.
    ######填充空值,预测trainset的值
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    test_end = time.time()
    print('test time:%.1f s' % (test_end - train_end))
    #top_n_est 是元组列表,元组里边是itemid 和 对应预测评分
    top_n_est, true_ratings = get_top_n(predictions, n=10, threshold=threshold)
    #模型评估
    f1, map, mrr, mndcg = evaluate_model_new(algo, test_user, test_item,
                                             test_rate, topK)
    eval_end = time.time()
    print('evaluate time:%.1f s' % (eval_end - test_end))
    print("algorithm : %s" % rec)
    print(
        'recommendation metrics: F1 : %0.4f, NDCG : %0.4f, MAP : %0.4f, MRR : %0.4f'
        % (f1, mndcg, map, mrr))
    print('%0.4f个用户' % algo.pu.shape)
    print('%0.4f个物品' % algo.qi.shape)
    return top_n_est