Пример #1
0
def try_recom_algorithm_grid(data, algo, filename, grid_options, n_splits=5):
    """
    Function that tries out the recommendation algorithms supported by Surprise library,
    but first it tunes the hyperparameters using grid search
    :param data: input data containing user, item, rating and timestamp(opt)
    :param algo: the recom. algorithm to be used
    :param filename: name of the file the results should be saved into
    :param grid_options: dictionary containing possible values range for each parameter
    :param n_splits: number of folds for the cross validation
    :return:
    """
    print("\nWorking on " + filename + "\n")
    file = open("../results_surprise_163K/" + filename + ".txt", "w+")

    # use grid search cross validation using the given grid options
    gs = GridSearchCV(algo,
                      grid_options,
                      measures=['rmse', 'mae'],
                      cv=n_splits)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])
    file.write("RMSE: %f" % (gs.best_score['rmse']))

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])
    file.write("Best params:")
    file.write(str(gs.best_params['rmse']))
    file.close()
Пример #2
0
    def tune_and_find_parameter(self,algo_name, algo, rating_data,param_grid):
        """
            use GridSearchCVcomputes which (from surpise documentation)
            computes accuracy metrics for an algorithm on various combinations of parameters,
            over a cross-validation procedure.

            Args:
                param1: algo_name : the name of the algorithm
                param2: algo: the algorithm itself
                param3: rating_data: the whole dataset

            Return:best k found
        """


        print("tuning for", algo_name, "hyperparameters")

        # algo: algo class name
        grid_search = GridSearchCV(algo, param_grid, measures=['rmse', 'mae'])
        grid_search.fit(rating_data)

        print('best RMSE for ', algo_name, ' ', grid_search.best_score['rmse'])

        best_params = grid_search.best_params['rmse']
        # print the best set of parameters
        print("best params:", best_params)
        return best_params
    def perform_grid_search_with_cv(self, train_set):
        """
        Perform grid search to get optimal parameters and get metrics after cross validation
        :param train_set: The train set
        :return: Different RMSE and MAE for the different hyper parameters
        """
        if train_set:
            print("Running grid search to find optimal hyper parameters")
            self.LOG_HANDLE.info(
                "Running grid search to find optimal hyper parameters")

            param_grid = {
                'k': [30, 40, 50],
                'min_k': [1, 3, 5],
                'sim_options': {
                    'name': ['cosine', 'pearson', 'msd'],
                    'user_based': [False]
                }
            }
            gs = GridSearchCV(
                KNNWithMeans,
                param_grid,
                measures=model_params.all_models_training_error_measures,
                cv=model_params.cross_validation_folds)
            gs.fit(train_set)

            # best RMSE score
            print("Best RMSE after CV: ")
            print(gs.best_score['rmse'])
            self.LOG_HANDLE.info(gs.best_score['rmse'])

            # combination of parameters that gave the best RMSE score
            print("Best parameters after CV: ")
            print(gs.best_params['rmse'])
            self.LOG_HANDLE.info(gs.best_params['rmse'])
def test():
    seed = 0
    random.seed(seed)
    np.random.seed(seed)

    param_grid: Dict[str, List[object]] = {
        'n_factors': [50, 100, 200],
        'n_epochs': [10, 20, 50],
        'biased': [True, False],
        'init_mean': [0, 0.1, 0.5],
        'init_std_dev': [0, 0.1, 0.5],
        'lr_all': [0.001, 0.005, 0.01],
        'reg_all': [0.01, 0.02, 0.05],
        'random_state': [None],
        'verbose': [True]
    }

    grid_search = GridSearchCV(
        algo_class=SVD,
        param_grid=param_grid,
        measures=['rmse'],
        cv=KFold(5),
        n_jobs=-1
    )

    interactions = load_sorted_test_interactions()
    parsed_data = Parser.parse(interactions)
    grid_search.fit(parsed_data.whole_data_set)

    print(grid_search.best_score['rmse'])
    print(grid_search.best_params['rmse'])
    print(grid_search.cv_results)

    add_results_to_database(grid_search.cv_results, "svd", cls=NumpyEncoder)
Пример #5
0
def test_gridsearchcv_same_splits():
    """Ensure that all parameter combinations are tested on the same splits (we
    check their RMSE scores are the same once averaged over the splits, which
    should be enough). We use as much parallelism as possible."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, reader=Reader('ml-100k'),
                                  rating_scale=(1, 5))
    kf = KFold(3, shuffle=True, random_state=4)

    # all RMSE should be the same (as param combinations are the same)
    param_grid = {'n_epochs': [5], 'lr_all': [.2, .2],
                  'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]}
    gs = GridSearchCV(SVD, param_grid, measures=['RMSE'], cv=kf,
                      n_jobs=1)
    gs.fit(data)

    rmse_scores = [m for m in gs.cv_results['mean_test_rmse']]
    assert len(set(rmse_scores)) == 1  # assert rmse_scores are all equal

    # Note: actually, even when setting random_state=None in kf, the same folds
    # are used because we use product(param_comb, kf.split(...)). However, it's
    # needed to have the same folds when calling fit again:
    gs.fit(data)
    rmse_scores += [m for m in gs.cv_results['mean_test_rmse']]
    assert len(set(rmse_scores)) == 1  # assert rmse_scores are all equal
Пример #6
0
def do_grid_search(data):
    print("Doing gridsearch for best model.")
    param_grid = {
        'n_epochs': [10, 20, 30],
        'n_factors': [100, 150, 200],
        'lr_all': [0.001, 0.0025, 0.005, 0.001],
        'reg_all': [0.2, 0.4, 0.6]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['rmse', 'mae'],
                      cv=5,
                      joblib_verbose=5,
                      n_jobs=-1)

    gs.fit(data_handler.get_data_from_df(data))
    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    # We can now use the algorithm that yields the best rmse:
    algo = gs.best_estimator['rmse']
    return algo
Пример #7
0
 def param_selection(self):
     """
     select the best parameter for SVD, using cross-validation
     :param data:
     :return: SVD paramters
     """
     tuned_parameters = {
         'n_factors': [20, 50, 100],
         'reg_all': [0.04, 0.05]
     }
     grid_search = GridSearchCV(SVD,
                                tuned_parameters,
                                measures=['rmse', 'mae'],
                                cv=3)
     grid_search.fit(self.trainset)
     print("Best parameters using RMSE:")
     print(grid_search.best_params['rmse'])
     print()
     self.n_factors = grid_search.best_params['mae'].get('n_factors')
     self.reg_all = grid_search.best_params['mae'].get('reg_all')
     print("Best score using RMSE:")
     print(grid_search.best_score['rmse'])
     print()
     print("Best parameters using MAE:")
     print(grid_search.best_params['mae'])
     print()
     print("Best score using MAE:")
     print(grid_search.best_score['mae'])
     print()
Пример #8
0
def KNN_Tester(trainset, testset, algo):
    param_grid = {
        'k': [50, 100],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson']
        }
    }

    gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=5)
    gs.fit(data)
    params = gs.best_params['rmse']
    algo = KNNBasic(k=params['k'], sim_options=params['sim_options'])
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)
    avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
    metrics = {
        'rmse': rmse,
        'avg_precision': avg_precision,
        'avg_recall': avg_recall,
        'best_parameters': params
    }
    return metrics
Пример #9
0
    def grid_search(self):
        print('grid search...')
        sim_options = {
            "name": ["msd", "cosine"],
            "min_support": [3, 4],
            "user_based": [False]
        }
        param_grid = {
            "sim_options": sim_options,
            "k": [50, 100, 200],
            "min_k": [1]
        }
        gs = GridSearchCV(KNNWithMeans,
                          param_grid,
                          measures=["rmse", "mae"],
                          cv=3)
        gs.fit(self.model_data)
        best_params, best_score = gs.best_params["rmse"], gs.best_score["rmse"]
        print(f'Best score (RMSE): {best_score}')
        print(f'Best params (RMSE): {best_params}')

        print(f'Best score (MAE): {gs.best_score["mae"]}')
        print(f'Best params (RMSE): {gs.best_params["mae"]}')

        self.set_model_params(best_params)

        return best_params
Пример #10
0
def BaselineOnly_als():
    print('Testing BaselineOnly als parameters')
    param_grid = {
        'bsl_options': {
            'method': ['als'],
            'reg_i': [7, 6.9, 7.1],
            'reg_u': [7, 6.9, 7.1]
        }
    }
    gs = GridSearchCV(BaselineOnly,
                      param_grid,
                      measures=['rmse'],
                      cv=10,
                      n_jobs=-2,
                      refit=True)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    #export model
    joblib.dump(gs.best_params['rmse']['bsl_options'],
                'BaselineOnly.pkl',
                compress=1)
    dump.dump('BaselineOnly', algo=gs)
Пример #11
0
def BaselineOnly_sgd():
    print('Testing BaselineOnly sgd parameters')
    param_grid = {
        'bsl_options': {
            'method': ['sgd'],
            'learning_rate': [0.00643, 0.00646, 0.00649],
            'n_epochs': [43, 44, 45, 46, 47]
        }
    }
    gs = GridSearchCV(BaselineOnly,
                      param_grid,
                      measures=['rmse'],
                      cv=10,
                      n_jobs=-2,
                      refit=True)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    #export model
    joblib.dump(gs.best_params['rmse']['bsl_options'],
                'BaselineOnly.pkl',
                compress=1)
    dump.dump('BaselineOnly', algo=gs)
Пример #12
0
def SVD_alg():
    print('Testing SVD parameters')
    param_grid = {
        'n_epochs': [12, 13],
        'lr_all': [0.0013, 0.0015],
        'reg_all': [0.05, 0.06]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['rmse'],
                      cv=10,
                      n_jobs=-2,
                      refit=True)

    #runs fit method for all parameter combinations over splits given by cv
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    #export model
    joblib.dump(gs.best_params['rmse'], 'SVD.pkl', compress=1)
    dump.dump('SVD', algo=gs)
Пример #13
0
def get_surprise_knn_item_model(data, trainset, testset,
                                model_train_evaluation, model_test_evaluation,
                                error_table):
    param_grid = {
        'sim_options': {
            'name': ["pearson_baseline"],
            "user_based": [False],
            "min_support": [2],
            "shrinkage": [60, 80, 80, 140]
        },
        'k': [5, 20, 40, 80]
    }
    gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)
    gs.fit(data)
    st.write("GRIDSEARCH best scores", gs.best_score['rmse'])
    st.write("GRIDSEARCH best parameters", gs.best_params['rmse'])
    sim_options = {
        'name': 'pearson_baseline',
        'user_based': False,
        'min_support': 2,
        'shrinkage': gs.best_params['rmse']['sim_options']['shrinkage']
    }
    bsl_options = {'method': 'sgd'}
    algo = KNNBaseline(k=gs.best_params['rmse']['k'],
                       sim_options=sim_options,
                       bsl_options=bsl_options)
    train_result, test_result, error_table = run_surprise(
        algo, trainset, testset, "KNNBaseline_Item", error_table)
    model_train_evaluation["KNNBaseline_Item"] = train_result
    model_test_evaluation["KNNBaseline_Item"] = test_result
    return model_train_evaluation, model_test_evaluation, error_table
def grid_search(data):
    """
        This function was originally used to perform grid search on the different algorithms
    """
    # ---------------------KNN--------------------
    #sim_options = {
    #    "name": "mcd",
    #    "min_support": 3,
    #    "user_based": True
    #}
    #param_grid = {"sim_options": sim_options}
    #gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=3)
    #gs.fit(data)
    #print(gs.best_score["rmse"])
    #print(gs.best_params["rmse"])

    # ---------------SVD--------------
    param_grid = {
        "n_epochs": [5, 10, 20],
        "n_factors": [10, 15, 30]
    }
    gs = GridSearchCV(NMF, param_grid, measures=["rmse"], cv=3)

    gs.fit(data)

    print(gs.best_score["rmse"])
    print(gs.best_params["rmse"])
Пример #15
0
def load_data():
    data = Dataset.load_builtin('ml-100k')
    # similarity options
    sim_options = {"name": "msd", "user_based": False}

    param_grid = {
        "n_epochs": [5, 10],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.4, 0.6]
    }

    # algorithm
    algo = KNNWithMeans(sim_options=sim_options)

    # computation
    training_set = data.build_full_trainset()

    algo.fit(training_set)

    # GRID SEACH, MATRIX FACTORIZATION
    print("Divide matrix in grids")
    gs = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=3)
    gs.fit(data)

    print(gs.best_score['rmse'])
Пример #16
0
def modelo_svd_best_n(data):
    reader = Reader(rating_scale=(1, 5))
    # 'lr_all':[0.01,0.002,0.005],
    #'reg_all':[0.01,0.02,0.04],

    data = Dataset.load_from_df(
        data[['userid', 'businessid', 'mean_by_business']], reader)
    param_grid = {
        'n_factors': [5, 20, 50, 100],
        'n_epochs': [100, 200, 300],
    }

    gs = Gridsearch_svd(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=5)
    gs.fit(data)
    # combination of parameters that gave the best RMSE score
    k = gs.best_params['rmse']['n_factors']
    n_epochs = gs.best_params['rmse']['n_epochs']

    #Predictions with best parameters

    data_ = data.build_full_trainset()
    algo = SVD(n_factors=k, n_epochs=n_epochs)
    algo.fit(data_)
    prediciones = algo.test(data_.build_anti_testset())

    return prediciones
Пример #17
0
def find_best_params(data_set, cv=3, param_grid=None):

    if param_grid is None:
        param_grid = {
            'n_factors': [10, 30, 50],
            'n_epochs': [10, 30, 50],
            'lr_all': [0.002, 0.005, 0.008, 0.01],
            'reg_all': [0.2, 0.4, 0.6, 0.8]
        }

    log.info(f'Performing Grid Search: {param_grid}')

    gs = GridSearchCV(SVD,
                      param_grid=param_grid,
                      measures=['rmse', 'mae'],
                      cv=cv,
                      n_jobs=4,
                      joblib_verbose=2)
    start_time = time.time()
    gs.fit(data_set)
    end_time = time.time()
    log.info(f'Time spend on Grid Search: {end_time - start_time}')

    log.info(
        f"Best RMSE score: {gs.best_score['rmse']} with params: {gs.best_params['rmse']}"
    )
    log.info(
        f"Best MAE score: {gs.best_score['mae']} with params: {gs.best_params['mae']}"
    )

    return gs.best_params['rmse'], gs.best_params['mae']
Пример #18
0
    def __init__(self, data, score_index, user_index, items_index):

        self.items_index = items_index
        self.user_index = user_index
        self.data = data

        scale = (data[score_index].min(), data[score_index].max())
        reader = Reader(rating_scale=scale)
        dataset = Dataset.load_from_df(
            data[[user_index, items_index, score_index]], reader)

        param_grid = {
            'n_factors': [50, 100, 150],
            'n_epochs': [25, 50, 75],
            'lr_all': [0.005, 0.01],
            'reg_all': [0.02, 0.1, 0.5]
        }

        gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
        gs.fit(dataset)

        params = gs.best_params['rmse']

        self.model = SVD(reg_all=params['reg_all'],
                         n_factors=params['n_factors'],
                         n_epochs=params['n_epochs'],
                         lr_all=params['lr_all'])
        self.model.fit(dataset.build_full_trainset())
Пример #19
0
def tuneHyperParams(algtype, trainset, testset, df, param_grid):
    """
    Tune Hyper Parameters for Surprise library models
    Args:
        algtype (surprise.prediction_algorithms): type of the surprise algorithm
        trainset(pandas.Dataframe) :
        testset(pandas.Dataframe) :
        df(pandas.Dataframe) :
        param_grid : parameters to try
    Returns:
        surprise.GridSearchCV: gs
    """
    #TUNE HYPERPARAM VIA GRIDSEARCH
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
    #trainset, testset = train_test_split(data, test_size=.25, random_state=20)
    gs = GridSearchCV(algtype, param_grid, measures=['rmse'], cv=3)

    model = gs.fit(data)

    # best RMSE score
    #print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    #print(gs.best_params['rmse'])

    return gs
Пример #20
0
def test_best_estimator():
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))

    param_grid = {
        'n_epochs': [5],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [1],
        'init_std_dev': [0]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae'],
                      cv=PredefinedKFold(),
                      joblib_verbose=100)
    gs.fit(data)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator,
                         data,
                         measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
    def tune_and_find_param(self,
                            algo_name,
                            algo,
                            rating_data,
                            param_grid={
                                'n_factors': [50, 100],
                                'n_epochs': [20, 30],
                                'lr_all': [0.005, 0.010]
                            }):
        # use GridSearchCVcomputes which (from surpise documentation)
        # computes accuracy metrics for an algorithm on various combinations of parameters, over a cross-validation procedure.
        print("tuning for", algo_name, "hyperparameters")

        # algo: algo class name
        grid_search = GridSearchCV(algo, param_grid, measures=['rmse', 'mae'])

        # fitting data
        grid_search.fit(rating_data)

        # print the best RMSE
        print('best RMSE for ', algo_name, ' ', grid_search.best_score['rmse'])

        best_params = grid_search.best_params['rmse']
        # print the best set of parameters
        print("best params:", best_params)
        return best_params
Пример #22
0
def grid_search():
    """ grid search template """
    
    # Set Grid Parameters
    G = gsp.graphs.Graph(dd.build_friend_friend())
    G.compute_laplacian('normalized')
    param_grid = {
            #'L' : [G.L.todense()],
            'n_factors' : [5],
            'n_epochs' : [30],
            'lr_all' : [1.e-3],
            'reg_all' : np.logspace(-6,-1, 20),
            #'reg' : np.logspace(-6,-1,15)
    }
    
    # Init grid_search
    grid = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=6, n_jobs=1, joblib_verbose=10000)
    grid.fit(data)
    
    # Print best score and best parameters
    print('Best Score: ', grid.best_score['rmse'])
    print('Best parameters: ', grid.best_params['rmse'])
    
    # Plot RMSE
    plt.plot(grid.cv_results['param_reg_all'], grid.cv_results['mean_test_rmse'])
Пример #23
0
def grid():
    raw_ratings = data.raw_ratings
    threshold = int(.9 * len(raw_ratings))
    A_raw_ratings = raw_ratings[:threshold]
    B_raw_ratings = raw_ratings[threshold:]

    data.raw_ratings = A_raw_ratings
    param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
    grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3)
    grid_search.fit(data)
    algo = grid_search.best_estimator['rmse']

    # retrain on the whole set A
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    # Compute biased accuracy on A
    predictions = algo.test(trainset.build_testset())
    print('Biased accuracy on A,', end='   ')
    accuracy.rmse(predictions)

    # Compute unbiased accuracy on B
    testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
    predictions = algo.test(testset)
    print('Unbiased accuracy on B,', end=' ')
    accuracy.rmse(predictions)
Пример #24
0
def test_gridsearchcv_best_estimator(u1_ml100k):
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    param_grid = {
        'n_epochs': [5],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [1],
        'init_std_dev': [0]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae'],
                      cv=PredefinedKFold(),
                      joblib_verbose=100)
    gs.fit(u1_ml100k)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator,
                         u1_ml100k,
                         measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
    def tune(self,
             opt_field='rmse',
             param_grid={
                 'n_epochs': [5, 10],
                 'lr_all': [0.002, 0.005],
                 'reg_all': [0.4, 0.6]
             },
             SHOW_RESULT=False):

        if self.algorithm == 'svd':
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

        ## Start tuning
        gs.fit(self.data)

        ## Save to self.algo
        self.algo = gs.best_estimator[opt_field]
        self.algo.fit(self.trainset)

        if SHOW_RESULT:
            # best RMSE score
            print(gs.best_score['rmse'])

            # combination of parameters that gave the best RMSE score
            print(gs.best_params['rmse'])

        return self
    def perform_grid_search_with_cv(self, train_set):
        """
        Perform grid search to get optimal parameters and get metrics after cross validation
        :param train_set: The train set
        :return: Different RMSE and MAE for the different hyper parameters
        """
        if train_set:
            print("Running grid search to find optimal hyper parameters")
            self.LOG_HANDLE.info(
                "Running grid search to find optimal hyper parameters")

            param_grid = {
                'n_epochs': [10, 20, 30],
                'lr_all': [0.005, 0.006, 0.007, 0.008],
                'reg_all': [0.01, 0.02, 0.03, 0.2]
            }
            gs = GridSearchCV(
                SVDpp,
                param_grid,
                measures=model_params.all_models_training_error_measures,
                cv=model_params.cross_validation_folds)
            gs.fit(train_set)

            # best RMSE score
            print(gs.best_score['rmse'])
            self.LOG_HANDLE.info(gs.best_score['rmse'])

            # combination of parameters that gave the best RMSE score
            print(gs.best_params['rmse'])
            self.LOG_HANDLE.info(gs.best_params['rmse'])
Пример #27
0
def gridsearch(data, algo, param_grid):
    # param_grid = {'n_factors': [50, 100, 150], 'n_epochs': [20, 30],
    #               'lr_all': [0.005, 0.01], 'reg_all': [0.02, 0.1]}

    gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3)
    gs.fit(data)
    params = gs.best_params['rmse']
    print(params)
Пример #28
0
 def best_params(self):
     param_grid = {
         'n_factors': [x for x in range(50, 500, 50)],
         'n_epochs': [10, 20, 50, 75, 100],
         'lr_all': [.001, .003, .005, .008]
     }
     gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
     gs.fit(self.data)
     return gs.best_score['rmse'], gs.best_params['rmse']
Пример #29
0
def best_params():
    # Dataset de reviews a utilizar
    file_path = os.path.expanduser('./data/reviews_stars.csv')
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_file(file_path, reader=reader)

    # Se crea una lista de posibles valores de factores
    n_factors_values = []
    n_factors_initial_value = 2
    # Se prueban 15 distintos factores en intervalos de 2
    for i in range(0, 15):
        n_factors_values.append(n_factors_initial_value +
                                (n_factors_initial_value * i))

    # Se crea una lista de posibles epochs
    n_epochs_values = []
    n_epochs_initial_value = 5
    # Se prueba 10 valores distintos en intervalos de 5
    for i in range(0, 10):
        n_epochs_values.append(n_epochs_initial_value +
                               (n_epochs_initial_value * i))

    # Se crea una lista de posibles parámetros de regularización
    reg_all_values = []
    reg_all_initial_value = 0.2
    # Se prueban 5 valores distintos en intervalos de 0.2
    for i in range(0, 5):
        reg_all_values.append(reg_all_initial_value +
                              (reg_all_initial_value * i))

    # Se crea una lista de posibles learning rates
    lr_all_values = []
    lr_all_initial_value = 0.002
    # Se prueban 5 valores distintos en intervalos de 0.002
    for i in range(0, 5):
        lr_all_values.append(lr_all_initial_value + (lr_all_initial_value * i))

    # Se crea el diccionario de parámetros
    param_grid = {
        'n_factors': n_factors_values,
        'n_epochs': n_epochs_values,
        'lr_all': lr_all_values,
        'reg_all': reg_all_values,
        'biased': [True]
    }
    # Se prueban los parámetros utilizando MAE y RMSE
    gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # Se escribe en un archivo los resultados de los mejores parámetros RMSE y MAE
    with open('./data/results.txt', 'a') as file:
        file.write('Score rmse: ' + str(gs.best_score['rmse']) + '\n')
        file.write('Best parameters rmse: ' + str(gs.best_params['rmse']) +
                   '\n')
        file.write('Score mae: ' + str(gs.best_score['mae']) + '\n')
        file.write('Best parameters mae: ' + str(gs.best_params['mae']) + '\n')
Пример #30
0
def recomendacion(usuario):
    array = []
    for rate in Calificacion.objects.all():
        array.append([rate.usuario_id, rate.asignatura_id, rate.calificacion])

    df = pd.DataFrame(data=array)
    reader = Reader(rating_scale=(0, 10))
    data = Dataset.load_from_df(df, reader)
    trainingSet = data.build_full_trainset()
    param_grid = {
        'n_factors': [50, 100, 150],
        "n_epochs": [40, 50, 60],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.4, 0.6]
    }

    gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
    gs.fit(data)
    #Parametros optimos
    params = gs.best_params["rmse"]
    SVDoptimized = SVD(n_factors=params['n_factors'],
                       n_epochs=params['n_epochs'],
                       lr_all=params['lr_all'],
                       reg_all=params['reg_all'])
    SVDoptimized.fit(trainingSet)

    asig = Asignatura.objects.all()

    asig_user = Calificacion.objects.all().filter(usuario_id=usuario.id)

    #Asignaturas sin calificar
    asignaturas_SinC = []
    for asignatura in asig:
        encontrado = False
        for asignatura_usuario in asig_user:
            if (asignatura_usuario.asignatura_id == asignatura.codigo):
                encontrado = True
        if (not encontrado):
            asignaturas_SinC.append(asignatura)

    #asignaturas_recomendados
    asignaturas_rec = []

    for asignatura in asignaturas_SinC:
        asignaturas_rec.append({
            'asignatura':
            asignatura,
            'svd':
            SVDoptimized.predict(usuario.id, asignatura.codigo).est
        })
    # A function that returns the 'year' value:
    def ordenador(e):
        return e['svd']

    asignaturas_rec.sort(reverse=True, key=ordenador)

    return asignaturas_rec
Пример #31
0
def test_gridsearchcv_cv_results():
    """Test the cv_results attribute"""

    f = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(f, Reader('ml-100k'), rating_scale=(1, 5))
    kf = KFold(3, shuffle=True, random_state=4)
    param_grid = {'n_epochs': [5], 'lr_all': [.2, .2],
                  'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]}
    gs = GridSearchCV(SVD, param_grid, measures=['RMSE', 'mae'], cv=kf,
                      return_train_measures=True)
    gs.fit(data)

    # test keys split*_test_rmse, mean and std dev.
    assert gs.cv_results['split0_test_rmse'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['split1_test_rmse'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['split2_test_rmse'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['mean_test_rmse'].shape == (4,)  # 4 param comb.
    assert np.allclose(gs.cv_results['mean_test_rmse'],
                       np.mean([gs.cv_results['split0_test_rmse'],
                                gs.cv_results['split1_test_rmse'],
                                gs.cv_results['split2_test_rmse']], axis=0))
    assert np.allclose(gs.cv_results['std_test_rmse'],
                       np.std([gs.cv_results['split0_test_rmse'],
                               gs.cv_results['split1_test_rmse'],
                               gs.cv_results['split2_test_rmse']], axis=0))

    # test keys split*_train_mae, mean and std dev.
    assert gs.cv_results['split0_train_rmse'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['split1_train_rmse'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['split2_train_rmse'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['mean_train_rmse'].shape == (4,)  # 4 param comb.
    assert np.allclose(gs.cv_results['mean_train_rmse'],
                       np.mean([gs.cv_results['split0_train_rmse'],
                                gs.cv_results['split1_train_rmse'],
                                gs.cv_results['split2_train_rmse']], axis=0))
    assert np.allclose(gs.cv_results['std_train_rmse'],
                       np.std([gs.cv_results['split0_train_rmse'],
                               gs.cv_results['split1_train_rmse'],
                               gs.cv_results['split2_train_rmse']], axis=0))

    # test fit and train times dimensions.
    assert gs.cv_results['mean_fit_time'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['std_fit_time'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['mean_test_time'].shape == (4,)  # 4 param comb.
    assert gs.cv_results['std_test_time'].shape == (4,)  # 4 param comb.

    assert gs.cv_results['params'] is gs.param_combinations

    # assert that best parameter in gs.cv_results['rank_test_measure'] is
    # indeed the best_param attribute.
    best_index = np.argmin(gs.cv_results['rank_test_rmse'])
    assert gs.cv_results['params'][best_index] == gs.best_params['rmse']
    best_index = np.argmin(gs.cv_results['rank_test_mae'])
    assert gs.cv_results['params'][best_index] == gs.best_params['mae']
Пример #32
0
def _perform_grid_search(algo_class: AlgoBase, param_grid: Dict[str, Any],
                         dataset: Dataset, random_state: int) -> pd.DataFrame:
    gs = GridSearchCV(algo_class,
                      param_grid,
                      measures=['rmse', 'mae', 'fcp'],
                      cv=KFold(5, random_state=random_state),
                      n_jobs=2,
                      joblib_verbose=100,
                      pre_dispatch=2)
    gs.fit(dataset)
    return pd.DataFrame.from_dict(gs.cv_results).sort_values('rank_test_rmse')
Пример #33
0
def test_gridsearchcv_best_estimator(u1_ml100k):
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005],
                  'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]}
    gs = GridSearchCV(SVD, param_grid, measures=['mae'],
                      cv=PredefinedKFold(), joblib_verbose=100)
    gs.fit(u1_ml100k)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
Пример #34
0
parameter combination of a given algorithm.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Use movielens-100K
data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

import pandas as pd  # noqa
results_df = pd.DataFrame.from_dict(gs.cv_results)
Пример #35
0
def test_gridsearchcv_refit(u1_ml100k):
    """Test refit function of GridSearchCV."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, Reader('ml-100k'),
                                  rating_scale=(1, 5))

    param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005],
                  'reg_all': [0.4, 0.6], 'n_factors': [2]}

    # assert gs.fit() and gs.test will use best estimator for mae (first
    # appearing in measures)
    gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2,
                      refit=True)
    gs.fit(data)
    gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    mae_preds = gs.best_estimator['mae'].test(
        data.construct_testset(data.raw_ratings))
    assert gs_preds == mae_preds

    # assert gs.fit() and gs.test will use best estimator for rmse
    gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2,
                      refit='rmse')
    gs.fit(data)
    gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    rmse_preds = gs.best_estimator['rmse'].test(
        data.construct_testset(data.raw_ratings))
    assert gs_preds == rmse_preds
    # test that predict() can be called
    gs.predict(2, 4)

    # assert test() and predict() cannot be used when refit is false
    gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2,
                      refit=False)
    gs.fit(data)
    with pytest.raises(ValueError):
        gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    with pytest.raises(ValueError):
        gs.predict('1', '2')

    # test that error is raised if used with load_from_folds
    gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2,
                      refit=True)
    with pytest.raises(ValueError):
        gs.fit(u1_ml100k)