예제 #1
0
    def __init__(self, hyper_params, user_count, item_count):
        latent_size = hyper_params['latent_size']

        if hyper_params['model_type'] == 'kNN':
            self.model = surprise.prediction_algorithms.knns.KNNBasic(
                k=10, verbose=True)
        elif hyper_params['model_type'] == 'NMF':
            self.model = surprise.NMF(n_factors=latent_size,
                                      biased=False,
                                      n_epochs=50,
                                      verbose=True)
        elif hyper_params['model_type'] == 'SVD':
            self.model = surprise.SVD(n_factors=latent_size, verbose=True)
        elif hyper_params['model_type'] == 'SVD++':
            self.model = surprise.SVDpp(n_factors=latent_size, verbose=True)
        elif hyper_params['model_type'] == 'baseline':
            bsl_options = {
                'method': 'sgd',
                'n_epochs': 20,
            }
            self.model = surprise.prediction_algorithms.baseline_only.BaselineOnly(
                bsl_options=bsl_options, verbose=True)

        self.hyper_params = hyper_params
        self.user_count = user_count
        self.item_count = item_count
예제 #2
0
def NMF(train, test):
    """
    Run the NMF model from Surprise library.
    @param train: the training set in the Surprise format.
    @param test: the test set in the Surprise format.
    @return: the predictions in a numpy array.
    """
    algo = spr.NMF()
    algo.fit(train)
    predictions = algo.test(test)
    return get_predictions(predictions)
예제 #3
0
def main(args):

    user_item_based = 'item_based' if args.item_based else 'user_based'
    filename = '_'.join([
        args.exp_name, args.algorithm, args.sim_name, user_item_based,
        str(args.num_rows)
    ]) + '.pkl'

    output_file = Path(filename)
    if output_file.exists():
        print(f'ERROR! Output file {output_file} already exists. Exiting!')
        sys.exit(1)

    print(f'Saving scores in {output_file}\n')

    reader = surprise.Reader(rating_scale=(1, 5))
    df = pq.read_table('all_ratings_with_indices.parquet',
                       columns=['user_idx', 'movie_idx',
                                'rating']).to_pandas()
    df.user_idx = df.user_idx.astype(np.uint32)
    df.movie_idx = df.movie_idx.astype(np.uint16)
    df.rating = df.rating.astype(np.uint8)
    print(df.dtypes)
    data = surprise.Dataset.load_from_df(df[:args.num_rows], reader=reader)
    del df
    sim_options = {
        'name': args.sim_name,
        'user_based': False if args.item_based else True
    }

    if args.algorithm == 'knn':
        algo = surprise.KNNBasic(sim_options=sim_options)
    elif args.algorithm == 'baseline':
        algo = surprise.BaselineOnly()
    elif args.algorithm == 'normal':
        algo = surprise.NormalPredictor()
    elif args.algorithm == 'knn_zscore':
        algo = surprise.KNNWithZScore(sim_options=sim_options)
    elif args.algorithm == 'svd':
        algo = surprise.SVD()
    elif args.algorithm == 'nmf':
        algo = surprise.NMF()
    else:
        print(f'Algorithm {args.algorithm} is not a valid choice.')

    scores = surprise.model_selection.cross_validate(algo,
                                                     data,
                                                     cv=args.cv_folds,
                                                     verbose=True,
                                                     n_jobs=-1)

    pickle.dump(scores, open(output_file, 'wb'))
예제 #4
0
def algo_tester(data_object):
    '''
  Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms

  ---Parameters---
  data_object(variable) created from the read_data_surprise function

  ---Returns---
  returns a dataframe where you can compare the performance of different algorithms
  '''
    benchmark = []
    algos = [
        sp.SVDpp(),
        sp.SVD(),
        sp.SlopeOne(),
        sp.NMF(),
        sp.NormalPredictor(),
        sp.KNNBaseline(),
        sp.KNNBasic(),
        sp.KNNWithMeans(),
        sp.KNNWithZScore(),
        sp.BaselineOnly(),
        sp.CoClustering()
    ]

    # Iterate over all algorithms
    for algorithm in algos:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_object,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_rmse')
    return benchmark
                testset = data.construct_testset(A_test_dense)

                # SVDpp:
                algo = surprise.SVDpp()
                algo.fit(trainset)
                predictions = algo.test(testset)
                print("model SVDpp: ")
                # Then compute RMSE
                accuracy.rmse(predictions)
                print("NDCG: " +
                      str(sur_ndcg(atstd, predictions, product_index)))
                print("Precision: " +
                      str(sur_precision(atstd, predictions, product_index)))

                # NMF:
                algo = surprise.NMF()
                algo.fit(trainset)
                predictions = algo.test(testset)
                print("model NMF: ")
                accuracy.rmse(predictions)
                print("NDCG: " +
                      str(sur_ndcg(atstd, predictions, product_index)))
                print("Precision: " +
                      str(sur_precision(atstd, predictions, product_index)))

                # SlopeOne:
                algo = surprise.SlopeOne()
                algo.fit(trainset)
                predictions = algo.test(testset)
                print("model SlopeOne: ")
                # Then compute RMSE
예제 #6
0
knnBasic = surprise.KNNBasic()
knnBasic_temp = surprise.model_selection.cross_validate(
    knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnBasic-----------------')
print(knnBasic_temp)
knnWithMeans = surprise.KNNWithMeans()
knnWithMeans_temp = surprise.model_selection.cross_validate(
    knnWithMeans, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnWithMeans-----------------')
print(knnWithMeans_temp)
knnBaseline = surprise.KNNBaseline()
knnBaseline_temp = surprise.model_selection.cross_validate(
    knnBaseline, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnBaseline-----------------')
print(knnBaseline_temp)
svdpp = surprise.SVDpp()
svdpp_temp = surprise.model_selection.cross_validate(svdpp,
                                                     rating_data,
                                                     measures=['RMSE', 'MAE'],
                                                     cv=5,
                                                     verbose=True)
print('svdpp-----------------')
print(svdpp)
nmf = surprise.NMF()
nmf_temp = surprise.model_selection.cross_validate(nmf,
                                                   rating_data,
                                                   measures=['RMSE', 'MAE'],
                                                   cv=5,
                                                   verbose=True)
print('nmf-----------------')
print(nmf_temp)
예제 #7
0
def main(args):

    parser = argparse.ArgumentParser(description= \
        'Deploys recommendation algorithms and outputs the recommendations list',\
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--pickleLoadPath", type=str, action='store', \
        help= 'If set=> load topN recoms from pickle file')
    parser.add_argument("--pickleSavePath",
                        type=str,
                        action='store',
                        help='If set => Output .pickle file.')

    parser.add_argument("--proc", type=int, default=multiprocessing.cpu_count(), \
        action='store', \
        help= 'Number of processes to spawn for topN computation\n' +
        'default is number of processors.')
    parser.add_argument("--update_freq", type=int, default=1, action='store', \
        help= 'Number of clicks after which the model is updated')
    parser.add_argument("--topN_list", type=int, nargs="+", required=True, \
        help= 'e.g., --topN_list 5 10 50\n' \
        + 'topN=max(topN_list); the rest of the values are used for evaluation.')
    parser.add_argument("--drop_ratio", type=int, default=0, action='store', \
        help= 'Number of random events to remove from the training set;\n' + \
        'default is 0; Currently not implemented for librec.')
    parser.add_argument("--evalTrain", dest='evalTrain', action='store_true', \
        help='If set => evaluate on training set using k-fold validation.\n' \
            + 'Else => evaluate only on test set')

    parser.add_argument("--dataset", type=str, action='store', \
        help= 'Full path to the dataset.\n' + \
        'Must give --testSize and --validSize for the split')
    parser.add_argument("--testSize",
                        type=int,
                        default=0,
                        action='store',
                        help='TestSet size; default is 0 => no test set')
    parser.add_argument("--validSize", type=int, default=2000, action='store', \
        help= 'Validation Set size; default is 2000.')
    parser.add_argument("--trainSet", type=str, action='store', \
        help= 'Full path to the trainingSet.csv\n' + \
        'If given the (potential) training set split from --dataset will be overwritten')
    parser.add_argument("--validSet", type=str, action='store', \
        help= 'Full path to the validationSet.csv\n' + \
        'If given the (potential) validation set split from --dataset will be overwritten')
    parser.add_argument("--testSet", type=str, action='store', \
        help= 'Full path to the testSet.csv\n' + \
        'If given the (potential) test set split from --dataset will be overwritten')

    parser.add_argument("--librec_home", type=str, action='store', \
        help= 'Full path to the librec folder cloned from git.')
    parser.add_argument("--config", type=str, action='store', \
        help= 'Full path to the librec .properties file.\n' + \
        'Copy from: https://www.librec.net/dokuwiki/doku.php?id=AlgorithmList')
    parser.add_argument("--surprise_algo", type=str, action='store', \
        help= 'Choose algorithm from surprise lib. Available options:\n' + \
        '--surprise_algo SVD\n' + \
        '--surprise_algo SVDpp\n' + \
        '--surprise_algo PMF\n' + \
        '--surprise_algo NMF\n' + \
        '--surprise_algo KNNWithMeans\n')

    args = parser.parse_args(args)

    random.seed(42)  # reproducability
    np.random.seed(42)

    if args.pickleLoadPath is None:
        """DATA"""
        train, valid, test = splitter.splitData(
              fullDataPath=args.dataset, validSize=args.validSize, testSize=args.testSize, \
              trainSetPath=args.trainSet, validSetPath=args.validSet, testSetPath=args.testSet)
        """RECOMMENDATIONS"""
        if args.surprise_algo == 'SVD':
            algo = surprise.SVD()
        elif args.surprise_algo == 'KNNWithMeans':
            #     sim_options = {'name': 'pearson_baseline', 'shrinkage': 2500, \
            #        'user_based': False, }
            sim_options = {'name': 'cosine', 'user_based': False}
            algo = surprise.KNNWithMeans(k=40, sim_options=sim_options)
        elif args.surprise_algo == 'PMF':
            algo = surprise.SVD(n_factors=5,
                                reg_all=0.12,
                                lr_all=0.005,
                                n_epochs=400)
        elif args.surprise_algo == 'NMF':
            algo = surprise.NMF(n_factors=5, n_epochs=400)
        elif args.surprise_algo == 'SVDpp':
            algo = surprise.SVDpp()

        testList = []  # output recommendations for the last element
        if len(test) > 0:
            testList.append(test)
        if len(valid) > 0:
            testList.append(valid)

        for test in testList:
            if args.librec_home is None:
                recs = surprise_recom(train, test, algo, drop_ratio=args.drop_ratio, \
                    update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \
                    evalTrain=args.evalTrain)
            else:
                recs = librec_recom(train, test, args.librec_home, args.config, \
                    update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \
                    evalTrain=args.evalTrain)

        if not args.pickleSavePath is None:
            with open(args.pickleSavePath, 'wb') as handle:
                pickle.dump(recs, handle)

    else:
        with open(args.pickleLoadPath, 'rb') as handle:
            recs = pickle.load(handle)
예제 #8
0
def SurpriseBased(table, relation_name, parameters, verbose=False):
    """

    """

    report = {}

    # Initial checks
    param_keys = [k for k, v in parameters.items()]
    if ('max_scale' not in param_keys) or ('min_scale' not in param_keys):
        raise ValueError(
            'max_scale and min_scale must be specified in parameters for explicit RS.'
        )
    if 'model_size' not in param_keys:
        raise ValueError(
            'model_size must be specified in parameters for SURPRISE-based RS.'
        )
    if 'topK_predictions' not in param_keys:
        raise ValueError(
            'A size (K) must be given for the recommended list size (topK).')

    # Retrieving names
    start_group = table.start_group.iloc[0]
    end_group = table.end_group.iloc[0]
    timestamp = pd.Timestamp('')

    # Retrieving the table of the bipartite graph in SURPRISE format
    table = table[['start_object', 'end_object', 'value']]
    reader = surprise.Reader(rating_scale=(parameters['min_scale'],
                                           parameters['max_scale']))
    data = surprise.Dataset.load_from_df(table, reader)

    # Selecting the method from the SURPRISE module
    if parameters['method'] == 'UBCF':
        method = surprise.KNNBasic(k=parameters['model_size'], verbose=verbose)
    elif parameters['method'] == 'Z-UBCF':
        method = surprise.KNNWithZScore(k=parameters['model_size'])
    elif parameters['method'] == 'IBCF':
        method = surprise.KNNBasic(k=parameters['model_size'],
                                   sim_options={'user_based': False})
    elif parameters['method'] == 'SVD':
        method = surprise.SVD(n_factors=parameters['model_size'])
    elif parameters['method'] == 'NMF':
        method = surprise.NMF(n_factors=parameters['model_size'])
    elif parameters['method'] == 'CClustering':
        method = surprise.CoClustering(n_cltr_u=parameters['model_size'],
                                       n_cltr_i=parameters['model_size'])
    else:
        raise ValueError('Unrecognized SURPRISE-based RS method named %s' %
                         parameters['method'])

    # Computing utility metrics if so specified
    if 'RMSE' in param_keys:
        if parameters['RMSE']:
            results = surprise.model_selection.validation.cross_validate(
                method, data, measures=['rmse'], cv=5, verbose=verbose)
            rmse = results['test_rmse'].mean()
            report['RMSE'] = rmse

    # Training the prediction method
    trainset = data.build_full_trainset()
    del data
    method.fit(trainset)

    # Retrieving unobserved pairs
    t = TCounter()
    VerboseMessage(verbose, 'Producing unobserved links...')
    unobserved_links = trainset.build_anti_testset()
    VerboseMessage(
        verbose,
        'Unobserved links produced in %s.' % (ETSec2ETTime(TCounter() - t)))

    # Making the predictions
    t = TCounter()
    VerboseMessage(verbose, 'Making predictions for unobserved links...')
    predictions = method.test(unobserved_links)
    VerboseMessage(
        verbose, 'Predictions for Unobserved links produced in %s.' %
        (ETSec2ETTime(TCounter() - t)))

    # Prefiltering predictions with lower scores
    if 'prefilter_score' in param_keys:
        t = TCounter()
        VerboseMessage(
            verbose, 'Prefiltering %d predictions scores lower than %0.1f...' %
            (len(predictions), parameters['prefilter_threshold']))
        predictions = [
            p for p in predictions if p[3] > parameters['prefilter_threshold']
        ]
        VerboseMessage(
            verbose, 'Predictions prefiltered in %s, %d remaining.' %
            (ETSec2ETTime(TCounter() - t), len(predictions)))

    # Selecting only top K predictions
    t = TCounter()
    VerboseMessage(
        verbose,
        'Selecting top %d predictions...' % (parameters['topK_predictions']))
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_recs[uid] = user_ratings[:parameters['topK_predictions']]
    VerboseMessage(
        verbose,
        'Predictions selected in %s.' % (ETSec2ETTime(TCounter() - t)))

    # Putting the predictions in a DataFrame
    predictions_table = pd.DataFrame(columns=[
        'relation', 'start_group', 'start_object', 'end_group', 'end_object',
        'value', 'timestamp'
    ])
    counter = 0
    t = TCounter()
    VerboseMessage(verbose, 'Arranging predictions into a DataFrame table...')
    for k, v in top_recs.items():
        for r in v:
            predictions_table.loc[counter] = [
                relation_name, start_group, k, end_group, r[0], r[1], timestamp
            ]
            counter += 1

    VerboseMessage(
        verbose, 'Predictions arranged into a table in %s.' %
        (ETSec2ETTime(TCounter() - t)))

    return predictions_table, report
예제 #9
0
    return iid_to_title, iid_ratings

def recomDf(uid, recoms=recoms):
    recoms["title"] = recom(uid, 5)[0]
    recoms["predicted_score"] = recom(uid, 5)[1]
    recoms = recoms.sort_values(by="predicted_score", ascending=False)
    print(recoms)
    return recoms

recom(52, 5)
recomDf(52)

alg1 = surprise.SVD()
alg2 = surprise.KNNBasic()
alg3 = surprise.NMF()

#cross_validate(alg1, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True)
#cross_validate(alg2, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True)
#cross_validate(alg3, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True)


##############
# EVALUATION #
##############

benchmark = []
# Iterate over all algorithms --> First Fold ist train, k-1 Folds for testing
for algorithm in [SVD(), NMF(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=False)
예제 #10
0
def load_data(path, r_range):
    train_set = convert_to_df(np.load(path + ".train"), r_range)
    test_set = convert_to_df(np.load(path + ".test"), r_range)
    return train_set.build_full_trainset(), test_set.build_full_trainset(
    ).build_testset()


if __name__ == '__main__':
    PREFIX = "/home/mlsnrs/data/pxd/paper4graduation/paper_exp/dataset/"
    names = ['ml-latest-small/ml', 'BX-CSV-Dump/bx']
    teller = ["MovieLens", "BookCrossing"]
    r_ranges = [(1, 5), (1, 10), (1, 5)]
    algos = [
        sp.SVD(n_factors=10, biased=False, verbose=True),
        sp.NMF(n_factors=15, verbose=True)
    ]
    algos_names = ['SVD', 'NMF']
    max_epoch = 20

    for i, name in enumerate(names):
        print("BEGIN {}".format(teller[i]))
        train_set, test_set = load_data(PREFIX + name, r_ranges[i])
        for j in range(2):
            for epoch in range(1, max_epoch + 1):
                avg_rmse = 0.0
                avg_mae = 0.0
                for k in range(5):
                    if (j == 0):
                        algo = sp.SVD(n_factors=10,
                                      n_epochs=epoch,
예제 #11
0
tfidf_matrix = tfidf.fit_transform(articles['title'])
sim_mat_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix)
del articles, tfidf_matrix
'''
    5.  NMF

'''
dat = df_clicks.copy()
dat['click'] = 1

R = dat.pivot(index='userId', columns='articleId', values='click').fillna(0)

reader = surprise.Reader(rating_scale=(0, 1))
data = surprise.Dataset.load_from_df(dat, reader)

nmf_alg = surprise.NMF(random_state=123)

# n_factor: default = 50
nmf_output = nmf_alg.fit(data.build_full_trainset())
#print(nmf_output.qi.shape)

sim_mat_NMF = 1 - pairwise_distances(nmf_output.qi, metric='cosine')

#print(sim_mat_NMF.shape)
#print(sim_mat_NMF[:5, :5])

del dat, data, reader, nmf_output
'''
    6.  SVD

'''
예제 #12
0
    elif args.model == 'BaselineOnly':
        model = surprise.BaselineOnly()
    elif args.model == 'KNNBasic':
        model = surprise.KNNBasic()
    elif args.model == 'KNNWithMeans':
        model = surprise.KNNWithMeans()
    elif args.model == 'KNNWithZScore':
        model = surprise.KNNWithZScore()
    elif args.model == 'KNNBaseline':
        model = surprise.KNNBaseline()
    elif args.model == 'SVD':
        model = surprise.SVD()
    elif args.model == 'SVDpp':
        model = surprise.SVDpp(verbose=True)
    elif args.model == 'NMF':
        model = surprise.NMF()
    elif args.model == 'SlopeOne':
        model = surprise.SlopeOne()
    elif args.model == 'CoClustering':
        model = surprise.CoClustering()

    # cross_validate(model, trainset, cv=5, verbose=True)
    model.fit(trainset)

    lines = []
    test_path = path + '/Data/test_format.txt'
    for line in tqdm(open(test_path, 'r').readlines()):
        user_id, item_id, timestamp, *tags = line.strip().split(',')
        rating = model.predict(user_id, item_id).est
        lines.append("{:.5}\n".format(float(rating)))
예제 #13
0
 def build_network(self):
     self.model = surprise.NMF(n_factors=self.n_factors)
예제 #14
0
    df = pd.DataFrame(ratings_dict);
    return sp.Dataset.load_from_df(df[['uid','vid', 'r']], reader);



def load_data(path, r_range):
    train_set = convert_to_df(np.load(path + ".train"), r_range);
    test_set = convert_to_df(np.load(path + ".test"), r_range);
    return train_set.build_full_trainset(), test_set.build_full_trainset().build_testset();
    
    


if __name__ == '__main__':
    PREFIX = "/Users/morino/Downloads/dataset/";
    names = [ 'ml-latest-small/ml', 'BX-CSV-Dump/bx', 'jester/jester'];
    teller = ["MovieLens", "BookCrossing", "Jester"];
    r_ranges = [(1, 5), (1, 10), (0, 20)];
    algos = [sp.SVD(biased = False), sp.SVDpp(), sp.NMF()];
    algos_names = ['SVD', 'SVD++', 'NMF']
  

    for i, name in enumerate(names):
        print("BEGIN {}".format(teller[i]));
        train_set, test_set = load_data(PREFIX + name, r_ranges[i]);
        for j, algo in enumerate(algos):
            algo.fit(train_set);
            preds = algo.test(test_set);
            print("{} RMSE {}".format(algos_names[j], sp.accuracy.rmse(preds)));
        print("END {}".format(teller[i]));
예제 #15
0
    return sp.Dataset.load_from_df(df[['uid', 'vid', 'r']], reader)


def load_data(path, r_range):
    train_set = convert_to_df(np.load(path + ".train"), r_range)
    test_set = convert_to_df(np.load(path + ".test"), r_range)
    return train_set.build_full_trainset(), test_set.build_full_trainset(
    ).build_testset()


if __name__ == '__main__':
    PREFIX = "/home/mlsnrs/data/pxd/paper4graduation/paper_exp/dataset/"
    names = ['ml-latest-small/ml', 'BX-CSV-Dump/bx', 'douban/douban']
    teller = ["MovieLens", "BookCrossing", "douban"]
    r_ranges = [(1, 5), (1, 10), (1, 5)]
    algos = [
        sp.SVD(n_factors=10, biased=False, verbose=True),
        sp.NMF(n_factors=15, verbose=True)
    ]
    algos_names = ['SVD', 'NMF']

    for i, name in enumerate(names):
        print("BEGIN {}".format(teller[i]))
        train_set, test_set = load_data(PREFIX + name, r_ranges[i])
        for j, algo in enumerate(algos):
            algo.fit(train_set)
            preds = algo.test(test_set)
            print("{} RMSE {}".format(algos_names[j], sp.accuracy.rmse(preds)))
            sp.accuracy.mae(preds)
        print("END {}".format(teller[i]))
예제 #16
0
# identifier_df_train = X_train[['user_id', 'business_id']]
# identifier_df_test = X_test[['user_id', 'business_id']]

# A reader is needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_df, reader)

sim_options = {'user_based': [False]}

results = []

# Iterate over all algorithms
for algorithm in [
        SVD(),
        surprise.NMF(),
        surprise.SlopeOne(),
        surprise.CoClustering(),
        surprise.KNNBasic(sim_options=sim_options),
        surprise.KNNWithMeans(sim_options=sim_options),
        surprise.KNNWithZScore(sim_options=sim_options),
        surprise.KNNBaseline(sim_options=sim_options),
        surprise.NormalPredictor(),
        surprise.BaselineOnly()
]:

    # Get string of algname for naming a pickle file a useful name
    alg_name = str(algorithm)
    alg_name = alg_name[alg_name.find('.') + 1:]
    alg_name = alg_name[alg_name.find('.') + 1:]
    alg_name = alg_name[alg_name.find('.') + 1:]
          epochs=2,
          validation_split=0.1,
          shuffle=True)

y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf])
y_true = df_hybrid_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

# Load dataset into surprise specific data-structure
data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader())

benchmark = []
# Iterate over all algorithms
for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    
    # Store data
    benchmark.append(tmp)
    
    # Store results
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False)

# Get data
data = surprise_results[['test_rmse', 'test_mae']]
예제 #18
0
cross_validate(algo, data)['test_mae'].mean()

# In[50]:

sim_options = {'name': 'cosine'}
algo = surprise.KNNBasic(sim_options=sim_options)
cross_validate(algo, data)['test_mae'].mean()

# In[51]:

algo = surprise.SVD(n_factors=100)
cross_validate(algo, data)['test_mae'].mean()

# In[52]:

algo = surprise.NMF(n_factors=100)
cross_validate(algo, data)['test_mae'].mean()

# In[55]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

gc = pd.read_csv('C:/Users/USER/Desktop/test/GermanCredit.csv')
gc.head()

X = gc.iloc[:, 1:31]
y = gc['RESPONSE']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
예제 #19
0
def main(train_df, target_df, cache_name="test", force_recompute=[]):
    """Train multiple models on train_df and predicts target_df

    Predictions are cached. If the indices don't match the indices of
    target_df, the cache is discarded.

    By default, if a method was already computed it is not recomputed again
    (except if the method name is listed in force_recompute). cache_name
    is the name to use to read and write the cache.

    Arguments:
        train_df {dataframe} -- Training dataframe
        target_df {dataframe} -- Testing dataframe

    Keyword Arguments:
        cache_name {str} -- Name to use for caching (default: {"test"})
        force_recompute {list} -- Name(s) of methods to recompute, whether or
        not it was already computed. Useful to only recompute single methods
        without discarding the rest. (default: {[]})

    Returns:
        Dataframe -- Dataframe with predictions for each methods as columns,
        IDs as indices
    """
    global algo_in_use
    CACHED_DF_FILENAME = os.path.dirname(
        os.path.abspath(__file__)) +\
        "/cache/cached_predictions_{}.pkl".format(cache_name)
    train_df = preprocess_df(train_df)
    trainset = pandas_to_data(train_df)
    ids_to_predict = target_df["Id"].to_list()

    # try to retrieve backup dataframe
    try:
        print("Retrieving cached predictions")
        all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME)
        print("Ensuring cached IDs match given IDs")
        assert sorted(ids_to_predict) == sorted(
            all_algos_preds_df.index.values)
        print("Indices match, continuing")
    except (FileNotFoundError, AssertionError):
        print("No valid cached predictions found")
        all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"])
        all_algos_preds_df.set_index("Id", inplace=True)

    all_algos = {
        "SVD": spr.SVD(n_factors=200, n_epochs=100),
        "Baseline": spr.BaselineOnly(),
        "NMF": spr.NMF(n_factors=30, n_epochs=100),
        "Slope One": spr.SlopeOne(),
        "KNN Basic": spr.KNNBasic(k=60),
        "KNN Means": spr.KNNWithMeans(k=60),
        "KNN Baseline": spr.KNNBaseline(),
        "KNN Zscore": spr.KNNWithZScore(k=60),
        "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100),
        "Co Clustering": spr.CoClustering()
    }

    for name in all_algos:
        print("##### {} ####".format(name))
        if name in force_recompute and name in all_algos_preds_df.columns:
            all_algos_preds_df.drop(name, axis=1, inplace=True)
        if name in all_algos_preds_df.columns:
            print("Already computed {}, skipping".format(name))
            continue
        algo = all_algos[name]
        time.sleep(1)
        algo.fit(trainset)
        time.sleep(1)
        algo_in_use = algo
        print("Generating predictions...")
        predictions = parallelize_predictions(ids_to_predict, 80)
        print("Done. Merging with previous results")
        this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name])
        this_algo_preds_df.set_index("Id", inplace=True)
        all_algos_preds_df = pd.merge(all_algos_preds_df,
                                      this_algo_preds_df,
                                      left_index=True,
                                      right_index=True)
        all_algos_preds_df.to_pickle(CACHED_DF_FILENAME)
    print("DONE computing surprize")
    return all_algos_preds_df
예제 #20
0
# defining the number of folds = 5
print("Performing splits...")
kf = sp.model_selection.KFold(n_splits=5, random_state=0)
print("Done.")

###
### PART 1.1
###
'''
application of all algorithms for recommendation made available by 
“Surprise” libraries, according to their default configuration.
'''
algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\
              sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\
              sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()]
for elem in algorithms:
    start_time = time.time()
    algo = elem
    sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \
                                      cv=kf, n_jobs = 2, verbose=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

###
### PART 1.2
###
'''
Improvement of the quality of both KNNBaseline and SVD methods, 
by performing hyper-parameters tuning over 5-folds
Random-Search-Cross-Validation - KNN