def __init__(self, hyper_params, user_count, item_count): latent_size = hyper_params['latent_size'] if hyper_params['model_type'] == 'kNN': self.model = surprise.prediction_algorithms.knns.KNNBasic( k=10, verbose=True) elif hyper_params['model_type'] == 'NMF': self.model = surprise.NMF(n_factors=latent_size, biased=False, n_epochs=50, verbose=True) elif hyper_params['model_type'] == 'SVD': self.model = surprise.SVD(n_factors=latent_size, verbose=True) elif hyper_params['model_type'] == 'SVD++': self.model = surprise.SVDpp(n_factors=latent_size, verbose=True) elif hyper_params['model_type'] == 'baseline': bsl_options = { 'method': 'sgd', 'n_epochs': 20, } self.model = surprise.prediction_algorithms.baseline_only.BaselineOnly( bsl_options=bsl_options, verbose=True) self.hyper_params = hyper_params self.user_count = user_count self.item_count = item_count
def NMF(train, test): """ Run the NMF model from Surprise library. @param train: the training set in the Surprise format. @param test: the test set in the Surprise format. @return: the predictions in a numpy array. """ algo = spr.NMF() algo.fit(train) predictions = algo.test(test) return get_predictions(predictions)
def main(args): user_item_based = 'item_based' if args.item_based else 'user_based' filename = '_'.join([ args.exp_name, args.algorithm, args.sim_name, user_item_based, str(args.num_rows) ]) + '.pkl' output_file = Path(filename) if output_file.exists(): print(f'ERROR! Output file {output_file} already exists. Exiting!') sys.exit(1) print(f'Saving scores in {output_file}\n') reader = surprise.Reader(rating_scale=(1, 5)) df = pq.read_table('all_ratings_with_indices.parquet', columns=['user_idx', 'movie_idx', 'rating']).to_pandas() df.user_idx = df.user_idx.astype(np.uint32) df.movie_idx = df.movie_idx.astype(np.uint16) df.rating = df.rating.astype(np.uint8) print(df.dtypes) data = surprise.Dataset.load_from_df(df[:args.num_rows], reader=reader) del df sim_options = { 'name': args.sim_name, 'user_based': False if args.item_based else True } if args.algorithm == 'knn': algo = surprise.KNNBasic(sim_options=sim_options) elif args.algorithm == 'baseline': algo = surprise.BaselineOnly() elif args.algorithm == 'normal': algo = surprise.NormalPredictor() elif args.algorithm == 'knn_zscore': algo = surprise.KNNWithZScore(sim_options=sim_options) elif args.algorithm == 'svd': algo = surprise.SVD() elif args.algorithm == 'nmf': algo = surprise.NMF() else: print(f'Algorithm {args.algorithm} is not a valid choice.') scores = surprise.model_selection.cross_validate(algo, data, cv=args.cv_folds, verbose=True, n_jobs=-1) pickle.dump(scores, open(output_file, 'wb'))
def algo_tester(data_object): ''' Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms ---Parameters--- data_object(variable) created from the read_data_surprise function ---Returns--- returns a dataframe where you can compare the performance of different algorithms ''' benchmark = [] algos = [ sp.SVDpp(), sp.SVD(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering() ] # Iterate over all algorithms for algorithm in algos: # Perform cross validation results = cross_validate(algorithm, data_object, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') return benchmark
testset = data.construct_testset(A_test_dense) # SVDpp: algo = surprise.SVDpp() algo.fit(trainset) predictions = algo.test(testset) print("model SVDpp: ") # Then compute RMSE accuracy.rmse(predictions) print("NDCG: " + str(sur_ndcg(atstd, predictions, product_index))) print("Precision: " + str(sur_precision(atstd, predictions, product_index))) # NMF: algo = surprise.NMF() algo.fit(trainset) predictions = algo.test(testset) print("model NMF: ") accuracy.rmse(predictions) print("NDCG: " + str(sur_ndcg(atstd, predictions, product_index))) print("Precision: " + str(sur_precision(atstd, predictions, product_index))) # SlopeOne: algo = surprise.SlopeOne() algo.fit(trainset) predictions = algo.test(testset) print("model SlopeOne: ") # Then compute RMSE
knnBasic = surprise.KNNBasic() knnBasic_temp = surprise.model_selection.cross_validate( knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBasic-----------------') print(knnBasic_temp) knnWithMeans = surprise.KNNWithMeans() knnWithMeans_temp = surprise.model_selection.cross_validate( knnWithMeans, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnWithMeans-----------------') print(knnWithMeans_temp) knnBaseline = surprise.KNNBaseline() knnBaseline_temp = surprise.model_selection.cross_validate( knnBaseline, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBaseline-----------------') print(knnBaseline_temp) svdpp = surprise.SVDpp() svdpp_temp = surprise.model_selection.cross_validate(svdpp, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('svdpp-----------------') print(svdpp) nmf = surprise.NMF() nmf_temp = surprise.model_selection.cross_validate(nmf, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('nmf-----------------') print(nmf_temp)
def main(args): parser = argparse.ArgumentParser(description= \ 'Deploys recommendation algorithms and outputs the recommendations list',\ formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--pickleLoadPath", type=str, action='store', \ help= 'If set=> load topN recoms from pickle file') parser.add_argument("--pickleSavePath", type=str, action='store', help='If set => Output .pickle file.') parser.add_argument("--proc", type=int, default=multiprocessing.cpu_count(), \ action='store', \ help= 'Number of processes to spawn for topN computation\n' + 'default is number of processors.') parser.add_argument("--update_freq", type=int, default=1, action='store', \ help= 'Number of clicks after which the model is updated') parser.add_argument("--topN_list", type=int, nargs="+", required=True, \ help= 'e.g., --topN_list 5 10 50\n' \ + 'topN=max(topN_list); the rest of the values are used for evaluation.') parser.add_argument("--drop_ratio", type=int, default=0, action='store', \ help= 'Number of random events to remove from the training set;\n' + \ 'default is 0; Currently not implemented for librec.') parser.add_argument("--evalTrain", dest='evalTrain', action='store_true', \ help='If set => evaluate on training set using k-fold validation.\n' \ + 'Else => evaluate only on test set') parser.add_argument("--dataset", type=str, action='store', \ help= 'Full path to the dataset.\n' + \ 'Must give --testSize and --validSize for the split') parser.add_argument("--testSize", type=int, default=0, action='store', help='TestSet size; default is 0 => no test set') parser.add_argument("--validSize", type=int, default=2000, action='store', \ help= 'Validation Set size; default is 2000.') parser.add_argument("--trainSet", type=str, action='store', \ help= 'Full path to the trainingSet.csv\n' + \ 'If given the (potential) training set split from --dataset will be overwritten') parser.add_argument("--validSet", type=str, action='store', \ help= 'Full path to the validationSet.csv\n' + \ 'If given the (potential) validation set split from --dataset will be overwritten') parser.add_argument("--testSet", type=str, action='store', \ help= 'Full path to the testSet.csv\n' + \ 'If given the (potential) test set split from --dataset will be overwritten') parser.add_argument("--librec_home", type=str, action='store', \ help= 'Full path to the librec folder cloned from git.') parser.add_argument("--config", type=str, action='store', \ help= 'Full path to the librec .properties file.\n' + \ 'Copy from: https://www.librec.net/dokuwiki/doku.php?id=AlgorithmList') parser.add_argument("--surprise_algo", type=str, action='store', \ help= 'Choose algorithm from surprise lib. Available options:\n' + \ '--surprise_algo SVD\n' + \ '--surprise_algo SVDpp\n' + \ '--surprise_algo PMF\n' + \ '--surprise_algo NMF\n' + \ '--surprise_algo KNNWithMeans\n') args = parser.parse_args(args) random.seed(42) # reproducability np.random.seed(42) if args.pickleLoadPath is None: """DATA""" train, valid, test = splitter.splitData( fullDataPath=args.dataset, validSize=args.validSize, testSize=args.testSize, \ trainSetPath=args.trainSet, validSetPath=args.validSet, testSetPath=args.testSet) """RECOMMENDATIONS""" if args.surprise_algo == 'SVD': algo = surprise.SVD() elif args.surprise_algo == 'KNNWithMeans': # sim_options = {'name': 'pearson_baseline', 'shrinkage': 2500, \ # 'user_based': False, } sim_options = {'name': 'cosine', 'user_based': False} algo = surprise.KNNWithMeans(k=40, sim_options=sim_options) elif args.surprise_algo == 'PMF': algo = surprise.SVD(n_factors=5, reg_all=0.12, lr_all=0.005, n_epochs=400) elif args.surprise_algo == 'NMF': algo = surprise.NMF(n_factors=5, n_epochs=400) elif args.surprise_algo == 'SVDpp': algo = surprise.SVDpp() testList = [] # output recommendations for the last element if len(test) > 0: testList.append(test) if len(valid) > 0: testList.append(valid) for test in testList: if args.librec_home is None: recs = surprise_recom(train, test, algo, drop_ratio=args.drop_ratio, \ update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \ evalTrain=args.evalTrain) else: recs = librec_recom(train, test, args.librec_home, args.config, \ update_freq=args.update_freq, N_list=args.topN_list, num=args.proc, \ evalTrain=args.evalTrain) if not args.pickleSavePath is None: with open(args.pickleSavePath, 'wb') as handle: pickle.dump(recs, handle) else: with open(args.pickleLoadPath, 'rb') as handle: recs = pickle.load(handle)
def SurpriseBased(table, relation_name, parameters, verbose=False): """ """ report = {} # Initial checks param_keys = [k for k, v in parameters.items()] if ('max_scale' not in param_keys) or ('min_scale' not in param_keys): raise ValueError( 'max_scale and min_scale must be specified in parameters for explicit RS.' ) if 'model_size' not in param_keys: raise ValueError( 'model_size must be specified in parameters for SURPRISE-based RS.' ) if 'topK_predictions' not in param_keys: raise ValueError( 'A size (K) must be given for the recommended list size (topK).') # Retrieving names start_group = table.start_group.iloc[0] end_group = table.end_group.iloc[0] timestamp = pd.Timestamp('') # Retrieving the table of the bipartite graph in SURPRISE format table = table[['start_object', 'end_object', 'value']] reader = surprise.Reader(rating_scale=(parameters['min_scale'], parameters['max_scale'])) data = surprise.Dataset.load_from_df(table, reader) # Selecting the method from the SURPRISE module if parameters['method'] == 'UBCF': method = surprise.KNNBasic(k=parameters['model_size'], verbose=verbose) elif parameters['method'] == 'Z-UBCF': method = surprise.KNNWithZScore(k=parameters['model_size']) elif parameters['method'] == 'IBCF': method = surprise.KNNBasic(k=parameters['model_size'], sim_options={'user_based': False}) elif parameters['method'] == 'SVD': method = surprise.SVD(n_factors=parameters['model_size']) elif parameters['method'] == 'NMF': method = surprise.NMF(n_factors=parameters['model_size']) elif parameters['method'] == 'CClustering': method = surprise.CoClustering(n_cltr_u=parameters['model_size'], n_cltr_i=parameters['model_size']) else: raise ValueError('Unrecognized SURPRISE-based RS method named %s' % parameters['method']) # Computing utility metrics if so specified if 'RMSE' in param_keys: if parameters['RMSE']: results = surprise.model_selection.validation.cross_validate( method, data, measures=['rmse'], cv=5, verbose=verbose) rmse = results['test_rmse'].mean() report['RMSE'] = rmse # Training the prediction method trainset = data.build_full_trainset() del data method.fit(trainset) # Retrieving unobserved pairs t = TCounter() VerboseMessage(verbose, 'Producing unobserved links...') unobserved_links = trainset.build_anti_testset() VerboseMessage( verbose, 'Unobserved links produced in %s.' % (ETSec2ETTime(TCounter() - t))) # Making the predictions t = TCounter() VerboseMessage(verbose, 'Making predictions for unobserved links...') predictions = method.test(unobserved_links) VerboseMessage( verbose, 'Predictions for Unobserved links produced in %s.' % (ETSec2ETTime(TCounter() - t))) # Prefiltering predictions with lower scores if 'prefilter_score' in param_keys: t = TCounter() VerboseMessage( verbose, 'Prefiltering %d predictions scores lower than %0.1f...' % (len(predictions), parameters['prefilter_threshold'])) predictions = [ p for p in predictions if p[3] > parameters['prefilter_threshold'] ] VerboseMessage( verbose, 'Predictions prefiltered in %s, %d remaining.' % (ETSec2ETTime(TCounter() - t), len(predictions))) # Selecting only top K predictions t = TCounter() VerboseMessage( verbose, 'Selecting top %d predictions...' % (parameters['topK_predictions'])) top_recs = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_recs[uid].append((iid, est)) for uid, user_ratings in top_recs.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_recs[uid] = user_ratings[:parameters['topK_predictions']] VerboseMessage( verbose, 'Predictions selected in %s.' % (ETSec2ETTime(TCounter() - t))) # Putting the predictions in a DataFrame predictions_table = pd.DataFrame(columns=[ 'relation', 'start_group', 'start_object', 'end_group', 'end_object', 'value', 'timestamp' ]) counter = 0 t = TCounter() VerboseMessage(verbose, 'Arranging predictions into a DataFrame table...') for k, v in top_recs.items(): for r in v: predictions_table.loc[counter] = [ relation_name, start_group, k, end_group, r[0], r[1], timestamp ] counter += 1 VerboseMessage( verbose, 'Predictions arranged into a table in %s.' % (ETSec2ETTime(TCounter() - t))) return predictions_table, report
return iid_to_title, iid_ratings def recomDf(uid, recoms=recoms): recoms["title"] = recom(uid, 5)[0] recoms["predicted_score"] = recom(uid, 5)[1] recoms = recoms.sort_values(by="predicted_score", ascending=False) print(recoms) return recoms recom(52, 5) recomDf(52) alg1 = surprise.SVD() alg2 = surprise.KNNBasic() alg3 = surprise.NMF() #cross_validate(alg1, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True) #cross_validate(alg2, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True) #cross_validate(alg3, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True) ############## # EVALUATION # ############## benchmark = [] # Iterate over all algorithms --> First Fold ist train, k-1 Folds for testing for algorithm in [SVD(), NMF(), KNNBasic()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=False)
def load_data(path, r_range): train_set = convert_to_df(np.load(path + ".train"), r_range) test_set = convert_to_df(np.load(path + ".test"), r_range) return train_set.build_full_trainset(), test_set.build_full_trainset( ).build_testset() if __name__ == '__main__': PREFIX = "/home/mlsnrs/data/pxd/paper4graduation/paper_exp/dataset/" names = ['ml-latest-small/ml', 'BX-CSV-Dump/bx'] teller = ["MovieLens", "BookCrossing"] r_ranges = [(1, 5), (1, 10), (1, 5)] algos = [ sp.SVD(n_factors=10, biased=False, verbose=True), sp.NMF(n_factors=15, verbose=True) ] algos_names = ['SVD', 'NMF'] max_epoch = 20 for i, name in enumerate(names): print("BEGIN {}".format(teller[i])) train_set, test_set = load_data(PREFIX + name, r_ranges[i]) for j in range(2): for epoch in range(1, max_epoch + 1): avg_rmse = 0.0 avg_mae = 0.0 for k in range(5): if (j == 0): algo = sp.SVD(n_factors=10, n_epochs=epoch,
tfidf_matrix = tfidf.fit_transform(articles['title']) sim_mat_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix) del articles, tfidf_matrix ''' 5. NMF ''' dat = df_clicks.copy() dat['click'] = 1 R = dat.pivot(index='userId', columns='articleId', values='click').fillna(0) reader = surprise.Reader(rating_scale=(0, 1)) data = surprise.Dataset.load_from_df(dat, reader) nmf_alg = surprise.NMF(random_state=123) # n_factor: default = 50 nmf_output = nmf_alg.fit(data.build_full_trainset()) #print(nmf_output.qi.shape) sim_mat_NMF = 1 - pairwise_distances(nmf_output.qi, metric='cosine') #print(sim_mat_NMF.shape) #print(sim_mat_NMF[:5, :5]) del dat, data, reader, nmf_output ''' 6. SVD '''
elif args.model == 'BaselineOnly': model = surprise.BaselineOnly() elif args.model == 'KNNBasic': model = surprise.KNNBasic() elif args.model == 'KNNWithMeans': model = surprise.KNNWithMeans() elif args.model == 'KNNWithZScore': model = surprise.KNNWithZScore() elif args.model == 'KNNBaseline': model = surprise.KNNBaseline() elif args.model == 'SVD': model = surprise.SVD() elif args.model == 'SVDpp': model = surprise.SVDpp(verbose=True) elif args.model == 'NMF': model = surprise.NMF() elif args.model == 'SlopeOne': model = surprise.SlopeOne() elif args.model == 'CoClustering': model = surprise.CoClustering() # cross_validate(model, trainset, cv=5, verbose=True) model.fit(trainset) lines = [] test_path = path + '/Data/test_format.txt' for line in tqdm(open(test_path, 'r').readlines()): user_id, item_id, timestamp, *tags = line.strip().split(',') rating = model.predict(user_id, item_id).est lines.append("{:.5}\n".format(float(rating)))
def build_network(self): self.model = surprise.NMF(n_factors=self.n_factors)
df = pd.DataFrame(ratings_dict); return sp.Dataset.load_from_df(df[['uid','vid', 'r']], reader); def load_data(path, r_range): train_set = convert_to_df(np.load(path + ".train"), r_range); test_set = convert_to_df(np.load(path + ".test"), r_range); return train_set.build_full_trainset(), test_set.build_full_trainset().build_testset(); if __name__ == '__main__': PREFIX = "/Users/morino/Downloads/dataset/"; names = [ 'ml-latest-small/ml', 'BX-CSV-Dump/bx', 'jester/jester']; teller = ["MovieLens", "BookCrossing", "Jester"]; r_ranges = [(1, 5), (1, 10), (0, 20)]; algos = [sp.SVD(biased = False), sp.SVDpp(), sp.NMF()]; algos_names = ['SVD', 'SVD++', 'NMF'] for i, name in enumerate(names): print("BEGIN {}".format(teller[i])); train_set, test_set = load_data(PREFIX + name, r_ranges[i]); for j, algo in enumerate(algos): algo.fit(train_set); preds = algo.test(test_set); print("{} RMSE {}".format(algos_names[j], sp.accuracy.rmse(preds))); print("END {}".format(teller[i]));
return sp.Dataset.load_from_df(df[['uid', 'vid', 'r']], reader) def load_data(path, r_range): train_set = convert_to_df(np.load(path + ".train"), r_range) test_set = convert_to_df(np.load(path + ".test"), r_range) return train_set.build_full_trainset(), test_set.build_full_trainset( ).build_testset() if __name__ == '__main__': PREFIX = "/home/mlsnrs/data/pxd/paper4graduation/paper_exp/dataset/" names = ['ml-latest-small/ml', 'BX-CSV-Dump/bx', 'douban/douban'] teller = ["MovieLens", "BookCrossing", "douban"] r_ranges = [(1, 5), (1, 10), (1, 5)] algos = [ sp.SVD(n_factors=10, biased=False, verbose=True), sp.NMF(n_factors=15, verbose=True) ] algos_names = ['SVD', 'NMF'] for i, name in enumerate(names): print("BEGIN {}".format(teller[i])) train_set, test_set = load_data(PREFIX + name, r_ranges[i]) for j, algo in enumerate(algos): algo.fit(train_set) preds = algo.test(test_set) print("{} RMSE {}".format(algos_names[j], sp.accuracy.rmse(preds))) sp.accuracy.mae(preds) print("END {}".format(teller[i]))
# identifier_df_train = X_train[['user_id', 'business_id']] # identifier_df_test = X_test[['user_id', 'business_id']] # A reader is needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(train_df, reader) sim_options = {'user_based': [False]} results = [] # Iterate over all algorithms for algorithm in [ SVD(), surprise.NMF(), surprise.SlopeOne(), surprise.CoClustering(), surprise.KNNBasic(sim_options=sim_options), surprise.KNNWithMeans(sim_options=sim_options), surprise.KNNWithZScore(sim_options=sim_options), surprise.KNNBaseline(sim_options=sim_options), surprise.NormalPredictor(), surprise.BaselineOnly() ]: # Get string of algname for naming a pickle file a useful name alg_name = str(algorithm) alg_name = alg_name[alg_name.find('.') + 1:] alg_name = alg_name[alg_name.find('.') + 1:] alg_name = alg_name[alg_name.find('.') + 1:]
epochs=2, validation_split=0.1, shuffle=True) y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf]) y_true = df_hybrid_test['Rating'].values rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true)) print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse)) # Load dataset into surprise specific data-structure data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader()) benchmark = [] # Iterate over all algorithms for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) # Store data benchmark.append(tmp) # Store results surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False) # Get data data = surprise_results[['test_rmse', 'test_mae']]
cross_validate(algo, data)['test_mae'].mean() # In[50]: sim_options = {'name': 'cosine'} algo = surprise.KNNBasic(sim_options=sim_options) cross_validate(algo, data)['test_mae'].mean() # In[51]: algo = surprise.SVD(n_factors=100) cross_validate(algo, data)['test_mae'].mean() # In[52]: algo = surprise.NMF(n_factors=100) cross_validate(algo, data)['test_mae'].mean() # In[55]: from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split gc = pd.read_csv('C:/Users/USER/Desktop/test/GermanCredit.csv') gc.head() X = gc.iloc[:, 1:31] y = gc['RESPONSE'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
def main(train_df, target_df, cache_name="test", force_recompute=[]): """Train multiple models on train_df and predicts target_df Predictions are cached. If the indices don't match the indices of target_df, the cache is discarded. By default, if a method was already computed it is not recomputed again (except if the method name is listed in force_recompute). cache_name is the name to use to read and write the cache. Arguments: train_df {dataframe} -- Training dataframe target_df {dataframe} -- Testing dataframe Keyword Arguments: cache_name {str} -- Name to use for caching (default: {"test"}) force_recompute {list} -- Name(s) of methods to recompute, whether or not it was already computed. Useful to only recompute single methods without discarding the rest. (default: {[]}) Returns: Dataframe -- Dataframe with predictions for each methods as columns, IDs as indices """ global algo_in_use CACHED_DF_FILENAME = os.path.dirname( os.path.abspath(__file__)) +\ "/cache/cached_predictions_{}.pkl".format(cache_name) train_df = preprocess_df(train_df) trainset = pandas_to_data(train_df) ids_to_predict = target_df["Id"].to_list() # try to retrieve backup dataframe try: print("Retrieving cached predictions") all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME) print("Ensuring cached IDs match given IDs") assert sorted(ids_to_predict) == sorted( all_algos_preds_df.index.values) print("Indices match, continuing") except (FileNotFoundError, AssertionError): print("No valid cached predictions found") all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"]) all_algos_preds_df.set_index("Id", inplace=True) all_algos = { "SVD": spr.SVD(n_factors=200, n_epochs=100), "Baseline": spr.BaselineOnly(), "NMF": spr.NMF(n_factors=30, n_epochs=100), "Slope One": spr.SlopeOne(), "KNN Basic": spr.KNNBasic(k=60), "KNN Means": spr.KNNWithMeans(k=60), "KNN Baseline": spr.KNNBaseline(), "KNN Zscore": spr.KNNWithZScore(k=60), "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100), "Co Clustering": spr.CoClustering() } for name in all_algos: print("##### {} ####".format(name)) if name in force_recompute and name in all_algos_preds_df.columns: all_algos_preds_df.drop(name, axis=1, inplace=True) if name in all_algos_preds_df.columns: print("Already computed {}, skipping".format(name)) continue algo = all_algos[name] time.sleep(1) algo.fit(trainset) time.sleep(1) algo_in_use = algo print("Generating predictions...") predictions = parallelize_predictions(ids_to_predict, 80) print("Done. Merging with previous results") this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name]) this_algo_preds_df.set_index("Id", inplace=True) all_algos_preds_df = pd.merge(all_algos_preds_df, this_algo_preds_df, left_index=True, right_index=True) all_algos_preds_df.to_pickle(CACHED_DF_FILENAME) print("DONE computing surprize") return all_algos_preds_df
# defining the number of folds = 5 print("Performing splits...") kf = sp.model_selection.KFold(n_splits=5, random_state=0) print("Done.") ### ### PART 1.1 ### ''' application of all algorithms for recommendation made available by “Surprise” libraries, according to their default configuration. ''' algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\ sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\ sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()] for elem in algorithms: start_time = time.time() algo = elem sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \ cv=kf, n_jobs = 2, verbose=True) print("--- %s seconds ---" % (time.time() - start_time)) print() ### ### PART 1.2 ### ''' Improvement of the quality of both KNNBaseline and SVD methods, by performing hyper-parameters tuning over 5-folds Random-Search-Cross-Validation - KNN