def test_grid_search(): cv = ShuffleSplit(n_iter=5, random_state=0) mf = ExplicitMF(n_components=3, max_iter=10, random_state=0) param_grid = {"alpha": [0.1, 1.0, 10]} gcv = GridSearchCV(mf, param_grid, cv) gcv.fit(X) assert_equal(gcv.best_estimator_.alpha, 0.1) assert_equal(gcv.best_params_, {"alpha": 0.1}) mf = ExplicitMF(alpha=0.1, n_components=3, max_iter=10, random_state=0) mf.fit(X) assert_almost_equal(mf.score(X), gcv.score(X))
def test_matrix_fact_cd(): # Generate some toy data. rng = np.random.RandomState(0) U = rng.rand(50, 3) V = rng.rand(3, 20) X = np.dot(U, V) mf = ExplicitMF(n_components=3, max_iter=10, alpha=1e-3, random_state=0, verbose=0) mf.fit(X) Y = np.dot(mf.P_, mf.Q_) Y2 = mf.predict(X).toarray() assert_array_almost_equal(Y, Y2) rmse = np.sqrt(np.mean((X - Y) ** 2)) rmse2 = mf.score(X) assert_almost_equal(rmse, rmse2)
def test_cross_val_score(): # Generate some toy data. rng = np.random.RandomState(0) U = rng.rand(50, 3) V = rng.rand(3, 20) X = np.dot(U, V) cv = ShuffleSplit(n_iter=10) mf = ExplicitMF(n_components=3, max_iter=10, alpha=1e-3, random_state=0, verbose=0) scores = cross_val_score(mf, X, cv) assert_equal(len(scores), cv.n_iter)
def test_matrix_fact_cd(): # Generate some toy data. rng = np.random.RandomState(0) U = rng.rand(50, 3) V = rng.rand(3, 20) X = np.dot(U, V) mf = ExplicitMF(n_components=3, max_iter=10, alpha=1e-3, random_state=0, verbose=0) mf.fit(X) Y = np.dot(mf.P_, mf.Q_) Y2 = mf.predict(X).toarray() assert_array_almost_equal(Y, Y2) rmse = np.sqrt(np.mean((X - Y)**2)) rmse2 = mf.score(X) assert_almost_equal(rmse, rmse2)
self.times.append(time.clock() - self.start_time - self.test_time) try: version = sys.argv[1] except: version = "100k" X = load_movielens(version) print X.shape X_tr, X_te = train_test_split(X, train_size=0.75, random_state=0) X_tr = X_tr.tocsr() X_te = X_te.tocsr() cb = Callback(X_tr, X_te) mf = ExplicitMF(n_components=30, max_iter=50, alpha=0.1, verbose=1, callback=cb) mf.fit(X_tr) plt.figure() plt.plot(cb.times, cb.obj) plt.xlabel("CPU time") plt.xscale("log") plt.ylabel("Objective value") plt.figure() plt.plot(cb.times, cb.rmse) plt.xlabel("CPU time") plt.xscale("log") plt.ylabel("RMSE") plt.show()
def main(version='100k', n_jobs=1, random_state=0, cross_val=False): dl_params = {} dl_params['100k'] = dict(learning_rate=1, batch_size=10, offset=0, alpha=1) dl_params['1m'] = dict(learning_rate=.75, batch_size=60, offset=0, alpha=.8) dl_params['10m'] = dict(learning_rate=.75, batch_size=600, offset=0, alpha=3) dl_params['netflix'] = dict(learning_rate=.8, batch_size=4000, offset=0, alpha=0.16) cd_params = { '100k': dict(alpha=.1), '1m': dict(alpha=.03), '10m': dict(alpha=.04), 'netflix': dict(alpha=.1) } if version in ['100k', '1m', '10m']: X = load_movielens(version) X_tr, X_te = train_test_split(X, train_size=0.75, random_state=random_state) X_tr = X_tr.tocsr() X_te = X_te.tocsr() elif version is 'netflix': X_tr = load(expanduser('~/spira_data/nf_prize/X_tr.pkl')) X_te = load(expanduser('~/spira_data/nf_prize/X_te.pkl')) cd_mf = ExplicitMF( n_components=60, max_iter=50, alpha=.1, normalize=True, verbose=1, ) dl_mf = DictMF(n_components=30, n_epochs=20, alpha=1.17, verbose=5, batch_size=10000, normalize=True, fit_intercept=True, random_state=0, learning_rate=.75, impute=False, partial=False, backend='python') dl_mf_partial = DictMF(n_components=60, n_epochs=20, alpha=1.17, verbose=5, batch_size=10000, normalize=True, fit_intercept=True, random_state=0, learning_rate=.75, impute=False, partial=True, backend='python') timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H' '-%M-%S') if cross_val: subdir = 'benches_ncv' else: subdir = 'benches' output_dir = expanduser(join('~/output/recommender/', timestamp, subdir)) if not os.path.exists(output_dir): os.makedirs(output_dir) alphas = np.logspace(-2, 1, 10) mf_list = [dl_mf_partial] dict_id = {cd_mf: 'cd', dl_mf: 'dl', dl_mf_partial: 'dl_partial'} names = { 'cd': 'Coordinate descent', 'dl': 'Proposed online masked MF', 'dl_partial': 'Proposed algorithm (with partial projection)' } if os.path.exists( join(output_dir, 'results_%s_%s.json' % (version, random_state))): with open( join(output_dir, 'results_%s_%s.json' % (version, random_state)), 'r') as f: results = json.load(f) else: results = {} for mf in mf_list: results[dict_id[mf]] = {} if not cross_val: if isinstance(mf, DictMF): mf.set_params( learning_rate=dl_params[version]['learning_rate'], batch_size=dl_params[version]['batch_size'], alpha=dl_params[version]['alpha']) else: mf.set_params(alpha=cd_params[version]['alpha']) else: if isinstance(mf, DictMF): mf.set_params( learning_rate=dl_params[version]['learning_rate'], batch_size=dl_params[version]['batch_size']) if version != 'netflix': cv = ShuffleSplit(n_iter=3, train_size=0.66, random_state=0) mf_scores = Parallel(n_jobs=n_jobs, verbose=10)( delayed(single_fit)(mf, alpha, X_tr, cv) for alpha in alphas) else: mf_scores = Parallel(n_jobs=n_jobs, verbose=10)( delayed(single_fit)(mf, alpha, X_tr, X_te, nested=False) for alpha in alphas) mf_scores = np.array(mf_scores).mean(axis=1) best_alpha_arg = mf_scores.argmin() best_alpha = alphas[best_alpha_arg] mf.set_params(alpha=best_alpha) cb = Callback(X_tr, X_te, refit=False) mf.set_params(callback=cb) mf.fit(X_tr) results[dict_id[mf]] = dict(name=names[dict_id[mf]], time=cb.times, rmse=cb.rmse) if cross_val: results[dict_id[mf]]['alphas'] = alphas.tolist() results[dict_id[mf]]['cv_alpha'] = mf_scores.tolist() results[dict_id[mf]]['best_alpha'] = mf.alpha with open( join(output_dir, 'results_%s_%s.json' % (version, random_state)), 'w+') as f: json.dump(results, f) print('Done')
try: version = sys.argv[1] except: version = "100k" X = load_movielens(version) print X.shape alphas = np.logspace(-3, 0, 10) mf_scores = [] cv = ShuffleSplit(n_iter=3, train_size=0.75, random_state=0) for alpha in alphas: mf = ExplicitMF(n_components=30, max_iter=10, alpha=alpha) mf_scores.append(cross_val_score(mf, X, cv)) # Array of size n_alphas x n_folds. mf_scores = np.array(mf_scores) dummy = Dummy() dummy_scores = cross_val_score(dummy, X, cv) dummy = Dummy(axis=0) dummy_scores2 = cross_val_score(dummy, X, cv) plt.figure() plt.plot(alphas, mf_scores.mean(axis=1), label="Matrix Factorization") plt.plot(alphas, [dummy_scores.mean()] * len(alphas), label="User mean") plt.plot(alphas, [dummy_scores2.mean()] * len(alphas), label="Movie mean")
import sys import time from spira.datasets import load_movielens from spira.cross_validation import train_test_split from spira.completion import ExplicitMF try: version = sys.argv[1] except: version = "100k" X = load_movielens(version) print(X.shape) X_tr, X_te = train_test_split(X, train_size=0.75, random_state=0) start = time.time() mf = ExplicitMF(n_components=30, max_iter=10, alpha=1e-1, random_state=0, verbose=1) mf.fit(X_tr) print("Time", time.time() - start) print("RMSE", mf.score(X_te))
import sys import time from spira.datasets import load_movielens from spira.cross_validation import train_test_split from spira.completion import ExplicitMF try: version = sys.argv[1] except: version = "100k" X = load_movielens(version) print X.shape X_tr, X_te = train_test_split(X, train_size=0.75, random_state=0) start = time.time() mf = ExplicitMF(n_components=30, max_iter=10, alpha=1e-1, random_state=0, verbose=1) mf.fit(X_tr) print "Time", time.time() - start print "RMSE", mf.score(X_te)
try: version = sys.argv[1] except: version = "100k" X = load_movielens(version) print X.shape X_tr, X_te = train_test_split(X, train_size=0.75, random_state=0) X_tr = X_tr.tocsr() X_te = X_te.tocsr() cb = Callback(X_tr, X_te) mf = ExplicitMF(n_components=30, max_iter=50, alpha=0.1, verbose=1, callback=cb) mf.fit(X_tr) plt.figure() plt.plot(cb.times, cb.obj) plt.xlabel("CPU time") plt.xscale("log") plt.ylabel("Objective value") plt.figure() plt.plot(cb.times, cb.rmse) plt.xlabel("CPU time") plt.xscale("log") plt.ylabel("RMSE")
rmse = np.sqrt(np.mean((X_pred.data - self.X_te.data)**2)) print(rmse) self.rmse.append(rmse) self.test_time += time.clock() - test_time self.times.append(time.clock() - self.start_time - self.test_time) X_tr = load(expanduser('~/spira_data/nf_prize/X_tr.pkl')) X_te = load(expanduser('~/spira_data/nf_prize/X_te.pkl')) # X_te = X_te.T.tocsr() cb = {} cd_mf = ExplicitMF( n_components=30, max_iter=50, alpha=0.1, verbose=1, ) dl_mf = DictMF(n_components=30, n_epochs=5, alpha=.3, verbose=10, batch_size=10000, normalize=True, impute=False, fit_intercept=True, random_state=0, learning_rate=.75, backend='c') for mf in [dl_mf]: