def runFM(self, X_train, y_train, X_test, y_test): """ X_train = sp.csc_matrix(np.array([[6, 1], [2, 3], [3, 0], [6, 1], [4, 5]]), dtype=np.float64) y_train = np.array([298, 266, 29, 298, 848], dtype=np.float64) X_test = X_train y_test = y_train """ """ X, y, coef = make_user_item_regression(label_stdev=.4, random_state=seed) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=seed) X_train = sp.csc_matrix(X_train) X_test = sp.csc_matrix(X_test) X_test = X_train y_test = y_train """ print "params => iter: {} - rank: {} - std-dev: {} - seed: {}".format( n_iter, rank, std_dev, seed) start_time = time.time() self.fm_wu = mcmc.FMRegression( n_iter=0, rank=rank, random_state=seed) #, init_stdev=std_dev) # initalize coefs self.fm_wu.fit_predict(X_train, y_train, X_test) rmse_test = [] rmse_new = [] hyper_param = np.zeros((n_iter - 1, 3 + 2 * rank), dtype=np.float64) for nr, i in enumerate(range(1, n_iter)): self.fm_wu.random_state = i * seed y_pred = self.fm_wu.fit_predict(X_train, y_train, X_test, n_more_iter=step_size) rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_test))) hyper_param[nr, :] = self.fm_wu.hyper_param_ print '------- restart ----------' values = np.arange(1, n_iter) rmse_test_re = [] hyper_param_re = np.zeros((len(values), 3 + 2 * rank), dtype=np.float64) for nr, i in enumerate(values): self.fm = mcmc.FMRegression( n_iter=i, rank=rank, random_state=seed) #, init_stdev=std_dev) y_pred = self.fm.fit_predict(X_train, y_train, X_test) rmse_test_re.append(np.sqrt(mean_squared_error(y_pred, y_test))) hyper_param_re[nr, :] = self.fm.hyper_param_ print "Process finished in {} seconds".format(time.time() - start_time) print "Min RMSE on warmup model: {}".format(rmse_test[-1]) print "Min RMSE on retrained model: {}".format(rmse_test_re[-1]) return rmse_test, hyper_param, rmse_test_re, hyper_param_re
def fm(X_train, X_dev_test, y_train, y_dev_test, X_test, y_test): seed = 333 fm = mcmc.FMRegression(n_iter=300, rank=32, random_state=seed) y_pred = fm.fit_predict(X_train, y_train, X_test) np.sqrt(mean_squared_error(y_pred, y_test)) scaler = StandardScaler() y_train_norm = scaler.fit_transform(y_train.reshape(-1, 1)).ravel() fm = mcmc.FMRegression(n_iter=300, rank=32, random_state=seed) y_pred = fm.fit_predict(X_train, y_train_norm, X_test) print(np.sqrt(mean_squared_error(scaler.inverse_transform(y_pred), y_test)))
def predict_fastfm(self): if Constants.USE_CONTEXT: for record in self.records_to_predict: important_record = record[Constants.REVIEW_ID_FIELD] record[Constants.CONTEXT_TOPICS_FIELD] = \ self.context_topics_map[important_record] all_records = self.train_records + self.records_to_predict x_matrix, y_vector = fastfm_recommender.records_to_matrix( all_records, self.context_rich_topics) encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True) encoder.fit(x_matrix) x_train = encoder.transform(x_matrix[:len(self.train_records)]) y_train = y_vector[:len(self.train_records)] x_test = encoder.transform(x_matrix[len(self.train_records):]) if Constants.FASTFM_METHOD == 'mcmc': # solver = mcmc.FMRegression(n_iter=num_iters, rank=num_factors) solver = mcmc.FMRegression(rank=Constants.FM_NUM_FACTORS) self.predictions = solver.fit_predict(x_train, y_train, x_test) elif Constants.FASTFM_METHOD == 'als': solver = als.FMRegression(rank=Constants.FM_NUM_FACTORS) solver.fit(x_train, y_train) self.predictions = solver.predict(x_test) elif Constants.FASTFM_METHOD == 'sgd': solver = sgd.FMRegression(rank=Constants.FM_NUM_FACTORS) solver.fit(x_train, y_train) self.predictions = solver.predict(x_test)
def test_fm_regression(): w0, w, V, y, X = get_test_problem() fm = mcmc.FMRegression(n_iter=1000, rank=2, init_stdev=0.1) y_pred = fm.fit_predict(X, y, X) assert metrics.r2_score(y_pred, y) > 0.99
def test_find_init_stdev(): X, y, coef = make_user_item_regression(label_stdev=.5) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44) X_train = sp.csc_matrix(X_train) X_test = sp.csc_matrix(X_test) fm = mcmc.FMRegression(n_iter=10, rank=5) best_init_stdev, mse = mcmc.find_init_stdev(fm, X_train, y_train, stdev_range=[0.2, 0.5, 1.0]) best_init_stdev_bad, _ = mcmc.find_init_stdev(fm, X_train, y_train, stdev_range=[5.]) print('--' * 30) best_init_stdev_vali, mse_vali = mcmc.find_init_stdev( fm, X_train, y_train, X_test, y_test, stdev_range=[0.2, 0.5, 1.0]) assert best_init_stdev < best_init_stdev_bad assert best_init_stdev_vali == best_init_stdev assert mse_vali > mse
def test_clone(): from sklearn.base import clone a = mcmc.FMRegression() b = clone(a) assert a.get_params() == b.get_params() a = mcmc.FMClassification() b = clone(a) assert a.get_params() == b.get_params()
def test_mcmc_warm_start(): X, y, coef = make_user_item_regression(label_stdev=0) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=44) X_train = sp.csc_matrix(X_train) X_test = sp.csc_matrix(X_test) fm = mcmc.FMRegression(n_iter=100, rank=2) y_pred = fm.fit_predict(X_train, y_train, X_test) error_10_iter = mean_squared_error(y_pred, y_test) fm = mcmc.FMRegression(n_iter=50, rank=2) y_pred = fm.fit_predict(X_train, y_train, X_test) error_5_iter = mean_squared_error(y_pred, y_test) y_pred = fm.fit_predict(X_train, y_train, X_test, n_more_iter=50) error_5_iter_plus_5 = mean_squared_error(y_pred, y_test) print(error_5_iter, error_5_iter_plus_5, error_10_iter) print(fm.hyper_param_) assert_almost_equal(error_10_iter, error_5_iter_plus_5, decimal=2)
def fm_rank(X_train, X_dev_test, y_train, y_dev_test): n_iter = 100 seed = 333 rmse_test = [] ranks = [4, 8, 16, 32, 64] for rank in ranks: fm = mcmc.FMRegression(n_iter=n_iter, rank=rank, random_state=seed) y_pred = fm.fit_predict(X_train, y_train, X_dev_test) rmse = np.sqrt(mean_squared_error(y_pred, y_dev_test)) rmse_test.append(rmse) print("rank:{}\trmse:{:.3f}".format(rank, rmse)) plt.plot(ranks, rmse_test, label="dev test rmse", color="r") plt.legend() plt.show() pass
def fm_candidate_columns(): lens, _, _, _ = lens_data.get_lens_data() lens['user_id'] = lens['user_id'].astype(str) lens['movie_id'] = lens['movie_id'].astype(str) lens['year'] = lens['date'].apply(str).str.split('-').str.get(0) lens['release_year'] = lens['release_date'].apply(str).str.split( '-').str.get(2) candidate_columns = [ [ 'user_id', 'movie_id', 'release_year', 'age', 'sex', 'year', 'rating' ], # A ['user_id', 'movie_id', 'age', 'sex', 'year', 'rating'], # B ['user_id', 'movie_id', 'sex', 'year', 'rating'], # C ['user_id', 'movie_id', 'age', 'sex', 'rating'], # D ['user_id', 'movie_id', 'rating'], # E ] rmse_test = [] n_iter = 500 seed = 123 rank = 8 for column in candidate_columns: filtered_lens = lens[column].dropna() v = DictVectorizer() X_more_feature = v.fit_transform( list(filtered_lens.drop('rating', axis=1).T.to_dict().values())) y_more_feature = filtered_lens['rating'].tolist() X_mf_train, X_mf_test, y_mf_train, y_mf_test = train_test_split( X_more_feature, y_more_feature, test_size=0.1, random_state=42) scaler = StandardScaler() y_mf_train_norm = scaler.fit_transform( np.array(y_mf_train).reshape(-1, 1)).ravel() fm = mcmc.FMRegression(n_iter=n_iter, rank=rank, random_state=seed) # Allocates and initalizes the model and hyper parameter. fm.fit_predict(X_mf_train, y_mf_train_norm, X_mf_test) y_pred = fm.fit_predict(X_mf_train, y_mf_train_norm, X_mf_test) rmse_test.append( np.sqrt( mean_squared_error( scaler.inverse_transform(y_pred.reshape(-1, 1)), y_mf_test))) print(rmse_test) # RMSEをプロットする ind = np.arange(len(rmse_test)) bar = plt.bar(ind, height=rmse_test) plt.xticks(ind, ('A', 'B', 'C', 'D', 'E')) plt.ylim((0.88, 0.90)) plt.show()
def fm_n_iter(X_train, X_dev_test, y_train, y_dev_test): """ fastFMで機械学習を行う :param X_train: 訓練データ :param X_dev_test: 検証データ :param y_train: 訓練データの評価値(ラベル?) :param y_dev_test: 検証データの評価値(ラベル?) :return: """ # fastFMに指定するパラメタ n_iter = 300 step_size = 1 seed = 123 rank = 4 # MCMCのFMモデルを初期化 fm = mcmc.FMRegression(n_iter=0, rank=rank, random_state=seed) fm.fit_predict(X_train, y_train, X_dev_test) # aaa rmse_dev_test = [] rmse_test = [] hyper_param = np.zeros((n_iter - 1, 3 + 2 * rank), dtype=np.float64) # TODO イテレーション回数を変化させて、予測結果の性能とハイパーパラメタを得る for nr, i in enumerate(range(1, n_iter)): fm.random_state = i * seed # (MCMCで)パラメタフィッティングおよび予測を行う y_pred = fm.fit_predict(X_train, y_train, X_dev_test, n_more_iter=step_size) rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_dev_test))) hyper_param[nr, :] = fm.hyper_param_ # 最初の5回は値が落ち着いていないので無視する values = np.arange(1, n_iter) x = values * step_size burn_in = 5 x = x[burn_in:] # RMSEとハイパーパラメータをプロットする fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, figsize=(15, 8)) axes[0, 0].plot(x, rmse_test[burn_in:], label='dev test rmse', color="r") axes[0, 0].legend() axes[0, 1].plot(x, hyper_param[burn_in:, 0], label='alpha', color="b") axes[0, 1].legend() axes[1, 0].plot(x, hyper_param[burn_in:, 1], label='lambda_w', color="g") axes[1, 0].legend() axes[1, 1].plot(x, hyper_param[burn_in:, 3], label='mu_w', color="g") axes[1, 1].legend() # 検証データのラベル値の標準偏差を出力。予測値の標準偏差がこの値より小さければ print("np.std(y_dev_test) = {}".format(np.std(y_dev_test))) plt.show()
def fastFMJob(data_path, params, N, vectorizer, solver): rmses = [] logging.info("Evaluando con params: {0}".format(params)) for i in range(1, 4 + 1): train_data, y_tr, _ = loadData('train/train_N' + str(N) + '.' + str(i), data_path=data_path, with_timestamps=False, with_authors=False) val_data, y_va, _ = loadData('val/val_N' + str(N) + '.' + str(i), data_path=data_path, with_timestamps=False, with_authors=False) X_tr = vectorizer.transform(train_data) X_va = vectorizer.transform(val_data) if solver == "mcmc": fm = mcmc.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, copy_X=True) preds = fm.fit_predict(X_tr, y_tr, X_va) rmse = sqrt(mean_squared_error(y_va, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) rmses.append(rmse) elif solver == "als": fm = als.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, \ l2_reg_w=params['l2_reg_w'], l2_reg_V=params['l2_reg_V'], l2_reg=params['l2_reg']) fm.fit(X_tr, y_tr) preds = fm.predict(X_va) rmse = sqrt(mean_squared_error(y_va, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) rmses.append(rmse) elif solver == "sgd": fm = sgd.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, \ l2_reg_w=params['l2_reg_w'], l2_reg_V=params['l2_reg_V'], l2_reg=params['l2_reg'], step_size=params['step_size']) fm.fit(X_tr, y_tr) preds = fm.predict(X_va) rmse = sqrt(mean_squared_error(y_va, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) rmses.append(rmse) return mean(rmses)
def predict(train_records, test_records): """ Makes a prediction for the testing set based on the topic probability vector of each record and the rating. The topic model is built using the training set. This function uses the FastFM Factorization Machines Module for Python :param train_records: the training set :param test_records: the testing set :return: a list with the predictions for the testing set """ records = train_records + test_records context_rich_topics = [(i, 1) for i in range(num_topics)] new_matrix, new_y = records_to_matrix(records, context_rich_topics) print(new_matrix) encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True) encoder.fit(new_matrix) new_x = encoder.transform(new_matrix[:len(train_records)]) # print(new_x.todense()) # x_train, x_test, y_train, y_test = train_test_split(new_x, new_y) x_train = new_x y_train = new_y[:len(train_records)] x_test = encoder.transform(new_matrix[len(train_records):]) mc_regressor = mcmc.FMRegression() y_pred = mc_regressor.fit_predict(x_train, y_train, x_test) print('********') print(x_test.todense()) print(y_pred) als_fm = als.FMRegression( n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5) als_fm.fit(x_train, y_train) y_pred = als_fm.predict(x_test) print(y_pred) return y_pred
def fastFM_tuning(data_path, N, solver): all_data, y_all, _ = loadData("eval_all_N" + str(N) + ".data", data_path=data_path, with_timestamps=False, with_authors=False) v = DictVectorizer() X_all = v.fit_transform(all_data) if solver == "mcmc": defaults = {'mi': 100, 'init_stdev': 0.1, 'f': 8} elif solver == "als": defaults = { 'mi': 100, 'init_stdev': 0.1, 'f': 8, 'l2_reg_w': 0.1, 'l2_reg_V': 0.1, 'l2_reg': 0 } elif solver == "sgd": defaults = { 'mi': 100, 'init_stdev': 0.1, 'f': 8, 'l2_reg_w': 0.1, 'l2_reg_V': 0.1, 'l2_reg': 0, 'step_size': 0.1 } results = dict((param, {}) for param in defaults.keys()) for param in ['mi', 'f', 'init_stdev']: if param == 'mi': for i in [1, 5, 10, 20, 50, 100, 150, 200]: defaults['mi'] = i results['mi'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['mi'] = opt_value(results=results['mi'], metric='rmse') elif param == 'f': for i in [1, 5, 8, 10] + range(20, 2020, 20): defaults['f'] = i results['f'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['f'] = opt_value(results=results['f'], metric='rmse') elif param == 'init_stdev': for i in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]: defaults['init_stdev'] = i results['init_stdev'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['init_stdev'] = opt_value(results=results['init_stdev'], metric='rmse') if solver != "mcmc": for param in ['l2_reg_w', 'l2_reg_V', 'l2_reg']: if param == 'l2_reg_w': for i in [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0 ]: defaults['l2_reg_w'] = i results['l2_reg_w'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['l2_reg_w'] = opt_value(results=results['l2_reg_w'], metric='rmse') elif param == 'l2_reg_V': for i in [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0 ]: defaults['l2_reg_V'] = i results['l2_reg_V'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['l2_reg_V'] = opt_value(results=results['l2_reg_V'], metric='rmse') elif param == 'l2_reg': for i in [ 0.0, 0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.08, 0.1 ]: defaults['l2_reg'] = i results['l2_reg'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['l2_reg'] = opt_value(results=results['l2_reg'], metric='rmse') if solver == "sgd": for i in [ 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.5 ]: defaults['step_size'] = i results['step_size'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['step_size'] = opt_value(results=results['step_size'], metric='rmse') # Real testing train_data, y_tr, _ = loadData('eval_train_N' + str(N) + '.data', data_path=data_path, with_timestamps=False, with_authors=False) test_data, y_te, _ = loadData('test/test_N' + str(N) + '.data', data_path=data_path, with_timestamps=False, with_authors=False) X_tr = v.transform(train_data) X_te = v.transform(test_data) if solver == "mcmc": fm = mcmc.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, copy_X=True) preds = fm.fit_predict(X_tr, y_tr, X_te) rmse = sqrt(mean_squared_error(y_te, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) with open('TwitterRatings/fastFM/mcmc/clean/opt_params.txt', 'w') as f: for param in defaults: f.write("{param}:{value}\n".format(param=param, value=defaults[param])) f.write("RMSE:{rmse}".format(rmse=rmse)) with open('TwitterRatings/fastFM/mcmc/clean/params_rmses.txt', 'w') as f: for param in results: for value in results[param]: f.write("{param}={value}\t : {RMSE}\n".format( param=param, value=value, RMSE=results[param][value])) elif solver == "als": fm = als.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, \ l2_reg_w=defaults['l2_reg_w'], l2_reg_V=defaults['l2_reg_V'], l2_reg=defaults['l2_reg']) fm.fit(X_tr, y_tr) preds = fm.predict(X_te) rmse = sqrt(mean_squared_error(y_te, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) with open('TwitterRatings/fastFM/als/clean/opt_params.txt', 'w') as f: for param in defaults: f.write("{param}:{value}\n".format(param=param, value=defaults[param])) f.write("RMSE:{rmse}".format(rmse=rmse)) with open('TwitterRatings/fastFM/als/clean/params_rmses.txt', 'w') as f: for param in results: for value in results[param]: f.write("{param}={value}\t : {RMSE}\n".format( param=param, value=value, RMSE=results[param][value])) elif solver == "sgd": fm = sgd.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, \ l2_reg_w=defaults['l2_reg_w'], l2_reg_V=defaults['l2_reg_V'], l2_reg=defaults['l2_reg'], step_size=defaults['step_size']) fm.fit(X_tr, y_tr) preds = fm.predict(X_te) rmse = sqrt(mean_squared_error(y_te, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) with open('TwitterRatings/fastFM/sgd/clean/opt_params.txt', 'w') as f: for param in defaults: f.write("{param}:{value}\n".format(param=param, value=defaults[param])) f.write("RMSE:{rmse}".format(rmse=rmse)) with open('TwitterRatings/fastFM/sgd/clean/params_rmses.txt', 'w') as f: for param in results: for value in results[param]: f.write("{param}={value}\t : {RMSE}\n".format( param=param, value=value, RMSE=results[param][value])) return defaults
rank = 4 seed = 333 step_size = 1 """ X, y, coef = make_user_item_regression(label_stdev=.4, random_state=seed) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=seed) X_train = sp.csc_matrix(X_train) X_test = sp.csc_matrix(X_test) X_test = X_train y_test = y_train """ fm = mcmc.FMRegression(n_iter=0, rank=rank, random_state=seed) # initalize coefs fm.fit_predict(X_train, y_train, X_test) rmse_test = [] rmse_new = [] hyper_param = np.zeros((n_iter -1, 3 + 2 * rank), dtype=np.float64) for nr, i in enumerate(range(1, n_iter)): fm.random_state = i * seed y_pred = fm.fit_predict(X_train, y_train, X_test, n_more_iter=step_size) rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_test))) hyper_param[nr, :] = fm.hyper_param_ print '------- restart ----------' values = np.arange(1, n_iter) rmse_test_re = []
from sklearn.metrics import mean_squared_error from fastFM import mcmc import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.model_selection import train_test_split n_iter = 100 seed = 333 rmse_test = [] # rankを4, 8, 16, 32, 64 ranks = [4, 8, 16, 32, 64] # rankを変えて学習・予測をしdev testデータに対するRMSEを獲得する for rank in ranks: fm = mcmc.FMRegression(n_ter=n_iter, rank=rank, random_state=seed) y_pred = fm.fit_predict(X_train, y_train, X_dev_test) rmse = np.sqrt(mean_squared_error(y_pred, y_dev_test)) rmse_test.append(rmse) print('rank:{}\trmse:{:.3f}'.format(rank, rmse)) # 各rank毎のRMSEをプロットする plt.plot(ranks, rmse_test, label='dev test rmse', color="r") plt.legend()
def _build_mcmc_model(param): return mcmc.FMRegression(n_iter=param['n_iter'], \ init_stdev=param['init_stdev'], \ rank=param['rank'], \ random_state=param['random_state'])
train_hot = X_merge_hot[0:train.shape[0]] test_hot = X_merge_hot[train.shape[0]:X_merge_hot.shape[0]] X = v.fit_transform(X_origin) y = np.array(train.loc[:, ['score']]).flatten() X_train, X_test, y_train, y_test = train_test_split(X, y) print "start fit" #fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5) xgbc = XGBClassifier() xgbc.fit(X_train, y_train) print 'The accuracy of eXtreme Gradient Boosting Classifier on testing set:', xgbc.score( X_test, y_test) fm = mcmc.FMRegression(n_iter=100, init_stdev=0.1, rank=8, random_state=123, copy_X=True) #y_pred = fm.fit_predict(train_hot, y,test_hot) y_pred = fm.fit_predict(X_train, y_train, X_test) joblib.dump(fm, "fast_fm_model_mcmc.m") print "start predict" #y_pred = fm.predict(test_hot) df_fm = pd.DataFrame(y_pred, columns=['score']) df_fm.to_csv("fast_fm_result_mcmc.csv", index=False) print y_pred.shape from sklearn.metrics import mean_squared_error print 'mse:', mean_squared_error(y_test, y_pred) R_real = math.sqrt(mean_squared_error(y_test, y_pred))
from sklearn.metrics import mean_squared_error from fastFM import mcmc import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.model_selection import train_test_split fm = mcmc.FMRegression(n_iter=300, rank=32, random_state=seed) y_pred = fm.fit_predict(X_train, y_train, X_test) np.sqrt(mean_squared_error(y_pred, y_test)) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() y_train_norm = scaler.fit_transform(y_train.reshape(-1, 1)).ravel() fm = mcmc.FMRegression(n_iter=300, rank=32, random_state=seed) y_pred = fm.fit_predict(X_train, y_train_norm, X_test) np.sqrt(mean_squared_error(scaler.inverse_transform(y_pred), y_test)) lens['user_id'] = lens['user_id'].astype(str) lens['movie_id'] = lens['movie_id'].astype(str) lens['year'] = lens['date'].apply(str).str.split('-').str.get(0) lens['release_year'] = lens['release_date'].apply(str).str.split('-').str.get(2) lens['year'] = lens['date'].apply(str).str.split('-').str.get(0) lens['release_year'] = lens['release_date'].apply(str).str.split('-').str.get(2) candidate_columns = [ ['user_id', 'movie_id', 'release_year', 'age', 'sex', 'year', 'rating'], #A ['user_id', 'movie_id', 'age', 'sex', 'year', 'rating'], #B ['user_id', 'movie_id', 'sex', 'year', 'rating'], #C ['user_id', 'movie_id', 'age', 'sex', 'rating'], #D