def test_als_warm_start(): X, y, coef = make_user_item_regression(label_stdev=0) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) X_train = sp.csc_matrix(X_train) X_test = sp.csc_matrix(X_test) fm = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2) fm.fit(X_train, y_train) y_pred = fm.predict(X_test) error_10_iter = mean_squared_error(y_pred, y_test) fm = als.FMRegression(n_iter=5, l2_reg_w=0, l2_reg_V=0, rank=2) fm.fit(X_train, y_train) print(fm.iter_count) y_pred = fm.predict(X_test) error_5_iter = mean_squared_error(y_pred, y_test) fm.fit(sp.csc_matrix(X_train), y_train, n_more_iter=5) print(fm.iter_count) y_pred = fm.predict(X_test) error_5_iter_plus_5 = mean_squared_error(y_pred, y_test) print(error_5_iter, error_5_iter_plus_5, error_10_iter) assert error_10_iter == error_5_iter_plus_5
def run_FM_cv( X_learn=None, Y_learn=None, X_test=None, Y_test=None, dict_cv={ "l2_reg_w": np.linspace(0.01, 15, 5), "l2_reg_V": np.linspace(0.01, 15, 5), "rank": [2, 4, 6, 8] }, score_func=None, scoring='mean_squared_error', columns=None, nb_trails=5, verbose=None): ss = [] skf = list(cross_validation.KFold(X_learn.shape[0], nb_trials)) for k, (train, test) in enumerate(skf): X_learn_cv, X_test_cv = X_learn[train], X_learn[test] Y_learn_cv, Y_test_cv = Y_learn[train], Y_learn[test] print("cv {}".format(k), end='\r') l2_reg_V = random.choice(dict_cv['l2_reg_V']) l2_reg_w = random.choice(dict_cv['l2_reg_w']) rank = random.choice(dict_cv['rank']) reg = als.FMRegression(l2_reg_V=random.choice(dict_cv['l2_reg_V']), l2_reg_w=random.choice(dict_cv['l2_reg_w']), rank=random.choice(dict_cv['rank'])) reg.fit(X_learn_cv, Y_learn_cv) y_pred = reg.predict(X_test_cv) s = metrics.mean_squared_error(y_pred, Y_test_cv) ss.append((l2_reg_V, l2_reg_w, rank, s)) best = min(ss, key=operator.itemgetter(3)) print(best) fm = als.FMRegression(l2_reg_V=best[0], l2_reg_w=best[1], rank=best[2]) fm.fit(X_learn, Y_learn) y_pred = fm.predict(X_test) score_fm = 0 if not (score_func is None): score_fm = score_func(Y_test, y_pred) return y_pred, score_fm
def test_warm_start_path(): X, y, coef = make_user_item_regression(label_stdev=.4) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) X_train = sp.csc_matrix(X_train) X_test = sp.csc_matrix(X_test) n_iter = 10 rank = 4 seed = 333 step_size = 1 l2_reg_w = 0 l2_reg_V = 0 fm = als.FMRegression(n_iter=0, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V, rank=rank, random_state=seed) # initalize coefs fm.fit(X_train, y_train) rmse_train = [] rmse_test = [] for i in range(1, n_iter): fm.fit(X_train, y_train, n_more_iter=step_size) rmse_train.append( np.sqrt(mean_squared_error(fm.predict(X_train), y_train))) rmse_test.append( np.sqrt(mean_squared_error(fm.predict(X_test), y_test))) print('------- restart ----------') values = np.arange(1, n_iter) rmse_test_re = [] rmse_train_re = [] for i in values: fm = als.FMRegression(n_iter=i, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V, rank=rank, random_state=seed) fm.fit(X_train, y_train) rmse_test_re.append( np.sqrt(mean_squared_error(fm.predict(X_test), y_test))) rmse_train_re.append( np.sqrt(mean_squared_error(fm.predict(X_train), y_train))) assert_almost_equal(rmse_train, rmse_train_re) assert_almost_equal(rmse_test, rmse_test_re)
def test_fm_regression(): w0, w, V, y, X = get_test_problem() fm = als.FMRegression(n_iter=1000, l2_reg_w=0, l2_reg_V=0, rank=2) fm.fit(X, y) y_pred = fm.predict(X) assert_almost_equal(y_pred, y, 3) # check different size fm = als.FMRegression(n_iter=1000, l2_reg_w=0, l2_reg_V=0, rank=5) X_big = sp.hstack([X, X]) fm.fit(X_big, y) y_pred = fm.predict(X_big[:2, ])
def _test_fm_regression_only_w0(): X, y = get_small_data() fm = als.FMRegression(n_iter=0, l2_reg_w=0, l2_reg_V=0, rank=0) fm.ignore_w = True fm.w0_ = 2 fm.fit(X, y, warm_start=True) assert_almost_equal(fm.w0_, 2, 6) fm = als.FMRegression(n_iter=1, l2_reg_w=0, l2_reg_V=0, rank=0) fm.ignore_w = True fm.w0_ = 2 fm.fit(X, y, warm_start=True) assert_almost_equal(fm.w0_, 4466.6666666666661, 6)
def __init__(self,model_file = "", n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5): if os.path.exists(model_file): print('old') self.fm = joblib.load(model_file) else: self.fm = als.FMRegression(n_iter, init_stdev, rank, l2_reg_w, l2_reg_V) print('new')
def predict_fastfm(self): if Constants.USE_CONTEXT: for record in self.records_to_predict: important_record = record[Constants.REVIEW_ID_FIELD] record[Constants.CONTEXT_TOPICS_FIELD] = \ self.context_topics_map[important_record] all_records = self.train_records + self.records_to_predict x_matrix, y_vector = fastfm_recommender.records_to_matrix( all_records, self.context_rich_topics) encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True) encoder.fit(x_matrix) x_train = encoder.transform(x_matrix[:len(self.train_records)]) y_train = y_vector[:len(self.train_records)] x_test = encoder.transform(x_matrix[len(self.train_records):]) if Constants.FASTFM_METHOD == 'mcmc': # solver = mcmc.FMRegression(n_iter=num_iters, rank=num_factors) solver = mcmc.FMRegression(rank=Constants.FM_NUM_FACTORS) self.predictions = solver.fit_predict(x_train, y_train, x_test) elif Constants.FASTFM_METHOD == 'als': solver = als.FMRegression(rank=Constants.FM_NUM_FACTORS) solver.fit(x_train, y_train) self.predictions = solver.predict(x_test) elif Constants.FASTFM_METHOD == 'sgd': solver = sgd.FMRegression(rank=Constants.FM_NUM_FACTORS) solver.fit(x_train, y_train) self.predictions = solver.predict(x_test)
def _build_als_model(param): return als.FMRegression(n_iter=param['n_iter'], \ init_stdev=param['init_stdev'], \ rank=param['rank'], \ random_state=param['random_state'], \ l2_reg_w=param['l2_reg_w'], \ l2_reg_V=param['l2_reg_V'], \ l2_reg=param['l2_reg'])
def fmcv(rank, l2_reg_w, l2_reg_V): fm = als.FMRegression(n_iter=10, init_stdev=0.0001, rank=int(rank), l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V) fm.fit(X_train, y_train) val = -mean_squared_error(fm.predict(X_test), y_test) return val
def fit(self, X, y): X_fm = self.prepare_fm(X) self.chrono.save('prepare data in sparse FM format') self.fm = als.FMRegression(n_iter=self.nb_iterations, rank=self.rank) self.fm.fit(X_fm, y) self.chrono.save('factor matrix')
def fm_reg(x_train, y_train, x_valid): from fastFM import als FM = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=8, l2_reg_w=0.2, l2_reg_V=0.5) fm_train, fm_test = stacking(FM, x_train, y_train, x_valid, "fm") return fm_train, fm_test, 'fm'
def test_clone(): from sklearn.base import clone a = als.FMRegression() b = clone(a) assert a.get_params() == b.get_params() a = als.FMClassification() b = clone(a) assert a.get_params() == b.get_params()
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): from fastFM import als X = dt.Frame(X) orig_cols = list(X.names) if self.num_classes >= 2: model = als.FMClassification(n_iter=self.params["n_iter"], init_stdev=self.params["init_stdev"], rank=self.params["rank"], l2_reg_w=self.params["l2_reg_w"], l2_reg_V=self.params["l2_reg_V"], random_state=self.random_state) lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) y[y != 1] = -1 else: model = als.FMRegression(n_iter=self.params["n_iter"], init_stdev=self.params["init_stdev"], rank=self.params["rank"], l2_reg_w=self.params["l2_reg_w"], l2_reg_V=self.params["l2_reg_V"], random_state=self.random_state) self.means = dict() self.scaler = StandardScaler() for col in X.names: XX = X[:, col] self.means[col] = XX.mean1() if np.isnan(self.means[col]): self.means[col] = 0 XX.replace(None, self.means[col]) X[:, col] = XX assert X[dt.isna(dt.f[col]), col].nrows == 0 X = X.to_numpy() X = self.scaler.fit_transform(X) X = csr_matrix(X) # requires sparse matrix model.fit(X, y) importances = np.array(abs(model.w_)) self.set_model_properties( model=model, features=orig_cols, importances=importances.tolist(), # abs(model.coef_[0]) iterations=0)
def main(): vectorizer = build_vectorizer(binary=False) print('loading output...') train_data = pd.read_csv(TRAIN_FILE) test_data = pd.read_csv(TEST_FILE) #train_data['qpair'] = train_data.apply(lambda r: '{0} {1}'.format(str(r.question1), str(r.question2)), axis=1) #test_data['qpair'] = test_data.apply(lambda r: '{0} {1}'.format(str(r.question1), str(r.question2)), axis=1) combined = pd.concat([ train_data.question1, train_data.question2, test_data.question1, test_data.question2 ], axis=0, ignore_index=True) combined = combined.fillna('na') print(combined.head()) print('fitting tf_idf vectorizer...') features = vectorizer.fit_transform(combined) train_size = len(train_data.question1) test_size = len(test_data.question1) f_train_q1 = features[0:train_size] f_train_q2 = features[train_size:train_size * 2] f_test_q1 = features[train_size * 2:train_size * 2 + test_size] f_test_q2 = features[train_size * 2 + test_size:] f_train = sp.hstack([f_train_q1, f_train_q2]) f_test = sp.hstack([f_test_q1, f_test_q2]) X_train, X_cv, y_train, y_cv = train_test_split(f_train, train_data.is_duplicate, test_size=0.2, random_state=1234) print('training FM model...') fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=4, l2_reg_w=0.1, l2_reg_V=0.5) fm.fit(X_train, y_train) print('cross validation...') predictions = fm.predict(X_cv) print('cv log-loss: {0}'.format(log_loss(y_cv, predictions))) print('cv auc: {0}'.format(roc_auc_score(y_cv, predictions))) print('predicting {0} test samples...'.format(f_test.shape[0])) predictions = pd.DataFrame() predictions['test_id'] = range(0, f_test.shape[0]) predictions['is_duplicate'] = fm.predict(f_test) predictions = predictions.fillna(POS_PROP) predictions.to_csv(SUBMISSION_FILE, index=False)
def fastfm(self): fm = als.FMRegression(n_iter=100, init_stdev=0.1, rank=4, l2_reg_w=0.1, l2_reg_V=0.5) fm.fit(self.X_train, self.y_train) y_pred = fm.predict(self.X_test) prec = precision_score(self.y_test, y_pred.round(), average='weighted') rec = recall_score(self.y_test, y_pred.round(), average='weighted') fmeasure = 2*((prec*rec)/(prec+rec)) auc = roc_auc_score(self.y_test, y_pred, average='macro') rmse = np.sqrt(mean_squared_error(self.y_test, y_pred)) return (auc, rmse)
def train_model(x_train, y_train, n_iter, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5): fm = als.FMRegression(n_iter=n_iter, init_stdev=init_stdev, rank=1, l2_reg_w=1, l2_reg_V=2) fm.fit(x_train, y_train) return fm
def test_second_order_sgd_vs_als_regression(): X, y = make_regression(n_samples=100, n_features=50, random_state=123) X = sp.csc_matrix(X) fm_sgd = sgd.FMRegression(n_iter=50000, init_stdev=0.00, l2_reg_w=0.0, l2_reg_V=50.5, rank=2, step_size=0.0002) fm_als = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2) y_pred_als = fm_als.fit(X, y).predict(X) y_pred_sgd = fm_sgd.fit(X, y).predict(X) score_als = metrics.r2_score(y_pred_als, y) score_sgd = metrics.r2_score(y_pred_sgd, y) assert_almost_equal(score_sgd, score_als, decimal=2)
def praRegression(model, dirName, X_train, y_train, X_test, y_test, train_pair, test_pair): if (model == "lr"): reg = LogisticRegression(penalty="l1", n_jobs=-1) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) elif (model == "sgdregressor"): reg = SGDRegressor(penalty='elasticnet', n_iter=500, l1_ratio=0.6) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) elif (model == "fastFM"): reg = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) elif (model == "gbr"): reg = GradientBoostingRegressor(n_estimators=100) reg.fit(X_train.toarray(), y_train) y_pred = reg.predict(X_test) elif (model == "xgb"): X_train = X_train.tocsc() X_test = X_test.tocsc() reg = xgb.XGBRegressor(max_depth=7, objective="rank:pairwise", learning_rate=0.08, subsample=0.8, colsample_bytree=0.7, reg_alpha=0.6, reg_lambda=0.1, n_estimators=1500) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) elif (model == "mxnet"): y_pred = mxTrainer(dirName, X_train, y_train, X_test, y_test, train_pair, test_pair) else: exit(1) #y_pred = reg.predict(X_test) test_pair['score'] = y_pred print(roc_auc_score(y_true=y_test, y_score=y_pred)) writeScoresInPraStyle(test_pair, train_pair, dirName)
def train(self): """""" print('size before truncated outliers is %d ' % len(self.TrainData)) #TrainData = self.TrainData[(self.TrainData['logerror'] > -0.4) & (self.TrainData['logerror'] < 0.418)] TrainData = self.TrainData print('size after truncated outliers is %d ' % len(TrainData)) print('train data size %d' % len(TrainData)) #self.__ExtraEncode() X = TrainData.drop(self._l_drop_cols, axis=1) Y = TrainData['logerror'] l_train_columns = X.columns cols = [] for col in l_train_columns: for cc in self._l_cate_cols: if (col.startswith('%s_' % cc)): cols.append(col) break tmp_cols = set(cols) if(len(tmp_cols) != len(cols)): print('!!!! cols duplicated .') self._l_train_columns = list(tmp_cols) X = scipy.sparse.csr_matrix(X[self._l_train_columns]) self._model = als.FMRegression(n_iter= self._iter, init_stdev=0.1, rank= self._rank, l2_reg_w= self._reg_w, l2_reg_V= self._reg_v) self._model.fit(X, Y) print('training done.') self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,datetime.now().strftime('%Y%m%d-%H:%M:%S')) with open(self._f_eval_train_model,'wb') as o_file: pickle.dump(self._model,o_file,-1) o_file.close() self.TrainData = pd.concat([self.TrainData,self.ValidData[self.TrainData.columns]],ignore_index= True) ## ignore_index will reset the index or index will be overlaped return
def fastFMJob(data_path, params, N, vectorizer, solver): rmses = [] logging.info("Evaluando con params: {0}".format(params)) for i in range(1, 4 + 1): train_data, y_tr, _ = loadData('train/train_N' + str(N) + '.' + str(i), data_path=data_path, with_timestamps=False, with_authors=False) val_data, y_va, _ = loadData('val/val_N' + str(N) + '.' + str(i), data_path=data_path, with_timestamps=False, with_authors=False) X_tr = vectorizer.transform(train_data) X_va = vectorizer.transform(val_data) if solver == "mcmc": fm = mcmc.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, copy_X=True) preds = fm.fit_predict(X_tr, y_tr, X_va) rmse = sqrt(mean_squared_error(y_va, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) rmses.append(rmse) elif solver == "als": fm = als.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, \ l2_reg_w=params['l2_reg_w'], l2_reg_V=params['l2_reg_V'], l2_reg=params['l2_reg']) fm.fit(X_tr, y_tr) preds = fm.predict(X_va) rmse = sqrt(mean_squared_error(y_va, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) rmses.append(rmse) elif solver == "sgd": fm = sgd.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, \ l2_reg_w=params['l2_reg_w'], l2_reg_V=params['l2_reg_V'], l2_reg=params['l2_reg'], step_size=params['step_size']) fm.fit(X_tr, y_tr) preds = fm.predict(X_va) rmse = sqrt(mean_squared_error(y_va, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) rmses.append(rmse) return mean(rmses)
def run_FM(X_learn=None, Y_learn=None, X_test=None, Y_test=None, l2_reg_w=0.01, l2_reg_V=0.01, rank=2, score_func=None, columns=None, verbose=None): fm = als.FMRegression(l2_reg_V=l2_reg_V, l2_reg_w=l2_reg_w, rank=rank) fm.fit(X_learn, Y_learn) y_pred = fm.predict(X_test) score_fm = 0 if not (score_func is None): score_fm = score_func(Y_test, y_pred) return y_pred, score_fm
def __init__(self, params: dict): super().__init__(params) #choose which model to use, {"model_info_used": "None", "User", "Item","Both"} self.model_info_used = self.params[ "model_info_used"] if "model_info_used" in self.params else "None" self.fm = als.FMRegression( n_iter=self.params["n_iter"] if "n_iter" in self.params else 100, l2_reg_w=self.params["l2_reg_w"] if "l2_reg_w" in self.params else 0.1, l2_reg_V=self.params["l2_reg_V"] if "l2_reg_V" in self.params else 0.5, rank=self.params["rank"] if "rank" in self.params else 2) self.model = None self.v = DictVectorizer() self.user_info = None self.item_info = None self.X_train = None self.y_train = None self.X_test = None self.y_test = None self.pred = None self.model_name += self.model_info_used
def predict(train_records, test_records): """ Makes a prediction for the testing set based on the topic probability vector of each record and the rating. The topic model is built using the training set. This function uses the FastFM Factorization Machines Module for Python :param train_records: the training set :param test_records: the testing set :return: a list with the predictions for the testing set """ records = train_records + test_records context_rich_topics = [(i, 1) for i in range(num_topics)] new_matrix, new_y = records_to_matrix(records, context_rich_topics) print(new_matrix) encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True) encoder.fit(new_matrix) new_x = encoder.transform(new_matrix[:len(train_records)]) # print(new_x.todense()) # x_train, x_test, y_train, y_test = train_test_split(new_x, new_y) x_train = new_x y_train = new_y[:len(train_records)] x_test = encoder.transform(new_matrix[len(train_records):]) mc_regressor = mcmc.FMRegression() y_pred = mc_regressor.fit_predict(x_train, y_train, x_test) print('********') print(x_test.todense()) print(y_pred) als_fm = als.FMRegression( n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5) als_fm.fit(x_train, y_train) y_pred = als_fm.predict(x_test) print(y_pred) return y_pred
def explicit(args): encoder = Encoder() encoder.load_item_attributes(os.path.join(args.in_dir, 'u.item')) encoder.load_user_attributes(os.path.join(args.in_dir, 'u.user')) X, y = encoder.get_Xy(os.path.join(args.in_dir, 'ua.base')) print(X.shape) print(len(y)) fm = als.FMRegression(random_state=args.random_state) # cross-validation param_grid = {'rank': [2, 4, 8, 16]} cv = KFold(n_splits=5, shuffle=True, random_state=args.random_state) gs = GridSearchCV(fm, param_grid, scoring='neg_mean_squared_error', cv=cv) gs.fit(X, y) fm = gs.best_estimator_ X_test, y_test = encoder.get_Xy(os.path.join(args.in_dir, 'ua.test'), test=True) y_pred = fm.predict(X_test) print(np.c_[y_test, y_pred][:10]) mse = mean_squared_error(y_test, y_pred) print(f'RMSE: {np.sqrt(mse)}')
def create_model(alg="als", type="regression", rank=5, n_iter=100, init_stdev=0.1, l2_reg_w=0.1, l2_reg_V=0.1, step_size=0.01): model = None if alg == "als" and type == "regression": model = als.FMRegression(n_iter=n_iter, init_stdev=init_stdev, rank=rank, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V) elif alg == "als" and type == "classification": model = als.FMClassification(n_iter=n_iter, init_stdev=init_stdev, rank=rank, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V) elif alg == "sgd" and type == "regression": model = sgd.FMRegression(n_iter=n_iter, init_stdev=init_stdev, rank=rank, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V, step_size=step_size) elif alg == "sgd" and type == "classification": model = sgd.FMClassification(n_iter=n_iter, init_stdev=init_stdev, rank=rank, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V, step_size=step_size) return model
def __init__(self): self.model = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
def fastFM_tuning(data_path, N, solver): all_data, y_all, _ = loadData("eval_all_N" + str(N) + ".data", data_path=data_path, with_timestamps=False, with_authors=False) v = DictVectorizer() X_all = v.fit_transform(all_data) if solver == "mcmc": defaults = {'mi': 100, 'init_stdev': 0.1, 'f': 8} elif solver == "als": defaults = { 'mi': 100, 'init_stdev': 0.1, 'f': 8, 'l2_reg_w': 0.1, 'l2_reg_V': 0.1, 'l2_reg': 0 } elif solver == "sgd": defaults = { 'mi': 100, 'init_stdev': 0.1, 'f': 8, 'l2_reg_w': 0.1, 'l2_reg_V': 0.1, 'l2_reg': 0, 'step_size': 0.1 } results = dict((param, {}) for param in defaults.keys()) for param in ['mi', 'f', 'init_stdev']: if param == 'mi': for i in [1, 5, 10, 20, 50, 100, 150, 200]: defaults['mi'] = i results['mi'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['mi'] = opt_value(results=results['mi'], metric='rmse') elif param == 'f': for i in [1, 5, 8, 10] + range(20, 2020, 20): defaults['f'] = i results['f'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['f'] = opt_value(results=results['f'], metric='rmse') elif param == 'init_stdev': for i in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]: defaults['init_stdev'] = i results['init_stdev'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['init_stdev'] = opt_value(results=results['init_stdev'], metric='rmse') if solver != "mcmc": for param in ['l2_reg_w', 'l2_reg_V', 'l2_reg']: if param == 'l2_reg_w': for i in [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0 ]: defaults['l2_reg_w'] = i results['l2_reg_w'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['l2_reg_w'] = opt_value(results=results['l2_reg_w'], metric='rmse') elif param == 'l2_reg_V': for i in [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0 ]: defaults['l2_reg_V'] = i results['l2_reg_V'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['l2_reg_V'] = opt_value(results=results['l2_reg_V'], metric='rmse') elif param == 'l2_reg': for i in [ 0.0, 0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.08, 0.1 ]: defaults['l2_reg'] = i results['l2_reg'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['l2_reg'] = opt_value(results=results['l2_reg'], metric='rmse') if solver == "sgd": for i in [ 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.5 ]: defaults['step_size'] = i results['step_size'][i] = fastFMJob(data_path=data_path, params=defaults, N=N, vectorizer=v, solver=solver) defaults['step_size'] = opt_value(results=results['step_size'], metric='rmse') # Real testing train_data, y_tr, _ = loadData('eval_train_N' + str(N) + '.data', data_path=data_path, with_timestamps=False, with_authors=False) test_data, y_te, _ = loadData('test/test_N' + str(N) + '.data', data_path=data_path, with_timestamps=False, with_authors=False) X_tr = v.transform(train_data) X_te = v.transform(test_data) if solver == "mcmc": fm = mcmc.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, copy_X=True) preds = fm.fit_predict(X_tr, y_tr, X_te) rmse = sqrt(mean_squared_error(y_te, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) with open('TwitterRatings/fastFM/mcmc/clean/opt_params.txt', 'w') as f: for param in defaults: f.write("{param}:{value}\n".format(param=param, value=defaults[param])) f.write("RMSE:{rmse}".format(rmse=rmse)) with open('TwitterRatings/fastFM/mcmc/clean/params_rmses.txt', 'w') as f: for param in results: for value in results[param]: f.write("{param}={value}\t : {RMSE}\n".format( param=param, value=value, RMSE=results[param][value])) elif solver == "als": fm = als.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, \ l2_reg_w=defaults['l2_reg_w'], l2_reg_V=defaults['l2_reg_V'], l2_reg=defaults['l2_reg']) fm.fit(X_tr, y_tr) preds = fm.predict(X_te) rmse = sqrt(mean_squared_error(y_te, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) with open('TwitterRatings/fastFM/als/clean/opt_params.txt', 'w') as f: for param in defaults: f.write("{param}:{value}\n".format(param=param, value=defaults[param])) f.write("RMSE:{rmse}".format(rmse=rmse)) with open('TwitterRatings/fastFM/als/clean/params_rmses.txt', 'w') as f: for param in results: for value in results[param]: f.write("{param}={value}\t : {RMSE}\n".format( param=param, value=value, RMSE=results[param][value])) elif solver == "sgd": fm = sgd.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, \ l2_reg_w=defaults['l2_reg_w'], l2_reg_V=defaults['l2_reg_V'], l2_reg=defaults['l2_reg'], step_size=defaults['step_size']) fm.fit(X_tr, y_tr) preds = fm.predict(X_te) rmse = sqrt(mean_squared_error(y_te, preds)) logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver)) with open('TwitterRatings/fastFM/sgd/clean/opt_params.txt', 'w') as f: for param in defaults: f.write("{param}:{value}\n".format(param=param, value=defaults[param])) f.write("RMSE:{rmse}".format(rmse=rmse)) with open('TwitterRatings/fastFM/sgd/clean/params_rmses.txt', 'w') as f: for param in results: for value in results[param]: f.write("{param}={value}\t : {RMSE}\n".format( param=param, value=value, RMSE=results[param][value])) return defaults
def test_fm_linear_regression(): X, y = get_small_data() fm = als.FMRegression(n_iter=1, l2_reg_w=1, l2_reg_V=1, rank=0) fm.fit(X, y)
#fastFM sandbox from fastFM import als from sklearn.metrics import mean_squared_error import scipy.sparse as sp xtrain = sp.csc_matrix(X_train) xvalid = sp.csc_matrix(X_valid) n_iter = 50 rank = 4 seed = 42 step_size = 1 l2_reg_w = 0 l2_reg_V = 0 fm = als.FMRegression(n_iter=0, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V, rank=rank, random_state=seed) # initalize coefs fm.fit(xtrain, Y_train.values) rmse_train = [] rmse_test = [] for i in range(1, n_iter): fm.fit(xtrain, Y_train.values, n_more_iter=step_size) y_pred = fm.predict(xvalid) train_err = np.sqrt(mean_squared_error(fm.predict(xtrain), Y_train.values)) valid_err = np.sqrt(mean_squared_error(fm.predict(xvalid), Y_valid.values)) print("train-rmse=%.4f valid-rmse=%.4f" % (train_err, valid_err)) rmse_train.append(train_err) rmse_test.append(valid_err) train_err = np.sqrt(mean_squared_error(fm.predict(xtrain).clip(0, 20), Y_train.values))
def _test_raise_when_input_is_dense(): fm = als.FMRegression(n_iter=0, l2_reg_w=0, l2_reg_V=0, rank=0) X = np.arange(3, 4, dtype=np.float64) y = np.arange(3, dtype=np.float64) fm.fit(X, y, warm_start=True)