示例#1
0
def test_als_warm_start():
    X, y, coef = make_user_item_regression(label_stdev=0)
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)
    X_train = sp.csc_matrix(X_train)
    X_test = sp.csc_matrix(X_test)

    fm = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2)
    fm.fit(X_train, y_train)
    y_pred = fm.predict(X_test)
    error_10_iter = mean_squared_error(y_pred, y_test)

    fm = als.FMRegression(n_iter=5, l2_reg_w=0, l2_reg_V=0, rank=2)
    fm.fit(X_train, y_train)
    print(fm.iter_count)
    y_pred = fm.predict(X_test)
    error_5_iter = mean_squared_error(y_pred, y_test)

    fm.fit(sp.csc_matrix(X_train), y_train, n_more_iter=5)
    print(fm.iter_count)
    y_pred = fm.predict(X_test)
    error_5_iter_plus_5 = mean_squared_error(y_pred, y_test)

    print(error_5_iter, error_5_iter_plus_5, error_10_iter)

    assert error_10_iter == error_5_iter_plus_5
示例#2
0
def run_FM_cv(
        X_learn=None,
        Y_learn=None,
        X_test=None,
        Y_test=None,
        dict_cv={
            "l2_reg_w": np.linspace(0.01, 15, 5),
            "l2_reg_V": np.linspace(0.01, 15, 5),
            "rank": [2, 4, 6, 8]
        },
        score_func=None,
        scoring='mean_squared_error',
        columns=None,
        nb_trails=5,
        verbose=None):

    ss = []

    skf = list(cross_validation.KFold(X_learn.shape[0], nb_trials))

    for k, (train, test) in enumerate(skf):

        X_learn_cv, X_test_cv = X_learn[train], X_learn[test]
        Y_learn_cv, Y_test_cv = Y_learn[train], Y_learn[test]

        print("cv {}".format(k), end='\r')
        l2_reg_V = random.choice(dict_cv['l2_reg_V'])
        l2_reg_w = random.choice(dict_cv['l2_reg_w'])
        rank = random.choice(dict_cv['rank'])

        reg = als.FMRegression(l2_reg_V=random.choice(dict_cv['l2_reg_V']),
                               l2_reg_w=random.choice(dict_cv['l2_reg_w']),
                               rank=random.choice(dict_cv['rank']))

        reg.fit(X_learn_cv, Y_learn_cv)

        y_pred = reg.predict(X_test_cv)

        s = metrics.mean_squared_error(y_pred, Y_test_cv)
        ss.append((l2_reg_V, l2_reg_w, rank, s))

    best = min(ss, key=operator.itemgetter(3))
    print(best)

    fm = als.FMRegression(l2_reg_V=best[0], l2_reg_w=best[1], rank=best[2])

    fm.fit(X_learn, Y_learn)

    y_pred = fm.predict(X_test)

    score_fm = 0
    if not (score_func is None):
        score_fm = score_func(Y_test, y_pred)

    return y_pred, score_fm
示例#3
0
def test_warm_start_path():

    X, y, coef = make_user_item_regression(label_stdev=.4)
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    X_train = sp.csc_matrix(X_train)
    X_test = sp.csc_matrix(X_test)
    n_iter = 10

    rank = 4
    seed = 333
    step_size = 1
    l2_reg_w = 0
    l2_reg_V = 0

    fm = als.FMRegression(n_iter=0,
                          l2_reg_w=l2_reg_w,
                          l2_reg_V=l2_reg_V,
                          rank=rank,
                          random_state=seed)
    # initalize coefs
    fm.fit(X_train, y_train)

    rmse_train = []
    rmse_test = []
    for i in range(1, n_iter):
        fm.fit(X_train, y_train, n_more_iter=step_size)
        rmse_train.append(
            np.sqrt(mean_squared_error(fm.predict(X_train), y_train)))
        rmse_test.append(
            np.sqrt(mean_squared_error(fm.predict(X_test), y_test)))

    print('------- restart ----------')
    values = np.arange(1, n_iter)
    rmse_test_re = []
    rmse_train_re = []
    for i in values:
        fm = als.FMRegression(n_iter=i,
                              l2_reg_w=l2_reg_w,
                              l2_reg_V=l2_reg_V,
                              rank=rank,
                              random_state=seed)
        fm.fit(X_train, y_train)
        rmse_test_re.append(
            np.sqrt(mean_squared_error(fm.predict(X_test), y_test)))
        rmse_train_re.append(
            np.sqrt(mean_squared_error(fm.predict(X_train), y_train)))

    assert_almost_equal(rmse_train, rmse_train_re)
    assert_almost_equal(rmse_test, rmse_test_re)
示例#4
0
def test_fm_regression():
    w0, w, V, y, X = get_test_problem()

    fm = als.FMRegression(n_iter=1000, l2_reg_w=0, l2_reg_V=0, rank=2)
    fm.fit(X, y)
    y_pred = fm.predict(X)
    assert_almost_equal(y_pred, y, 3)
    # check different size
    fm = als.FMRegression(n_iter=1000, l2_reg_w=0, l2_reg_V=0, rank=5)
    X_big = sp.hstack([X, X])
    fm.fit(X_big, y)
    y_pred = fm.predict(X_big[:2, ])
示例#5
0
def _test_fm_regression_only_w0():
    X, y = get_small_data()

    fm = als.FMRegression(n_iter=0, l2_reg_w=0, l2_reg_V=0, rank=0)
    fm.ignore_w = True
    fm.w0_ = 2
    fm.fit(X, y, warm_start=True)
    assert_almost_equal(fm.w0_, 2, 6)

    fm = als.FMRegression(n_iter=1, l2_reg_w=0, l2_reg_V=0, rank=0)
    fm.ignore_w = True
    fm.w0_ = 2
    fm.fit(X, y, warm_start=True)
    assert_almost_equal(fm.w0_, 4466.6666666666661, 6)
示例#6
0
 def __init__(self,model_file = "", n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5):
     if os.path.exists(model_file):
         print('old')
         self.fm = joblib.load(model_file)
     else:
         self.fm = als.FMRegression(n_iter, init_stdev, rank, l2_reg_w, l2_reg_V)
         print('new')
示例#7
0
    def predict_fastfm(self):

        if Constants.USE_CONTEXT:
            for record in self.records_to_predict:
                important_record = record[Constants.REVIEW_ID_FIELD]
                record[Constants.CONTEXT_TOPICS_FIELD] = \
                    self.context_topics_map[important_record]

        all_records = self.train_records + self.records_to_predict
        x_matrix, y_vector = fastfm_recommender.records_to_matrix(
            all_records, self.context_rich_topics)

        encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True)
        encoder.fit(x_matrix)

        x_train = encoder.transform(x_matrix[:len(self.train_records)])
        y_train = y_vector[:len(self.train_records)]
        x_test = encoder.transform(x_matrix[len(self.train_records):])

        if Constants.FASTFM_METHOD == 'mcmc':
            # solver = mcmc.FMRegression(n_iter=num_iters, rank=num_factors)
            solver = mcmc.FMRegression(rank=Constants.FM_NUM_FACTORS)
            self.predictions = solver.fit_predict(x_train, y_train, x_test)
        elif Constants.FASTFM_METHOD == 'als':
            solver = als.FMRegression(rank=Constants.FM_NUM_FACTORS)
            solver.fit(x_train, y_train)
            self.predictions = solver.predict(x_test)
        elif Constants.FASTFM_METHOD == 'sgd':
            solver = sgd.FMRegression(rank=Constants.FM_NUM_FACTORS)
            solver.fit(x_train, y_train)
            self.predictions = solver.predict(x_test)
示例#8
0
def _build_als_model(param):
    return als.FMRegression(n_iter=param['n_iter'], \
                            init_stdev=param['init_stdev'], \
                            rank=param['rank'], \
                            random_state=param['random_state'], \
                            l2_reg_w=param['l2_reg_w'], \
                            l2_reg_V=param['l2_reg_V'], \
                            l2_reg=param['l2_reg'])
示例#9
0
 def fmcv(rank, l2_reg_w, l2_reg_V):
     fm = als.FMRegression(n_iter=10,
                           init_stdev=0.0001,
                           rank=int(rank),
                           l2_reg_w=l2_reg_w,
                           l2_reg_V=l2_reg_V)
     fm.fit(X_train, y_train)
     val = -mean_squared_error(fm.predict(X_test), y_test)
     return val
示例#10
0
文件: cfm.py 项目: xuedong/mangaki
    def fit(self, X, y):
        X_fm = self.prepare_fm(X)

        self.chrono.save('prepare data in sparse FM format')

        self.fm = als.FMRegression(n_iter=self.nb_iterations, rank=self.rank)
        self.fm.fit(X_fm, y)

        self.chrono.save('factor matrix')
示例#11
0
def fm_reg(x_train, y_train, x_valid):
    from fastFM import als
    FM = als.FMRegression(n_iter=1000,
                          init_stdev=0.1,
                          rank=8,
                          l2_reg_w=0.2,
                          l2_reg_V=0.5)
    fm_train, fm_test = stacking(FM, x_train, y_train, x_valid, "fm")
    return fm_train, fm_test, 'fm'
示例#12
0
def test_clone():
    from sklearn.base import clone

    a = als.FMRegression()
    b = clone(a)
    assert a.get_params() == b.get_params()

    a = als.FMClassification()
    b = clone(a)
    assert a.get_params() == b.get_params()
示例#13
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        from fastFM import als
        X = dt.Frame(X)

        orig_cols = list(X.names)

        if self.num_classes >= 2:
            model = als.FMClassification(n_iter=self.params["n_iter"],
                                         init_stdev=self.params["init_stdev"],
                                         rank=self.params["rank"],
                                         l2_reg_w=self.params["l2_reg_w"],
                                         l2_reg_V=self.params["l2_reg_V"],
                                         random_state=self.random_state)
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            y[y != 1] = -1

        else:
            model = als.FMRegression(n_iter=self.params["n_iter"],
                                     init_stdev=self.params["init_stdev"],
                                     rank=self.params["rank"],
                                     l2_reg_w=self.params["l2_reg_w"],
                                     l2_reg_V=self.params["l2_reg_V"],
                                     random_state=self.random_state)

        self.means = dict()
        self.scaler = StandardScaler()
        for col in X.names:
            XX = X[:, col]
            self.means[col] = XX.mean1()
            if np.isnan(self.means[col]):
                self.means[col] = 0
            XX.replace(None, self.means[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()
        X = self.scaler.fit_transform(X)
        X = csr_matrix(X)  # requires sparse matrix
        model.fit(X, y)
        importances = np.array(abs(model.w_))

        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances.tolist(),  # abs(model.coef_[0])
            iterations=0)
示例#14
0
def main():

    vectorizer = build_vectorizer(binary=False)

    print('loading output...')
    train_data = pd.read_csv(TRAIN_FILE)
    test_data = pd.read_csv(TEST_FILE)
    #train_data['qpair'] = train_data.apply(lambda r: '{0} {1}'.format(str(r.question1), str(r.question2)), axis=1)
    #test_data['qpair'] = test_data.apply(lambda r: '{0} {1}'.format(str(r.question1), str(r.question2)), axis=1)
    combined = pd.concat([
        train_data.question1, train_data.question2, test_data.question1,
        test_data.question2
    ],
                         axis=0,
                         ignore_index=True)
    combined = combined.fillna('na')
    print(combined.head())

    print('fitting tf_idf vectorizer...')
    features = vectorizer.fit_transform(combined)
    train_size = len(train_data.question1)
    test_size = len(test_data.question1)
    f_train_q1 = features[0:train_size]
    f_train_q2 = features[train_size:train_size * 2]
    f_test_q1 = features[train_size * 2:train_size * 2 + test_size]
    f_test_q2 = features[train_size * 2 + test_size:]

    f_train = sp.hstack([f_train_q1, f_train_q2])
    f_test = sp.hstack([f_test_q1, f_test_q2])

    X_train, X_cv, y_train, y_cv = train_test_split(f_train,
                                                    train_data.is_duplicate,
                                                    test_size=0.2,
                                                    random_state=1234)

    print('training FM model...')
    fm = als.FMRegression(n_iter=1000,
                          init_stdev=0.1,
                          rank=4,
                          l2_reg_w=0.1,
                          l2_reg_V=0.5)
    fm.fit(X_train, y_train)

    print('cross validation...')
    predictions = fm.predict(X_cv)
    print('cv log-loss: {0}'.format(log_loss(y_cv, predictions)))
    print('cv auc: {0}'.format(roc_auc_score(y_cv, predictions)))

    print('predicting {0} test samples...'.format(f_test.shape[0]))
    predictions = pd.DataFrame()
    predictions['test_id'] = range(0, f_test.shape[0])
    predictions['is_duplicate'] = fm.predict(f_test)
    predictions = predictions.fillna(POS_PROP)
    predictions.to_csv(SUBMISSION_FILE, index=False)
示例#15
0
    def fastfm(self):
        fm = als.FMRegression(n_iter=100, init_stdev=0.1, rank=4, l2_reg_w=0.1, l2_reg_V=0.5)
        fm.fit(self.X_train, self.y_train)
        y_pred = fm.predict(self.X_test)

        prec = precision_score(self.y_test, y_pred.round(), average='weighted')
        rec = recall_score(self.y_test, y_pred.round(), average='weighted') 
        fmeasure = 2*((prec*rec)/(prec+rec))
        auc = roc_auc_score(self.y_test, y_pred, average='macro')
        rmse = np.sqrt(mean_squared_error(self.y_test, y_pred))
        return (auc, rmse)
示例#16
0
def train_model(x_train,
                y_train,
                n_iter,
                init_stdev=0.1,
                rank=2,
                l2_reg_w=0.1,
                l2_reg_V=0.5):
    fm = als.FMRegression(n_iter=n_iter,
                          init_stdev=init_stdev,
                          rank=1,
                          l2_reg_w=1,
                          l2_reg_V=2)
    fm.fit(x_train, y_train)
    return fm
示例#17
0
def test_second_order_sgd_vs_als_regression():
    X, y = make_regression(n_samples=100, n_features=50, random_state=123)
    X = sp.csc_matrix(X)

    fm_sgd = sgd.FMRegression(n_iter=50000, init_stdev=0.00, l2_reg_w=0.0,
                              l2_reg_V=50.5, rank=2, step_size=0.0002)
    fm_als = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2)

    y_pred_als = fm_als.fit(X, y).predict(X)
    y_pred_sgd = fm_sgd.fit(X, y).predict(X)

    score_als = metrics.r2_score(y_pred_als, y)
    score_sgd = metrics.r2_score(y_pred_sgd, y)

    assert_almost_equal(score_sgd, score_als, decimal=2)
示例#18
0
def praRegression(model, dirName, X_train, y_train, X_test, y_test, train_pair,
                  test_pair):
    if (model == "lr"):
        reg = LogisticRegression(penalty="l1", n_jobs=-1)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
    elif (model == "sgdregressor"):
        reg = SGDRegressor(penalty='elasticnet', n_iter=500, l1_ratio=0.6)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
    elif (model == "fastFM"):
        reg = als.FMRegression(n_iter=1000,
                               init_stdev=0.1,
                               rank=2,
                               l2_reg_w=0.1,
                               l2_reg_V=0.5)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
    elif (model == "gbr"):
        reg = GradientBoostingRegressor(n_estimators=100)
        reg.fit(X_train.toarray(), y_train)
        y_pred = reg.predict(X_test)
    elif (model == "xgb"):
        X_train = X_train.tocsc()
        X_test = X_test.tocsc()
        reg = xgb.XGBRegressor(max_depth=7,
                               objective="rank:pairwise",
                               learning_rate=0.08,
                               subsample=0.8,
                               colsample_bytree=0.7,
                               reg_alpha=0.6,
                               reg_lambda=0.1,
                               n_estimators=1500)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
    elif (model == "mxnet"):
        y_pred = mxTrainer(dirName, X_train, y_train, X_test, y_test,
                           train_pair, test_pair)
    else:
        exit(1)
    #y_pred = reg.predict(X_test)
    test_pair['score'] = y_pred
    print(roc_auc_score(y_true=y_test, y_score=y_pred))
    writeScoresInPraStyle(test_pair, train_pair, dirName)
示例#19
0
    def train(self):
        """"""
        print('size before truncated outliers is %d ' % len(self.TrainData))
        #TrainData = self.TrainData[(self.TrainData['logerror'] > -0.4) & (self.TrainData['logerror'] < 0.418)]
        TrainData = self.TrainData
        print('size after truncated outliers is %d ' % len(TrainData))
        print('train data size %d' % len(TrainData))

        #self.__ExtraEncode()

        X = TrainData.drop(self._l_drop_cols, axis=1)
        Y = TrainData['logerror']
        l_train_columns = X.columns

        cols = []
        for col in l_train_columns:
            for cc in self._l_cate_cols:
                if (col.startswith('%s_' % cc)):
                    cols.append(col)
                    break

        tmp_cols = set(cols)
        if(len(tmp_cols) != len(cols)):
            print('!!!! cols duplicated .')

        self._l_train_columns = list(tmp_cols)

        X = scipy.sparse.csr_matrix(X[self._l_train_columns])
        self._model = als.FMRegression(n_iter= self._iter, init_stdev=0.1, rank= self._rank, l2_reg_w= self._reg_w, l2_reg_V= self._reg_v)
        self._model.fit(X, Y)

        print('training done.')

        self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,datetime.now().strftime('%Y%m%d-%H:%M:%S'))
        with open(self._f_eval_train_model,'wb') as o_file:
            pickle.dump(self._model,o_file,-1)
        o_file.close()

        self.TrainData = pd.concat([self.TrainData,self.ValidData[self.TrainData.columns]],ignore_index= True) ## ignore_index will reset the index or index will be overlaped

        return
示例#20
0
def fastFMJob(data_path, params, N, vectorizer, solver):
    rmses = []
    logging.info("Evaluando con params: {0}".format(params))
    for i in range(1, 4 + 1):
        train_data, y_tr, _ = loadData('train/train_N' + str(N) + '.' + str(i),
                                       data_path=data_path,
                                       with_timestamps=False,
                                       with_authors=False)
        val_data, y_va, _ = loadData('val/val_N' + str(N) + '.' + str(i),
                                     data_path=data_path,
                                     with_timestamps=False,
                                     with_authors=False)
        X_tr = vectorizer.transform(train_data)
        X_va = vectorizer.transform(val_data)
        if solver == "mcmc":
            fm = mcmc.FMRegression(n_iter=params['mi'],
                                   init_stdev=params['init_stdev'],
                                   rank=params['f'],
                                   random_state=123,
                                   copy_X=True)
            preds = fm.fit_predict(X_tr, y_tr, X_va)
            rmse = sqrt(mean_squared_error(y_va, preds))
            logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
            rmses.append(rmse)
        elif solver == "als":
            fm = als.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, \
                       l2_reg_w=params['l2_reg_w'], l2_reg_V=params['l2_reg_V'], l2_reg=params['l2_reg'])
            fm.fit(X_tr, y_tr)
            preds = fm.predict(X_va)
            rmse = sqrt(mean_squared_error(y_va, preds))
            logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
            rmses.append(rmse)
        elif solver == "sgd":
            fm = sgd.FMRegression(n_iter=params['mi'], init_stdev=params['init_stdev'], rank=params['f'], random_state=123, \
                       l2_reg_w=params['l2_reg_w'], l2_reg_V=params['l2_reg_V'], l2_reg=params['l2_reg'], step_size=params['step_size'])
            fm.fit(X_tr, y_tr)
            preds = fm.predict(X_va)
            rmse = sqrt(mean_squared_error(y_va, preds))
            logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
            rmses.append(rmse)
    return mean(rmses)
示例#21
0
def run_FM(X_learn=None,
           Y_learn=None,
           X_test=None,
           Y_test=None,
           l2_reg_w=0.01,
           l2_reg_V=0.01,
           rank=2,
           score_func=None,
           columns=None,
           verbose=None):

    fm = als.FMRegression(l2_reg_V=l2_reg_V, l2_reg_w=l2_reg_w, rank=rank)

    fm.fit(X_learn, Y_learn)

    y_pred = fm.predict(X_test)

    score_fm = 0
    if not (score_func is None):
        score_fm = score_func(Y_test, y_pred)

    return y_pred, score_fm
示例#22
0
 def __init__(self, params: dict):
     super().__init__(params)
     #choose which model to use, {"model_info_used": "None", "User", "Item","Both"}
     self.model_info_used = self.params[
         "model_info_used"] if "model_info_used" in self.params else "None"
     self.fm = als.FMRegression(
         n_iter=self.params["n_iter"] if "n_iter" in self.params else 100,
         l2_reg_w=self.params["l2_reg_w"]
         if "l2_reg_w" in self.params else 0.1,
         l2_reg_V=self.params["l2_reg_V"]
         if "l2_reg_V" in self.params else 0.5,
         rank=self.params["rank"] if "rank" in self.params else 2)
     self.model = None
     self.v = DictVectorizer()
     self.user_info = None
     self.item_info = None
     self.X_train = None
     self.y_train = None
     self.X_test = None
     self.y_test = None
     self.pred = None
     self.model_name += self.model_info_used
示例#23
0
def predict(train_records, test_records):
    """
    Makes a prediction for the testing set based on the topic probability vector
    of each record and the rating. The topic model is built using the training
    set. This function uses the FastFM Factorization Machines Module for Python

    :param train_records: the training set
    :param test_records: the testing set
    :return: a list with the predictions for the testing set
    """

    records = train_records + test_records

    context_rich_topics = [(i, 1) for i in range(num_topics)]
    new_matrix, new_y = records_to_matrix(records, context_rich_topics)
    print(new_matrix)
    encoder = OneHotEncoder(categorical_features=[0, 1], sparse=True)
    encoder.fit(new_matrix)

    new_x = encoder.transform(new_matrix[:len(train_records)])
    # print(new_x.todense())
    # x_train, x_test, y_train, y_test = train_test_split(new_x, new_y)
    x_train = new_x
    y_train = new_y[:len(train_records)]
    x_test = encoder.transform(new_matrix[len(train_records):])
    mc_regressor = mcmc.FMRegression()
    y_pred = mc_regressor.fit_predict(x_train, y_train, x_test)
    print('********')
    print(x_test.todense())
    print(y_pred)

    als_fm = als.FMRegression(
        n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
    als_fm.fit(x_train, y_train)
    y_pred = als_fm.predict(x_test)
    print(y_pred)

    return y_pred
def explicit(args):
    encoder = Encoder()
    encoder.load_item_attributes(os.path.join(args.in_dir, 'u.item'))
    encoder.load_user_attributes(os.path.join(args.in_dir, 'u.user'))
    X, y = encoder.get_Xy(os.path.join(args.in_dir, 'ua.base'))
    print(X.shape)
    print(len(y))
    fm = als.FMRegression(random_state=args.random_state)

    # cross-validation
    param_grid = {'rank': [2, 4, 8, 16]}
    cv = KFold(n_splits=5, shuffle=True, random_state=args.random_state)
    gs = GridSearchCV(fm, param_grid, scoring='neg_mean_squared_error', cv=cv)
    gs.fit(X, y)
    fm = gs.best_estimator_

    X_test, y_test = encoder.get_Xy(os.path.join(args.in_dir, 'ua.test'),
                                    test=True)
    y_pred = fm.predict(X_test)
    print(np.c_[y_test, y_pred][:10])

    mse = mean_squared_error(y_test, y_pred)
    print(f'RMSE: {np.sqrt(mse)}')
示例#25
0
def create_model(alg="als",
                 type="regression",
                 rank=5,
                 n_iter=100,
                 init_stdev=0.1,
                 l2_reg_w=0.1,
                 l2_reg_V=0.1,
                 step_size=0.01):
    model = None
    if alg == "als" and type == "regression":
        model = als.FMRegression(n_iter=n_iter,
                                 init_stdev=init_stdev,
                                 rank=rank,
                                 l2_reg_w=l2_reg_w,
                                 l2_reg_V=l2_reg_V)
    elif alg == "als" and type == "classification":
        model = als.FMClassification(n_iter=n_iter,
                                     init_stdev=init_stdev,
                                     rank=rank,
                                     l2_reg_w=l2_reg_w,
                                     l2_reg_V=l2_reg_V)
    elif alg == "sgd" and type == "regression":
        model = sgd.FMRegression(n_iter=n_iter,
                                 init_stdev=init_stdev,
                                 rank=rank,
                                 l2_reg_w=l2_reg_w,
                                 l2_reg_V=l2_reg_V,
                                 step_size=step_size)
    elif alg == "sgd" and type == "classification":
        model = sgd.FMClassification(n_iter=n_iter,
                                     init_stdev=init_stdev,
                                     rank=rank,
                                     l2_reg_w=l2_reg_w,
                                     l2_reg_V=l2_reg_V,
                                     step_size=step_size)
    return model
示例#26
0
 def __init__(self):
     self.model = als.FMRegression(n_iter=1000,
                                   init_stdev=0.1,
                                   rank=2,
                                   l2_reg_w=0.1,
                                   l2_reg_V=0.5)
示例#27
0
def fastFM_tuning(data_path, N, solver):
    all_data, y_all, _ = loadData("eval_all_N" + str(N) + ".data",
                                  data_path=data_path,
                                  with_timestamps=False,
                                  with_authors=False)
    v = DictVectorizer()
    X_all = v.fit_transform(all_data)

    if solver == "mcmc":
        defaults = {'mi': 100, 'init_stdev': 0.1, 'f': 8}
    elif solver == "als":
        defaults = {
            'mi': 100,
            'init_stdev': 0.1,
            'f': 8,
            'l2_reg_w': 0.1,
            'l2_reg_V': 0.1,
            'l2_reg': 0
        }
    elif solver == "sgd":
        defaults = {
            'mi': 100,
            'init_stdev': 0.1,
            'f': 8,
            'l2_reg_w': 0.1,
            'l2_reg_V': 0.1,
            'l2_reg': 0,
            'step_size': 0.1
        }

    results = dict((param, {}) for param in defaults.keys())

    for param in ['mi', 'f', 'init_stdev']:

        if param == 'mi':
            for i in [1, 5, 10, 20, 50, 100, 150, 200]:
                defaults['mi'] = i
                results['mi'][i] = fastFMJob(data_path=data_path,
                                             params=defaults,
                                             N=N,
                                             vectorizer=v,
                                             solver=solver)
            defaults['mi'] = opt_value(results=results['mi'], metric='rmse')

        elif param == 'f':
            for i in [1, 5, 8, 10] + range(20, 2020, 20):
                defaults['f'] = i
                results['f'][i] = fastFMJob(data_path=data_path,
                                            params=defaults,
                                            N=N,
                                            vectorizer=v,
                                            solver=solver)
            defaults['f'] = opt_value(results=results['f'], metric='rmse')

        elif param == 'init_stdev':
            for i in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]:
                defaults['init_stdev'] = i
                results['init_stdev'][i] = fastFMJob(data_path=data_path,
                                                     params=defaults,
                                                     N=N,
                                                     vectorizer=v,
                                                     solver=solver)
            defaults['init_stdev'] = opt_value(results=results['init_stdev'],
                                               metric='rmse')

    if solver != "mcmc":
        for param in ['l2_reg_w', 'l2_reg_V', 'l2_reg']:

            if param == 'l2_reg_w':
                for i in [
                        0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0
                ]:
                    defaults['l2_reg_w'] = i
                    results['l2_reg_w'][i] = fastFMJob(data_path=data_path,
                                                       params=defaults,
                                                       N=N,
                                                       vectorizer=v,
                                                       solver=solver)
                defaults['l2_reg_w'] = opt_value(results=results['l2_reg_w'],
                                                 metric='rmse')

            elif param == 'l2_reg_V':
                for i in [
                        0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0
                ]:
                    defaults['l2_reg_V'] = i
                    results['l2_reg_V'][i] = fastFMJob(data_path=data_path,
                                                       params=defaults,
                                                       N=N,
                                                       vectorizer=v,
                                                       solver=solver)
                defaults['l2_reg_V'] = opt_value(results=results['l2_reg_V'],
                                                 metric='rmse')

            elif param == 'l2_reg':
                for i in [
                        0.0, 0.001, 0.003, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05,
                        0.07, 0.08, 0.1
                ]:
                    defaults['l2_reg'] = i
                    results['l2_reg'][i] = fastFMJob(data_path=data_path,
                                                     params=defaults,
                                                     N=N,
                                                     vectorizer=v,
                                                     solver=solver)
                defaults['l2_reg'] = opt_value(results=results['l2_reg'],
                                               metric='rmse')

    if solver == "sgd":
        for i in [
                0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
                0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.5
        ]:
            defaults['step_size'] = i
            results['step_size'][i] = fastFMJob(data_path=data_path,
                                                params=defaults,
                                                N=N,
                                                vectorizer=v,
                                                solver=solver)
        defaults['step_size'] = opt_value(results=results['step_size'],
                                          metric='rmse')

    # Real testing
    train_data, y_tr, _ = loadData('eval_train_N' + str(N) + '.data',
                                   data_path=data_path,
                                   with_timestamps=False,
                                   with_authors=False)
    test_data, y_te, _ = loadData('test/test_N' + str(N) + '.data',
                                  data_path=data_path,
                                  with_timestamps=False,
                                  with_authors=False)
    X_tr = v.transform(train_data)
    X_te = v.transform(test_data)

    if solver == "mcmc":
        fm = mcmc.FMRegression(n_iter=defaults['mi'],
                               init_stdev=defaults['init_stdev'],
                               rank=defaults['f'],
                               random_state=123,
                               copy_X=True)
        preds = fm.fit_predict(X_tr, y_tr, X_te)
        rmse = sqrt(mean_squared_error(y_te, preds))
        logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
        with open('TwitterRatings/fastFM/mcmc/clean/opt_params.txt', 'w') as f:
            for param in defaults:
                f.write("{param}:{value}\n".format(param=param,
                                                   value=defaults[param]))
            f.write("RMSE:{rmse}".format(rmse=rmse))
        with open('TwitterRatings/fastFM/mcmc/clean/params_rmses.txt',
                  'w') as f:
            for param in results:
                for value in results[param]:
                    f.write("{param}={value}\t : {RMSE}\n".format(
                        param=param, value=value, RMSE=results[param][value]))

    elif solver == "als":
        fm = als.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, \
                   l2_reg_w=defaults['l2_reg_w'], l2_reg_V=defaults['l2_reg_V'], l2_reg=defaults['l2_reg'])
        fm.fit(X_tr, y_tr)
        preds = fm.predict(X_te)
        rmse = sqrt(mean_squared_error(y_te, preds))
        logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
        with open('TwitterRatings/fastFM/als/clean/opt_params.txt', 'w') as f:
            for param in defaults:
                f.write("{param}:{value}\n".format(param=param,
                                                   value=defaults[param]))
            f.write("RMSE:{rmse}".format(rmse=rmse))
        with open('TwitterRatings/fastFM/als/clean/params_rmses.txt',
                  'w') as f:
            for param in results:
                for value in results[param]:
                    f.write("{param}={value}\t : {RMSE}\n".format(
                        param=param, value=value, RMSE=results[param][value]))

    elif solver == "sgd":
        fm = sgd.FMRegression(n_iter=defaults['mi'], init_stdev=defaults['init_stdev'], rank=defaults['f'], random_state=123, \
                   l2_reg_w=defaults['l2_reg_w'], l2_reg_V=defaults['l2_reg_V'], l2_reg=defaults['l2_reg'], step_size=defaults['step_size'])
        fm.fit(X_tr, y_tr)
        preds = fm.predict(X_te)
        rmse = sqrt(mean_squared_error(y_te, preds))
        logging.info("FM RMSE: {0}. Solver: {1}".format(rmse, solver))
        with open('TwitterRatings/fastFM/sgd/clean/opt_params.txt', 'w') as f:
            for param in defaults:
                f.write("{param}:{value}\n".format(param=param,
                                                   value=defaults[param]))
            f.write("RMSE:{rmse}".format(rmse=rmse))
        with open('TwitterRatings/fastFM/sgd/clean/params_rmses.txt',
                  'w') as f:
            for param in results:
                for value in results[param]:
                    f.write("{param}={value}\t : {RMSE}\n".format(
                        param=param, value=value, RMSE=results[param][value]))

    return defaults
示例#28
0
def test_fm_linear_regression():
    X, y = get_small_data()

    fm = als.FMRegression(n_iter=1, l2_reg_w=1, l2_reg_V=1, rank=0)
    fm.fit(X, y)
示例#29
0
#fastFM sandbox
from fastFM import als
from sklearn.metrics import mean_squared_error
import scipy.sparse as sp

xtrain = sp.csc_matrix(X_train)
xvalid = sp.csc_matrix(X_valid)

n_iter = 50
rank = 4
seed = 42
step_size = 1
l2_reg_w = 0
l2_reg_V = 0

fm = als.FMRegression(n_iter=0, l2_reg_w=l2_reg_w,
        l2_reg_V=l2_reg_V, rank=rank, random_state=seed)
# initalize coefs
fm.fit(xtrain, Y_train.values)

rmse_train = []
rmse_test = []
for i in range(1, n_iter):
    fm.fit(xtrain, Y_train.values, n_more_iter=step_size)
    y_pred = fm.predict(xvalid)
    train_err = np.sqrt(mean_squared_error(fm.predict(xtrain), Y_train.values))
    valid_err = np.sqrt(mean_squared_error(fm.predict(xvalid), Y_valid.values))
    print("train-rmse=%.4f valid-rmse=%.4f" % (train_err, valid_err))
    rmse_train.append(train_err)
    rmse_test.append(valid_err)

train_err = np.sqrt(mean_squared_error(fm.predict(xtrain).clip(0, 20), Y_train.values))
示例#30
0
def _test_raise_when_input_is_dense():
    fm = als.FMRegression(n_iter=0, l2_reg_w=0, l2_reg_V=0, rank=0)
    X = np.arange(3, 4, dtype=np.float64)
    y = np.arange(3, dtype=np.float64)
    fm.fit(X, y, warm_start=True)