def demo_xlearn_0(): # Generate predictions fm_model = xl.FMModel(task='binary', init=0.1, epoch=100, k=16, lr=0.1, reg_lambda=0.01, opt='sgd', n_jobs=4, metric='auc', stop_window=10) # log=str(BASE_DIR)+'/xlearn.log' fm_model.fit(X_tr, y_tr, eval_set=[X_te, y_te], is_lock_free=False) y_pred = fm_model.predict(X_te) print(y_pred, type(y_pred))
def train(X, y): model = xl.FMModel(task='binary', init=0.1, epoch=10, k=4, lr=0.1, reg_lambda=0.01, opt='sgd', metric='auc') model.fit(X, y) # print model weights print('====>>>> weights of FM-Model: {}'.format(model.weights)) return model
def train_model(train_X, valid_X, train_y, valid_y): """训练模型""" fm_model = xl.FMModel(lr=0.02, reg_lambda=0.001, k=18, epoch=10, stop_window=4) fm_model.fit(train_X, train_y) y_pred = fm_model.predict(valid_X) fpr, tpr, thresholds = roc_curve(valid_y, np.array(y_pred)) aucs = auc(fpr, tpr) print(aucs)
def create_fm_model(cls, train_x, train_y, dev_x=None, dev_y=None, model_output=None, iterations=100, thread_count=4, task='binary', k=16, lr=0.1, metric='auc', stop_window=100): ''' 因子分解机模型 :param train_x: 训练集特征 :param train_y: 训练集标签 :param dev_x: 验证集特征 :param dev_y: 验证集标签 :param iterations: 迭代次数 :param model_output: 模型保存路径 :param thread_count: CPU核数 :param task: 任务类型 :param k: 隐向量维度 :param lr: 学习率 :param metric: 评测函数 :param stop_window: Early-Stop :return: ''' fm_model = xl.FMModel(task=task, init=0.1, k=k, lr=lr, reg_lambda=0.01, epoch=iterations, opt='sgd', n_jobs=thread_count, metric=metric, stop_window=stop_window) if dev_x and dev_y: fm_model.fit(train_x, train_y, eval_set=[dev_x, dev_y], is_lock_free=False) else: fm_model.fit(train_x, train_y, is_lock_free=False) return fm_model
def FM(X_train, y_train, X_val, y_val): fm_model = xl.FMModel(task='binary', epoch=100, lr=0.05, reg_lambda=0.00002, k=40, opt='adagrad', nthread=8, metric='auc') # k = latent factor size fm_model.fit(X_train, y_train, eval_set=[X_val, y_val], is_instance_norm=True) y_pred = fm_model.predict(X_val) return y_pred
ffm_model.setTrain(dataTrain) ffm_model.setValidate(dataTest) param={'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric':'acc'} ffm_model.fit(param,"./model.out") ffm_model.setTest(dataTest) ffm_model.predict("./model.out","model./output.txt") ''' linear_model = xl.FMModel(task='binary', lr=0.2, k=10, init=0.1, epoch=500, reg_lambda=0.002, metric='acc') linear_model.fit(dataTrain, labelTrain, eval_set=[dataTest, labelTest]) y_pred = linear_model.predict(dataTest) y_pred = list(y_pred) print(y_pred) error = 0 item = 0 for i in range(len(y_pred)): item += 1 if y_pred[i] < 0.5 and labelTest[i] == 1: error += 1
X_val, \ y_train, \ y_val = train_test_split(X, y, test_size=0.2, random_state=0) # Standardize input scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_val = scaler.transform(X_val) # param: # 0. binary classification # 1. model scale: 0.1 # 2. epoch number: 10 (auto early-stop) # 3. number of latent factor: 4 # 4. learning rate: 0.1 # 5. regular lambda: 0.01 # 6. use sgd optimization method # 7. evaluation metric: accuarcy fm_model = xl.FMModel(task='binary', init=0.1, epoch=10, k=4, lr=0.1, reg_lambda=0.01, opt='sgd', metric='acc') # Start to train fm_model.fit(X_train, y_train, eval_set=[X_val, y_val]) # Generate predictions y_pred = fm_model.predict(X_val)
# y = (iris_data['target'] == 2) # # X_train,X_val,y_train,y_val = train_test_split(X, y, test_size=0.3, random_state=0) # param: # 0. binary classification # 1. model scale: 0.1 # 2. epoch number: 10 (auto early-stop) # 3. learning rate: 0.1 # 4. regular lambda: 1.0 # 5. use sgd optimization method # linear_model = xl.LRModel(task='binary', init=0.1, # epoch=10, lr=0.1, # reg_lambda=1.0, opt='sgd') param = {'task': 'binary', 'lr': 0.2, 'lambda': 0.002, 'metric': 'acc'} model = xl.FMModel() model.fit(X_train, y_train) preds = model.predict(X_test) print("FMModel ROC AUC:%.3f" % roc_auc_score(y_true=y_test, y_score=preds)) print("FMModel accuracy_scorer:%.3f" % accuracy_score(y_true=y_test, y_pred=preds)) # Start to train model.fit(X_train, y_train, eval_set=[X_test, y_test], is_lock_free=False) y_submission = model.predict(X_submission) y_pred = pd.DataFrame() y_pred["PassengerId"] = data_test["PassengerId"] y_pred["Survived"] = y_submission
# param: # 0. binary classification # 1. model scale: 0.1 # 2. epoch number: 10 (auto early-stop) # 3. learning rate: 0.1 # 4. regular lambda: 1.0 # 5. use sgd optimization method fm_model = xl.FMModel(model_type='fm', task='binary', metric='auc', block_size=500, lr=0.2, k=4, reg_lambda=0.1, init=0.1, fold=5, epoch=5, stop_window=2, opt='sgd', nthread=None, n_jobs=4, alpha=1, beta=1, lambda_1=1, lambda_2=1) # Start to train fm_model.fit(X_train, y_train, eval_set=[X_val, y_val], is_lock_free=False) # Generate predictions y_pred = fm_model.predict(X_val) print(y_pred)
import xlearn as xl import pandas as pd df = pd.read_csv('test/support/data.txt', sep=' ', names=['y', 'x0', 'x1', 'x2', 'x3']) X = df.drop(columns=['y']) y = df['y'] model = xl.FMModel(task='reg', nthread=1, opt='adagrad') model.fit(X, y) print('weights', model.weights) print('predict', model.predict(X)[0:6].tolist()) model = xl.FMModel(task='reg', nthread=1, opt='adagrad') model.fit('test/support/data.txt') print('predict txt', model.predict('test/support/data.txt')[0:6].tolist()) model = xl.FMModel(task='reg', nthread=1, opt='adagrad') model.fit('test/support/data.csv') print('predict csv', model.predict('test/support/data.csv')[0:6].tolist())
xTest = xTest.fillna(-999) for column_name in DataTestFin.columns: if DataTestFin[column_name].dtype.name == 'category': DataTestFin.loc[:, column_name] = le.fit_transform(DataTestFin[column_name].astype(str)) else: pass DataTestFin = DataTestFin.fillna(-999) ffm_model = xl.FMModel(task='binary', lr=0.001, epoch=100, reg_lambda=0.01, metric='auc') print("Fitting Model...") # Start to train # Directly use string to specify data source ffm_model.fit(xTrain, yTrain, eval_set=[xTest, yTest]) # print model weights #print(ffm_model.weights) print("Generating predictions") # Generate predictions y_pred = ffm_model.predict(DataTestFin)
from KmmtML.Utils.Util import Util import numpy as np import pandas as pd # Training task ffm_model = xl.create_ffm() # Use field-aware factorization machine (ffm) ffm_model.setTrain("./small_train.txt") # Set the path of training data fm_model = xl.create_fm() fm_model.fit() # parameter: # 0. task: binary classification # 1. learning rate : 0.2 # 2. regular lambda : 0.002 param = {'task':'binary', 'lr':0.2, 'lambda':0.002} path = "/Users/jianjun.yue/PycharmGItHub/data/titanic/train_pre.csv" data = pd.read_csv(path) print("--------------RandomForestClassifier---------------") predictors = ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare_scaler", "Embarked", "NameLength"] train = data[predictors] dt= Util.CSV2Libsvm(train) print(dt) xl.FMModel(task='binary', init=0.1, epoch=10, lr=0.1, reg_lambda=1.0, opt='sgd') fm=xl.FMModel() fm.fit()
import xlearn as xl from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split # Load dataset iris_data = load_iris() X = iris_data['data'] y = (iris_data['target'] == 2) X_train, \ X_val, \ y_train, \ y_val = train_test_split(X, y, test_size=0.3, random_state=0) # param: # 0. binary classification # 1. model scale: 0.1 # 2. epoch number: 10 (auto early-stop) # 3. learning rate: 0.1 # 4. regular lambda: 1.0 # 5. use sgd optimization method fm = xl.FMModel() # Start to train fm.fit(X_train, y_train, eval_set=[X_val, y_val], is_lock_free=False) # Generate predictions y_pred = fm.predict(X_val) print(y_pred) print("------------------------") print(y_val)
elif sam_apr_name == "pop": pass elif sam_apr_name == 'top_dis_pop': pass assert args.algo['name'] in ["FM", "xlearn_FM"] if args.algo['name'] == "FM": algo = factorization_machine.FMRegression(rank=8, n_iter=100, l2_reg_w=0.1, l2_reg_V=0.1) if args.algo['name'] == "xlearn_FM": algo = xl.FMModel(task='reg', init=0.1, epoch=10, k=4, lr=0.2, reg_lambda=0.01, opt='sgd', metric='mae') if '\r' in args.features: args.features.remove('\r') features = args.features print('------loading data------') cd = ConstructData(dataset=args.dataset, features=features, test=args.test_flag, sampling_approach=args.sampling_approach, negative_ratio=args.negative_ratio, n_clusters=args.n_clusters)
def model(train_df=None, test_df=None, not_used_cols=None): logger.info('Start prepare model') train_df = train_df.sort_values('date') # format_str = '%Y%m%d' # train_df['date2'] = train_df['date2'].apply(lambda x: datetime.strptime(str(x), format_str)) # split_date = '2017-05-31' # dev_df = train_df.loc[train_df['date2'] <= split_date] # valid_df = train_df.loc[train_df['date2'] > split_date] # dev_y = np.log1p(dev_df["totals_transactionRevenue"].values) # dev_df = dev_df.drop(not_used_cols, axis=1) # valid_y = np.log1p(valid_df["totals_transactionRevenue"].values) # valid_df = valid_df.drop(not_used_cols, axis=1) X = train_df.drop(not_used_cols, axis=1) y = train_df['totals_transactionRevenue'] X_test = test_df.drop( [col for col in not_used_cols if col in test_df.columns], axis=1) folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=20180917) ## Model logger.info('Start tuning model') # space = space_lightgbm() # clf = LGBMRegressor(n_jobs=-1, random_state=56, objective='regression', verbose=-1) # model = tune_model(dev_df, dev_y, valid_df, valid_y, clf, # space=space, metric=rmse, n_calls=50, min_func=forest_minimize) params = { "objective": "regression", "metric": "rmse", "max_depth": 8, "min_child_samples": 40, "reg_alpha": 0.4, "reg_lambda": 0.1, "num_leaves": 297, "learning_rate": 0.01, "subsample": 0.8, "colsample_bytree": 0.97 } # Try Stacknet models = [ ######## First level ######## [ RandomForestRegressor(n_estimators=500, random_state=1, n_jobs=-1), ExtraTreesRegressor(n_estimators=500, random_state=1, n_jobs=-1), LGBMRegressor(n_jobs=-1, random_state=56, objective='regression', max_depth=8, min_child_samples=40, reg_alpha=0.4, reg_lambda=0.1, num_leaves=290, learning_rate=0.01, subsample=0.8, colsample_bytree=0.9, n_estimators=520), xl.FMModel(task='reg', metric='rmse', block_size=800, lr=0.05, k=12, reg_lambda=0.05, init=0.1, fold=1, epoch=50, stop_window=5, opt='ftrl', nthread=0, n_jobs=-1, alpha=0.05, beta=1, lambda_1=0.1, lambda_2=0.1), Ridge(random_state=1) ], ######## Second level ######## [RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)] ] model = StackNetRegressor(models, metric="rmse", folds=3, restacking=True, use_retraining=True, random_state=12345, n_jobs=1, verbose=1) model.fit(X, y) prediction = model.predict(X_test) return prediction