예제 #1
0
def ffm_model():
    X = process(train)
    Y = train['is_trade']

    # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
    ffm_data = ffm.FFMData(X, Y)
    ffm_data_test = ffm.FFMData(X[418028:468028], Y[418028:468028])

    model = ffm.FFM(eta=0.1, lam=0.0001, k=4)
    model.fit(ffm_data,
              num_iter=200,
              val_data=ffm_data_test,
              metric='logloss',
              early_stopping=6,
              maximum=True)

    t = process(test)
    ffm_test = ffm.FFMData(t)
    pred = model.predict_proba(ffm_test)

    test['predicted_score'] = pred
    sub1 = test[['instance_id', 'predicted_score']]
    sub = pd.read_csv("input/test.txt", sep="\s+")
    sub = pd.merge(sub, sub1, on=['instance_id'], how='left')
    sub = sub.fillna(0)
    sub[['instance_id', 'predicted_score']].to_csv('result/result0422_ffm.txt',
                                                   sep=" ",
                                                   index=False)
예제 #2
0
def ffm(df_train, category_features):
    train = df_train[(df_train['day'] >= 18) & (df_train['day'] <= 23)]
    col = [
        c for c in train if c not in [
            'is_trade', 'item_category_list', 'item_property_list',
            'predict_category_property', 'instance_id', 'realtime',
            'context_timestamp'
        ]
    ]
    raw_ffm_data = df_train
    for cols in category_features:
        raw_ffm_data[cols] = raw_ffm_data[cols].astype(str)
    data_ffm = FFMFormatPandas(raw_ffm_data[col])
    data_ffm_y = raw_ffm_data['is_trade'].tolist()
    X = train[col]
    train_num = X.shape[0]
    X_train_ffm = data_ffm[:train_num]
    X_test_ffm = data_ffm[train_num:]
    y_train_ffm = data_ffm_y[:train_num]
    y_test_ffm = data_ffm_y[train_num:]
    import ffm
    ffm_train = ffm.FFMData(X_train_ffm, y_train_ffm)
    ffm_test = ffm.FFMData(X_test_ffm, y_test_ffm)
    n_iter = 5
    ffmmodel = ffm.FFM(eta=0.02, lam=0.0001, k=6)
    ffmmodel.init_model(ffm_train)
    for i in range(n_iter):
        print('iteration %d : ' % i)
        ffmmodel.iteration(ffm_train)
        y_pred = ffmmodel.predict(ffm_test)
        t_pred = ffmmodel.predict(ffm_train)
        logloss = log_loss(y_test_ffm, y_pred)
        t_logloss = log_loss(y_train_ffm, t_pred)
        print('train log_loss %.4f' % (t_logloss), end='\t')
        print('test log_loss %.4f' % (logloss))
예제 #3
0
 def fit(self, X, y):
     '''
     :param X: (field, index, value) format
     :param y: 0 or 1
     :return:
     '''
     ffm_data = ffm.FFMData(X, y)
     model = ffm.FFM(self.eta, self.l2, self.factor)
     model.init_model(ffm_data)
     for i in tqdm(range(self.n_iter)):
         model.iteration(ffm_data)
     FFM._model = model
     return model
예제 #4
0
def build_model(train_X, train_y, test_X, test_y):
  """
    Function to build and to train model from given train and test dataset
  """
  train_ffm_data = ffm.FFMData(train_X, train_y)
  test_ffm_data = ffm.FFMData(test_X, test_y)

  model = ffm.FFM(**MODEL_PARAMETERS)
  model.init_model(train_ffm_data)

  for i in range(ITERATIONS):
    model.iteration(train_ffm_data)
  # TODO temporary fix. replace this line of code with a commented line
  # return model, roc_auc_score(train_y, model.predict(train_ffm_data)), roc_auc_score(test_y, model.predict(test_ffm_data))
  return model, 1, 1
예제 #5
0
def ffm_test(ffmdata, data):
    # FFM
    X_train, X_test, y_train, y_test = train_test_split(
        ffmdata, data['is_trade'].values, test_size=0.3, random_state=888)
    n_iter = 20
    ffm_train = ffm.FFMData(X_train, y_train)
    ffm_test = ffm.FFMData(X_test, y_test)
    model = ffm.FFM(eta=0.05, lam=0.01, k=10)
    model.init_model(ffm_train)
    for i in range(n_iter):
        model.iteration(ffm_train)
        y_true = model.predict(ffm_train)
        y_pred = model.predict(ffm_test)
        train_log = log_loss(y_train, y_true)
        test_log = log_loss(y_test, y_pred)
        print('iteration_%d: ' % i, 'train_auc %.4f' % train_log,
              'test_auc %.4f' % test_log)
예제 #6
0
def ffm_0():
    X = [
        [(1, 2, 1), (2, 3, 1), (3, 5, 1)],
        [(1, 0, 1), (2, 3, 1), (3, 7, 1)],
        [(1, 1, 1), (2, 3, 1), (3, 7, 1), (3, 9, 1)],
    ]
    y = [1, 1, 0]
    ffm_data = ffm.FFMData(X, y)
    # train the model for 10 iterations
    n_iter = 10
    model = ffm.FFM(eta=0.1, lam=0.0001, k=4)
    model.init_model(ffm_data)
    for i in range(n_iter):
        print('iteration %d, ' % i, end='')
    model.iteration(ffm_data)
    y_pred = model.predict(ffm_data)
    auc = log_loss(y, y_pred)
    print('train auc %.4f' % auc)
예제 #7
0
def main():

    # load data
    train_set, validation_set, test_set, features = load_data()

    print(train_set)

    # train_set.save('no-header.csv', format='csv')

    # train_set = gl.SFrame.read_csv('no-header.csv', delimiter=',', verbose=False, column_type_hints=str)
    #

    train_set = train_set.add_row_number()

    def transform_row(row):

        return [':'.join([str(row['id']), str(k), v]) for k, v in row.items() if k != 'id']

    train_set['formatted_data'] = train_set.apply(lambda row: ' '.join(sorted(transform_row(row))))

    print(train_set['formatted_data'])

    # train_set['answer'].save('output')

    # train_set['answer'].save('training_set.csv', format='csv')

    # train_set['answer'].export_csv('output.csv', delimiter=' ', header=False, line_terminator='\n')

    # trainfile = 'no-header.csv'

    print ("converting")

    train = read_libffm_file(trainfile)

    print(train)


    # Train a model
    m = ffm.FFM()
    m.fit(train, features, target='click', features=features, nr_iters=10)
    yhat = m.predict(features)
    print yhat
예제 #8
0
 def fit(self, X_trn, y_trn, X_val, y_val, model_path=None):
     logger = logging.getLogger(str(self))
     ffm = ffmlib.FFM(eta=self.learning_rate, lam=self.reg, k=self.factor_size)
     ffm_data_trn = ffmlib.FFMData(X_trn, y_trn)
     ffm_data_val = ffmlib.FFMData(X_val, y_val)
     ffm.init_model(ffm_data_trn)
     auc_trn_max = auc_val_max = auc_val = nb_epochs = 0.
     while auc_val == auc_val_max and nb_epochs < self.nb_epochs_max:
         t0 = time()
         ffm.iteration(ffm_data_trn)
         t1 = time()
         auc_trn = roc_auc_score(y_trn, ffm.predict(ffm_data_trn))
         auc_val = roc_auc_score(y_val, ffm.predict(ffm_data_val))
         logger.info('AUC trn: %.3lf AUC val: %.3lf (%.3lf seconds)' % (auc_trn, auc_val, t1 - t0))
         auc_trn_max = max(auc_trn, auc_trn_max)
         auc_val_max = max(auc_val, auc_val_max)
         nb_epochs += int(auc_val == auc_val_max)
         if auc_val == auc_val_max and model_path:
             logger.info('Saving %s' % model_path)
             ffm.save_model(model_path)
     del ffm, ffm_data_trn, ffm_data_val
     return auc_trn_max, auc_val_max, nb_epochs
예제 #9
0
def main():

    # # load data
    # train_set, validation_set, test_set, features = load_data()
    #
    # train_set['click'] = train_set['click'].astype(int)

    # print(train_set['click'])

    # # Train a model
    # m = ffm.FFM()
    # m.fit(train_set, features, target='click', features=features, nr_iters=25)
    # yhat = m.predict(features)
    # print yhat

    ########################################################################################################################

    trainfile = 'lib/bigdata.tr.txt'
    validfile = 'lib/bigdata.te.txt'
    train = read_libffm_file(trainfile)
    valid = read_libffm_file(validfile)

    print(train)

    train['y'] = train['y'].astype(int)
    del train['features.0']
    valid = valid[train.column_names()]
    train.save('examples/small.tr.sframe')
    valid.save('examples/small.te.sframe')

    features = [c for c in train.column_names() if c != 'y']

    # Train a model
    m = ffm.FFM()
    m.fit(train, valid, target='y', features=features, nr_iters=15)
    yhat = m.predict(valid)
    print yhat
예제 #10
0
파일: preProcess.py 프로젝트: ys0232/mycode
for cols in category_features:
    raw_ffm_data[cols] = raw_ffm_data[cols].astype(str)
data_ffm = FFMFormatPandas(raw_ffm_data[col])
data_ffm_y = raw_ffm_data['is_trade'].tolist()
train_num = X.shape[0]
X_train_ffm = data_ffm[:train_num]
X_test_ffm = data_ffm[train_num:]
y_train_ffm = data_ffm_y[:train_num]
y_test_ffm = data_ffm_y[train_num:]
import ffm
ffm_train = ffm.FFMData(X_train_ffm, y_train_ffm)
ffm_test = ffm.FFMData(X_test_ffm, y_test_ffm)

n_iter = 5

ffmmodel = ffm.FFM(eta=0.02, lam=0.0001, k=6)
ffmmodel.init_model(ffm_train)

for i in range(n_iter):
    print('iteration %d : ' % i)
    ffmmodel.iteration(ffm_train)

    y_pred = ffmmodel.predict(ffm_test)
    t_pred = ffmmodel.predict(ffm_train)
    #auc = roc_auc_score(y_test_ffm, y_pred)
    logloss = log_loss(y_test_ffm, y_pred)
    #t_auc = roc_auc_score(y_train_ffm, t_pred)
    t_logloss = log_loss(y_train_ffm, t_pred)
    print('train log_loss %.4f' % (t_logloss), end='\t')
    print('test log_loss %.4f' % (logloss))
예제 #11
0
# prepare the data
# (field, index, value) format

X = [
    [(1, 2, 1), (2, 3, 1), (3, 5, 1)],
    [(1, 0, 1), (2, 3, 1), (3, 7, 1)],
    [(1, 1, 1), (2, 3, 1), (3, 7, 1), (3, 9, 1)],
    [(1, 0, 1), (2, 3, 1), (3, 5, 1)],
]

y = [1, 1, 0, 1]

ffm_data = ffm.FFMData(X, y)
ffm_data_test = ffm.FFMData(X, y)

model = ffm.FFM(eta=0.1, lam=0.0001, k=4)
model.fit(ffm_data,
          num_iter=10,
          val_data=ffm_data_test,
          metric='auc',
          early_stopping=6,
          maximum=True)

print(model.predict_proba(ffm_data_test))

model.save_model('result/ololo.bin')

model = ffm.read_model('result/ololo.bin')

print(model.predict(ffm_data_test))
예제 #12
0
import ffm
import graphlab as gl
from convert import read_libffm_file

# Output from examples/criteo_process.py
train = gl.SFrame('criteo_train_transformed')
valid = gl.SFrame('criteo_valid_transformed')

# Currently only dictionary columns are supported
features = [c for c in train.column_names() if train[c].dtype() == dict]

# Train a model
m = ffm.FFM()
m.fit(train, valid, target='X1', features=features, nr_iters=15)

# Make predictions
yhat = m.predict(valid)