Exemplo n.º 1
0
def test_long_dense_vector():
    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])

    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    model = DeepFM(feature_columns, feature_columns[:-1])
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Exemplo n.º 2
0
def test_long_dense_vector():
    #构造特征
    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    #构造样本
    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])
    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    #创建模型model
    model = DeepFM(feature_columns, feature_columns[:-1])

    # model.summary()
    #tf.keras.utils.plot_model(model, "test_compu")

    #训练模型
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Exemplo n.º 3
0
def run_deepfm_model():
    train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target = read_data_as_model(
    )

    #Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )

    model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
    return pred_ans, test[target].values, round(
        roc_auc_score(test[target].values, pred_ans), 4), 'deepfm'
Exemplo n.º 4
0
def test_DeepFM(use_fm, hidden_size, sparse_feature_num):
    model_name = "DeepFM"
    sample_size = 64
    feature_dim_dict = {"sparse": {}, 'dense': []}
    for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]):
        if name == "sparse":
            for i in range(num):
                feature_dim_dict[name][name + '_' +
                                       str(i)] = np.random.randint(1, 10)
        else:
            for i in range(num):
                feature_dim_dict[name].append(name + '_' + str(i))

    sparse_input = [np.random.randint(0, dim, sample_size)
                    for dim in feature_dim_dict['sparse'].values()]
    dense_input = [np.random.random(sample_size)
                   for name in feature_dim_dict['dense']]
    y = np.random.randint(0, 2, sample_size)
    x = sparse_input + dense_input

    model = DeepFM(feature_dim_dict,  use_fm=use_fm,
                   hidden_size=hidden_size, keep_prob=0.5, )
    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5)

    print(model_name+" test train valid pass!")
    model.save_weights(model_name + '_weights.h5')
    model.load_weights(model_name + '_weights.h5')
    print(model_name+" test save load weight pass!")
    save_model(model,  model_name + '.h5')
    model = load_model(model_name + '.h5', custom_objects)
    print(model_name + " test save load model pass!")

    print(model_name + " test pass!")
Exemplo n.º 5
0
def train_deepFM():
    k = featureengineer.k
    #缺失值填充+编码处理
    data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums
    data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', )
    for feat in trainmodel.dense_features:
        data[feat].fillna(data[feat].dropna().mean(), inplace=True)

    for feat in trainmodel.sparse_features:
        data[feat] = data[feat].apply(lambda x:str(x))
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features])


    #数据格式转换
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8)
                              for i, feat in enumerate(trainmodel.sparse_features)] + \
                             [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features]

    lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1)
                              for i, feat in enumerate(trainmodel.lgbOut_Features)]

    key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums}
    varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i
                       in trainmodel.var_features]

    dnn_feature_columns = fixlen_feature_columns + varlen_features
    linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features

    train, test = train_test_split(data, test_size=0.2)


    train_model_input = {name: train[name] for name in sparse_dense_features}
    test_model_input = {name: test[name] for name in sparse_dense_features}
    for x in trainmodel.var_features:
        if x == 'applist':
            train_model_input[x] = np.array(train[x].tolist())
            test_model_input[x] = np.array(test[x].tolist())
        if x == 'new_tag':
            train_model_input[x] = np.array(train[x].tolist())-appsnum
            test_model_input[x] = np.array(test[x].tolist())-appsnum
    # 模型
    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                   dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001,
                   l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True,
                   task='binary')
    model.compile("adam", "binary_crossentropy",metrics=['AUC'], )

    history = model.fit(train_model_input, train['target'].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2, )

    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
def main(dataPath, dataPath_val, batch_size):

    # must have list of training files
    files = glob.glob(dataPath + "/*.csv")[::5]

    # validation files
    files_val = glob.glob(dataPath_val + "/*.csv")[::5]

    # Count number of examples in training data
    nexs = get_total_examples(files)
    print("Number of training examples: ", nexs)

    nexs_val = get_total_examples(files_val)
    print("Number of validation examples: ", nexs_val)

    # Create data generator
    train_gen = DataGenerator(files, nexs, batch_size=batch_size)
    val_gen = DataGenerator(files_val, nexs_val, batch_size=batch_size)

    linear_feature_columns = train_gen.linear_feature_columns
    dnn_feature_columns = train_gen.dnn_feature_columns

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    optimizer = keras.optimizers.Adam(lr=0.001,
                                      beta_1=0.9,
                                      beta_2=0.999,
                                      decay=0.0)
    model.compile(
        optimizer,
        "binary_crossentropy",
        metrics=['binary_crossentropy', auroc],
    )

    pbar = ProgbarLogger(count_mode='steps', stateful_metrics=None)

    weights_file = "model-5-lr0p001.h5"
    model_checkpoint = ModelCheckpoint(weights_file,
                                       monitor="val_binary_crossentropy",
                                       save_best_only=True,
                                       save_weights_only=True,
                                       verbose=1)

    history = model.fit_generator(train_gen,
                                  epochs=10,
                                  verbose=1,
                                  steps_per_epoch=nexs / batch_size,
                                  validation_data=val_gen,
                                  validation_steps=nexs / batch_size,
                                  callbacks=[model_checkpoint])
Exemplo n.º 7
0
def model_generate(train_X, train_y, val_X, val_y, linear_feature_columns,
                   dnn_feature_columns):
    model = DeepFM(linear_feature_columns,
                   dnn_feature_columns,
                   embedding_size=32)
    model.compile("adam",
                  "binary_crossentropy",
                  metrics=[roc_auc_score_pyfunc, log_loss_pyfunc])
    history = model.fit(train_X,
                        train_y,
                        validation_data=(val_X, val_y),
                        batch_size=4096,
                        epochs=5,
                        callbacks=[EarlyStopping()])
    return model, history
Exemplo n.º 8
0
def train_model(train, test, linear_feature, dnn_feature):

    model = DeepFM(linear_feature, dnn_feature, task='binary')
    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['AUC'],
    )
    history = model.fit(
        *train,
        batch_size=512,
        epochs=5,
        verbose=2,
        validation_split=0.1,
    )
    pred_ans = model.predict(test[0], batch_size=512)
    print("test LogLoss", round(log_loss(test[1], pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[1], pred_ans), 4))
Exemplo n.º 9
0
def test_DeepFM(use_fm, hidden_size):
    name = "DeepFM"
    sample_size = 64
    feature_dim_dict = {
        'sparse': {
            'sparse_1': 2,
            'sparse_2': 5,
            'sparse_3': 10
        },
        'dense': ['dense_1', 'dense_2', 'dense_3']
    }
    sparse_input = [
        np.random.randint(0, dim, sample_size)
        for dim in feature_dim_dict['sparse'].values()
    ]
    dense_input = [
        np.random.random(sample_size) for name in feature_dim_dict['dense']
    ]
    y = np.random.randint(0, 2, sample_size)
    x = sparse_input + dense_input

    model = DeepFM(
        feature_dim_dict,
        use_fm=use_fm,
        hidden_size=hidden_size,
        keep_prob=0.5,
    )
    model.compile('adam',
                  'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, batch_size=100, epochs=1, validation_split=0.5)
    print(name + " test train valid pass!")
    model.save_weights(name + '_weights.h5')
    model.load_weights(name + '_weights.h5')
    print(name + " test save load weight pass!")
    save_model(model, name + '.h5')
    model = load_model(name + '.h5', custom_objects)
    print(name + " test save load model pass!")

    print(name + " test pass!")
Exemplo n.º 10
0
def deepfm_model(linear_feature_columns, dnn_feature_columns,
                 train_model_input, train, test_model_input, test):
    cols = ['model', 'RMSE', 'MAE', 'MSE', 'AUC', 'score']
    df_result = pd.DataFrame(columns=cols, index=range(1))
    model = DeepFM(linear_feature_columns,
                   dnn_feature_columns,
                   dnn_hidden_units=config.deepfm_att["dnn_hidden_units"],
                   init_std=config.deepfm_att["init_std"],
                   seed=config.deepfm_att["seed"],
                   dnn_dropout=config.deepfm_att["dnn_dropout"],
                   dnn_activation=config.deepfm_att["dnn_activation"],
                   task=config.deepfm_att["task"],
                   fm_group=config.deepfm_att["fm_group"],
                   dnn_use_bn=config.deepfm_att["dnn_use_bn"])

    model.compile("adam", "mse", metrics=['mse'])

    history = model.fit(train_model_input,
                        train[target].values,
                        batch_size=256,
                        epochs=config.model_epoch['epoch'],
                        verbose=2,
                        validation_split=0.2)

    pred_ans = model.predict(test_model_input, batch_size=256)
    save_model(model, 'saved_deepfm.h5')  # save_model
    auc = roc_auc_score(test[target].values, pred_ans)

    df_result.loc[0].model = "DeepFM"
    df_result.loc[0].RMSE = np.round(
        math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3)
    df_result.loc[0].MAE = np.round(
        mean_absolute_error(test[target].values, pred_ans), 3)
    df_result.loc[0].MSE = np.round(
        mean_squared_error(test[target].values, pred_ans), 3)
    df_result.loc[0].AUC = np.round(auc, 3)
    #df_result.loc[0].score=(1/df_result.iloc[0]['RMSE'])*(1/df_result.iloc[0]['MAE'])*(2*df_result.iloc[0]['AUC'])
    return df_result
Exemplo n.º 11
0
target = ['transportation_issues']

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_data[feat].nunique(), embedding_dim=size_dict[field_info[feat]], dtype='int32', group_name=field_info[feat]) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

train, test = train_test_split(train_data, test_size=0.2, random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', dnn_hidden_units=(2, 256), dnn_dropout=0.0)
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="binary_crossentropy", metrics=['binary_crossentropy'], optimizer=opt)

history = model.fit(train_model_input, train[target].values, batch_size=64, epochs=10, verbose=1, validation_split=0.2, class_weight={0:1, 1:3})
pred_ans = model.predict(test_model_input, batch_size=64)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

ans = pd.Series(pred_ans.reshape((-1,)))
ans[ans>=0.5] = 1
ans[ans<0.5] = 0

pd.Series(test[target].transportation_issues).value_counts()

ans.value_counts()

print(classification_report(test[target], ans))
from deepctr.models import DeepFM

if __name__ == "__main__":

    data = pd.read_csv("./movielens_sample.txt")
    sparse_features = ["movie_id", "user_id",
                       "gender", "age", "occupation", "zip"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # 2.count #unique features for each sparse field
    sparse_feature_dim = {feat: data[feat].nunique()
                          for feat in sparse_features}
    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat].values for feat in sparse_feature_dim]
    test_model_input = [test[feat].values for feat in sparse_feature_dim]
    # 4.Define Model,train,predict and evaluate
    model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
                   final_activation='linear')
    model.compile("adam", "mse", metrics=['mse'],)

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test MSE", round(mean_squared_error(
        test[target].values, pred_ans), 4))
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    sparse_feature_dict = {feat: data[feat].nunique()
                           for feat in sparse_features}
    dense_feature_list = dense_features

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat].values for feat in sparse_feature_dict] + \
        [train[feat].values for feat in dense_feature_list]
    test_model_input = [test[feat].values for feat in sparse_feature_dict] + \
        [test[feat].values for feat in dense_feature_list]

    # 4.Define Model,train,predict and evaluate
    model = DeepFM({"sparse": sparse_feature_dict,
                    "dense": dense_feature_list}, final_activation='sigmoid')
    model.compile("adam", "binary_crossentropy",
                  metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
Exemplo n.º 14
0
feats = [i for i in data.columns if i != 'Rating']
X = data[feats]
y = data['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sparse_features = [
    'UserID', 'MovieID', 'Gender', 'Occupation', 'day', 'weekday'
]
dense_features = ['hour', 'Age']

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features] + \
                         [DenseFeat(feat, 1) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass')
model.compile('adam', 'mse', metrics=['accuracy'])

feature_names = get_feature_names(fixlen_feature_columns)

train_feed_dict = {name: X_train[name] for name in feature_names}
test_feed_dict = {name: X_test[name] for name in feature_names}

model.fit(train_feed_dict,
          y_train,
          batch_size=256,
          epochs=10,
          validation_split=0.2)
pred_ans = model.predict(test_feed_dict, batch_size=256)
Exemplo n.º 15
0
                l2_reg_dnn=NNconfig_dic["l2_reg_dnn"],
                l2_reg_embedding=NNconfig_dic["l2_reg_embedding"],
                l2_reg_linear=NNconfig_dic["l2_reg_linear"],
                dnn_dropout=NNconfig_dic["dnn_dropout"],
                dnn_use_bn=NNconfig_dic["dnn_use_bn"],
                dnn_activation=NNconfig_dic["dnn_activation"])
NNconfig_dic["model_name"] = "DeepFM"


# %%
opt = tf.keras.optimizers.Adam(learning_rate=NNconfig_dic["lr"])
NNconfig_dic["optimizer"] = "Adam"


# %%
model.compile(optimizer=opt, loss=tf.losses.BinaryCrossentropy(),
                metrics=[tf.keras.metrics.AUC()])

log_dir="logs"+ os.path.sep + NNconfig_dic["model_name"] + "_res" + os.path.sep \
              + datetime.now().strftime("%Y%m%d-%H%M%S")
NN_config_path = "logs" + os.path.sep + NNconfig_dic["model_name"] + "_res" + os.path.sep \
              + datetime.now().strftime("%Y%m%d-%H%M%S") + "_" + "NNconfig.json"


# if worker_index == 0:
#     if not os.path.exists("logs" + os.path.sep + NNconfig_dic["model_name"] + "_res"):
#         os.makedirs("logs" + os.path.sep + NNconfig_dic["model_name"] + "_res")
#     with open(NN_config_path, "w+") as conf:
#         json.dump(NNconfig_dic, conf)

callbacks = [tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch = 3),]
Exemplo n.º 16
0
            dnn_feature_columns,
            task="binary",
            embedding_size=emb_dim,
            dnn_hidden_units=[1024, 1024],
            cross_num=6,
        )

    if opt == "adagrad":
        optimizer = Adagrad
    elif opt == "adam":
        optimizer = Adam
    else:
        raise ValueError("Invalid optimizer")

    model.compile(optimizer(learning_rate),
                  "binary_crossentropy",
                  metrics=["binary_crossentropy"])

    callbacks = []

    patience_counter = 0
    best_valid_loss = float("Inf")

    history_epoch = {}
    history_val = {}
    for epoch in range(epochs):
        breakout = False
        history_epoch[epoch] = {}
        history_val[epoch] = []
        train_generator = get_data_generator(
            base_path,
Exemplo n.º 17
0
    # 2.count #unique features for each sparse field,and record dense feature field name

    sparse_feature_list = [SingleFeat(feat, data[feat].nunique())
                           for feat in sparse_features]
    dense_feature_list = [SingleFeat(feat, 0,)
                          for feat in dense_features]

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
                        [train[feat.name].values for feat in dense_feature_list]
    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
                       [test[feat.name].values for feat in dense_feature_list]

    # 4.Define Model,train,predict and evaluate
    model = DeepFM({"sparse": sparse_feature_list,
                    "dense": dense_feature_list}, task='binary', embedding_size=4, dnn_hidden_units=(64, 64))
    # parallel_model = multi_gpu_model(model, gpus=2)
    model.compile(tf.keras.optimizers.Adam(1e-4), "binary_crossentropy",
                  metrics=['binary_crossentropy'], )

    tensorboard = TensorBoard(log_dir="logs/DeepFM")

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, callbacks=[tensorboard])
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
class DeepModel:
    def __init__(self, model_name, model_architecture="DeepFM"):
        self.model_name = model_name
        self.model_architecture = model_architecture

        self.model = None
        self.history = None
        self.data = None
        self.callbacks = []

    # requires tf2
    # def set_notebook_mode(self):
    #    progress_bar_cb = tfa.callbacks.TQDMProgressBar() #TQDMNotebookCallback(leave_inner=True, leave_outer=True)
    #    self.callbacks.append(progress_bar_cb)

    def prepare_data(self,
                     data_source,
                     sparse_features,
                     target,
                     test_size=0.1):
        self.data = Data(sparse_features,
                         target,
                         data_format="deepctr",
                         test_size=test_size)
        self.data.ingest(data_source)
        self.data.prepare()

    def build(self, task):
        assert task in ['regression', 'binary']
        if self.model_architecture == "DeepFM":
            self.model = DeepFM(
                self.data.linear_feature_columns,
                self.data.dnn_feature_columns,
                task=task,
            )
        else:
            raise NotImplementedError(
                'At the current stage of the development, only a DeepFM is supported'
            )

        task_attr = {
            'regression': {
                'loss': 'mse',
                'metrics': 'mse'
            },
            'binary': {
                'loss': 'binary_crossentropy',
                'metrics': 'accuracy'
            }
        }
        if task == "regression":
            loss = "mse"
            metrics = "mse"
        elif task == "binary":
            loss = "binary_crossentropy"
            metrics = "accuracy"

        self.model.compile(optimizer="adam",
                           loss=task_attr[task]['loss'],
                           metrics=task_attr[task]['metrics'])

    def train(self, batch_size=256, epochs=10, validation_split=0.1):
        #class_weights = class_weight.compute_class_weight(
        #    "balanced", np.unique(self.data.y_train[:, 0]), self.data.y_train[:, 0]
        #)
        self.history = self.model.fit(
            self.data.X_train,
            self.data.y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=validation_split,
            verbose=2,
            #class_weight=class_weights,
            callbacks=self.callbacks,
        )

    def evaluate(self):
        self.model.evaluate(self.data.X_test,
                            self.data.y_test,
                            batch_size=4096)

    def prepare_input(self, df):
        df = df.copy()
        for feat in self.data.sparse_features:
            lbe = self.data.encoders[feat]
            df[feat] = lbe.transform(df[feat])

        X = {name: df[name].values for name in self.data.feature_names}
        return X

    def predict(self, X, batch_size=256):
        return self.model.predict(X, batch_size=batch_size)
Exemplo n.º 19
0
print(behavior_feature_list)
# model = DIN( dnn_feature_columns, behavior_feature_list,task='multiclass', dnn_hidden_units=(256, 128),
# l2_reg_embedding=0.00001, dnn_dropout=0.5, l2_reg_dnn=0.0001,nClass=4)
# model = LR(linear_feature_columns, l2_reg_linear=0.0001, init_std=0.0001, seed=1024, task='multiclass',nClass=4)
model = DeepFM(linear_feature_columns,
               dnn_feature_columns,
               task='multiclass',
               dnn_hidden_units=(256, 128),
               l2_reg_embedding=0.0001,
               l2_reg_linear=0.0001,
               dnn_dropout=0.5,
               l2_reg_dnn=0.0001,
               nClass=4)

model.compile(optimizer=keras.optimizers.Adam(0.0001),
              loss="categorical_crossentropy",
              metrics=['acc'])
history = model.fit(train_model_input,
                    y_true,
                    callbacks=[
                        keras.callbacks.EarlyStopping(monitor='loss',
                                                      patience=5),
                        keras.callbacks.ModelCheckpoint("MCDIN.h5",
                                                        monitor="loss",
                                                        verbose=1,
                                                        save_best_only=True)
                    ],
                    batch_size=1024,
                    epochs=200,
                    verbose=2)
pred_ans = model.predict(test_model_input, batch_size=256)
Exemplo n.º 20
0
def test_DFM_avazu(data, train, test):
    print("\nTesting DFM on avazu dataset...\n")

    results_activation_function = {"auc": [], "logloss": [], "rmse": []}
    results_dropout = {"auc": [], "logloss": [], "rmse": []}
    results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []}

    auc = 0
    logloss = 0
    rmse = 0

    features_labels = train.columns

    sparse_features_labels = features_labels[1:23]
    target_label = features_labels[0]

    dnn_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]
    linear_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    true_y = test[target_label].values

    print("\t\t-- ACTIVATION FUNCTIONS --\t\t")
    for dnn_activation in dnn_activation_list:
        print("\nTesting {dnn_activation}...".format(
            dnn_activation=dnn_activation))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_activation=dnn_activation,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_activation_function["auc"].append(auc)
        results_activation_function["logloss"].append(logloss)
        results_activation_function["rmse"].append(rmse)

    print("\t\t-- DROPOUT RATES --\t\t")
    for dnn_dropout in dnn_dropout_list:
        print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_dropout=dnn_dropout,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_dropout["auc"].append(auc)
        results_dropout["logloss"].append(logloss)
        results_dropout["rmse"].append(rmse)

    print("\t\t-- HIDDEN UNITS --\t\t")
    for dnn_hidden_units in dnn_hidden_units_list:
        print("\nTesting {dnn_hidden_units}...".format(
            dnn_hidden_units=dnn_hidden_units))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_hidden_units=dnn_hidden_units,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_number_of_neurons["auc"].append(auc)
        results_number_of_neurons["logloss"].append(logloss)
        results_number_of_neurons["rmse"].append(rmse)

    if PLOT:
        create_plots("DFM", "avazu", results_activation_function,
                     "Activation Function", "activation_func",
                     dnn_activation_list)
        create_plots("DFM", "avazu", results_dropout, "Dropout Rate",
                     "dropout", dnn_dropout_list)
        create_plots("DFM", "avazu", results_number_of_neurons,
                     "Number of Neurons per layer", "nr_neurons",
                     dnn_hidden_units_list)
Exemplo n.º 21
0
def deepctr_cv(X_train,
               y_train,
               folds,
               logger,
               cv_path,
               X_test=None,
               optional_data=None,
               prep=True,
               split_conf=None):

    scores = []
    preds = []

    meta = np.zeros_like(y_train).astype("float64")
    if split_conf is None:
        X_tr, X_te, main_conf, _ = prep_for_embedding(X_train,
                                                      X_test,
                                                      conf,
                                                      prep=prep)
        X_train, X_test = X_tr, X_te
    else:
        main_conf = split_conf

    cat_cols = [c for c, _, _ in main_conf[0]]
    cat_fs = [SingleFeat(c, d) for c, d, _ in main_conf[0]]
    num_fs = [SingleFeat(c, 0) for c in conf.num_cols]

    X_test = split_df(X_test, cat_cols, conf.num_cols)

    for num_fold, (tr_ind, tes_ind) in enumerate(folds):
        if num_fold > 0:
            break
        logger.info(f"fold_{num_fold}")

        fold_path = cv_path / f"fold{num_fold}"
        seed_path = fold_path
        Path(fold_path).mkdir(exist_ok=True, parents=True)

        callbacks = [CSVLogger(str(fold_path / 'epochs.csv'))]

        X_cv_train, X_cv_test = X_train.iloc[tr_ind], X_train.iloc[tes_ind]
        y_cv_train, y_cv_test = y_train.iloc[tr_ind], y_train.iloc[tes_ind]
        X_cv_train = split_df(X_cv_train, cat_cols, conf.num_cols)
        X_cv_test = split_df(X_cv_test, cat_cols, conf.num_cols)

        model = DeepFM({
            'sparse': cat_fs,
            'dense': num_fs
        },
                       final_activation='sigmoid')
        model.compile("adam", "binary_crossentropy", metrics=['accuracy'])
        model.fit(X_cv_train,
                  y_cv_train,
                  callbacks=callbacks,
                  batch_size=2048,
                  epochs=10,
                  verbose=1,
                  validation_data=(X_cv_test, y_cv_test))
        model.save_weights(str(seed_path / 'weights.h5'), save_format='hdf5')
        gc.collect()

        if X_test is not None:
            pred = model.predict(X_test, batch_size=2048)
            pred = pred[:, 0]
            np.save(seed_path / f"pred.npy", pred)

        train_oof = model.predict(X_cv_test, batch_size=2048)
        train_oof = train_oof[:, 0]
        auc = roc_auc_score(y_cv_test.values, train_oof)
        logger.info(f"{num_fold}: auc {auc}")
        np.save(seed_path / f"train_oof.npy", train_oof)

        # auc = roc_auc_score(y_cv_test, train_oof)
        # logger.info(f"seed_average: auc {auc}")
        scores.append(auc)
        np.save(fold_path / f"tes_ind.npy", tes_ind)
        meta[tes_ind] += train_oof
        del X_cv_train, y_cv_train, X_cv_test, y_cv_test

        if X_test is not None:
            preds.append(pred)

    scores = np.array(scores)
    preds = np.array(preds)
    pred = rank_average(preds)
    logger.info(f"{scores.mean()}, {scores.std()}")
    return scores, pred, meta
Exemplo n.º 22
0
class DeepFMHelper:
    def __init__(self):
        self.min_max_scaler = MinMaxScaler(feature_range=(0, 1))
        self.cat_features = [
            "user_Вид тура_last",
            "user_Звездность_last",
            "tour_Страна",
            "tour_Страна тура",
            "user_Тип заявки_last",
        ]
        self.dense_features = None
        self.fixlen_feature_columns = None
        self.feature_names = None
        self.model = None

    def fit(self, X, y):
        X_ = X.copy()
        self.dense_features = list(X_.columns.difference(self.cat_features))

        logger.debug("MinMaxScaler")
        self.min_max_scaler.fit(X_[self.dense_features])
        X_[self.dense_features] = self.min_max_scaler.transform(
            X_[self.dense_features])

        self._column_mapping(X_)
        X_.columns = [self.columns_mapping[col] for col in X_.columns]

        self.fixlen_feature_columns = [
            SparseFeat(
                self.columns_mapping[feat],
                vocabulary_size=X_[self.columns_mapping[feat]].max() + 1,
                embedding_dim=4,
            ) for i, feat in enumerate(self.cat_features)
        ] + [
            DenseFeat(
                self.columns_mapping[feat],
                1,
            ) for feat in self.dense_features
        ]
        self.feature_names = get_feature_names(self.fixlen_feature_columns)

        logger.debug("Compile DeepFM model")
        self.model = DeepFM(self.fixlen_feature_columns,
                            self.fixlen_feature_columns,
                            task="binary")
        self.model.compile(
            "adam",
            "binary_crossentropy",
            metrics=["binary_crossentropy"],
        )

        logger.debug("Fit DeepFM")
        train_model_input = {
            name: X_[name].values
            for name in self.feature_names
        }
        self.model.fit(
            train_model_input,
            y,
            batch_size=256,
            epochs=3,
            verbose=2,
            validation_split=0.2,
        )

    def predict_proba(self, X):
        X_ = X.copy()
        X_[self.dense_features] = self.min_max_scaler.transform(
            X_[self.dense_features])
        X_.columns = [self.columns_mapping[col] for col in X_.columns]
        model_input = {name: X_[name].values for name in self.feature_names}
        pred = self.inference(model_input)
        pred = pred[:, 0].numpy()
        return pred

    def _column_mapping(self, X):
        symbols = (
            "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
            "abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA",
        )

        tr = {ord(a): ord(b) for a, b in zip(*symbols)}

        self.columns_mapping = dict(
            zip(
                X.columns,
                [
                    col.translate(tr).replace(" ", "_").replace("$", "dollar")
                    for col in X.columns
                ],
            ))

    @tf.function()
    def inference(self, test_model_input):
        return self.model(test_model_input)

    def save_model(self):
        self.model.save_weights("backend/data/DeepFM_w.h5")
        with open("backend/data/DeepFM_data.pkl", "wb") as f_out:
            pickle.dump(
                (
                    self.columns_mapping,
                    self.min_max_scaler,
                    self.dense_features,
                    self.fixlen_feature_columns,
                    self.feature_names,
                ),
                f_out,
            )

    def load_model(self):
        with open("data/DeepFM_data.pkl", "rb") as f_in:
            (
                self.columns_mapping,
                self.min_max_scaler,
                self.dense_features,
                self.fixlen_feature_columns,
                self.feature_names,
            ) = pickle.load(f_in)
        self.model = DeepFM(self.fixlen_feature_columns,
                            self.fixlen_feature_columns,
                            task="binary")
        self.model.compile(
            "adam",
            "binary_crossentropy",
            metrics=["binary_crossentropy"],
        )
        self.model.load_weights("data/DeepFM_w.h5")
Exemplo n.º 23
0
print(model_input)
# print(model_input.shape)
# 4.Define Model,compile and train
model = DeepFM(
    {
        "sparse": sparse_feat_list,
        "dense": dense_feat_list,
        "sequence": sequence_feature
    },
    final_activation='linear',
    embedding_size=8,
    use_fm=False,
    hidden_size=(64, 64))

model.compile(
    "adam",
    "mape",
    metrics=['mape'],
)
history = model.fit(
    model_input,
    df_train[target].values,
    batch_size=2048,
    epochs=200,
    verbose=2,
    validation_split=0.2,
)
pred = model.predict(model_input)
print(pred)
print(smape(df_train[target].values, pred))
Exemplo n.º 24
0
    SparseFeat(feature, data[feature].nunique()) for feature in sparse_features
]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile(
    "adam",
    "mse",
    metrics=['mse'],
)
history = model.fit(
    train_model_input,
    train[target].values,
    batch_size=256,
    epochs=1,
    verbose=True,
    validation_split=0.2,
)
# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse**0.5
Exemplo n.º 25
0
    # model = DeepFM(linear_feature_columns=linear_feature_columns,
    #                dnn_feature_columns=dnn_feature_columns,
    #                dnn_dropout=0.1,
    #                dnn_hidden_units=(512, 128),
    #                task='binary')
    model = DeepFM(
        linear_feature_columns,
        dnn_feature_columns,
        task='binary',
        dnn_dropout=0.1,
        dnn_hidden_units=(512, 128),
    )
    model = multi_gpu_model(model, NUM_WORKERS)
model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-4),
    # loss="binary_crossentropy",
    loss=multi_category_focal_loss2(alpha=0.1),
    metrics=[auroc],
)

dirpath = Path('checkpoint')
if dirpath.exists() and dirpath.is_dir():
    shutil.rmtree(dirpath)
os.mkdir('checkpoint')

hist = model.fit(online_train_model_input,
                 train_df['label'].values,
                 batch_size=BATCH_SIZE,
                 epochs=EPOCHS,
                 verbose=2,
                 validation_split=0.1,
                 shuffle=True,
Exemplo n.º 26
0
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    return dataset


linear_feature_columns = varlen_feature_columns + fixed_feature_columns
dnn_feature_columns = varlen_feature_columns + fixed_feature_columns
callbacks = []
GPU = True
if GPU:
    strategy = tf.distribute.MirroredStrategy(devices=['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3'])
    # strategy = tf.distribute.MirroredStrategy(devices=['/gpu:3'])
    with strategy.scope():
        model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=[1024, 512, 256],
                       task='binary',
                       dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False)
        model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy', tf.keras.metrics.AUC()])
    # model.run_eagerly = True
    model.fit_generator(generator=get_dataset(), steps_per_epoch=None, epochs=10, verbose=2, callbacks=callbacks,
                        validation_data=get_dataset(eval_data_path), validation_steps=None, validation_freq=1,
                        class_weight=None,
                        max_queue_size=100, workers=10, use_multiprocessing=False, shuffle=True, initial_epoch=0)
    tf.saved_model.save(model, "./models")
else:
    model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=[1024, 512, 256], task='binary',
                   dnn_dropout=0, dnn_activation='relu', dnn_use_bn=False)
    model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy', tf.keras.metrics.AUC()])
    model.run_eagerly = True
    model.fit_generator(generator=get_dataset(), steps_per_epoch=None, epochs=10, verbose=2, callbacks=callbacks,
                        validation_data=get_dataset(eval_data_path), validation_steps=None, validation_freq=1,
                        class_weight=None,
                        max_queue_size=100, workers=10, use_multiprocessing=False, shuffle=True, initial_epoch=0)
Exemplo n.º 27
0
]
# print(fix_feature_columns)

linear_feature_columns = fix_feature_columns
dnn_feature_columns = fix_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 4. 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

# 5. 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile('adam', 'mse', metrics=['mse'])
history = model.fit(train_model_input,
                    train[target].values,
                    batch_size=256,
                    epochs=1,
                    verbose=True,
                    validation_split=0.2)

# 6. 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)

# 7. 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse**0.5
print('test RMSE', rmse)
Exemplo n.º 28
0
    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name:train[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}
    # print(type(train_model_input))

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    # compile(self, optimizer, loss, metrics=None, loss_weights=None, sample_weight_mode=None, weighted_metrics=None,
    #         target_tensors=None)

    model.compile("adam", "binary_crossentropy",
                  metrics=['accuracy'] )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=50, verbose=2, validation_split=0.2)
    pred_ans = model.predict(test_model_input, batch_size=256)
    scores = []
    for index in range(0, len(pred_ans)):
        result = llfun(test[target].values, pred_ans,index)
        scores.append(result)

    print(sum(scores) / len(scores))

#     pred_ans = model.predict(test_model_input, batch_size=256)
#     # print(pred_ans)
#     # print(type(pred_ans))
#
Exemplo n.º 29
0
               l2_reg_linear=0.001,
               l2_reg_embedding=0.001,
               l2_reg_dnn=0,
               init_std=0.0001,
               seed=1024,
               dnn_dropout=0.5,
               dnn_activation='relu',
               dnn_use_bn=True,
               task='binary')
try:
    model.load_weights(checkpoint_path)
    print('load weights')
except:
    pass
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=['accuracy', 'AUC'])
history = model.fit(train_model_input,
                    train[target],
                    batch_size=8192,
                    epochs=5,
                    verbose=1,
                    shuffle=True,
                    callbacks=[cp_callback],
                    validation_data=(val_model_input, val[target]))

data['predict'] = 0
data.loc[train_index, 'predict'] = model.predict(train_model_input,
                                                 batch_size=8192)
data.loc[val_index, 'predict'] = model.predict(val_model_input,
                                               batch_size=8192)
Exemplo n.º 30
0
    dnn_hidden_units=(128, 256),
    l2_reg_linear=0.01,
    l2_reg_embedding=0.01,
    init_std=0.0001,
    seed=1024,
    dnn_dropout=0.3,
    dnn_activation='selu',
    dnn_use_bn=True,
)

import tensorflow.keras as keras
import tensorflow as tf
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(
    opt,
    "binary_crossentropy",
    metrics=['accuracy'],
)

history = model.fit(train_model_input,
                    train[target].values,
                    batch_size=128,
                    epochs=30,
                    verbose=1,
                    validation_data=(valid_model_input, valid[target].values))


def plot_learning_curves(history):
    pd.DataFrame(history.history).plot(figsize=(8, 5))
    plt.grid(True)
    plt.gca().set_ylim(0, 1)
Exemplo n.º 31
0
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model = multi_gpu_model(model, gpus=2)

    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
Exemplo n.º 32
0
train_model_input = [train[name] for name in feature_names]

test_model_input = [test[name] for name in feature_names]

#model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model = DeepFM(linear_feature_columns,
               dnn_feature_columns,
               task='binary',
               use_fm=False,
               dnn_hidden_units=(128, 128),
               dnn_dropout=0)
# model = DCN(dnn_feature_columns, embedding_size=8)
model.compile(
    Adam(lr=0.005),
    "binary_crossentropy",
    metrics=['binary_crossentropy'],
)
#es = EarlyStopping(monitor='val_binary_crossentropy')
history = model.fit(train_model_input,
                    train[target].values,
                    validation_split=0.3,
                    callbacks=[
                        EarlyStopping(monitor='val_loss',
                                      patience=3,
                                      verbose=0,
                                      mode='auto')
                    ],
                    batch_size=4096,
                    epochs=10,
                    verbose=1)