Exemplo n.º 1
0
def find_l2_reg_embedding(linear_feature_columns, dnn_feature_columns,
                          train_model_input, train, test_model_input, test):
    cols = ['l2_reg_embedding', 'RMSE', 'MAE', 'MSE', 'AUC']
    df_result = pd.DataFrame(columns=cols,
                             index=range(
                                 len(config.param_rand['l2_reg_embedding'])))
    for i, x in enumerate(config.param_rand['l2_reg_embedding']):

        ##Add dnn_hidden_units as b later
        model = WDL(
            linear_feature_columns,
            dnn_feature_columns,
            # ADD LATER
            dnn_hidden_units=(2, 2),
            l2_reg_linear=0.1,
            l2_reg_embedding=x,
            l2_reg_dnn=0,
            init_std=0.0001,
            seed=1024,
            task='binary')

        model.compile("adam", "mse", metrics=['mse'])
        history = model.fit(
            train_model_input,
            train[target].values,
            batch_size=256,
            epochs=config.model_epoch['epoch'],
            verbose=2,
            validation_split=0.2,
        )
        pred_ans = model.predict(test_model_input, batch_size=256)

        auc = roc_auc_score(test[target].values, pred_ans)
        df_result.loc[i].l2_reg_embedding = x
        df_result.loc[i].RMSE = np.round(
            math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3)
        df_result.loc[i].MAE = np.round(
            mean_absolute_error(test[target].values, pred_ans), 3)
        df_result.loc[i].MSE = np.round(
            mean_squared_error(test[target].values, pred_ans), 3)
        df_result.loc[i].AUC = np.round(auc, 3)
    return df_result
Exemplo n.º 2
0
def widendeep_model(linear_feature_columns, dnn_feature_columns,
                    train_model_input, train, test_model_input, test):
    cols = ['model', 'RMSE', 'MAE', 'MSE', 'AUC', 'score']
    df_result = pd.DataFrame(columns=cols, index=range(1))
    model = WDL(
        linear_feature_columns,
        dnn_feature_columns,
        dnn_hidden_units=config.widendeep_att["dnn_hidden_units"],
        #l2_reg_linear=config.widendeep_att["l2_reg_linear"],
        # l2_reg_embedding=config.widendeep_att["l2_reg_embedding"],
        #l2_reg_dnn=config.widendeep_att["l2_reg_dnn"],
        #  init_std=config.widendeep_att["init_std"],
        dnn_dropout=config.widendeep_att['dnn_dropout'],
        dnn_activation=config.widendeep_att['dnn_activation'],
        seed=config.widendeep_att["seed"],
        task=config.widendeep_att["task"])

    model.compile("adam", "mse", metrics=['mse'])

    history = model.fit(train_model_input,
                        train[target].values,
                        batch_size=256,
                        epochs=config.model_epoch['epoch'],
                        verbose=2,
                        validation_split=0.2)

    pred_ans = model.predict(test_model_input, batch_size=256)
    save_model(model, 'saved_widendeep.h5')  # save_model
    auc = roc_auc_score(test[target].values, pred_ans)

    df_result.loc[0].model = "Wide and Deep"
    df_result.loc[0].RMSE = np.round(
        math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3)
    df_result.loc[0].MAE = np.round(
        mean_absolute_error(test[target].values, pred_ans), 3)
    df_result.loc[0].MSE = np.round(
        mean_squared_error(test[target].values, pred_ans), 3)
    df_result.loc[0].AUC = np.round(auc, 3)

    return df_result
target = ['rating']

# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

# 使用WDL进行训练
model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=100, verbose=True, validation_split=0.2, )
plt.figure()
x = range(len(history.history['loss']))
plt.plot(x, history.history['loss'])
plt.title('loss')
# 使用WDL进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)
        SparseFeat(feature, data[feature].nunique())
        for feature in sparse_features
    ]
    linear_feature_columns = fix_len_feature_columns
    dnn_feature_columns = fix_len_feature_columns
    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 将数据集切分成训练集和测试集
    train, test = train_test_split(data, test_size=0.2)
    train_set = {name: train[name].values for name in feature_names}
    test_set = {name: test[name].values for name in feature_names}

    # 使用 WDL 进行训练
    model = WDL(linear_feature_columns, dnn_feature_columns, task='regression')
    model.compile('adam', 'mse', metrics=['mse'])
    history = model.fit(train_set,
                        train[target].values,
                        batch_size=256,
                        epochs=1,
                        verbose=True,
                        validation_split=0.2)

    # 使用 WDL 进行预测
    pred_ans = model.predict(test_set, batch_size=256)

    # 输出 RMSE 或者 MSE
    mse = round(mean_squared_error(test[target].values, pred_ans), 4)
    rmse = mse**0.5
    print(f'test rmse: {rmse}')
Exemplo n.º 5
0
        data = pd.read_csv("./data/sample/validation.txt")

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            data[feat] = lbe.fit_transform(data[feat])
        # 2.count #unique features for each sparse field
        sparse_feature_dim = {
            feat: data[feat].nunique()
            for feat in sparse_features
        }
        # 3.generate input data for model
        model_input = [data[feat].values for feat in sparse_feature_dim]

        pred = model.predict(model_input, batch_size, 1)
        label = data[target].values.flatten().tolist()
        pred = pred.flatten().tolist()
        with open('data/pctr', 'w') as fw:
            for i in range(len(pred)):
                if i % 10000 == 0:
                    print('label: %f, pred: %f' % (label[i], pred[i]))
                to_write = str(i + 1) + ',' + str(label[i]) + ',' + str(
                    pred[i]) + '\n'
                fw.write(to_write)
            fw.close()
        AUC = auc.auc(label, pred)
        print('auc: %f' % AUC)

    print("demo done")