예제 #1
0
파일: DIN_test.py 프로젝트: nwf5d/DeepCTR
def test_DIN_sum():

    model_name = "DIN_sum"
    x, y, feature_dim_dict, behavior_feature_list = get_xy_fd()

    model = DIN(feature_dim_dict,
                behavior_feature_list,
                hist_len_max=4,
                embedding_size=8,
                use_din=False,
                hidden_size=[4, 4, 4],
                keep_prob=0.6,
                activation="sigmoid")

    model.compile('adam',
                  'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, verbose=1, validation_split=0.5)

    print(model_name + " test train valid pass!")
    model.save_weights(model_name + '_weights.h5')
    model.load_weights(model_name + '_weights.h5')
    print(model_name + " test save load weight pass!")

    save_model(model, model_name + '.h5')
    model = load_model(model_name + '.h5', custom_objects)
    print(model_name + " test save load model pass!")

    print(model_name + " test pass!")
예제 #2
0
def test_DIN_att():
    model_name = "DIN_att"

    x, y, feature_dim_dict, behavior_feature_list = get_xy_fd()

    model = DIN(
        feature_dim_dict,
        behavior_feature_list,
        hist_len_max=4,
        embedding_size=8,
        use_din=True,
        hidden_size=[4, 4, 4],
        keep_prob=0.6,
    )

    model.compile('adam',
                  'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, verbose=1, validation_split=0.5)

    print(model_name + " test train valid pass!")
    model.save_weights(model_name + '_weights.h5')
    model.load_weights(model_name + '_weights.h5')
    print(model_name + " test save load weight pass!")

    # try:
    #     save_model(model,  name + '.h5')
    #     model = load_model(name + '.h5', custom_objects)
    #     print(name + " test save load model pass!")
    # except:
    #     print("【Error】There is a bug when save model use Dice---------------------------------------------------")

    print(model_name + " test pass!")
예제 #3
0
def test_DIN_att():
    model_name = "DIN_att"

    x, y, feature_dim_dict, behavior_feature_list = get_xy_fd()

    model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4, embedding_size=8,
                use_din=True, hidden_size=[4, 4, 4], keep_prob=0.6,)

    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model.fit(x, y, verbose=1, validation_split=0.5)

    print(model_name+" test train valid pass!")
    model.save_weights(model_name + '_weights.h5')
    model.load_weights(model_name + '_weights.h5')
    print(model_name+" test save load weight pass!")

    # try:
    #     save_model(model,  name + '.h5')
    #     model = load_model(name + '.h5', custom_objects)
    #     print(name + " test save load model pass!")
    # except:
    #     print("【Error】There is a bug when save model use Dice---------------------------------------------------")

    print(model_name + " test pass!")
예제 #4
0
def test_DIN_model_io():

    model_name = "DIN_att"
    _, _, feature_dim_dict, behavior_feature_list = get_xy_fd()

    model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4, embedding_size=8, att_activation=Dice,

                use_din=True, hidden_size=[4, 4, 4], keep_prob=0.6,)

    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
   #model.fit(x, y, verbose=1, validation_split=0.5)
    save_model(model,  model_name + '.h5')
    model = load_model(model_name + '.h5', custom_objects)
    print(model_name + " test save load model pass!")
예제 #5
0
def test_DIN_model_io():
    name = "DIN_att"
    x, y, feature_dim_dict, behavior_feature_list = get_xy_fd()

    model = DIN(
        feature_dim_dict,
        behavior_feature_list,
        hist_len_max=4,
        embedding_size=8,
        use_din=True,
        hidden_size=[4, 4, 4],
        keep_prob=0.6,
    )

    model.compile('adam',
                  'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    #model.fit(x, y, verbose=1, validation_split=0.5)
    save_model(model, name + '.h5')
    model = load_model(name + '.h5', custom_objects)
    print(name + " test save load model pass!")
예제 #6
0
def example_din():
    """
    1. 生成训练数据为txt格式的,逗号分割字段
    2. 转换成tfrecord
    3. 读取数据,区分dense, sparse, VarLenSparse, 用户行为序列特征
    4. 分别喂到模型中,看看会怎么样
    :return:
    """
    # x, y, feature_columns, behavior_feature_list = get_xy_fd() #说一下哪几列是当前的item需要和历史的行为进行attention的。所以之后就可以尝试,还是像之前一样读数据,然后只是把需要attention的列名单拿出来,放到list中就可以了
    x, y, feature_columns, behavior_feature_list = get_xy_from_txt(
    )  #说一下哪几列是当前的item需要和历史的行为进行attention的。所以之后就可以尝试,还是像之前一样读数据,然后只是把需要attention的列名单拿出来,放到list中就可以了
    # dataset = tf.data.Dataset.from_tensor_slices((x.values, y.values))

    model = DIN(feature_columns, behavior_feature_list)
    model.compile(
        'adam',
        keras.losses.binary_crossentropy,
        metrics=[keras.metrics.AUC(), keras.metrics.categorical_accuracy])
    history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
    # history = model.fit(dataset, verbose=1, epochs=10, validation_data=(x,y))
    # history = model.fit(dataset, verbose=1, epochs=10, validation_split=0.5)
    print("history: ", history)
예제 #7
0
def test_DIN_sum():

    model_name = "DIN_sum"
    x, y, feature_dim_dict, behavior_feature_list = get_xy_fd()

    model = DIN(feature_dim_dict,
                behavior_feature_list,
                hist_len_max=4,
                embedding_size=8,
                use_din=False,
                hidden_size=[4, 4, 4],
                keep_prob=0.6,
                activation="sigmoid")

    check_model(model, model_name, x, y)
예제 #8
0
def get_xy_fd():
    feature_dim_dict = {"sparse": [SingleFeat('user', 3), SingleFeat(
        'gender', 2), SingleFeat('item', 3+1), SingleFeat('item_gender', 2+1)], "dense": [SingleFeat('score', 0)]}
    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score}

    x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name]
                                                                            for feat in feature_dim_dict["dense"]] + [feature_dict['hist_'+feat] for feat in behavior_feature_list]

    y = [1, 0, 1]
    return x, y, feature_dim_dict, behavior_feature_list


if __name__ == "__main__":
    x, y, feature_dim_dict, behavior_feature_list = get_xy_fd()
    model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4,)
    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
예제 #9
0
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])
    seq_length = np.array([3, 3,
                           2])  # the actual length of the behavior sequence

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score,
        'seq_length': seq_length
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list


if __name__ == "__main__":
    x, y, feature_columns, behavior_feature_list = get_xy_fd()
    model = DIN(feature_columns, behavior_feature_list)
    # model = BST(feature_columns, behavior_feature_list,att_head_num=4)
    model.compile('adam',
                  'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
예제 #10
0
        elif column == 'action_type':
            feature_columns += [SparseFeat(column, 4 + 1, embedding_dim=dim)]
        else:
            feature_columns += [DenseFeat(column, 1)]

# maxlen为历史信息的长度,vocabulary_size为onehot的长度
feature_columns += [
    VarLenSparseFeat(sparsefeat=SparseFeat('hist_merchant_id', vocabulary_size=1993, embedding_dim=8,
                                           embedding_name='merchant_id'), maxlen=M),
    VarLenSparseFeat(sparsefeat=SparseFeat('hist_action_type', vocabulary_size=4, embedding_dim=4,
                                           embedding_name='action_type'), maxlen=M)]
history_features = ['merchant_id', 'action_type']
print(len(feature_columns))

# 使用DIN模型
model = DIN(feature_columns, history_features)
# 使用Adam优化器,二分类的交叉熵
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
# model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"])

# 组装train_model_input,得到feature names,将train_X转换为字典格式
feature_names = list(train_X.columns)
train_model_input = {name: train_X[name].values for name in get_feature_names(feature_columns)}
print("########################################")

# histroy输入必须是二维数组
from tqdm import tqdm

for fea in ['hist_merchant_id', 'hist_action_type']:
    list = []
    for i in tqdm(train_model_input[fea]):
예제 #11
0
    sess_len_max = SESS_MAX_LEN
    BATCH_SIZE = 1024
    sess_feature = ['item_id']
    # def auc(y_true,y_pred):
    #   return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

    EMBEDDING_SIZE = int(ebs)
    if EMBEDDING_SIZE == 0:
        EMBEDDING_SIZE = "auto"

    model = DIN(fd,
                sess_feature,
                embedding_size=EMBEDDING_SIZE,
                dnn_dropout=float(dnn_dropout),
                att_activation='dice',
                att_weight_normalization=False,
                hist_len_max=sess_len_max,
                dnn_hidden_units=(200, 80),
                att_hidden_size=(
                    64,
                    16,
                ))
    model.compile('adagrad',
                  'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    model_dir = "../model_dir_" + str(EMBEDDING_SIZE)
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    if os.path.exists(model_dir + '/ckpt.h5'):
        model.load_weights(model_dir + '/ckpt.h5')
    """
    test_input_pos = pd.read_pickle(
예제 #12
0
    user_age = np.array([1, 2, 3])
    user_gender = np.array([0, 1, 0])
    item_id = np.array([0, 1, 2])
    item_gender = np.array([0, 1, 0])

    # multi-value feature input
    hist_item_id = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 0]])
    hist_item_gender = np.array([[0, 1, 0, 1], [0, 1, 1, 1], [0, 0, 1, 0]])
    # valid length of behavior sequence of every sample
    hist_length = np.array([4, 4, 3])

    feature_dict = {'user_age': user_age, 'user_gender': user_gender, 'item_id': item_id, 'item_gender': item_gender,
                    'hist_item_id': hist_item_id, 'hist_item_gender': hist_item_gender, }

    x = [feature_dict[feat] for feat in feature_dim_dict["sparse"]] + \
        [feature_dict['hist_'+feat]
            for feat in behavior_feature_list] + [hist_length]
    # Notice the concatenation order: single feature + multi-value feature + length
    # Since the length of the historical sequences of different features in DIN are the same(they are all extended from item_id),only one length vector is enough.
    y = [1, 0, 1]

    return x, y, feature_dim_dict, behavior_feature_list


if __name__ == "__main__":
    x, y, feature_dim_dict, behavior_feature_list = get_xy_fd()
    model = DIN(feature_dim_dict, behavior_feature_list, hist_len_max=4,)
    model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])
    history = model.fit(x, y, verbose=1, validation_split=0.5)
예제 #13
0
elif sys.argv[1] == 'PNN_UDG':
    model = PNN_UDG(dnn_feature_columns, untrainable_features_columns, (200, 80), uid_feature_name=udg_features, 
                    udg_embedding_size=int(sys.argv[5]))
elif sys.argv[1] == 'PNN':
    model = PNN(dnn_feature_columns, untrainable_features_columns, (200, 80))
elif sys.argv[1] == 'WDL':
    model = WDL(linear_feature_columns, dnn_feature_columns, [], (200, 80))
elif sys.argv[1] == 'WDL_UDG':
    model = WDL_UDG(linear_feature_columns, dnn_feature_columns, untrainable_features_columns, (200, 80), uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5]))
elif sys.argv[1] == 'DIEN':
    model = DIEN(fixlen_feature_columns, behavior_feature_list,
             dnn_hidden_units=[200, 80], dnn_dropout=0, gru_type="AUGRU", use_negsampling=True)
elif sys.argv[1] == 'DIEN_UDG':
    model = DIEN_UDG(fixlen_feature_columns, untrainable_features_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0, gru_type="AUGRU", use_negsampling=True, uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5]))
elif sys.argv[1] == 'DIN':
    model = DIN(fixlen_feature_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0)
elif sys.argv[1] == 'DIN_UDG':
    model = DIN_UDG(fixlen_feature_columns, untrainable_features_columns, behavior_feature_list, dnn_hidden_units=[200, 80], dnn_dropout=0, uid_feature_name=udg_features, udg_embedding_size=int(sys.argv[5]))
    
if sys.argv[4] == 'focal':
    model.compile("adam", loss=focal_loss, metrics=['binary_crossentropy'], )
else:
    model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
init_lr = float(tf.keras.backend.get_value(model.optimizer.learning_rate))
lr = [init_lr, init_lr/2, init_lr/4]
history_all = {}
max_auc, min_log, min_rmse, max_rig = 0, 0, 0, 0
for x in range(epoch):
    tf.keras.backend.set_value(model.optimizer.lr, lr[x])
    history = CustomCallback()
    model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=1, 
예제 #14
0
파일: train_din.py 프로젝트: zwcdp/DSIN-1
    test_label = label[test_idx]

    sess_len_max = SESS_MAX_LEN
    BATCH_SIZE = 4096

    sess_feature = ['cate_id', 'brand']
    TEST_BATCH_SIZE = 2**17
    REG = 1e-6

    model = DIN(fd,
                sess_feature,
                embedding_size=4,
                att_activation='dice',
                att_weight_normalization=False,
                hist_len_max=sess_len_max,
                dnn_hidden_units=(200, 80),
                att_hidden_size=(
                    64,
                    16,
                ),
                l2_reg_embedding=REG,
                seed=2019)

    model.compile('adagrad',
                  'binary_crossentropy',
                  metrics=[
                      'binary_crossentropy',
                  ])

    hist_ = model.fit(
        train_input[:],
예제 #15
0
 def buildModel(self):
     feature_columns = self.encoder.getFeatureColumns()
     self.model = DIN(feature_columns, self.encoder.behavior_list)
     self.model.compile('adam',
                        'binary_crossentropy',
                        metrics=['binary_crossentropy'])
예제 #16
0
class Trainer:
    def __init__(self):
        self.data = None
        self.encoder = None
        self.model = None
        # number of positive samples
        self.num_pos = None
        self.recipeDomain = None

    def loadData(self, url: str):
        self.data = pd.read_csv(url)
        self.recipeDomain = pd.read_csv(url)
        self.num_pos = self.data.shape[0]
        self.encoder = ModelEncoder()
        self.encoder.train()

    def preProcessData(self):
        self.data = self.encoder.encode(self.data)
        self.data['result'] = [1] * self.num_pos

        self.recipeDomain = self.encoder.encode(self.recipeDomain)

        self.build_negative_data()

    def buildModel(self):
        feature_columns = self.encoder.getFeatureColumns()
        self.model = DIN(feature_columns, self.encoder.behavior_list)
        self.model.compile('adam',
                           'binary_crossentropy',
                           metrics=['binary_crossentropy'])

    def train(self):
        model_input = {}
        for feat, _ in self.encoder.fixed_sparse_dict + self.encoder.var_sparse_dict:
            model_input[feat] = self.data[feat]
        history = self.model.fit(model_input,
                                 self.data['gt'].values,
                                 batch_size=256,
                                 epochs=10,
                                 verbose=2,
                                 validation_split=0.2,
                                 shuffle=True)

    def dump_model(self, path: str):
        save_model(self.model, path)

    def dump_encoder(self, path: str):
        pickle.dump(self.encoder, self, protocol=4)

    def update(self):
        url = ''
        self.recipeDomain = pd.read_csv(url)
        self.encoder.encode(self.recipeDomain)

    def build_negative_data(self):
        for i in range(self.num_pos):
            record = self.data.iloc[i]
            clicked_set = record['hist_recipe'] + record['recipe'][i]
            for j in self.recipeDomain.shape[0]:
                if self.recipeDomain.iloc[j]['recipe'] not in clicked_set:
                    # valid unclicked combination
                    for feat in record.columns:
                        if feat in self.recipeDomain.columns:
                            record[feat] = self.recipeDomain.iloc[j][feat]
                    record['result'] = 0
                    self.data.append(record)