def get_xy_fd(hash_flag=False): feature_columns = [SparseFeat('user', 3, hash_flag), SparseFeat('gender', 2, hash_flag), SparseFeat('item', 3 + 1, hash_flag), SparseFeat('item_gender', 2 + 1, hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('sess_0_item',3+1,4,use_hash=hash_flag,embedding_name='item'),VarLenSparseFeat('sess_0_item_gender',2+1,4,use_hash=hash_flag,embedding_name='item_gender')] feature_columns += [VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'),VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag,embedding_name='item_gender')] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } x = {name:feature_dict[name] for name in get_feature_names(feature_columns)} x["sess_length"] = sess_number y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False, hash_flag=False, prefix='', use_group=False): feature_columns = [] model_input = {} if 'weight' in sequence_feature: feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + "weighted_seq", vocabulary_size=2, embedding_dim=embedding_size), maxlen=3, length_name=prefix + "weighted_seq" + "_seq_length", weight_name=prefix + "weight")) s_input, s_len_input = gen_sequence( 2, 3, sample_size) model_input[prefix + "weighted_seq"] = s_input model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1) model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input sequence_feature.pop(sequence_feature.index('weight')) for i in range(sparse_feature_num): if use_group: group_name = str(i%3) else: group_name = DEFAULT_GROUP_NAME dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32,group_name=group_name)) for i in range(dense_feature_num): feature_columns.append(DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode, vocabulary_size=dim, embedding_dim=embedding_size), maxlen=maxlen, combiner=mode)) for fc in feature_columns: if isinstance(fc, SparseFeat): model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size) elif isinstance(fc, DenseFeat): model_input[fc.name] = np.random.random(sample_size) else: s_input, s_len_input = gen_sequence( fc.vocabulary_size, fc.maxlen, sample_size) model_input[fc.name] = s_input if include_length: fc.length_name = prefix + "sequence_" + str(i) + '_seq_length' model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) return model_input, y, feature_columns
def get_xy_fd(hash_flag=False): # feature_dim_dict = {"sparse": [SingleFeat('user', 3, hash_flag), SingleFeat( # 'gender', 2, hash_flag), SingleFeat('item', 3 + 1, hash_flag), SingleFeat('item_gender', 2 + 1, hash_flag)], # "dense": [SingleFeat('score', 0)]} feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 0) ] feature_columns += [ VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender', 3 + 1, maxlen=4, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in feature_names ] + [feature_dict[name] for name in varlen_feature_names] # x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in # feature_dim_dict["dense"]] + [ # feature_dict['hist_' + feat] for feat in behavior_feature_list] y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item', 3 + 1, embedding_dim=8), SparseFeat('item_gender', 2 + 1, embedding_dim=4), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=20, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] feature_columns += [DenseFeat('hist_len', 1, dtype="int64")] behavior_feature_list = ["item"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) hist_len = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'hist_len': hist_len, 'score': score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = [1, 1, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(): # 固定长度的离散特征 feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1) ] # 不固定长度的离散特征 feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4) ] behavior_feature_list = ["item", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 1]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } print('x=', x) y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [SparseFeat('user', 3,hash_flag), SparseFeat('gender', 2,hash_flag), SparseFeat('item', 3+1,hash_flag), SparseFeat('item_gender', 2+1,hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item","item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3])#0 is mask value igender = np.array([1, 2, 1])# 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]]) hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3,3,2]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} #x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in # feature_dim_dict["dense"]] + [ # feature_dict['hist_' + feat] for feat in behavior_feature_list] if use_neg: feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] #x += [feature_dict['neg_hist_'+feat] for feat in behavior_feature_list] feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) print(varlen_feature_names) x = [feature_dict[name] for name in feature_names] + [feature_dict[name] for name in varlen_feature_names] x += [behavior_length] y = [1, 0, 1] print(len(x)) return x, y, feature_columns, behavior_feature_list
def generate_din_feature_columns(data, sparse_features, dense_features): feat_lbe_dict = get_glv('feat_lbe_dict') sparse_feature_columns = [ SparseFeat(feat, vocabulary_size=len(feat_lbe_dict[feat].classes_) + 1, embedding_dim=EMBED_DIM) for i, feat in enumerate(sparse_features) if feat not in time_feat ] dense_feature_columns = [DenseFeat( feat, 1, ) for feat in dense_features] var_feature_columns = [ VarLenSparseFeat(SparseFeat( 'hist_item_id', vocabulary_size=len(feat_lbe_dict['item_id'].classes_) + 1, embedding_dim=EMBED_DIM, embedding_name='item_id'), maxlen=max_seq_len) ] # DNN side dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns # FM side linear_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns # all feature names feature_names = get_feature_names(dnn_feature_columns + linear_feature_columns) return feature_names, linear_feature_columns, dnn_feature_columns
def train_deepFM(): k = featureengineer.k #缺失值填充+编码处理 data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', ) for feat in trainmodel.dense_features: data[feat].fillna(data[feat].dropna().mean(), inplace=True) for feat in trainmodel.sparse_features: data[feat] = data[feat].apply(lambda x:str(x)) lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features]) #数据格式转换 fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8) for i, feat in enumerate(trainmodel.sparse_features)] + \ [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features] lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1) for i, feat in enumerate(trainmodel.lgbOut_Features)] key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums} varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i in trainmodel.var_features] dnn_feature_columns = fixlen_feature_columns + varlen_features linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in sparse_dense_features} test_model_input = {name: test[name] for name in sparse_dense_features} for x in trainmodel.var_features: if x == 'applist': train_model_input[x] = np.array(train[x].tolist()) test_model_input[x] = np.array(test[x].tolist()) if x == 'new_tag': train_model_input[x] = np.array(train[x].tolist())-appsnum test_model_input[x] = np.array(test[x].tolist())-appsnum # 模型 model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True, task='binary') model.compile("adam", "binary_crossentropy",metrics=['AUC'], ) history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=1, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_feature_names(feature_columns) x = {name: feature_dict[name] for name in feature_names} y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_test_data(sample_size=1000, sparse_feature_num=1, dense_feature_num=1, sequence_feature=('sum', 'mean', 'max'), classification=True, include_length=False, hash_flag=False, prefix=''): feature_columns = [] for i in range(sparse_feature_num): dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag, tf.int32)) for i in range(dense_feature_num): feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, tf.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode)) model_input = [] sequence_input = [] sequence_len_input = [] for fc in feature_columns: if isinstance(fc, SparseFeat): model_input.append(np.random.randint(0, fc.dimension, sample_size)) elif isinstance(fc, DenseFeat): model_input.append(np.random.random(sample_size)) else: s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen, sample_size) sequence_input.append(s_input) sequence_len_input.append(s_len_input) if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) x = model_input + sequence_input if include_length: for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length', 1, embedding=False)) x += sequence_len_input return x, y, feature_columns
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender', 3 + 1, maxlen=4, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } fixlen_feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in fixlen_feature_names ] + [feature_dict[name] for name in varlen_feature_names] y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def varsparsefeature(): key2index_len = {'applist': 25730, 'tag': 32539, 'outertag': 192} max_len = {'applist': 91, 'tag': 197, 'outertag': 2} varlen_feature_columns = [VarLenSparseFeat('%s_key' % i, vocabulary_size=key2index_len[i] + 1, maxlen=max_len[i], combiner='mean', embedding_dim=8, weight_name='%s_weight' % i) for i in ['applist', 'tag', 'outertag']] # varlen_feature_columns = [VarLenSparseFeat('%s_key' % i, vocabulary_size=100,maxlen=100, # combiner='mean', embedding_dim=8,weight_name='%s_weight' % i) for i in # ['applist', 'tag', 'outertag']] print(varlen_feature_columns)
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [SparseFeat('user', 3,hash_flag), SparseFeat('gender', 2,hash_flag), SparseFeat('item', 3+1,hash_flag), SparseFeat('item_gender', 2+1,hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item","item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3])#0 is mask value igender = np.array([1, 2, 1])# 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]]) hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3,3,2]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} if use_neg: feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] x = {name:feature_dict[name] for name in get_feature_names(feature_columns)} x["seq_length"] = behavior_length y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def getFeatureColumns(self): feature_columns = [] for feat, value in self.fixed_sparse_dict.items(): feature_columns.append( SparseFeat(feat, vocabulary_size=value[0], embedding_dim=value[1])) for feat, value in self.var_sparse_dict.items(): feature_columns.append( VarLenSparseFeat(feat, maxlen=value[0], vocabulary_size=value[1], embedding_dim=value[2], embedding_name=value[3])) return feature_columns
def get_xy_fd_sdm(hash_flag=False): user_feature_columns = [SparseFeat('user',3), SparseFeat('gender', 2), VarLenSparseFeat(SparseFeat('prefer_item', vocabulary_size=100,embedding_dim=8, embedding_name='item'), maxlen=6, length_name="prefer_sess_length"), VarLenSparseFeat(SparseFeat('prefer_cate', vocabulary_size=100, embedding_dim=8, embedding_name='cate'), maxlen=6, length_name="prefer_sess_length"), VarLenSparseFeat(SparseFeat('short_item', vocabulary_size=100,embedding_dim=8, embedding_name='item'), maxlen=4, length_name="short_sess_length"), VarLenSparseFeat(SparseFeat('short_cate', vocabulary_size=100, embedding_dim=8, embedding_name='cate'), maxlen=4, length_name="short_sess_length"), ] item_feature_columns = [SparseFeat('item', 100, embedding_dim=8,)] uid = np.array([0, 1, 2, 1]) ugender = np.array([0, 1, 0, 1]) iid = np.array([1, 2, 3, 1]) # 0 is mask value prefer_iid = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0], [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]]) prefer_cate = np.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 0], [1, 2, 3, 3, 0, 0], [1, 2, 4, 0, 0, 0]]) short_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) short_cate = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0], [3, 0, 0, 0]]) prefer_len = np.array([6, 5, 4, 3]) short_len = np.array([3, 3, 2, 1]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'prefer_item': prefer_iid, "prefer_cate":prefer_cate, 'short_item': short_iid, 'short_cate': short_cate, 'prefer_sess_length': prefer_len, 'short_sess_length':short_len} #feature_names = get_feature_names(feature_columns) x = feature_dict y = np.array([1, 1, 1, 0]) history_feature_list = ['item', 'cate'] return x, y, user_feature_columns, item_feature_columns, history_feature_list
def __init__(self, uNum, iNum, dim, maxlen): self.uNum = uNum self.iNum = iNum self.dim = dim self.maxlen = maxlen hash_flag = True self.feature_columns = [SparseFeat('user', self.uNum, hash_flag), SparseFeat('item', self.iNum, hash_flag), VarLenSparseFeat('sess_0_item', self.iNum, self.dim, use_hash=hash_flag, embedding_name='item')] self.behavior_feature_list = ["item"] self.model = DSIN(self.feature_columns, self.behavior_feature_list, sess_max_count=1, embedding_size=self.dim, att_head_num=self.dim, dnn_hidden_units=[self.dim, self.dim, self.dim], dnn_dropout=0.5) self.model.compile('adam', 'binary_crossentropy', metrics=['acc'])
def get_xy_fd(hash_flag=False): user_feature_columns = [SparseFeat('user',3),SparseFeat( 'gender', 2),VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1,embedding_dim=4,embedding_name='item'), maxlen=4,length_name="hist_len") ] item_feature_columns = [SparseFeat('item', 3 + 1,embedding_dim=4,)] uid = np.array([0, 1, 2,1]) ugender = np.array([0, 1, 0,1]) iid = np.array([1, 2, 3,1]) # 0 is mask value hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],[3, 0, 0, 0]]) hist_len = np.array([3,3,2,1]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'hist_item': hist_iid, "hist_len":hist_len} #feature_names = get_feature_names(feature_columns) x = feature_dict y = np.array([1, 1, 1,1]) return x, y, user_feature_columns,item_feature_columns
data = reduce_mem_usage(data) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] if isVarlen: varlen_feature_columns = [ VarLenSparseFeat('%s_key' % i, maxlen=max_len[i], vocabulary_size=100, embedding_dim=4, combiner='mean', use_hash=True, dtype="string") for i in ['applist', 'tag', 'outertag'] ] # varlen_feature_columns = [VarLenSparseFeat('%s_key' % i, key2index_len[i] + 1, max_len[i], # 'mean', weight_name='%s_weight' % i) for i in # ['applist', 'tag', 'outertag']] else: varlen_feature_columns = [] # 通过hash来解决,避免内存爆炸 # varlen_feature_columns = [VarLenSparseFeat('%s_key' % col_name, vocabulary_size=1000, # maxlen=1000, embedding_dim=8,
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [ SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } if use_neg: feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) feature_columns += [ VarLenSparseFeat(SparseFeat('neg_hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('neg_hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
D_train = D_train_r.with_options(options) D_valid = D_valid_r.with_options(options) D_train = D_train.repeat().prefetch(buffer_size=num_para) D_valid = D_valid.repeat().prefetch(buffer_size=num_para) # %% embedding_size = NNconfig_dic["embedding_size"] sparse_feature_columns = [] varlen_feature_columns = [] sparse_feature_columns = [SparseFeat(feat, sparse_vcab_dic[feat] + 1, dtype=tf.int64, embedding_dim = embedding_size) for feat in sparse_f] varlen_feature_columns = [VarLenSparseFeat(SparseFeat(vfeat, vocabulary_size = varlen_vcab_dic[vfeat] + 1, dtype=tf.int64, embedding_dim = embedding_size), maxlen = varlen_maxlen_f[vfeat]) for vfeat in varlen_f] # %% linear_feature_columns, dnn_feature_columns = \ sparse_feature_columns + varlen_feature_columns, sparse_feature_columns + varlen_feature_columns # %% model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=NNconfig_dic["dnn_hidden_units"], l2_reg_dnn=NNconfig_dic["l2_reg_dnn"], l2_reg_embedding=NNconfig_dic["l2_reg_embedding"], l2_reg_linear=NNconfig_dic["l2_reg_linear"], dnn_dropout=NNconfig_dic["dnn_dropout"],
key2index_adids = {} adids_list = list(map(split_adis, data['click_adids'].values)) adids_length = np.array(list(map(len, adids_list))) max_adids_len = max(adids_length) adids_list = pad_sequences( adids_list, maxlen=max_adids_len, padding='post', ) fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] varlen_feature_columns_pkgs = [ VarLenSparseFeat('install_pkgs', len(key2index_pkgs) + 1, max_pkgs_len, 'mean') ] # varlen_feature_columns_adids = [ VarLenSparseFeat('click_adids', len(key2index_adids) + 1, max_adids_len, 'mean') ] # linear_feature_columns = fixlen_feature_columns + varlen_feature_columns_pkgs + varlen_feature_columns_adids dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns_pkgs + varlen_feature_columns_adids feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train_input = data[data["pdt"] != 20191019] test_input = data[data["pdt"] == 20191019] train_model_input = {name: train_input[name] for name in feature_names} # train_pkgs = [pkgs_list[i] for i in train_input.index.values]
padding='post', ) # 2.count #unique features for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique(), embedding_dim=4) for feat in sparse_features ] use_weighted_sequence = False if use_weighted_sequence: varlen_feature_columns = [ VarLenSparseFeat('genres', maxlen=max_len, vocabulary_size=len(key2index) + 1, embedding_dim=4, combiner='mean', weight_name='genres_weight') ] # Notice : value 0 is for padding for sequence input feature else: varlen_feature_columns = [ VarLenSparseFeat('genres', maxlen=max_len, vocabulary_size=len(key2index) + 1, embedding_dim=4, combiner='mean', weight_name=None) ] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
] elif column == 'action_type': feature_columns += [SparseFeat(column, 4 + 1, embedding_dim=dim)] else: feature_columns += [DenseFeat(column, 1)] #print(train_X['hist_merchant_id'].shape) #M = len(train_X['hist_merchant_id']) print('M=', M) # maxlen为历史信息的长度,vocabulary_size为onehot的长度 feature_columns += [ VarLenSparseFeat('hist_merchant_id', maxlen=M, vocabulary_size=19111 + 1, embedding_dim=8, embedding_name='merchant_id'), VarLenSparseFeat('hist_action_type', maxlen=M, vocabulary_size=4 + 1, embedding_dim=4, embedding_name='action_type') ] hist_features = ['merchant_id', 'action_type'] print(feature_columns) # 使用DIN模型 model = DIN(feature_columns, hist_features) # 使用Adam优化器,二分类的交叉熵 model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
import numpy as np from deepctr.models import DIN from deepctr.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_fixlen_feature_names,get_varlen_feature_names feature_columns = [SparseFeat('user',3),SparseFeat( 'gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1),DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} fixlen_feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in varlen_feature_names] y = [1, 0, 1] model = DIN(feature_columns, behavior_feature_list, hist_len_max=4, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
def get_test_data(sample_size=1000, sparse_feature_num=1, dense_feature_num=1, sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False, hash_flag=False, prefix=''): feature_columns = [] model_input = {} if 'weight' in sequence_feature: feature_columns.append( VarLenSparseFeat(prefix + "weighted_seq", 2, 3, weight_name=prefix + "weight")) feature_columns.append( SparseFeat(prefix + "weighted_seq_seq_length", 1, embedding=False)) s_input, s_len_input = gen_sequence(2, 3, sample_size) model_input[prefix + "weighted_seq"] = s_input model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1) model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input sequence_feature.pop(sequence_feature.index('weight')) for i in range(sparse_feature_num): dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag, tf.int32)) for i in range(dense_feature_num): feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, tf.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode)) for fc in feature_columns: if isinstance(fc, SparseFeat): model_input[fc.name] = np.random.randint(0, fc.dimension, sample_size) elif isinstance(fc, DenseFeat): model_input[fc.name] = np.random.random(sample_size) else: s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen, sample_size) model_input[fc.name] = s_input if include_length: feature_columns.append( SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length', 1, embedding=False)) model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) return model_input, y, feature_columns
for feat in item_sparse_features: if feat != "item_id": feature_list.append( SparseFeat(feat, items_feature[feat].nunique() + 1)) else: feature_list.append( SparseFeat(feat, len(item_id_unique) + 1)) dense_feature_list = [ DenseFeat(feat, 1) for feat in dense_features ] varLen_sparse_feature_list = [ VarLenSparseFeat(feat, 11, maxlen=10) for feat in varLen_sparse_features ] sess_sparse_feature_list = [ VarLenSparseFeat(feat, len(item_id_unique) + 1, maxlen=DIN_SESS_MAX_LEN, embedding_name='item_id') for feat in hist_feature ] feature_list = feature_list + dense_feature_list + varLen_sparse_feature_list # pred_stage = [ i for i in data["pred_stage"]] pred_stage = []
sparse_feature_columns = [ SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string') for feat in sparse_features ] dense_feature_columns = [DenseFeat( feat, 1, ) for feat in dense_features] varlen_feature_columns = [ VarLenSparseFeat(sparsefeat=SparseFeat('Genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"), maxlen=max_len, combiner='mean') ] # Notice : value 0 is for padding for sequence input feature fixlen_feature_columns = sparse_feature_columns + dense_feature_columns dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns # dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns # linear_feature_columns = fixlen_feature_columns + varlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
padding='post', dtype=str, value=0) # 2.set hashing space for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique() * 5, use_hash=True, dtype='string') for feat in sparse_features ] varlen_feature_columns = [ VarLenSparseFeat('genres', 100, max_len, 'mean', use_hash=True, dtype="string") ] # Notice : value 0 is for padding for sequence input feature linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model model_input = {name: data[name] for name in feature_names} model_input['genres'] = genres_list # 4.Define Model,compile and train model = DeepFM(linear_feature_columns, dnn_feature_columns,
def _preprocess_movielens(df, **kw): multiple_value = kw.get('multiple_value') sparse_col = ["movie_id", "user_id", "gender", "age", "occupation", "zip"] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_col: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) if not multiple_value: # 2.count #unique features for each sparse field fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col] linear_cols = fixlen_cols dnn_cols = fixlen_cols train, test = train_test_split(df, test_size=0.2) ytrue = test[target].values else: ytrue = df[target].values hash_feature = kw.get('hash_feature', False) if not hash_feature: def split(x): key_ans = x.split('|') for key in key_ans: if key not in key2index: # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input key2index[key] = len(key2index) + 1 return list(map(lambda x: key2index[x], key_ans)) # preprocess the sequence feature key2index = {} genres_list = list(map(split, df['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', ) fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col] use_weighted_sequence = False if use_weighted_sequence: varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len( key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature else: varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len( key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name=None)] # Notice : value 0 is for padding for sequence input feature linear_cols = fixlen_cols + varlen_cols dnn_cols = fixlen_cols + varlen_cols # generate input data for model model_input = {name: df[name] for name in sparse_col} # model_input["genres"] = genres_list model_input["genres_weight"] = np.random.randn(df.shape[0], max_len, 1) else: df[sparse_col] = df[sparse_col].astype(str) # 1.Use hashing encoding on the fly for sparse features,and process sequence features genres_list = list(map(lambda x: x.split('|'), df['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0) # 2.set hashing space for each sparse field and generate feature config for sequence feature fixlen_cols = [ SparseFeat(feat, df[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string') for feat in sparse_col] varlen_cols = [ VarLenSparseFeat( SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"), maxlen=max_len, combiner='mean', )] # Notice : value 0 is for padding for sequence input feature linear_cols = fixlen_cols + varlen_cols dnn_cols = fixlen_cols + varlen_cols feature_names = get_feature_names(linear_cols + dnn_cols) # 3.generate input data for model model_input = {name: df[name] for name in feature_names} model_input['genres'] = genres_list train, test = model_input, model_input return df, linear_cols, dnn_cols, train, test, target, ytrue
] + [DenseFeat( feat, 1, ) for feat in user_dense_features] item_feature_columns = [ SparseFeat(feat, data[feat].nunique(), embedding_dim=4, use_hash=True) for i, feat in enumerate(item_sparse_features) ] + [DenseFeat( feat, 1, ) for feat in item_dense_features] item_varlen_feature_columns = [ VarLenSparseFeat(SparseFeat('genres', vocabulary_size=1000, embedding_dim=4), maxlen=genres_maxlen, combiner='mean', weight_name=None) ] user_varlen_feature_columns = [ VarLenSparseFeat(SparseFeat('user_hist', vocabulary_size=1000, embedding_dim=4), maxlen=user_maxlen, combiner='mean', weight_name=None) ] # 3.generate input data for model user_feature_columns += user_varlen_feature_columns