def _preprocess_criteo(df, **kw): hash_feature = kw.get('hash_feature') sparse_col = ['C' + str(i) for i in range(1, 27)] dense_col = ['I' + str(i) for i in range(1, 14)] df[sparse_col] = df[sparse_col].fillna('-1', ) df[dense_col] = df[dense_col].fillna(0, ) target = ["label"] # set hashing space for each sparse field,and record dense feature field name if hash_feature: # Transformation for dense features mms = MinMaxScaler(feature_range=(0, 1)) df[dense_col] = mms.fit_transform(df[dense_col]) sparse_col = ['C' + str(i) for i in range(1, 27)] dense_col = ['I' + str(i) for i in range(1, 14)] fixlen_cols = [SparseFeat(feat, vocabulary_size=1000, embedding_dim=4, use_hash=True, dtype='string') # since the input is string for feat in sparse_col] + [DenseFeat(feat, 1, ) for feat in dense_col] else: for feat in sparse_col: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) mms = MinMaxScaler(feature_range=(0, 1)) df[dense_col] = mms.fit_transform(df[dense_col]) fixlen_cols = [SparseFeat(feat, vocabulary_size=df[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_col)] + [DenseFeat(feat, 1, ) for feat in dense_col] linear_cols = fixlen_cols dnn_cols = fixlen_cols train, test = train_test_split(df, test_size=kw['test_size']) return df, linear_cols, dnn_cols, train, test, target, test[target].values
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item', 3 + 1, embedding_dim=8), SparseFeat('item_gender', 2 + 1, embedding_dim=4), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=20, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] feature_columns += [DenseFeat('hist_len', 1, dtype="int64")] behavior_feature_list = ["item"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) hist_len = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'hist_len': hist_len, 'score': score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = [1, 1, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(hash_flag=False): feature_columns = [SparseFeat('user', 3, hash_flag), SparseFeat('gender', 2, hash_flag), SparseFeat('item', 3 + 1, hash_flag), SparseFeat('item_gender', 2 + 1, hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('sess_0_item',3+1,4,use_hash=hash_flag,embedding_name='item'),VarLenSparseFeat('sess_0_item_gender',2+1,4,use_hash=hash_flag,embedding_name='item_gender')] feature_columns += [VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'),VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag,embedding_name='item_gender')] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } x = {name:feature_dict[name] for name in get_feature_names(feature_columns)} x["sess_length"] = sess_number y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def generate_din_feature_columns(data, sparse_features, dense_features): feat_lbe_dict = get_glv('feat_lbe_dict') sparse_feature_columns = [ SparseFeat(feat, vocabulary_size=len(feat_lbe_dict[feat].classes_) + 1, embedding_dim=EMBED_DIM) for i, feat in enumerate(sparse_features) if feat not in time_feat ] dense_feature_columns = [DenseFeat( feat, 1, ) for feat in dense_features] var_feature_columns = [ VarLenSparseFeat(SparseFeat( 'hist_item_id', vocabulary_size=len(feat_lbe_dict['item_id'].classes_) + 1, embedding_dim=EMBED_DIM, embedding_name='item_id'), maxlen=max_seq_len) ] # DNN side dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns # FM side linear_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns # all feature names feature_names = get_feature_names(dnn_feature_columns + linear_feature_columns) return feature_names, linear_feature_columns, dnn_feature_columns
def test_long_dense_vector(): feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_feature_names(feature_columns) user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] model = DeepFM(feature_columns, feature_columns[:-1]) model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def prepare_data(cls, path, sparse_features, task='binary'): data_path = path dataframe = pd.read_csv(data_path, names= 'user_id,movie_id,rating,timestamp'.split(',')) sparse_features = sparse_features y= ['rating'] for feat in sparse_features: lbe = LabelEncoder() dataframe[feat] = lbe.fit_transform(dataframe[feat]) feature_columns = [DenseFeat(feat, dataframe[feat].nunique()) for feat in sparse_features] #feature_columns = [SparseFeat(feat, dataframe[feat].nunique()) for feat in sparse_features] trainset, testset = train_test_split(dataframe, test_size=0.2) train_model_input = [to_categorical(trainset[fc.name].values, num_classes= fc.dimension) for fc in feature_columns]#includes values from only data[user_id], data[movie_id] test_model_input = [to_categorical(testset[fc.name].values, num_classes= fc.dimension) for fc in feature_columns]#includes values from only data[user_id], data[movie_id] if task =='binary': train_lbl = trainset[y] test_lbl= testset[y] elif task == 'multiclass': train_lbl = to_categorical(trainset[y])[:,1:]#stripping 0th column as rating is (1,5) test_lbl= to_categorical(testset[y])[:,1:]#stripping 0th column as rating is (1,5) else: raise ValueError("Enter task either 'binary' or 'multiclass'") return cls(feature_columns), (train_model_input, train_lbl), (test_model_input, test_lbl) #try returning train_model_input from inside __init__()
def train_deepFM(): k = featureengineer.k #缺失值填充+编码处理 data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', ) for feat in trainmodel.dense_features: data[feat].fillna(data[feat].dropna().mean(), inplace=True) for feat in trainmodel.sparse_features: data[feat] = data[feat].apply(lambda x:str(x)) lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features]) #数据格式转换 fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8) for i, feat in enumerate(trainmodel.sparse_features)] + \ [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features] lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1) for i, feat in enumerate(trainmodel.lgbOut_Features)] key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums} varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i in trainmodel.var_features] dnn_feature_columns = fixlen_feature_columns + varlen_features linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in sparse_dense_features} test_model_input = {name: test[name] for name in sparse_dense_features} for x in trainmodel.var_features: if x == 'applist': train_model_input[x] = np.array(train[x].tolist()) test_model_input[x] = np.array(test[x].tolist()) if x == 'new_tag': train_model_input[x] = np.array(train[x].tolist())-appsnum test_model_input[x] = np.array(test[x].tolist())-appsnum # 模型 model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True, task='binary') model.compile("adam", "binary_crossentropy",metrics=['AUC'], ) history = model.fit(train_model_input, train['target'].values, batch_size=256, epochs=1, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
def load_stats(self): fixlen_feature_columns = [SparseFeat(feat, self.cat_meta[feat]) for feat in self.sparse_features] + [DenseFeat(feat, 1,) for feat in self.dense_features] self.dnn_feature_columns = fixlen_feature_columns self.linear_feature_columns = fixlen_feature_columns self.fixlen_feature_names = get_fixlen_feature_names(self.linear_feature_columns + self.dnn_feature_columns)
def get_test_data(sample_size=1000, sparse_feature_num=1, dense_feature_num=1, sequence_feature=('sum', 'mean', 'max'), classification=True, include_length=False, hash_flag=False, prefix=''): feature_columns = [] for i in range(sparse_feature_num): dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag, tf.int32)) for i in range(dense_feature_num): feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, tf.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode)) model_input = [] sequence_input = [] sequence_len_input = [] for fc in feature_columns: if isinstance(fc, SparseFeat): model_input.append(np.random.randint(0, fc.dimension, sample_size)) elif isinstance(fc, DenseFeat): model_input.append(np.random.random(sample_size)) else: s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen, sample_size) sequence_input.append(s_input) sequence_len_input.append(s_len_input) if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) x = model_input + sequence_input if include_length: for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length', 1, embedding=False)) x += sequence_len_input return x, y, feature_columns
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False, hash_flag=False, prefix='', use_group=False): feature_columns = [] model_input = {} if 'weight' in sequence_feature: feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + "weighted_seq", vocabulary_size=2, embedding_dim=embedding_size), maxlen=3, length_name=prefix + "weighted_seq" + "_seq_length", weight_name=prefix + "weight")) s_input, s_len_input = gen_sequence( 2, 3, sample_size) model_input[prefix + "weighted_seq"] = s_input model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1) model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input sequence_feature.pop(sequence_feature.index('weight')) for i in range(sparse_feature_num): if use_group: group_name = str(i%3) else: group_name = DEFAULT_GROUP_NAME dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32,group_name=group_name)) for i in range(dense_feature_num): feature_columns.append(DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode, vocabulary_size=dim, embedding_dim=embedding_size), maxlen=maxlen, combiner=mode)) for fc in feature_columns: if isinstance(fc, SparseFeat): model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size) elif isinstance(fc, DenseFeat): model_input[fc.name] = np.random.random(sample_size) else: s_input, s_len_input = gen_sequence( fc.vocabulary_size, fc.maxlen, sample_size) model_input[fc.name] = s_input if include_length: fc.length_name = prefix + "sequence_" + str(i) + '_seq_length' model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) return model_input, y, feature_columns
def get_xy_fd(hash_flag=False): # feature_dim_dict = {"sparse": [SingleFeat('user', 3, hash_flag), SingleFeat( # 'gender', 2, hash_flag), SingleFeat('item', 3 + 1, hash_flag), SingleFeat('item_gender', 2 + 1, hash_flag)], # "dense": [SingleFeat('score', 0)]} feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 0) ] feature_columns += [ VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender', 3 + 1, maxlen=4, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in feature_names ] + [feature_dict[name] for name in varlen_feature_names] # x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in # feature_dim_dict["dense"]] + [ # feature_dict['hist_' + feat] for feat in behavior_feature_list] y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(): # 固定长度的离散特征 feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1) ] # 不固定长度的离散特征 feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4) ] behavior_feature_list = ["item", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 1]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } print('x=', x) y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [SparseFeat('user', 3,hash_flag), SparseFeat('gender', 2,hash_flag), SparseFeat('item', 3+1,hash_flag), SparseFeat('item_gender', 2+1,hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item","item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3])#0 is mask value igender = np.array([1, 2, 1])# 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]]) hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3,3,2]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} #x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in # feature_dim_dict["dense"]] + [ # feature_dict['hist_' + feat] for feat in behavior_feature_list] if use_neg: feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] #x += [feature_dict['neg_hist_'+feat] for feat in behavior_feature_list] feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) print(varlen_feature_names) x = [feature_dict[name] for name in feature_names] + [feature_dict[name] for name in varlen_feature_names] x += [behavior_length] y = [1, 0, 1] print(len(x)) return x, y, feature_columns, behavior_feature_list
def main(): Use_SF = False if len(sys.argv) > 0 and sys.argv[0] == 'SF': Use_SF = True train, vali, test = GetFeatures(Use_SF) feature_count = [] for feat in sparse_features: print("Fitting {}".format(feat)) labels = {} for x in train[feat]: if x not in labels: labels[x] = len(labels) + 1 print("Transforming {}".format(feat)) for df in [train, vali, test]: df[feat] = df[feat].map(lambda x: labels.get(x, 0)) feature_count.append(len(labels) + 1) sparse_feature_columns = [ SparseFeat(f, f_c) for f, f_c in zip(sparse_features, feature_count) ] dense_feature_columns = [DenseFeat(f, 1) for f in dense_features] fixlen_feature_columns = sparse_feature_columns + dense_feature_columns dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = [train[name] for name in fixlen_feature_names] vali_model_input = [vali[name] for name in fixlen_feature_names] test_model_input = [test[name] for name in fixlen_feature_names] def eval(target): model, history = model_generate(train_model_input, train[[target]], vali_model_input, vali[[target]], linear_feature_columns, dnn_feature_columns) pred_ans = model.predict(test_model_input, batch_size=256) print(target + " test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print(target + " test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) for target in targets: eval(target)
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_feature_names(feature_columns) x = {name: feature_dict[name] for name in feature_names} y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender', 3 + 1, maxlen=4, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } fixlen_feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in fixlen_feature_names ] + [feature_dict[name] for name in varlen_feature_names] y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [SparseFeat('user', 3,hash_flag), SparseFeat('gender', 2,hash_flag), SparseFeat('item', 3+1,hash_flag), SparseFeat('item_gender', 2+1,hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item","item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3])#0 is mask value igender = np.array([1, 2, 1])# 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]]) hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3,3,2]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} if use_neg: feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] x = {name:feature_dict[name] for name in get_feature_names(feature_columns)} x["seq_length"] = behavior_length y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def client_restful_criteo(): data = pd.read_csv('./data/criteo_sample.txt') sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # model_input = [data[name].iloc[0] for name in feature_names] # model_input = [{name:data[name].iloc[0]} for name in feature_names] model_input = [{name:data[name].iloc[0] for name in feature_names}] print(model_input) data = json.dumps({"signature_name": "serving_default", "instances": model_input}, cls=NpEncoder) headers = {"content-type": "application/json"} json_response = requests.post('http://localhost:8501/v1/models/criteo:predict', data=data, headers=headers) json_response = json.loads(json_response.text) print(json_response)
data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names}
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [ SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } if use_neg: feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) feature_columns += [ VarLenSparseFeat(SparseFeat('neg_hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('neg_hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def main(args, local): if args.arch == 'xDeepFM' and args.mode == 'train': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join(DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature'] target = ['label'] len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features] idx_artics_all = item['article_id'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스 linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary') print('---model defined---') # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까 print(time.time() - s ,'seconds') if use_nsml and args.mode == 'train': bind_nsml(model,[], args.task) if args.mode == 'test': print('_infer root - : ', DATASET_PATH) print('test') model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH) bind_nsml(model, [], args.task) checkpoint_session = ['401','team_62/airush2/176'] nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) print('successfully loaded') if (args.mode == 'train'): if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네 nsml.save('infer') print('end') print('end_main') if args.pause: nsml.paused(scope=local)
def get_item(root): print('load') csv_file = os.path.join(root, 'test', 'test_data', 'test_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') print('loaded!!') sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature'] target = ['label'] len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() with open(os.path.join(DATASET_PATH, 'test', 'test_data', 'test_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) print('image_feaeture_dict loaded..') for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) # test set으로 구성해도 되고 item 을.. fixlen_feature_columns = [] for feat in sparse_features: if feat == 'article_id': fixlen_feature_columns.append(SparseFeat(feat,1896)) else: fixlen_feature_columns.append(SparseFeat(feat,item[feat].nunique())) #fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features] print(fixlen_feature_columns) idx_artics_all = item['article_id'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary') #bind_nsml(model, list(), args.task) return model, fixlen_feature_names_global, item,image_feature_dict, id_to_artic
# dense_feature_list = [SingleFeat(feat, 0) # for feat in dense_features] train = data[data['date'] <= 20190707] test = data[data['date'] == 20190708] # train_labels = [train[target[0]].values, train[target[1]].values] # test_labels = [test[target[0]].values, test[target[1]].values] train_labels = [train[target[0]].values, train[target[1]].values] test_labels = [test[target[0]].values, test[target[1]].values] sparse_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features] # sparse_feature_columns = [SparseFeat(feat, dimension=int(1e6), use_hash=True) for feat in # sparse_features] # The dimension can be set according to data # dense_feature_columns = [DenseFeat(feat, 1) # for feat in dense_features] dnn_feature_columns = sparse_feature_columns + dense_feature_columns linear_feature_columns = sparse_feature_columns + dense_feature_columns feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) print(feature_names) train_model_input = [train[name] for name in feature_names]
def get_input(use_img=True, use_text=True, target='isclick'): sequence_feature_list = [] sparse_feature_df = pd.read_csv(feat_columns_path + "cat_cols_selected.csv") dense_feature_df = pd.read_csv(feat_columns_path + "num_cols_selected.csv") emb_feat_list = pd.read_csv( feat_columns_path + "embeding_cols_selected.csv")["embeding_feature"].values.tolist() user_cols_list = pd.read_csv( feat_columns_path + "user_cols_selected.csv")["user_feature"].values.tolist() item_cols_list = pd.read_csv( feat_columns_path + "item_cols_selected.csv")["item_feature"].values.tolist() # 用户信息表 train_user_feat_df = pd.read_csv(train_path_user) # item 信息表 train_item_feat_df = pd.read_csv(train_path_item, names=item_feat_cols) cat_feature_list = sparse_feature_df["cat_feature"].values.tolist() num_feature_list = list( set(dense_feature_df["num_feature"].values.tolist())) #类别聚类还没有跑 这里是手动写的,之后这个删掉 cat_feature_list = ["user_id", "item_id"] data = pd.read_csv(feat_columns_path + "train.csv").iloc[:-1000] data["isclick"] = 1 for missing_col in data.columns.tolist(): if missing_col in num_feature_list: data[missing_col].fillna(data[missing_col].median(), inplace=True) elif missing_col in ['text_vd' + str(i) for i in range(128) ] + ['text_vd' + str(i) for i in range(128)]: data[missing_col].fillna(0, inplace=True) data[missing_col] = data[missing_col].apply( lambda x: 0 if x == "nan" or x == "null" else x) data[cat_feature_list] = data[cat_feature_list].apply( LabelEncoder().fit_transform) # data[sparse_feature_list].fillna(-1) feature_columns = [] sparse_feature_list = [ SparseFeat(cat_col, data[cat_col].nunique(), embedding_dim=10) for cat_col in cat_feature_list ] dense_feature_list = [ DenseFeat(colname, 1) for colname in num_feature_list ] sequence_feature_list = [] feature_columns = sparse_feature_list + dense_feature_list + sequence_feature_list test = data.iloc[-1000:] train = data.iloc[:-1000] train_size = len(train) train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \ [train[feat.name].values for feat in dense_feature_list] test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \ [test[feat.name].values for feat in dense_feature_list] if use_img: ad_cols = ['img_vd' + str(i) for i in range(128)] img_input = data[ad_cols].values train_model_input += [img_input[:train_size]] test_model_input += [img_input[train_size:]] if use_text: vd_cols = ['text_vd' + str(i) for i in range(128)] text_input = data[vd_cols].values train_model_input += [text_input[:train_size]] test_model_input += [text_input[train_size:]] train_labels, test_labels = train[target].values, test[target].values feature_dim_dict = { "sparse": sparse_feature_list, "dense": dense_feature_list, "sequence": sequence_feature_list } return feature_columns, train_model_input, train_labels, test_model_input, test_labels
target = ['finish', 'like'] for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) sparse_feature_columns = [ SparseFeat( feat, data[feat].nunique() ) #(特征名, 特征不同取值个数)生成SparseFeat对象,name == 特征名,dimension==该特征不同取值个数, dtype ==int32 for feat in sparse_features ] dense_feature_columns = [ DenseFeat(feat, 1) #(特征名, dimension==1) 数据dtype == float32 for feat in dense_features ] dnn_feature_columns = sparse_feature_columns + dense_feature_columns linear_feature_columns = sparse_feature_columns + dense_feature_columns ##['feature1','feature2',...] feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) train, test = train_test_split(data, test_size=0.1) train_model_input = [train[name] for name in feature_names] test_model_input = [test[name] for name in feature_names] features = build_input_features(linear_feature_columns + dnn_feature_columns)
test[feat] = lbe.transform(test[feat]) mms = MinMaxScaler(feature_range=(0, 1)) mms.fit(train[dense_features]) train[dense_features] = mms.transform(train[dense_features]) # preprocess the sequence feature genres_key2index, train_genres_list, genres_maxlen = get_var_feature( train, 'genres') user_key2index, train_user_hist, user_maxlen = get_var_feature( train, 'user_hist') user_feature_columns = [ SparseFeat(feat, data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(user_sparse_features) ] + [DenseFeat( feat, 1, ) for feat in user_dense_features] item_feature_columns = [ SparseFeat(feat, data[feat].nunique(), embedding_dim=4, use_hash=True) for i, feat in enumerate(item_sparse_features) ] + [DenseFeat( feat, 1, ) for feat in item_dense_features] item_varlen_feature_columns = [ VarLenSparseFeat(SparseFeat('genres', vocabulary_size=1000, embedding_dim=4), maxlen=genres_maxlen, combiner='mean',
import numpy as np from deepctr.models import DIN from deepctr.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_fixlen_feature_names,get_varlen_feature_names feature_columns = [SparseFeat('user',3),SparseFeat( 'gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1),DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} fixlen_feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in varlen_feature_names] y = [1, 0, 1] model = DIN(feature_columns, behavior_feature_list, hist_len_max=4, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
feature_list.append( SparseFeat(feat, users_feature[feat].nunique() + 1)) else: feature_list.append(SparseFeat(feat, len(user_id_unique))) for feat in item_sparse_features: if feat != "item_id": feature_list.append( SparseFeat(feat, items_feature[feat].nunique() + 1)) else: feature_list.append( SparseFeat(feat, len(item_id_unique) + 1)) dense_feature_list = [ DenseFeat(feat, 1) for feat in dense_features ] varLen_sparse_feature_list = [ VarLenSparseFeat(feat, 11, maxlen=10) for feat in varLen_sparse_features ] sess_sparse_feature_list = [ VarLenSparseFeat(feat, len(item_id_unique) + 1, maxlen=DIN_SESS_MAX_LEN, embedding_name='item_id') for feat in hist_feature ]
train = data[data['date'] <= 20190707] test = data[data['date'] == 20190708] # train_labels = [train[target[0]].values, train[target[1]].values] # test_labels = [test[target[0]].values, test[target[1]].values] train_y_id = train['g_region_id'].values test_y_id = test['g_region_id'].values train_labels = [train[target[0]].values, train[target[1]].values] test_labels = [test[target[0]].values, test[target[1]].values] sparse_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features] gate_feature_columns = [DenseFeat(feat, 1) for feat in gate_features] # sparse_feature_columns = [SparseFeat(feat, dimension=int(1e6), use_hash=True) for feat in # sparse_features] # The dimension can be set according to data # dense_feature_columns = [DenseFeat(feat, 1) # for feat in dense_features] dnn_feature_columns = sparse_feature_columns + dense_feature_columns linear_feature_columns = sparse_feature_columns + dense_feature_columns feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns + gate_feature_columns)
def run(data, ziel, line0, grid, loop): poi_feature_transfer = [] print('++++', '\n', grid) for a in range(len(poi_feature)): poi_feature_transfer.append('poi_feature_%d' % a) data = data.rename(columns={poi_feature[a]: 'poi_feature_%d' % a}) features = [ 'provname', 'prefname', 'cntyname', 'townname', 'villname', 'dispincm', 'urbcode_1', 'hauslvl' ] + poi_feature_transfer # sparse_features = [] dense_features = [] for f in features: if f not in x_category or x_category[f] == 1: dense_features.append(f) else: sparse_features.append(f) data[sparse_features] = data[sparse_features].fillna(-1) data[dense_features] = data[dense_features].fillna(0) y = [] #ziel = # villmean, income y_limit = [np.min(data[ziel]) - 1] + line0 + [np.max(data[ziel])] for index, row in data.iterrows(): for i in range(1, len(y_limit)): if y_limit[i - 1] < row[ziel] <= y_limit[i]: y.append(i - 1) break data['income_0'] = y target = ['income_0'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + \ [DenseFeat(feat, 1,)for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) # try to oversampling # (train_x,train_y)=over_sampling(train[features],train[ziel], 3) # train = (np.column_stack((train_x, train_y))) train_model_input = [train[name] for name in fixlen_feature_names] test_model_input = [test[name] for name in fixlen_feature_names] # 4.Define Model,train,predict and evaluate ############################################## (models, model_names, xlabel) = model_gridsearch(linear_feature_columns, dnn_feature_columns, grid) logloss, auc1, acc1, pre1, recall1, f11 = [], [], [], [], [], [] print(ziel, line0, len(data)) for name, model in zip(model_names, models): ll_avg, auc_avg = [], [] for i in range(loop): model.compile("adam", 'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=0, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) true = test[target].values ''' f = open("pred.csv", 'a', encoding='utf_8_sig') f.write('%s\n'%(ziel)) for i in range(len(pred_ans)): f.write('%s, %s\n' % (pred_ans[i],true[i] )) f.close()''' ll = round(log_loss(test[target].values, pred_ans), 4) auc = round(roc_auc_score(test[target].values, pred_ans), 4) #acc = round(accuracy_score(test[target].values, pred_ans.round()), 4) #pre = round(precision_score(test[target].values, pred_ans.round()), 4) #recall = round(recall_score(test[target].values, pred_ans.round()), 4) #f1 = round(f1_score(test[target].values, pred_ans.round(), average='weighted'),4) #spec = round(specificity_score(test[target].values, pred_ans.round(), average='weighted'),4) #sens = round(sensitivity_score(test[target].values, pred_ans.round(), average='weighted'),4) ll_avg.append(ll), auc_avg.append(auc) logloss.append(np.mean(ll_avg)), auc1.append( np.mean(auc_avg) ) #, acc1.append(acc), pre1.append(pre), recall1.append(recall), f11.append(f1) ''' cm = confusion_matrix(test[target].values, pred_ans.round()) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm = [] for m in range(len(line0)+1): cm.append([]) for n in range(len(line0)+1): cm[m].append(round(cm_normalized[m][n],4)) ''' ''' print(name) print("LogLoss", ll, end=' ') print("AUC", auc, end=' ') print("accuracy", acc, end=' ') #print("precision" , pre, end=' ') #print("recall", recall, end=' ') print("f1" , f1, end=' ') print("spec", spec, end=' ') print("sens" , sens, end=' ') print(cm) #f = open("DeepFM.csv", 'a', encoding='utf_8_sig') #f.write('%s,%s\n'%(ziel,line0)) #f.write('%s, %s, %s, %s, %s, %s, %s,' % (name, ll, auc, acc, f1, spec, sens)) #f.write('%s\n' % str(cm).replace(',',';')) #f.close() ''' return (logloss, auc1, xlabel)