def test_long_dense_vector(): #构造特征 feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_feature_names(feature_columns) #构造样本 user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] #创建模型model model = DeepFM(feature_columns, feature_columns[:-1]) # model.summary() #tf.keras.utils.plot_model(model, "test_compu") #训练模型 model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def make_feature_cols(self, dataset, embedding_dim): '''Return deepctr.feature_column. Parameters ---------- dataset : A dataset instance. Returns ------- dnn_features : A list of feature_column instance for dnn inputs. linear_features : A list of feature_column instance for linear inputs. ''' fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=dataset.nunique[feat], embedding_dim=embedding_dim) for feat in dataset.sparse_features ] fixlen_feature_columns += [ DenseFeat(feat, 1) for feat in dataset.dense_features ] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns return dnn_feature_columns, linear_feature_columns
def test_long_dense_vector(): feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_feature_names(feature_columns) user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] model = DeepFM(feature_columns, feature_columns[:-1]) model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def get_xy_random2(X, y, cols_family={}): # X = np.random.rand(100,30) # y = np.random.binomial(n=1, p=0.5, size=[100]) ## PREPROCESSING STEPS # change into dataframe target = 'y' cols = [str(i) for i in range(X.shape[1])] # define column pd dataframe, need to be string type data = pd.DataFrame(X, columns=cols) # need to convert into df, following the step from documentation #data['y'] = y # define which feature columns sparse or dense type # since our data categorize as Dense Features, we define the sparse features as empty list #cols_sparse_features = [] #cols_dense_features = [str(i) for i in range(X.shape[1])] cols_sparse_features = cols_family['colsparse'] cols_dense_features = cols_family['coldense'] # convert feature type into SparseFeat or DenseFeat type, adjusting from DeepCTR library sparse_feat_l = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i,feat in enumerate(cols_sparse_features)] dense_feat_l = [DenseFeat(feat, dimension=1) for feat in cols_dense_features] feature_col = sparse_feat_l + dense_feat_l linear_feat_col = feature_col # containing all the features used by linear part of the model dnn_feat_col = feature_col # containing all the features used by deep part of the model feature_names = get_feature_names(linear_feat_col + dnn_feat_col) train_model_input = {name: data[name] for name in feature_names} X_train, y_train = train_model_input, y.values return X_train, y_train, linear_feat_col, dnn_feat_col
def get_xy_fd(hash_flag=False): feature_columns = [SparseFeat('user', 3, embedding_dim=10), SparseFeat( 'gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1)] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length")] # Notice: History behavior sequence feature name must start with "hist_". behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]]) seq_length = np.array([3, 3, 2]) # the actual length of the behavior sequence feature_dict = {'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score, 'seq_length': seq_length} x = {name: feature_dict[name] for name in get_feature_names(feature_columns)} y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def get_mtl_test_data(sample_size=10, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, task_types=('binary', 'binary'), hash_flag=False, prefix='', use_group=False): feature_columns = [] model_input = {} for i in range(sparse_feature_num): if use_group: group_name = str(i % 3) else: group_name = DEFAULT_GROUP_NAME dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32, group_name=group_name)) for i in range(dense_feature_num): def transform_fn(x): return (x - 0.0) / 1.0 feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32, transform_fn=transform_fn)) for fc in feature_columns: if isinstance(fc, SparseFeat): model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size) elif isinstance(fc, DenseFeat): model_input[fc.name] = np.random.random(sample_size) y_list = [] # multi label for task in task_types: if task == 'binary': y = np.random.randint(0, 2, sample_size) y_list.append(y) else: y = np.random.random(sample_size) y_list.append(y) return model_input, y_list, feature_columns
def get_xy_random(): X = np.random.rand(100, 30) y = np.random.binomial(n=1, p=0.5, size=[100]) ## PREPROCESSING STEPS # change into dataframe cols = [str(i) for i in range(X.shape[1]) ] # define column pd dataframe, need to be string type data = pd.DataFrame( X, columns=cols ) # need to convert into df, following the step from documentation data['y'] = y # define which feature columns sparse or dense type # since our data categorize as Dense Features, we define the sparse features as empty list cols_sparse_features = [] cols_dense_features = [str(i) for i in range(X.shape[1])] # convert feature type into SparseFeat or DenseFeat type, adjusting from DeepCTR library sparse_feat_l = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(cols_sparse_features) ] dense_feat_l = [ DenseFeat(feat, dimension=1) for feat in cols_dense_features ] feature_col = sparse_feat_l + dense_feat_l linear_feat_col = feature_col # containing all the features used by linear part of the model dnn_feat_col = feature_col # containing all the features used by deep part of the model feature_names = get_feature_names(linear_feat_col + dnn_feat_col) train_full, test = train_test_split(data, random_state=2021, stratify=data['y']) train, val = train_test_split(train_full, random_state=2021, stratify=train_full['y']) train_model_input = {name: train[name] for name in feature_names} val_model_input = {name: val[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} target = 'y' ## END OF PREPROCESSING STEPS X_train, y_train = train_model_input, train[target].values X_val, y_val = val_model_input, val[target].values X_test, y_test = test_model_input, test[target].values return X_train, X_val, X_test, y_train, y_val, y_test, linear_feat_col, dnn_feat_col
def fit(self, X, y): X_ = X.copy() self.dense_features = list(X_.columns.difference(self.cat_features)) logger.debug("MinMaxScaler") self.min_max_scaler.fit(X_[self.dense_features]) X_[self.dense_features] = self.min_max_scaler.transform( X_[self.dense_features]) self._column_mapping(X_) X_.columns = [self.columns_mapping[col] for col in X_.columns] self.fixlen_feature_columns = [ SparseFeat( self.columns_mapping[feat], vocabulary_size=X_[self.columns_mapping[feat]].max() + 1, embedding_dim=4, ) for i, feat in enumerate(self.cat_features) ] + [ DenseFeat( self.columns_mapping[feat], 1, ) for feat in self.dense_features ] self.feature_names = get_feature_names(self.fixlen_feature_columns) logger.debug("Compile DeepFM model") self.model = DeepFM(self.fixlen_feature_columns, self.fixlen_feature_columns, task="binary") self.model.compile( "adam", "binary_crossentropy", metrics=["binary_crossentropy"], ) logger.debug("Fit DeepFM") train_model_input = { name: X_[name].values for name in self.feature_names } self.model.fit( train_model_input, y, batch_size=256, epochs=3, verbose=2, validation_split=0.2, )
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3, embedding_dim=10), SparseFeat('gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8), SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4) ] behavior_feature_list = ["item_id", "cate_id"] # 变长特征使用的base稀疏特征 uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value pay_score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]]) # 特征名->data输入 feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': pay_score } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
def run_base_experiment(data_path, dataset_type, model_params, model_type, opt): if dataset_type == 'critero': data_df, sparse_features, dense_features, target = load_citero_dataset( data_path) else: data_df, sparse_features, dense_features, target = load_taboola_dataset( data_path) data_df = prepare_data_for_train(data_df, sparse_features, dense_features) fixlen_feature_columns = [ SparseFeat( feat, vocabulary_size=data_df[feat].nunique(), embedding_dim=10) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data_df, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} batch_size = 1024 # 4.Define Model,train,predict and evaluate model = model_type(linear_feature_columns, dnn_feature_columns, seed=1024, **model_params) model.compile( optimizer=opt, loss="binary_crossentropy", metrics=['binary_crossentropy', 'accuracy'], ) history = model.fit( train_model_input, train[target].values, batch_size=batch_size, epochs=10, verbose=1, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=batch_size) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) pass
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1, embedding_dim=4, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_feature_names(feature_columns) x = {name: feature_dict[name] for name in feature_names} y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def read_data_as_model(): data = pd.read_csv('GiveMeSomeCredit/cs-training.csv') sparse_features = [ 'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents' ] dense_features = [ 'RevolvingUtilizationOfUnsecuredLines', 'age', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines' ] data[sparse_features] = data[sparse_features].fillna(-1, ) data[dense_features] = data[dense_features].fillna(-1, ) target = ['SeriousDlqin2yrs'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2, random_state=1234) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} return train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target
def train_youtube_model(train_model_input, train_label, embedding_dim, feature_max_idx, his_seq_maxlen, batch_size, epochs, verbose, validation_split): """构建youtubednn并完成训练""" # 特征封装 user_feature_columns = [ SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), VarLenSparseFeat( SparseFeat('hist_doc_ids', feature_max_idx['article_id'], embedding_dim, embedding_name="click_doc_id"), his_seq_maxlen, 'mean', 'hist_len'), SparseFeat('u_city', feature_max_idx['city'], embedding_dim), SparseFeat('u_age', feature_max_idx['age'], embedding_dim), SparseFeat('u_gender', feature_max_idx['gender'], embedding_dim), DenseFeat( 'u_example_age', 1, ) ] doc_feature_columns = [ SparseFeat('click_doc_id', feature_max_idx['article_id'], embedding_dim) # 这里后面也可以把文章的类别画像特征加入 ] # 定义模型 model = YoutubeDNN(user_feature_columns, doc_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim)) # 模型编译 model.compile(optimizer="adam", loss=sampledsoftmaxloss) # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练 history = model.fit(train_model_input, train_label, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_split=validation_split) return model
def custom_model(): sparse_features = ["C" + str(i) for i in range(1, 27)] dense_features = ["I" + str(i) for i in range(1, 14)] fixlen_feature_columns = [ SparseFeat( feat, vocabulary_size=10000, embedding_dim=4, dtype="string", use_hash=True, ) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] model = WDL(fixlen_feature_columns, fixlen_feature_columns, task="binary") return model
def _build_model(self): to_drop = config.Keywords_Categories[self.params['category']] self._build_category_dict(drop_categories=to_drop) attrs_matrix, attrs_max_len = self._get_category_matrix(self.data) vars_fixlen = [SparseFeat(var, self.data[var].nunique(), embedding_dim=4) for var in self.features_sparse] vars_fixlen += [DenseFeat(var, 1,) for var in self.features_dense] vars_varlen = [VarLenSparseFeat(SparseFeat('categories', vocabulary_size=len(self.attr2index) + 1, embedding_dim=4), maxlen=attrs_max_len, combiner='mean', weight_name='attrs_weight' if self.params['weight'] else None)] self.features_linear = vars_fixlen + vars_varlen self.features_dnn = vars_fixlen + vars_varlen self.model = DeepFM(self.features_linear, self.features_dnn, task='regression', **self.params_deepfm) return attrs_matrix, attrs_max_len
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [ SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } if use_neg: feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) feature_columns += [ VarLenSparseFeat(SparseFeat('neg_hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('neg_hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = np.array([1, 0, 1]) return x, y, feature_columns, behavior_feature_list
size_dict = {'Medical Claims Features': 2, 'Condition Related Features':2, 'Lab Claims Features':2, 'Pharmacy Claims Features':2, 'CMS Features':2, 'Demographics':16, 'Other features':16 } dense_features = set(train_data.columns) - set(sparse_features) - set(['transportation_issues', 'person_id_syn']) dense_features = list(dense_features) target = ['transportation_issues'] fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_data[feat].nunique(), embedding_dim=size_dict[field_info[feat]], dtype='int32', group_name=field_info[feat]) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train, test = train_test_split(train_data, test_size=0.2, random_state=2020) train_model_input = {name:train[name] for name in feature_names} test_model_input = {name:test[name] for name in feature_names} model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', dnn_hidden_units=(2, 256), dnn_dropout=0.0) opt = keras.optimizers.Adam(learning_rate=0.001) model.compile(loss="binary_crossentropy", metrics=['binary_crossentropy'], optimizer=opt) history = model.fit(train_model_input, train[target].values, batch_size=64, epochs=10, verbose=1, validation_split=0.2, class_weight={0:1, 1:3})
if num > 10000: dim = 10 else: if num > 1000: dim = 8 else: dim = 4 if column == 'user_id': feature_columns += [SparseFeat(column, 212062 + 1, embedding_dim=dim)] elif column == 'merchant_id': feature_columns += [SparseFeat(column, 1993 + 1, embedding_dim=dim)] elif column == 'action_type': feature_columns += [SparseFeat(column, 4 + 1, embedding_dim=dim)] else: feature_columns += [DenseFeat(column, 1)] # maxlen为历史信息的长度,vocabulary_size为onehot的长度 feature_columns += [ VarLenSparseFeat(sparsefeat=SparseFeat('hist_merchant_id', vocabulary_size=1993, embedding_dim=8, embedding_name='merchant_id'), maxlen=M), VarLenSparseFeat(sparsefeat=SparseFeat('hist_action_type', vocabulary_size=4, embedding_dim=4, embedding_name='action_type'), maxlen=M)] history_features = ['merchant_id', 'action_type'] print(len(feature_columns)) # 使用DIN模型 model = DIN(feature_columns, history_features) # 使用Adam优化器,二分类的交叉熵 model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) # model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"])
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3, use_hash=hash_flag), SparseFeat('gender', 2, use_hash=hash_flag), SparseFeat('item', 3 + 1, use_hash=hash_flag), SparseFeat('item_gender', 2 + 1, use_hash=hash_flag), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('sess_0_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('sess_0_item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item_gender'), maxlen=4) ] feature_columns += [ VarLenSparseFeat(SparseFeat('sess_1_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('sess_1_item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } x["sess_length"] = sess_number y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_dataset(data_sample=None): if data_sample == "avazu": df = pd.read_csv( 'https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/avazu_sample.txt' ) df['day'] = df['hour'].apply(lambda x: str(x)[4:6]) df['hour'] = df['hour'].apply(lambda x: str(x)[6:]) sparse_features = [ 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_model', 'device_type', 'device_conn_type', # 'device_ip', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', ] df[sparse_features] = df[sparse_features].fillna('-1', ) target = ['click'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) # 2.count #unique features for each sparse field,and record dense feature field name field_info = dict(C14='user', C15='user', C16='user', C17='user', C18='user', C19='user', C20='user', C21='user', C1='user', banner_pos='context', site_id='context', site_domain='context', site_category='context', app_id='item', app_domain='item', app_category='item', device_model='user', device_type='user', device_conn_type='context', hour='context', device_id='user') fixlen_feat_col = [ SparseFeat(name, vocabulary_size=df[name].nunique(), embedding_dim=16, use_hash=False, dtype='int32', group_name=field_info[name]) for name in sparse_features ] dnn_feat_col = fixlen_feat_col linear_feat_col = fixlen_feat_col feature_names = get_feature_names(linear_feat_col + dnn_feat_col) elif data_sample == "criteo": df = pd.read_csv( 'https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/criteo_sample.txt' ) sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] df[sparse_features] = df[sparse_features].fillna('-1', ) df[dense_features] = df[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) mms = MinMaxScaler(feature_range=(0, 1)) df[dense_features] = mms.fit_transform(df[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feat_col = [ SparseFeat( feat, vocabulary_size=df[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feat_col = fixlen_feat_col linear_feat_col = fixlen_feat_col feature_names = get_feature_names(linear_feat_col + dnn_feat_col) elif data_sample == "movielens": df = pd.read_csv( "https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/movielens_sample.txt" ) sparse_features = [ "movie_id", "user_id", "gender", "age", "occupation", "zip" ] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) # 2.count #unique features for each sparse field fixlen_feat_col = [ SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_features ] linear_feat_col = fixlen_feat_col dnn_feat_col = fixlen_feat_col feature_names = get_feature_names(linear_feat_col + dnn_feat_col) # 3.generate input data for model train_full, test = train_test_split(df, random_state=2021, stratify=df[target]) train, val = train_test_split(train_full, random_state=2021, stratify=train_full[target]) train_model_input = {name: train[name] for name in feature_names} val_model_input = {name: val[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} X_train, y_train = train_model_input, train[target].values X_val, y_val = val_model_input, val[target].values X_test, y_test = test_model_input, test[target].values return X_train, X_val, X_test, y_train, y_val, y_test, linear_feat_col, dnn_feat_col
#将类别型特征硬编码 for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) sys.exit() fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name].values for name in feature_names} test_model_input = {name: test[name].values for name in feature_names} model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model.compile( "adam", "binary_crossentropy",
data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=4 ) for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # list of string # 3.generate input data for model train, test = train_test_split(data, test_size=0.2, random_state=2020) train_model_input = {name:train[name] for name in feature_names} test_model_input = {name:test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
eval_set=[(valid_X, valid_df[label])], eval_metric=['auc'], early_stopping_rounds=None) model.save_model(MODEL_PATH_XGB) print('LightGBM模型') model = lgb.LGBMClassifier(n_estimators=200, random_state=RANDOM_SEED) model.fit(train_X, train_df[label], eval_set=[(valid_X, valid_df[label])], eval_metric=['auc'], early_stopping_rounds=None) model.booster_.save_model(MODEL_PATH_LGB) print('DCN模型') feature_columns = [DenseFeat(c, 1) for c in num_feat] + [ SparseFeat(c, n + 1, 'auto') for c, n in train_X[cat_feat].max().items() ] model = DCN(feature_columns, feature_columns, cross_num=4, dnn_use_bn=True, cross_parameterization='matrix', seed=RANDOM_SEED) model.compile('adam', 'binary_crossentropy', metrics=['AUC']) model.fit([x for _, x in train_X.items()], train_df[label], validation_data=([x for _, x in valid_X.items()], valid_df[label]), shuffle=False, epochs=1) model.save(MODEL_PATH_DCN)
def main(model_dir, data_dir, train_steps, model_name): data = pd.read_csv(os.path.join(data_dir, 'criteo_sample.txt')) sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2, random_state=2020) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate if model_name == 'DeepFM': model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'FNN': model = FNN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'WDL': model = WDL(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'MLR': model = MLR(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'NFM': model = NFM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'DIN': model = DIN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'CCPM': model = CCPM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'PNN': model = PNN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'AFM': model = AFM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'DCN': model = DCN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'DIEN': model = DIEN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'DSIN': model = DSIN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'xDeepFM': model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'AutoInt': model = AutoInt(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'ONN': model = ONN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'FGCNN': model = FGCNN(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'FiBiNET': model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='binary') elif model_name == 'FLEN': model = FLEN(linear_feature_columns, dnn_feature_columns, task='binary') else: print(model_name + ' is not supported now.') return gpus = int(os.getenv('SM_NUM_GPUS', '0')) print('gpus:', gpus) if gpus > 1: from tensorflow.keras.utils import multi_gpu_model model = multi_gpu_model(model, gpus=gpus) model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=train_steps, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) try: print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) except Exception as e: print(e) try: print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) except Exception as e: print(e) model.save_weights(os.path.join(model_dir, 'DeepFM_w.h5'))
data[dense_features] = np.log(data[dense_features] + 1.0) test[dense_features] = np.log(test[dense_features] + 1.0) print('data.shape', data.shape) print('data.columns', data.columns.tolist()) print('unique date_: ', data['date_'].unique()) train = data[data['date_'] < 14] val = data[data['date_'] == 14] # 第14天样本作为验证集 pretrained_feed_embedding_initializer = tf.initializers.identity(feed_embedding) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat('feedid', vocabulary_size=data['feedid'].max() + 1, embedding_dim=512, embeddings_initializer=pretrained_feed_embedding_initializer)] + [ SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim) for feat in sparse_features if feat is not 'feedid'] + [DenseFeat(feat, 1) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(dnn_feature_columns) # 3.generate input data for model train_model_input = {name: train[name] for name in feature_names} val_model_input = {name: val[name] for name in feature_names} userid_list = val['userid'].astype(str).tolist() test_model_input = {name: test[name] for name in feature_names} train_labels = [train[y].values for y in target] val_labels = [val[y].values for y in target] # 4.Define Model,train,predict and evaluate
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False, hash_flag=False, prefix='', use_group=False): feature_columns = [] model_input = {} if 'weight' in sequence_feature: feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + "weighted_seq", vocabulary_size=2, embedding_dim=embedding_size), maxlen=3, length_name=prefix + "weighted_seq" + "_seq_length", weight_name=prefix + "weight")) s_input, s_len_input = gen_sequence(2, 3, sample_size) model_input[prefix + "weighted_seq"] = s_input model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1) model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input sequence_feature.pop(sequence_feature.index('weight')) for i in range(sparse_feature_num): if use_group: group_name = str(i % 3) else: group_name = DEFAULT_GROUP_NAME dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32, group_name=group_name)) for i in range(dense_feature_num): feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode, vocabulary_size=dim, embedding_dim=embedding_size), maxlen=maxlen, combiner=mode)) for fc in feature_columns: if isinstance(fc, SparseFeat): model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size) elif isinstance(fc, DenseFeat): model_input[fc.name] = np.random.random(sample_size) else: s_input, s_len_input = gen_sequence(fc.vocabulary_size, fc.maxlen, sample_size) model_input[fc.name] = s_input if include_length: fc.length_name = prefix + "sequence_" + str(i) + '_seq_length' model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) return model_input, y, feature_columns
continue select_columns_name.append(feat_name) for key in vocabulary_size.keys(): if key in feat_name: vocabulary_size_val = vocabulary_size[key] embedding_name = key break varlen_feature_columns.append(VarLenSparseFeat( SparseFeat(feat_name, vocabulary_size=vocabulary_size_val + 1, embedding_dim=4, use_hash=False, embedding_name=embedding_name), maxlen=1, combiner='mean', weight_name=feat_name + '_weight', weight_norm=False)) else: # 是dense特征 if feat_name[-6:] == 'weight': select_columns_name.append(feat_name) fixed_feature_columns.append(DenseFeat(feat_name, 1, )) # dense 特征 else: continue # if use_hour_features: # 复现最优结果 # for feat_name in all_columns: # if feat_name[-6:] == 'weight' or feat_name in ['ctr_label', 'cvr_label']: # select_columns_name.append(feat_name) # continue # for key in vocabulary_size.keys(): # if key in feat_name: # vocabulary_size_val = vocabulary_size[key] # print("key:{0},size:{1},feature name:{2}".format(key, vocabulary_size_val, feat_name)) # break # print("size:{0},feature name:{1}".format(vocabulary_size_val, feat_name)) # varlen_feature_columns.append(VarLenSparseFeat( # SparseFeat(feat_name, vocabulary_size=vocabulary_size_val + 1, embedding_dim=4, use_hash=False),
data[dense_features] = data[dense_features].fillna(0, ) test[dense_features] = test[dense_features].fillna(0, ) data[dense_features] = np.log(data[dense_features] + 1.0) test[dense_features] = np.log(test[dense_features] + 1.0) logging.info('data.shape: {}'.format(data.shape)) # logging.info('data.columns', data.columns.tolist()) # logging.info('unique date_: ', data['date_'].unique()) train = data[data['date_'] < 14] val = data[data['date_'] == 14] # 第14天样本作为验证集 # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim) for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(dnn_feature_columns) # 3.generate input data for model if args.stage == 'offline': train_model_input = {name: train[name] for name in feature_names} train_labels = [train[y].values for y in target] else: train_model_input = {name: data[name] for name in feature_names} train_labels = [data[y].values for y in target] val_model_input = {name: val[name] for name in feature_names} userid_list = val['userid'].astype(str).tolist() test_model_input = {name: test[name] for name in feature_names}
user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set(data, 20) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 16 user_feature_columns = [ SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), DenseFeat("gender"), SparseFeat("age", feature_max_idx['age'], embedding_dim), SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat( SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), ] # feature_max_idx['gender'], embedding_dim item_feature_columns = [ SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim) ]
def _model_fn(features, labels, mode, config): train_flag = (mode == tf.estimator.ModeKeys.TRAIN) with variable_scope(DNN_SCOPE_NAME): sparse_feature_columns = [] dense_feature_columns = [] varlen_sparse_feature_columns = [] for feat in dnn_feature_columns: new_feat_name = list(feat.parse_example_spec.keys())[0] if new_feat_name in ['hist_price_id', 'hist_des_id']: varlen_sparse_feature_columns.append( VarLenSparseFeat(SparseFeat(new_feat_name, vocabulary_size=100, embedding_dim=32, use_hash=False), maxlen=3)) elif is_embedding(feat): sparse_feature_columns.append( SparseFeat(new_feat_name, vocabulary_size=feat[0]._num_buckets + 1, embedding_dim=feat.dimension)) else: dense_feature_columns.append(DenseFeat(new_feat_name)) history_feature_columns = [] sparse_varlen_feature_columns = [] history_fc_names = list( map(lambda x: "hist_" + x, history_feature_list)) for fc in varlen_sparse_feature_columns: feature_name = fc.name if feature_name in history_fc_names: history_feature_columns.append(fc) else: sparse_varlen_feature_columns.append(fc) my_feature_columns = sparse_feature_columns + dense_feature_columns + varlen_sparse_feature_columns embedding_dict = create_embedding_matrix(my_feature_columns, l2_reg_embedding, seed, prefix="") query_emb_list = embedding_lookup(embedding_dict, features, sparse_feature_columns, history_feature_list, history_feature_list, to_list=True) print('query_emb_list', query_emb_list) print('embedding_dict', embedding_dict) print('haha') print('history_feature_columns', history_feature_columns) print('haha') keys_emb_list = embedding_lookup(embedding_dict, features, history_feature_columns, history_fc_names, history_fc_names, to_list=True) print('keys_emb_list', keys_emb_list) dnn_input_emb_list = embedding_lookup( embedding_dict, features, sparse_feature_columns, mask_feat_list=history_feature_list, to_list=True) print('dnn_input_emb_list', dnn_input_emb_list) dense_value_list = get_dense_input(features, dense_feature_columns) sequence_embed_dict = varlen_embedding_lookup( embedding_dict, features, sparse_varlen_feature_columns) sequence_embed_list = get_varlen_pooling_list( sequence_embed_dict, features, sparse_varlen_feature_columns, to_list=True) dnn_input_emb_list += sequence_embed_list keys_emb = concat_func(keys_emb_list, mask=True) deep_input_emb = concat_func(dnn_input_emb_list) query_emb = concat_func(query_emb_list, mask=True) hist = AttentionSequencePoolingLayer( att_hidden_size, att_activation, weight_normalization=att_weight_normalization, supports_masking=True)([query_emb, keys_emb]) deep_input_emb = tf.keras.layers.Concatenate()( [NoMask()(deep_input_emb), hist]) deep_input_emb = tf.keras.layers.Flatten()(deep_input_emb) dnn_input = combined_dnn_input([deep_input_emb], dense_value_list) output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=seed)(dnn_input) final_logit = tf.keras.layers.Dense( 1, use_bias=False, kernel_initializer=tf.keras.initializers.glorot_normal(seed))( output) # logits_list.append(final_logit) # logits = add_func(logits_list) # print(labels) # tf.summary.histogram(final_logit + '/final_logit', final_logit) return deepctr_model_fn(features, mode, final_logit, labels, task, linear_optimizer, dnn_optimizer, training_chief_hooks=training_chief_hooks)