] # 2.对特征标签进行编码 for feature in sparse_categorical_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) # for feature in sparse_categorical_features : # print (data[feature].nunique()) # 3. 处理单值离散特征 feature_columns = [] for feature in sparse_categorical_features: feature_columns.append( SparseFeat(feature, data[feature].nunique(), embedding_dim=4, use_hash=False)) # 4. 处理多值离散特征 # 4.1 生成词表, def get_table(data, feature_name, sep='|'): s = set() for line in data[feature_name]: s.update(str(line).split(sep)) s.add("<pad>") return len(s), s max_len, table = get_table(data, 'genres', sep='|') # 4.2 生成索引
tf.data.experimental\ .prefetch_to_device('/gpu:0', buffer_size=num_para)) else: D_train = D_train_r.shard( num_workers, worker_index).repeat().prefetch(buffer_size=num_para) D_valid = D_valid_r.shard( num_workers, worker_index).repeat().prefetch(buffer_size=num_para) # %% embedding_size = NNconfig_dic["embedding_size"] sparse_feature_columns = [] varlen_feature_columns = [] sparse_feature_columns = [ SparseFeat(feat, sparse_vcab_dic[feat] + 1, dtype=tf.int64, embedding_dim=embedding_size) for feat in sparse_f ] varlen_feature_columns = [ VarLenSparseFeat(SparseFeat(vfeat, vocabulary_size=varlen_vcab_dic[vfeat] + 1, dtype=tf.int64, embedding_dim=embedding_size), maxlen=varlen_maxlen_f[vfeat]) for vfeat in varlen_f ] # %% linear_feature_columns, dnn_feature_columns = \ sparse_feature_columns + varlen_feature_columns, sparse_feature_columns + varlen_feature_columns # %%
sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'music_id', 'device'] dense_features = ['time', 'duration_time'] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0,) target = ['finish'] # target = ['like'] for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) sparse_feature_columns = [SparseFeat(feat, data[feat].nunique()) #(特征名, 特征不同取值个数)生成SparseFeat对象,name == 特征名,dimension==该特征不同取值个数, dtype ==int32 for feat in sparse_features] dense_feature_columns = [DenseFeat(feat, 1) #(特征名, dimension==1) 数据dtype == float32 for feat in dense_features] dnn_feature_columns = sparse_feature_columns + dense_feature_columns linear_feature_columns = sparse_feature_columns + dense_feature_columns ## 这里有多余的步骤,该方法中间为每个特征设置了Input层,但是没有返回,只返回了特征名称list,其实可以直接从上面的两个list合并得到。 feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) train, test = train_test_split(data, test_size=0.1) train_model_input = [train[name] for name in feature_names] test_model_input = [test[name] for name in feature_names] #model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
key2index = {} genres_list = list(map(split, data['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences( genres_list, maxlen=max_len, padding='post', ) # 2.count #unique features for each sparse field and generate feature config for sequence feature fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique(), embedding_dim=4) for feat in sparse_features ] use_weighted_sequence = False if use_weighted_sequence: varlen_feature_columns = [ VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name='genres_weight') ] # Notice : value 0 is for padding for sequence input feature else: varlen_feature_columns = [
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3, hash_flag), SparseFeat('gender', 2, hash_flag), SparseFeat('item', 3 + 1, hash_flag), SparseFeat('item_gender', 2 + 1, hash_flag), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat('sess_0_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'), VarLenSparseFeat('sess_0_item_gender', 2 + 1, 4, use_hash=hash_flag, embedding_name='item_gender') ] feature_columns += [ VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'), VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } fixlen_feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in fixlen_feature_names ] + [feature_dict[name] for name in varlen_feature_names] x += [sess_number] y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(hash_flag=False): feature_columns = [ SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item', 3 + 1, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('sess_0_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('sess_0_item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item_gender'), maxlen=4) ] feature_columns += [ VarLenSparseFeat(SparseFeat('sess_1_item', 3 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item'), maxlen=4), VarLenSparseFeat(SparseFeat('sess_1_item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag, embedding_name='item_gender'), maxlen=4) ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } x["sess_length"] = sess_number y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
label=label) csv_to_tfrecord(val_filename, output_filedir=os.path.join(save_dir, 'val_tfrecord'), dense_feature_names=dense_feature_names, sparse_feature_names=sparse_feature_names, label=label) csv_to_tfrecord(test_filename, output_filedir=os.path.join(save_dir, 'test_tfrecord'), dense_feature_names=dense_feature_names, sparse_feature_names=sparse_feature_names, label=None) dense_feature_columns = [DenseFeat(feat) for feat in dense_feature_names] sparse_feature_columns = [ SparseFeat(feat, vocab_dict[feat], embedding_dim=4) for feat in sparse_feature_names ] linear_feature_columns = dense_feature_columns + sparse_feature_columns dnn_feature_columns = dense_feature_columns + sparse_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=[64, 64], task='binary') model.compile(
def NCF(user_feature_columns, item_feature_columns, user_gmf_embedding_dim=20, item_gmf_embedding_dim=20, user_mlp_embedding_dim=20, item_mlp_embedding_dim=20, dnn_use_bn=False, dnn_hidden_units=(64, 32), dnn_activation='relu', l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, init_std=0.0001, seed=1024): """Instantiates the NCF Model architecture. :param user_feature_columns: A dict containing user's features and features'dim. :param item_feature_columns: A dict containing item's features and features'dim. :param user_gmf_embedding_dim: int. :param item_gmf_embedding_dim: int. :param user_mlp_embedding_dim: int. :param item_mlp_embedding_dim: int. :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net :param dnn_activation: Activation function to use in deep net :param l2_reg_dnn: float. L2 regularizer strength applied to DNN :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. :param init_std: float,to use as the initialize std of embedding vector :param seed: integer ,to use as random seed. :return: A Keras model instance. """ user_dim = len(user_feature_columns) * user_gmf_embedding_dim item_dim = len(item_feature_columns) * item_gmf_embedding_dim dim = (user_dim * item_dim) / (math.gcd(user_dim, item_dim)) user_gmf_embedding_dim = int(dim / len(user_feature_columns)) item_gmf_embedding_dim = int(dim / len(item_feature_columns)) # Generalized Matrix Factorization (GMF) Part user_gmf_feature_columns = [ SparseFeat(feat, vocabulary_size=size, embedding_dim=user_gmf_embedding_dim) for feat, size in user_feature_columns.items() ] user_features = build_input_features(user_gmf_feature_columns) user_inputs_list = list(user_features.values()) user_gmf_sparse_embedding_list, user_gmf_dense_value_list = input_from_feature_columns( user_features, user_gmf_feature_columns, l2_reg_embedding, init_std, seed, prefix='gmf_') user_gmf_input = combined_dnn_input(user_gmf_sparse_embedding_list, []) user_gmf_out = Lambda(lambda x: x, name="user_gmf_embedding")(user_gmf_input) item_gmf_feature_columns = [ SparseFeat(feat, vocabulary_size=size, embedding_dim=item_gmf_embedding_dim) for feat, size in item_feature_columns.items() ] item_features = build_input_features(item_gmf_feature_columns) item_inputs_list = list(item_features.values()) item_gmf_sparse_embedding_list, item_gmf_dense_value_list = input_from_feature_columns( item_features, item_gmf_feature_columns, l2_reg_embedding, init_std, seed, prefix='gmf_') item_gmf_input = combined_dnn_input(item_gmf_sparse_embedding_list, []) item_gmf_out = Lambda(lambda x: x, name="item_gmf_embedding")(item_gmf_input) gmf_out = Multiply()([user_gmf_out, item_gmf_out]) # Multi-Layer Perceptron (MLP) Part user_mlp_feature_columns = [ SparseFeat(feat, vocabulary_size=size, embedding_dim=user_mlp_embedding_dim) for feat, size in user_feature_columns.items() ] user_mlp_sparse_embedding_list, user_mlp_dense_value_list = input_from_feature_columns( user_features, user_mlp_feature_columns, l2_reg_embedding, init_std, seed, prefix='mlp_') user_mlp_input = combined_dnn_input(user_mlp_sparse_embedding_list, user_mlp_dense_value_list) user_mlp_out = Lambda(lambda x: x, name="user_mlp_embedding")(user_mlp_input) item_mlp_feature_columns = [ SparseFeat(feat, vocabulary_size=size, embedding_dim=item_mlp_embedding_dim) for feat, size in item_feature_columns.items() ] item_mlp_sparse_embedding_list, item_mlp_dense_value_list = input_from_feature_columns( item_features, item_mlp_feature_columns, l2_reg_embedding, init_std, seed, prefix='mlp_') item_mlp_input = combined_dnn_input(item_mlp_sparse_embedding_list, item_mlp_dense_value_list) item_mlp_out = Lambda(lambda x: x, name="item_mlp_embedding")(item_mlp_input) mlp_input = Concatenate(axis=1)([user_mlp_out, item_mlp_out]) mlp_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed, name="mlp_embedding")(mlp_input) # Fusion of GMF and MLP neumf_input = Concatenate(axis=1)([gmf_out, mlp_out]) neumf_out = DNN(hidden_units=[1], activation='sigmoid')(neumf_input) output = Lambda(lambda x: x, name='neumf_out')(neumf_out) # output = PredictionLayer(task, False)(neumf_out) model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output) return model
def structural_feature(train, test): test['label'] = -1 data = pd.concat([train, test], axis=0) '''特征工程 >>>>>''' # data['year'] = data['date'].dt.year # data['month'] = data['date'].dt.month # data['day'] = data['date'].dt.day data['hour'] = data['date'].dt.hour del data['date'] data['D1+D2'] = data['D1'] + data['D2'] data['D1-D2'] = data['D1'] - data['D2'] data['D1/D2'] = data['D1'] / data['D2'] # data['A_sum'] = data['A1'] + data['A2'] + data['A3'] data['B_sum'] = data['B1'] + data['B2'] + data['B3'] # data['C_sum'] = data['C1'] + data['C2'] + data['C3'] data['A_*'] = data['A1'] * data['A2'] * data['A3'] data['B_*'] = data['B1'] * data['B2'] * data['B3'] # data['C_*'] = data['C1'] * data['C2'] * data['C3'] data['A_+'] = data['A1'] + data['A2'] + data['A3'] data['B_+'] = data['B1'] + data['B2'] + data['B3'] data['C_+'] = data['C1'] + data['C2'] + data['C3'] normalization_columns = [ 'A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3', 'E2', 'E3', 'E5', 'E7', 'E9', 'E10', 'E13', 'E16', 'E17', 'E19', 'E21', 'E22' ] for column in normalization_columns: data[column] = (data[column] - data[column].min(axis=0)) / ( data[column].max(axis=0) - data[column].min(axis=0)) sparse_features = [ 'D1', 'D2', 'E4', 'E8', 'E11', 'E15', 'E18', 'E25', 'hour' ] dense_features = [ 'E1', 'E2', 'E3', 'E5', 'E6', 'E7', 'E9', 'E10', 'E12', 'E13', 'E14', 'E16', 'E17', 'E16', 'E17', 'E19', 'E20', 'E21', 'E22', 'E23', 'E24', 'A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3' ] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) '''特征工程结束 <<<<''' train = data[data.label != -1] test = data[data.label == -1] del test['label'] '''调整特征顺序''' l = train['label'] del train['label'] train['label'] = l return train, test, feature_names, linear_feature_columns, dnn_feature_columns
def main(args): if args.arch == 'xDeepFM': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join( DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label s = time.time() print(f'before test article preprocess : {len(item)}') sparse_features = [ 'article_id', 'hh', 'gender', 'age_range', 'len_bin' ] dense_features = ['image_feature', 'read_cnt_prob'] target = ['label'] ############################ make more feature !!!!!!! ################################# ############## 1. read_article_ids len cnt -- user feature ################################################# len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() ################ 2. read_cnt, total_cnt, prob_read_cnt --- article feature #################################### read_cnt = item[item['label'] == 1].groupby('article_id').agg( {'hh': 'count'}) read_cnt = read_cnt.reset_index() read_cnt = read_cnt.rename(columns={'hh': 'read_cnt'}) read_cnt_list = read_cnt['read_cnt'].tolist() read_cnt_artic_list = read_cnt['article_id'].tolist() print(f'len read_cnt : {len(read_cnt)}') print(read_cnt.head(3)) total_cnt = item.groupby('article_id').agg({'hh': 'count'}) total_cnt = total_cnt.reset_index() total_cnt = total_cnt.rename(columns={'hh': 'read_cnt'}) total_cnt_list = total_cnt['read_cnt'].tolist() total_cnt_artic_list = total_cnt['article_id'].tolist() print(f'len read_cnt : {len(total_cnt)}') print(total_cnt.head(3)) # lit # test_article_ids list lit_cnt = [] lit_total_cnt = [] lit_cnt_prob = [] lit = list(set(artics)) lit.sort() print(lit[:10]) print(f'len(lit):{len(lit)}') for i in range(len(lit)): # lit_cnt cur_artic = lit[i] if cur_artic not in read_cnt_artic_list: lit_cnt.append(0) else: for j in range(len(read_cnt_artic_list)): if cur_artic == read_cnt_artic_list[j]: lit_cnt.append(read_cnt_list[j]) break # lit_total_cnt if cur_artic not in total_cnt_artic_list: lit_total_cnt.append(0) else: for j in range(len(total_cnt_artic_list)): if cur_artic == total_cnt_artic_list[j]: lit_total_cnt.append(total_cnt_list[j]) break # lit_cnt_prob if lit_total_cnt[i] == 0: lit_cnt_prob.append(0) else: lit_cnt_prob.append(lit_cnt[i] / lit_total_cnt[i]) print('--- read_cnt article feature completed ---') print(f'lit_cnt {len(lit_cnt)}') print(f'lit_total_cnt {len(lit_total_cnt)}') print(f'lit_cnt_prob {len(lit_cnt_prob)}') #### fea print('feature dict generate') file_list1 = os.listdir(DATASET_PATH) file_list2 = os.listdir(DATASET_PATH + '/train') file_list3 = os.listdir(DATASET_PATH + '/train/train_data') print(file_list1) print(file_list2) print(file_list3) resnet_feature_extractor(args.mode) print(file_list1) print(file_list2) print(file_list3) # One hot Encoding with open(os.path.join('train_image_features_50.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) print('check artic feature') print(f"757518f4a3da : {image_feature_dict['757518f4a3da']}") lbe = LabelEncoder() lbe.fit(lit) item['article_id' + '_onehot'] = lbe.transform(item['article_id']) print(lbe.classes_) for feat in sparse_features[1:]: lbe = LabelEncoder() item[feat + '_onehot'] = lbe.fit_transform( item[feat]) # 이때 고친 라벨이 같은 라벨인지도 필수로 확인해야함 print(item.head(10)) print('columns name : ', item.columns) fixlen_feature_columns = [SparseFeat('article_id', len(lit))] fixlen_feature_columns += [ SparseFeat(feat, item[feat + '_onehot'].nunique()) for feat in sparse_features[1:] ] fixlen_feature_columns += [ DenseFeat('image_feature', len(image_feature_dict[artics[0]])) ] fixlen_feature_columns += [DenseFeat('read_cnt_prob', 1)] print(f'fixlen_feature_columns : {fixlen_feature_columns}') idx_artics_all = item['article_id' + '_onehot'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names( linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary') print('---model defined---') print(time.time() - s, 'seconds') ##### print need for artic in lit: print(artic, end=',') print() print('new') print() print(len(lit_cnt_prob)) for prob in lit_cnt_prob: prob = round(prob, 4) print(prob, end=',') print() print('end') print('--------------') optimizer = tf.keras.optimizers.Adam(args.lr) s = time.time() # negative sampling item_pos = item[item['label'] == 1] item_neg = item[item['label'] == 0] dn_1 = item_neg.sample(n=3 * len(item_pos), random_state=42) dn_2 = item_neg.sample(n=3 * len(item_pos), random_state=20) dn_3 = item_neg.sample(n=3 * len(item_pos), random_state=7) dn_4 = item_neg.sample(n=3 * len(item_pos), random_state=33) dn_5 = item_neg.sample(n=3 * len(item_pos), random_state=41) dn_1.reset_index() data_1 = pd.concat([dn_1, item_pos]).sample(frac=1, random_state=42).reset_index() data_1_article_idxs = data_1['article_id_onehot'].tolist() data_1_article = data_1['article_id'].tolist() print(f'len data_1 : {len(data_1)}') print(data_1.head(5)) li1 = [] li2 = [] li3 = [] for i in range(len(data_1_article)): for j in range(len(lit_cnt_prob)): if data_1_article[i] == lit[j]: li3.append(lit_cnt_prob[j]) break data_1['read_cnt_prob'] = li3 print('---read_cnt_prob end---') ## preprocess append data_2 = pd.concat([dn_2, item_pos]).sample(frac=1, random_state=42).reset_index() data_3 = pd.concat([dn_3, item_pos]).sample(frac=1, random_state=42).reset_index() data_4 = pd.concat([dn_4, item_pos]).sample(frac=1, random_state=42).reset_index() data_5 = pd.concat([dn_5, item_pos]).sample(frac=1, random_state=42).reset_index() li = [] for i in range(len(data_1_article_idxs)): image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]] li.append(image_feature) print(f'article_id : {data_1_article[0]}') print(f'article_image_feature : {image_feature_dict[data_1_article[0]]}') data_1['image_feature'] = li li = [] print(f'finished data_1_image_feature : {time.time() - s} sec') if use_nsml: bind_nsml(model, optimizer, args.task) if args.pause: nsml.paused(scope=locals()) if (args.mode == 'train') or args.dry_run: best_loss = 1000 if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') model.compile( tf.keras.optimizers.Adam(args.lr), 'mse', metrics=['accuracy'], ) train_generator = data_generator(data_1) lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler) #k_fold 할때는 check point 빼자 save_cbk = CustomModelCheckpoint() history = model.fit_generator(train_generator, epochs=100, verbose=2, workers=8, steps_per_epoch=np.ceil( len(data_1) / 2048), callbacks=[lr_scheduler, save_cbk]) print('again')
scaler, splits=1, feats=feats, batch_size=2048, shuffle=False, debug=False, use_cache=False) sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=len(encoder.get_labels(feat)), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model #train, test = train_test_split(data, test_size=0.2)
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [ SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag), SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag), SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag), SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('pay_score', 1) ] feature_columns += [ VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] behavior_feature_list = ["item_id", "cate_id"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value cate_id = np.array([1, 2, 2]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) behavior_length = np.array([3, 3, 2]) feature_dict = { 'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id, 'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, 'pay_score': score, "seq_length": behavior_length } if use_neg: feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]]) feature_columns += [ VarLenSparseFeat(SparseFeat('neg_hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'), maxlen=4, length_name="seq_length"), VarLenSparseFeat(SparseFeat('neg_hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4, length_name="seq_length") ] x = { name: feature_dict[name] for name in get_feature_names(feature_columns) } y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
item_profile = data[["movie_id"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set(data, negsample) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 16 user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), SparseFeat("gender", feature_max_idx['gender'], embedding_dim), SparseFeat("age", feature_max_idx['age'], embedding_dim), SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] # 3.Define Model and train model = DSSM(user_feature_columns, item_feature_columns) # FM(user_feature_columns,item_feature_columns) model.compile(optimizer='adagrad', loss="binary_crossentropy")
'pay_score': np.array([0.1, 0.2, 0.3, 0.2] * n_copy), 'context': np.array([0, 1, 0, 1] * n_copy), # 'seq_length': np.array([3, 4, 2, 2]) } y_ctr = np.array([1, 1, 1, 0] * n_copy) y_cvr = np.array([1, 0, 1, 0] * n_copy) y_ctcvr = np.array([1, 0, 1, 0] * n_copy) # 用户特征 user_feature_columns = [ DenseFeat('pay_score', dimension=1), SparseFeat('user', vocabulary_size=len(np.unique(X["user"])), embedding_dim=embedding_dim), SparseFeat('gender', vocabulary_size=len(np.unique(X["gender"])), embedding_dim=embedding_dim), VarLenSparseFeat( # 0 值表示填充, 自动过滤 SparseFeat('hist_item_id', vocabulary_size=len(np.unique(X["hist_item_id"][0])), embedding_dim=embedding_dim, embedding_name='item_id'), maxlen=len(X["hist_item_id"][0]), combiner="max", # "mean", "sum" length_name=None, # length_name="seq_length" ), VarLenSparseFeat( SparseFeat('hist_cate_id',
num_iteration=gbm.best_iteration, pred_leaf=True) print('Writing transformed training data') transformed_training_matrix = np.zeros( [len(lgb_pred), len(lgb_pred[0]) * num_leaves], dtype=np.int64) # N * num_tress * num_leafs for i in range(0, len(lgb_pred)): temp = np.arange(len(lgb_pred[0])) * num_leaves + np.array(lgb_pred[i]) transformed_training_matrix[i][temp] += 1 print('deep training...') lgb_feat = pd.DataFrame(transformed_training_matrix.tolist()) lgb_feat.columns = [str(i) for i in lgb_feat.columns] fixlen_feature_columns = [ SparseFeat(feat, lgb_feat[feat].nunique()) for feat in lgb_feat.columns ] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = [lgb_feat[name] for name in fixlen_feature_names] model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary') model.compile( "adam", loss=losses.mae, metrics=['accuracy', 'mse'], ) history = model.fit( train_model_input, y_train.values,
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1, sequence_feature=['sum', 'mean', 'max', 'weight'], classification=True, include_length=False, hash_flag=False, prefix='', use_group=False): feature_columns = [] model_input = {} if 'weight' in sequence_feature: feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + "weighted_seq", vocabulary_size=2, embedding_dim=embedding_size), maxlen=3, length_name=prefix + "weighted_seq" + "_seq_length", weight_name=prefix + "weight")) s_input, s_len_input = gen_sequence(2, 3, sample_size) model_input[prefix + "weighted_seq"] = s_input model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1) model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input sequence_feature.pop(sequence_feature.index('weight')) for i in range(sparse_feature_num): if use_group: group_name = str(i % 3) else: group_name = DEFAULT_GROUP_NAME dim = np.random.randint(1, 10) feature_columns.append( SparseFeat(prefix + 'sparse_feature_' + str(i), dim, embedding_size, use_hash=hash_flag, dtype=tf.int32, group_name=group_name)) for i in range(dense_feature_num): feature_columns.append( DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32)) for i, mode in enumerate(sequence_feature): dim = np.random.randint(1, 10) maxlen = np.random.randint(1, 10) feature_columns.append( VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode, vocabulary_size=dim, embedding_dim=embedding_size), maxlen=maxlen, combiner=mode)) for fc in feature_columns: if isinstance(fc, SparseFeat): model_input[fc.name] = np.random.randint(0, fc.vocabulary_size, sample_size) elif isinstance(fc, DenseFeat): model_input[fc.name] = np.random.random(sample_size) else: s_input, s_len_input = gen_sequence(fc.vocabulary_size, fc.maxlen, sample_size) model_input[fc.name] = s_input if include_length: fc.length_name = prefix + "sequence_" + str(i) + '_seq_length' model_input[prefix + "sequence_" + str(i) + '_seq_length'] = s_len_input if classification: y = np.random.randint(0, 2, sample_size) else: y = np.random.random(sample_size) return model_input, y, feature_columns
'C17', 'C18', 'C19', 'C21' ] train_features = feature1 + feature2 + feature3 for feature in train_features: encoder = LabelEncoder() train_data[feature] = encoder.fit_transform(train_data[feature]) target = ['click'] from deepctr.models import DeepFM from deepctr.inputs import SparseFeat, get_feature_names # 计算每个特征中的 不同特征值的个数 fixlen_feature_column1 = [ SparseFeat(name=feature, vocabulary_size=int(train_data[feature].nunique() * 0.01), embedding_dim=4, use_hash=True) for feature in feature1 ] fixlen_feature_column2 = [ SparseFeat(name=feature, vocabulary_size=int(train_data[feature].nunique() * 0.05), embedding_dim=4, use_hash=True) for feature in feature2 ] fixlen_feature_column3 = [ SparseFeat(name=feature, vocabulary_size=train_data[feature].nunique(), embedding_dim=4, use_hash=False) for feature in feature3 ]
for feat in sparse_features: label_enc = LabelEncoder() data[feat] = label_enc.fit_transform(data[feat]) feats = [i for i in data.columns if i != 'Rating'] X = data[feats] y = data['Rating'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sparse_features = [ 'UserID', 'MovieID', 'Gender', 'Occupation', 'day', 'weekday' ] dense_features = ['hour', 'Age'] fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features] + \ [DenseFeat(feat, 1) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass') model.compile('adam', 'mse', metrics=['accuracy']) feature_names = get_feature_names(fixlen_feature_columns) train_feed_dict = {name: X_train[name] for name in feature_names} test_feed_dict = {name: X_test[name] for name in feature_names} model.fit(train_feed_dict, y_train,
user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set(data, negsample) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 16 user_feature_columns = [ SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), SparseFeat("gender", feature_max_idx['gender'], embedding_dim), SparseFeat("age", feature_max_idx['age'], embedding_dim), SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat( SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), ] item_feature_columns = [ SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim) ]
def loadData(trainFile, testFile, embedding_dim, multivalue_len, multiClass=False): train = pd.read_csv(trainFile) test = pd.read_csv(testFile) ##1. feature type declarion sparse_features = [ "BaseAdGroupId", "Criteria", 'placementType', 'Week', 'IsRestrict', 'IsNegative', 'AccountTimeZone', 'AccountCurrencyCode', 'BiddingStrategyType', 'CampaignId', 'Month' ] dense_features = [ 'adClicks', 'adConversions', 'adCtr', 'adConversionRate', 'adActiveViewImpressions', 'adActiveViewMeasurability', 'adActiveViewMeasurableCost', 'adActiveViewViewability', 'adImpressions', 'adActiveViewCpm', 'adAverageCpc', 'adAverageCpe', 'adCpcBid', 'adActiveViewMeasurableImpressions', 'adActiveViewCtr', 'adAverageCpm', 'adAverageCpv', 'adCost', 'plaClicks', 'plaConversions', 'plaCtr', 'plaConversionRate', 'plaActiveViewImpressions', 'plaActiveViewMeasurability', 'plaActiveViewMeasurableCost', 'plaActiveViewViewability', 'plaImpressions', 'plaCpcBid', 'plaActiveViewMeasurableImpressions', 'plaActiveViewCtr', 'plaActiveViewCpm', 'plaAverageCpc', 'plaAverageCpe', 'plaAverageCpm', 'plaAverageCpv', 'plaCost', 'histListLen' ] multivalue_features = [ 'locationName', 'languageCode', 'hist_BaseAdGroupId' ] sparse_features = ["BaseAdGroupId", "Criteria", 'placementType'] target = ['Ctr'] # 2. Missing value process. train[sparse_features + multivalue_features] = train[sparse_features + multivalue_features].fillna('-1', ) train[dense_features + target] = train[dense_features + target].fillna(0, ) test[sparse_features + multivalue_features] = test[sparse_features + multivalue_features].fillna('-1', ) test[dense_features + target] = test[dense_features + target].fillna(0, ) train["BaseAdGroupId"] = train["BaseAdGroupId"].apply(lambda x: str( (int(x)))) test["BaseAdGroupId"] = test["BaseAdGroupId"].apply(lambda x: str( (int(x)))) # 3. sparse features transformation for feat in sparse_features: lbe = LabelEncoder() train[feat] = lbe.fit_transform(train[feat]) test[feat] = lbe.fit_transform(test[feat]) # 4. dense features transformation for numFeature in dense_features: train[numFeature] = train[numFeature].apply( lambda x: x if x < 2 else math.sqrt(math.log(x))) test[numFeature] = test[numFeature].apply( lambda x: x if x < 2 else math.sqrt(math.log(x))) # 5. multivalue features transformation for feat in multivalue_features: exec( '{}_train_list = list([split(x,{}Dict) for x in train[feat].values])' .format(feat, feat, feat)) exec( '{}_test_list = list([split(x,{}Dict) for x in test[feat].values])' .format(feat, feat, feat)) exec('{}_length = np.array(list(map(len, {}_train_list)))'.format( feat, feat)) exec('{}_maxlen = max({}_length)'.format(feat, feat)) exec( '{}_train_list = pad_sequences({}_train_list, maxlen=multivalue_len, padding="post",)' .format(feat, feat, feat)) exec( '{}_test_list = pad_sequences({}_test_list, maxlen=multivalue_len, padding="post",)' .format(feat, feat, feat)) # 6. feature colums fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=(train[feat].append( test[feat], ignore_index=True)).nunique(), embedding_dim=embedding_dim) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] varlen_feature_columns = [] for feat in multivalue_features: exec( 'varlen_feature_columns.append(VarLenSparseFeat("{}", maxlen= multivalue_len,vocabulary_size=len({}Dict) + 1,embedding_dim=embedding_dim, combiner="mean",weight_name=None))' .format(str(feat), feat)) dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 7.generate input data for model train_model_input = { name: train[name] for name in sparse_features + dense_features } test_model_input = { name: test[name] for name in sparse_features + dense_features } for feat in multivalue_features: name = str(feat) exec('train_model_input["{}"] = {}_train_list'.format(name, feat)) exec('test_model_input["{}"] = {}_test_list'.format(name, feat)) behavior_feature_list = ["BaseAdGroupId"] return train_model_input, train, test_model_input, test, dnn_feature_columns, linear_feature_columns, behavior_feature_list
dense_features = [f for f in dense_features if f !='target'] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['target'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns fixlen_feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.33) train_model_input = [train[name] for name in fixlen_feature_names] test_model_input = [test[name] for name in fixlen_feature_names] # 4.Define Model,train,predict and evaluate
site_id='context', site_domain='context', site_category='context', app_id='item', app_domain='item', app_category='item', device_model='user', device_type='user', device_conn_type='context', hour='context', device_id='user') fixlen_feature_columns = [ SparseFeat(name, vocabulary_size=data[name].nunique(), embedding_dim=16, use_hash=False, dtype='int32', group_name=field_info[name]) for name in sparse_features ] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names}
label_number.append(label_feature_number1) label_number.append(label_feature_number2) Y1 = data[target1].values Y2 = data[target2].values encoder = LabelEncoder() encoded_Y1 = encoder.fit_transform(Y1) encoded_Y2 = encoder.fit_transform(Y2) dummy_target1 = np_utils.to_categorical(encoded_Y1) dummy_target2 = np_utils.to_categorical(encoded_Y2) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test, target1_train, target1_test, target2_train, target2_test = train_test_split( data, dummy_target1, dummy_target2, test_size=0.4, random_state=0)
item_profile = data[["movie_id"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) # # user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set_sdm(data, seq_short_len=SEQ_LEN_short, seq_prefer_len=SEQ_LEN_prefer) train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 32 # for sdm,we must provide `VarLenSparseFeat` with name "prefer_xxx" and "short_xxx" and their length user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16), SparseFeat("gender", feature_max_idx['gender'], 16), SparseFeat("age", feature_max_idx['age'], 16), SparseFeat("occupation", feature_max_idx['occupation'], 16), SparseFeat("zip", feature_max_idx['zip'], 16), VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN_prefer, 'mean', 'prefer_sess_length'), VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN_prefer, 'mean',
'label': 't_location', 'len': t_loc_len, 'map': locmap, 'weight': None }) var_info.append({ 'label': 'rs_channel', 'len': rs_channel_len, 'map': rschannlemap, 'weight': None }) # define model emb_size = 32 fixlen_feature_columns = [ SparseFeat(feat, vocabulary_size=len(vocabs[feat]), embedding_dim=emb_size) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] varlen_feature_columns = [ VarLenSparseFeat(SparseFeat(item['label'], vocabulary_size=len(item['map']) + 1, embedding_dim=emb_size), maxlen=item['len'], combiner='mean', weight_name=item['weight']) for item in var_info ] linear_feature_columns = fixlen_feature_columns + varlen_feature_columns dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
data = pd.read_csv('./criteo_sample.txt') sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] data[sparse_features] = data[sparse_features].fillna('-1', ) data[dense_features] = data[dense_features].fillna(0, ) target = ['label'] # 1.do simple Transformation for dense features mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.set hashing space for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=1000,embedding_dim=4, use_hash=True, dtype='string') # since the input is string for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns, ) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name:train[name] for name in feature_names} test_model_input = {name:test[name] for name in feature_names}
from deepctr.inputs import SparseFeat, get_feature_names #数据加载 # data = pd.read_csv("movielens_sample.txt") data = pd.read_csv("movielens_sample_my.csv") sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"] target = ['rating'] # 对特征标签进行编码 for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) # 计算每个特征中的 不同特征值的个数 fixlen_feature_columns = [ SparseFeat(feature, data[feature].nunique()) for feature in sparse_features ] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 将数据集切分成训练集和测试集 train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name].values for name in feature_names} test_model_input = {name: test[name].values for name in feature_names} # 使用DeepFM进行训练 model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression') model.compile( "adam", "mse",
def run(data, ziel, line0, grid , loop): poi_feature_transfer = [] print('++++', '\n', grid) for a in range(len(poi_feature)): poi_feature_transfer.append('poi_feature_%d'%a) data = data.rename(columns={poi_feature[a]: 'poi_feature_%d'%a}) features = ['provname', 'prefname', 'cntyname', 'townname', 'villname','dispincm', 'urbcode_1', 'hauslvl'] + poi_feature_transfer# sparse_features = [] dense_features = [] for f in features: if f not in x_category or x_category[f] == 1: dense_features.append(f) else: sparse_features.append(f) data[sparse_features] = data[sparse_features].fillna(-1) data[dense_features] = data[dense_features].fillna(0 ) y=[] #ziel = # villmean, income y_limit= [np.min(data[ziel])-1]+ line0 +[np.max(data[ziel])] for index, row in data.iterrows(): for i in range(1, len(y_limit)): if y_limit[i - 1] < row[ziel] <= y_limit[i]: y.append(i-1) break data['income_0'] = y target = ['income_0'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) mms = MinMaxScaler(feature_range=(0, 1)) data[dense_features] = mms.fit_transform(data[dense_features]) # 2.count #unique features for each sparse field,and record dense feature field name fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + \ [DenseFeat(feat, 1,)for feat in dense_features] dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns fixlen_feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) # try to oversampling # (train_x,train_y)=over_sampling(train[features],train[ziel], 3) # train = (np.column_stack((train_x, train_y))) train_model_input = [train[name] for name in fixlen_feature_names] test_model_input = [test[name] for name in fixlen_feature_names] # 4.Define Model,train,predict and evaluate ############################################## (models, model_names,xlabel) = model_gridsearch(linear_feature_columns,dnn_feature_columns,grid) logloss, auc1, acc1, pre1, recall1,f11 = [],[],[],[],[],[] print(ziel, line0, len(data)) for name,model in zip(model_names,models): ll_avg, auc_avg = [],[] for i in range(loop): model.compile("adam",'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=0, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) true = test[target].values ''' f = open("pred.csv", 'a', encoding='utf_8_sig') f.write('%s\n'%(ziel)) for i in range(len(pred_ans)): f.write('%s, %s\n' % (pred_ans[i],true[i] )) f.close()''' ll = round(log_loss(test[target].values, pred_ans), 4) auc = round(roc_auc_score(test[target].values, pred_ans), 4) #acc = round(accuracy_score(test[target].values, pred_ans.round()), 4) #pre = round(precision_score(test[target].values, pred_ans.round()), 4) #recall = round(recall_score(test[target].values, pred_ans.round()), 4) #f1 = round(f1_score(test[target].values, pred_ans.round(), average='weighted'),4) #spec = round(specificity_score(test[target].values, pred_ans.round(), average='weighted'),4) #sens = round(sensitivity_score(test[target].values, pred_ans.round(), average='weighted'),4) print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) ll_avg.append(ll), auc_avg.append(auc) logloss.append(np.mean(ll_avg)), auc1.append(np.mean(auc_avg))#, acc1.append(acc), pre1.append(pre), recall1.append(recall), f11.append(f1) ''' cm = confusion_matrix(test[target].values, pred_ans.round()) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm = [] for m in range(len(line0)+1): cm.append([]) for n in range(len(line0)+1): cm[m].append(round(cm_normalized[m][n],4)) ''' ''' print(name) print("LogLoss", ll, end=' ') print("AUC", auc, end=' ') print("accuracy", acc, end=' ') #print("precision" , pre, end=' ') #print("recall", recall, end=' ') print("f1" , f1, end=' ') print("spec", spec, end=' ') print("sens" , sens, end=' ') print(cm) #f = open("DeepFM.csv", 'a', encoding='utf_8_sig') #f.write('%s,%s\n'%(ziel,line0)) #f.write('%s, %s, %s, %s, %s, %s, %s,' % (name, ll, auc, acc, f1, spec, sens)) #f.write('%s\n' % str(cm).replace(',',';')) #f.close() ''' return (logloss, auc1, xlabel)
def test_PNN_avazu(data, train, test): print("\nTesting PNN on avazu dataset...\n") results_activation_function = {"auc": [], "logloss": [], "rmse": []} results_dropout = {"auc": [], "logloss": [], "rmse": []} results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []} auc = 0 logloss = 0 rmse = 0 features_labels = train.columns sparse_features_labels = features_labels[1:23] target_label = features_labels[0] dnn_feature_columns = [ SparseFeat( feat, vocabulary_size=data[feat].nunique(), embedding_dim=4, ) for feat in sparse_features_labels ] feature_names = get_feature_names(dnn_feature_columns) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} true_y = test[target_label].values print("\t\t-- ACTIVATION FUNCTIONS --\t\t") for dnn_activation in dnn_activation_list: print("\nTesting {dnn_activation}...".format( dnn_activation=dnn_activation)) # model = PNN(dnn_feature_columns, use_inner=False, use_outter=True, dnn_activation = dnn_activation, task='binary') model = PNN(dnn_feature_columns, use_inner=True, use_outter=False, dnn_activation=dnn_activation, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_activation_function["auc"].append(auc) results_activation_function["logloss"].append(logloss) results_activation_function["rmse"].append(rmse) print("\t\t-- DROPOUT RATES --\t\t") for dnn_dropout in dnn_dropout_list: print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout)) # model = PNN(dnn_feature_columns, use_inner=False, use_outter=True, dnn_dropout = dnn_dropout, task='binary') model = PNN(dnn_feature_columns, use_inner=True, use_outter=False, dnn_dropout=dnn_dropout, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit( train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION, ) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_dropout["auc"].append(auc) results_dropout["logloss"].append(logloss) results_dropout["rmse"].append(rmse) print("\t\t-- HIDDEN UNITS --\t\t") for dnn_hidden_units in dnn_hidden_units_list: print("\nTesting {dnn_hidden_units}...".format( dnn_hidden_units=dnn_hidden_units)) # model = PNN(dnn_feature_columns, use_inner=False, use_outter=True, dnn_hidden_units = dnn_hidden_units, task='binary') model = PNN(dnn_feature_columns, use_inner=True, use_outter=False, dnn_hidden_units=dnn_hidden_units, task='binary') model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) model.fit(train_model_input, train[target_label].values, batch_size=256, epochs=10, verbose=0, validation_split=TEST_PROPORTION) pred_y = model.predict(test_model_input, batch_size=256) auc = compute_auc(true_y, pred_y) logloss = compute_log_loss(true_y, pred_y) rmse = compute_rmse(true_y, pred_y) results_number_of_neurons["auc"].append(auc) results_number_of_neurons["logloss"].append(logloss) results_number_of_neurons["rmse"].append(rmse) if PLOT: # create_plots("OPNN", "avazu", results_activation_function, "Activation Function", "activation_func", dnn_activation_list) # create_plots("OPNN", "avazu", results_dropout, "Dropout Rate", "dropout", dnn_dropout_list) # create_plots("OPNN", "avazu", results_number_of_neurons, "Number of Neurons per layer", "nr_neurons", dnn_hidden_units_list) create_plots("PNN", "avazu", results_activation_function, "Activation Function", "activation_func", dnn_activation_list) create_plots("PNN", "avazu", results_dropout, "Dropout Rate", "dropout", dnn_dropout_list) create_plots("PNN", "avazu", results_number_of_neurons, "Number of Neurons per layer", "nr_neurons", dnn_hidden_units_list)
def _preprocess_movielens(df, **kw): multiple_value = kw.get('multiple_value') sparse_col = ["movie_id", "user_id", "gender", "age", "occupation", "zip"] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_col: lbe = LabelEncoder() df[feat] = lbe.fit_transform(df[feat]) if not multiple_value: # 2.count #unique features for each sparse field fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col] linear_cols = fixlen_cols dnn_cols = fixlen_cols train, test = train_test_split(df, test_size=0.2) ytrue = test[target].values else: ytrue = df[target].values hash_feature = kw.get('hash_feature', False) if not hash_feature: def split(x): key_ans = x.split('|') for key in key_ans: if key not in key2index: # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input key2index[key] = len(key2index) + 1 return list(map(lambda x: key2index[x], key_ans)) # preprocess the sequence feature key2index = {} genres_list = list(map(split, df['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', ) fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col] use_weighted_sequence = False if use_weighted_sequence: varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len( key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature else: varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len( key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean', weight_name=None)] # Notice : value 0 is for padding for sequence input feature linear_cols = fixlen_cols + varlen_cols dnn_cols = fixlen_cols + varlen_cols # generate input data for model model_input = {name: df[name] for name in sparse_col} # model_input["genres"] = genres_list model_input["genres_weight"] = np.random.randn(df.shape[0], max_len, 1) else: df[sparse_col] = df[sparse_col].astype(str) # 1.Use hashing encoding on the fly for sparse features,and process sequence features genres_list = list(map(lambda x: x.split('|'), df['genres'].values)) genres_length = np.array(list(map(len, genres_list))) max_len = max(genres_length) # Notice : padding=`post` genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0) # 2.set hashing space for each sparse field and generate feature config for sequence feature fixlen_cols = [ SparseFeat(feat, df[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string') for feat in sparse_col] varlen_cols = [ VarLenSparseFeat( SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"), maxlen=max_len, combiner='mean', )] # Notice : value 0 is for padding for sequence input feature linear_cols = fixlen_cols + varlen_cols dnn_cols = fixlen_cols + varlen_cols feature_names = get_feature_names(linear_cols + dnn_cols) # 3.generate input data for model model_input = {name: df[name] for name in feature_names} model_input['genres'] = genres_list train, test = model_input, model_input return df, linear_cols, dnn_cols, train, test, target, ytrue