def fit_test(self, train_X, train_Y, val_X, val_Y, test_X, test_Y, cat_cols): sparse_features = cat_cols dense_features = [ idx for idx in range(train_X.shape[1]) if idx not in cat_cols ] sparse_feature_columns = [ SparseFeat(str(feat), vocabulary_size=len(set(train_X[:, feat])) + 1, embedding_dim=4) for i, feat in enumerate(sparse_features) ] dense_feature_columns = [ DenseFeat( str(feat), 1, ) for feat in dense_features ] dnn_feature_columns = sparse_feature_columns + dense_feature_columns linear_feature_columns = sparse_feature_columns + dense_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = { name: train_X[:, int(name)] for name in feature_names } val_model_input = {name: val_X[:, int(name)] for name in feature_names} test_model_input = { name: test_X[:, int(name)] for name in feature_names } use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') self.device = 'cuda:0' self.model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary', device=self.device) self.model.compile( Adam(self.model.parameters(), 0.0001), "binary_crossentropy", metrics=['binary_crossentropy'], ) es = EarlyStopping(monitor='val_binary_crossentropy', min_delta=0, verbose=1, patience=30, mode='min') lbe = LabelEncoder() self.model.fit(train_model_input, lbe.fit_transform(train_Y), batch_size=512, epochs=21, verbose=2, validation_data=(val_model_input, lbe.transform(val_Y))) pred_ans = self.model.predict(test_model_input, batch_size=256) print(f'{log_loss(test_Y, pred_ans):.5f}')
def test_xDeepFM(dnn_hidden_units, cin_layer_size, cin_split_half, cin_activation, sparse_feature_num, dense_feature_dim): model_name = 'xDeepFM' sample_size = SAMPLE_SIZE x, y, feature_columns = get_test_data( sample_size, sparse_feature_num, sparse_feature_num) model = xDeepFM(feature_columns, feature_columns, dnn_hidden_units=dnn_hidden_units, cin_layer_size=cin_layer_size, cin_split_half=cin_split_half, cin_activation=cin_activation, dnn_dropout=0.5) check_model(model, model_name, x, y)
def test_recommend_movies(pretrained, inputs, linear_feature_columns, dnn_feature_columns, DEVICE): model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=DEVICE) model.load_state_dict(torch.load(pretrained)) pred_ans = model.predict(inputs, batch_size=256) # print(f'Predict rating: {pred_ans}') pred_movie_list = [] idx = np.argsort(pred_ans, axis=0)[::-1] for i, ans_idx in enumerate(idx[:, 0]): if i < 5: print( f"Top {i} predict rating: {pred_ans[ans_idx][0] :.3f}, movie_id: {data.iloc[ans_idx]['movie_id']}, gender: {data.iloc[ans_idx]['gender']}, age: {data.iloc[ans_idx]['age']}, user_id: {data.iloc[ans_idx]['user_id']}" ) pred_movie_list.append(data.iloc[ans_idx]['movie_id']) # print(inputs['movie_id'].iloc[i]) # print('movie_max',data.loc[:, ['movie_id']].max(axis=0)) # 1682 return pred_movie_list
def train_recommend_movies(csv_file, DEVICE): """ Description: Train recommend system on: Model: "xDeepFM", Target: "rating", Input features: ["movie_id", "gender", "age"], Save model to: "save_model/xDeepFM_MSE{}.h5" Parameters: csv_file: "path to *.csv" DEVICE: "cuda:0" """ data = pd.read_csv(csv_file) sparse_features = ["movie_id", "gender", "age"] target = ['rating'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] # 他自己的資料型態 # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group') linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names( linear_feature_columns + dnn_feature_columns ) # movie_id, user_id, gender, age, occupation, zip. # 3.generate input data for model train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = { name: test[name] for name in feature_names } # dict of movie_id, user_id, gender, age, occupation, zip values # 4.Define Model,train,predict and evaluate device = 'cpu' use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') device = DEVICE # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device) # model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='regression', device=device) model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device) model.compile( "adam", "mse", metrics=['mse'], ) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) print("test MSE", round(mean_squared_error(test[target].values, pred_ans), 4)) torch.save( model.state_dict(), 'save_model/xDeepFM_MSE{}.h5'.format( round(mean_squared_error(test[target].values, pred_ans), 4)))
train, test = train_test_split(data, test_size=0.2) train_model_input = {name: train[name] for name in feature_names} test_model_input = {name: test[name] for name in feature_names} # 4.Define Model,train,predict and evaluate device = 'cpu' use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') device = 'cuda:0' model = xDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns, mlet_dim=(16, ), task='binary', l2_reg_embedding=1e-5, device=device) model.compile( "adagrad", "binary_crossentropy", metrics=["binary_crossentropy", "auc"], ) model.fit(train_model_input, train[target].values, batch_size=32, epochs=10, validation_split=0.0, verbose=2)
X_valid = model_feed_dict(valid[feature_name]) X_test = model_feed_dict(test[feature_name]) Y = train['label'].values valid_Y = valid['label'].values torch.cuda.empty_cache() use_cuda = True if use_cuda and torch.cuda.is_available(): print('cuda ready...') device = 'cuda:0' # torch.autograd.set_detect_anomaly(True) model = xDeepFM(linear_feature_columns, dnn_feature_columns, device=device) model.compile("adam", 'binary_crossentropy', ["auc"]) model.fit( X_train, Y, batch_size=4096, epochs=1, validation_data=(X_valid, valid_Y), verbose=1, ) model.fit(X_valid, valid_Y, batch_size=4096) answer = model.predict(X_test, batch_size=8192) submit = pd.DataFrame() submit['id'] = test['id'].astype(int) submit['probability'] = np.round(answer.flatten(), 6)