seed = 20170705 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) train_file = "train_large.txt" feature_sizes_file = "feature_sizes_large.txt" debug = False #train_file = "train.txt" #feature_sizes_file = "feature_sizes.txt" #debug = True # load data train_data = CriteoDataset('./data', train=True, train_file=train_file) # split trani and valid set train_idx, valid_idx = split_train_and_valid(train_data, debug) # loader loader_train = DataLoader(train_data, batch_size=256, sampler=sampler.SubsetRandomSampler(train_idx), num_workers=0) loader_val = DataLoader(train_data, batch_size=1000, sampler=sampler.SubsetRandomSampler(valid_idx), num_workers=0) feature_sizes = np.loadtxt('./data/{}'.format(feature_sizes_file), delimiter=',') feature_sizes = [int(x) for x in feature_sizes] print(feature_sizes) model = DeepFM(feature_sizes, use_cuda=True, overfitting=debug) #optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0) optimizer = radam.RAdam(model.parameters(), lr=1e-3, weight_decay=0.0) model.fit(loader_train, loader_val, optimizer, epochs=1000, verbose=True, print_every=1000, checkpoint_dir="./chkp")
import torch import torch.optim as optim from torch.utils.data import DataLoader from torch.utils.data import sampler from model.DeepFM import DeepFM from data.dataset import CriteoDataset # 900000 items for training, 10000 items for valid, of all 1000000 items Num_train = 900000 # load data train_data = CriteoDataset('./data', train=True) loader_train = DataLoader(train_data, batch_size=100, sampler=sampler.SubsetRandomSampler( range(Num_train))) val_data = CriteoDataset('./data', train=True) loader_val = DataLoader(val_data, batch_size=100, sampler=sampler.SubsetRandomSampler( range(Num_train, 1000000))) feature_sizes = np.loadtxt('./data/feature_sizes.txt', delimiter=',') feature_sizes = [int(x) for x in feature_sizes] print(feature_sizes) model = DeepFM(feature_sizes, use_cuda=False) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0) model.fit(loader_train, loader_val, optimizer, epochs=5, verbose=True)
def train(args, feature_dict, feature_values, label, ctx, validation_feature, validation_label): if args.MODEL == 'deepfm': model = DeepFM(feature_dict, args, ctx, args.TASK) elif args.MODEL == 'xdeepfm': model = xDeepFM(feature_dict, args, ctx, args.TASK) else: model = MergeModel(feature_dict, args, ctx, args.TASK) if args.TASK == 'finish': if args.FINISH_MODEL_PATH is not None: model.initialize(init=init.Xavier(), ctx=ctx) model.load_params(args.FINISH_MODEL_PATH) else: model.initialize(init=init.Xavier(), ctx=ctx) else: if args.LIKE_MODEL_PATH is not None: model.initialize(init=init.Xavier(), ctx=ctx) model.load_params(args.LIKE_MODEL_PATH) else: model.initialize(init=init.Xavier(), ctx=ctx) # print(model.collect_params()) # train_iter = gdata.DataLoader(gdata.ArrayDataset(feature_values, label), # batch_size=args.BATCH_SIZE,shuffle=True) # pd.read_csv去掉name if args.TASK == 'finish': lr = args.FINISH_LEARNING_RATE else: lr = args.LIKE_LEARNING_RATE if args.OPTIMIZER == 'adam': model_trainer = Trainer(model.collect_params(), args.OPTIMIZER, { 'learning_rate': lr, 'wd': args.WEIGHT_DECAY }) else: model_trainer = Trainer(model.collect_params(), args.OPTIMIZER, {'learning_rate': lr}) if args.TASK == 'finish': epochs = args.FINISH_NUM_EPOCHS batch = args.FINISH_BATCH_SIZE else: epochs = args.LIKE_NUM_EPOCHS batch = args.LIKE_BATCH_SIZE for epoch in range(epochs): train_iter = gdata.DataLoader(gdata.ArrayDataset( feature_values, label), batch_size=batch, shuffle=True) # pd.read_csv去掉name time_start = time.time() train_epoch_loss, train_acc = model.train_epoch( epoch, train_iter, model_trainer) test_iter = gdata.DataLoader(gdata.ArrayDataset( validation_feature, validation_label), batch_size=batch, shuffle=False) epoch_loss, epoch_test_acc = model.eval_model(test_iter) # epoch_test_acc = evaluate_accuracy(test_iter,model) # epoch_loss = 0.0 train_num = len(feature_values) # print(train_num) print( '\n[%s] net_name:[%s] ,EPOCH FINISH [%d],,time_cost [%d]s,average_loss:[%f] eval_model_loss:[%f],train_acc:[%f],test_acc:[%f]' % ( time.strftime("%Y-%m-%d %H:%M:%S"), model.task, epoch + 1, np.int(time.time() - time_start), train_epoch_loss / train_num, epoch_loss / len(validation_feature), train_acc, epoch_test_acc # epoch_loss )) if epoch % 1 == 0: filename = args.SAVE_PARAMS_PATH_PREFIX + '/net_' + args.MODEL + '_' + model.task + '_' + args.CONFIG_NAME + '_' + time.strftime( "%Y%m%d_%H%M%S") + '.model' model.save_params(filename) return model
def DeepFM_sub(train, test): def evaluate_by_batch(): pred = np.ndarray([n_instances_test]) n_iter_pred = int(math.ceil(n_instances_test / batch_size)) print('evaluating') for i_eval in range(n_iter_pred): print("{}/{}".format(i_eval, n_iter_pred)) idx_start, idx_end = i_eval * batch_size, min( (i_eval + 1) * batch_size, n_instances_test) pred[idx_start:idx_end] = classifier.predict_proba( Xi_test[idx_start:idx_end], Xv_test[idx_start:idx_end]) return pred def save_checkpoint(pred, postfix): test['predicted_score'] = pred sub1 = test[['instance_id', 'predicted_score']] sub = pd.read_csv(path_test, sep="\s+") sub = pd.merge(sub, sub1, on=['instance_id'], how='left') sub = sub.fillna(0) path_save_result = path_result + 'result_' + training_info + '_' + postfix + '.txt' sub[['instance_id', 'predicted_score']].to_csv(path_save_result, sep=" ", index=False) print(path_save_result + ' saved') ''' col = [c for c in train if c not in ['is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'instance_id', 'context_id', 'realtime', 'context_timestamp', 'user_cnt1', 'item_cnt1', 'shop_cnt1', 'user_cntx', 'item_cntx', 'shop_cntx', ]] col_cate = [c for c in col if issubclass(np.dtype('intp').type, type(train[c][0]))] col_real = [c for c in col if not issubclass(np.dtype('intp').type, type(train[c][0]))] ''' batch_size = 16 col_cate, col_real = select_features_by_corr(train) #n_fields = len(col) n_fields_cate = len(col_cate) n_fields_real = len(col_real) n_fields = n_fields_cate + n_fields_real #assert n_fields == n_fields_cate + n_fields_real n_instances_train = len(train) n_instances_test = len(test) enc_lbl = preprocessing.LabelEncoder() feature_sizes_cate = np.ndarray((n_fields_cate, ), dtype=np.dtype('int')) feature_sizes_real = np.ndarray((n_fields_real, ), dtype=np.dtype('int')) Xi_train_cate = np.ndarray((n_fields_cate, n_instances_train), dtype=np.dtype('int')) Xv_train_cate = np.ndarray((n_fields_cate, n_instances_train), dtype=np.dtype('int')) Xi_train_real = np.ndarray((n_fields_real, n_instances_train)) Xv_train_real = np.ndarray((n_fields_real, n_instances_train)) y_train = train['is_trade'].values dist = np.bincount(y_train.astype(np.dtype('int64'))).tolist() assert y_train.size == dist[0] + dist[1] print('label distribution') print('label 0 = ' + str(dist[0])) print('label 1 = ' + str(dist[1])) class_weight = [ dist[1] / (dist[0] + dist[1]), dist[0] / (dist[0] + dist[1]) ] print(class_weight) print(type(class_weight)) Xi_test_cate = np.ndarray((n_fields_cate, n_instances_test), dtype=np.dtype('int')) Xv_test_cate = np.ndarray((n_fields_cate, n_instances_test), dtype=np.dtype('int')) Xi_test_real = np.ndarray((n_fields_real, n_instances_test)) Xv_test_real = np.ndarray((n_fields_real, n_instances_test)) # category features for i, c in enumerate(col_cate): enc_lbl.fit(pd.concat([train[c], test[c]])) n_features = len(enc_lbl.classes_) feature_sizes_cate[i] = n_features train[c] = enc_lbl.transform(train[c]) test[c] = enc_lbl.fit_transform(test[c]) Xi_train_cate[i] = train[c].astype(int) Xv_train_cate[i] = np.ones((n_instances_train, )) Xi_test_cate[i] = test[c].astype(int) Xv_test_cate[i] = np.ones((n_instances_test, )) # real number features for i, c in enumerate(col_real): n_features = 1 feature_sizes_real[i] = n_features #train[c] = enc_lbl.transform(train[c]) #test[c] = enc_lbl.fit_transform(test[c]) Xi_train_real[i] = np.zeros((n_instances_train), dtype=np.dtype('int')) Xv_train_real[i] = train[c] ''' print('===================================real value feature : ' + c + '===================================') print(Xv_train_real.shape) print(Xv_train_real[i][:10]) ''' Xi_test_real[i] = np.zeros((n_instances_test), dtype=np.dtype('int')) Xv_test_real[i] = test[c] feature_sizes = np.concatenate( (feature_sizes_cate, feature_sizes_real)).tolist() Xi_train = np.concatenate((Xi_train_cate, Xi_train_real)) Xv_train = np.concatenate((Xv_train_cate, Xv_train_real)) Xi_test = np.concatenate((Xi_test_cate, Xi_test_real)) Xv_test = np.concatenate((Xv_test_cate, Xv_test_real)) Xi_train = Xi_train.swapaxes(0, 1) Xv_train = Xv_train.swapaxes(0, 1) Xi_test = Xi_test.swapaxes(0, 1) Xv_test = Xv_test.swapaxes(0, 1) Xi_valid = None Xv_valid = None y_valid = None print('n_fields : ' + str(n_fields)) print('len(feature_sizes : )' + str(len(feature_sizes))) print('max(feature_sizes : )' + str(max(feature_sizes))) print('create ' + training_info + ' classifier') classifier = DeepFM( n_fields, feature_sizes, embedding_size=20, h_depth=2, deep_layers=[32, 32], is_deep_dropout=True, dropout_deep=[0.0, 0.2, 0.2], is_batch_norm=True, verbose=True, weight_decay=0.002, n_epochs=1, batch_size=batch_size, eval_metric=sklearn.metrics.average_precision_score, use_fm=True, use_ffm=False, use_cuda=False, class_weight=class_weight, ) """ print("Xi_train.shape : "+str(Xi_train.shape)) print("Xv_train.shape : "+str(Xv_train.shape)) print("y_train.shape : "+str(y_train.shape)) """ print(training_info + ' classifier fitting') #pred = evaluate_by_batch() #save_checkpoint(pred, 'pre') for i in range(64): classifier.fit(Xi_train, Xv_train, y_train, ealry_stopping=True) pred = evaluate_by_batch() save_checkpoint(pred, '{:02d}'.format(i)) '''
# 测试集 test_data = CriteoDataset(test_data, train=True) loader_test = DataLoader(test_data) # 读取每个特征的size feature_sizes = np.loadtxt('./data/feature_sizes.txt', delimiter=',') feature_sizes = [int(x) for x in feature_sizes] print(feature_sizes) # dense, sparse特征数量 n_sparse = len(sparse_features) n_dense = len(train_raw.columns.tolist()) - n_sparse - 1 start = time.process_time() # 模型训练 model = DeepFM(feature_sizes, n_dense, n_sparse, use_cuda=False) optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.0) model.fit(loader_train, loader_val, optimizer, epochs=50, verbose=True) # 模型预测 y_pred = model.predict(loader_test) print(y_pred) results = result_process(np.array(y_pred)) print(results) results.to_csv('../../code result/label/label0803.csv', index=False) end = time.process_time() print('Running time: %d seconds' % (end - start))