예제 #1
0
seed = 20170705
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_file = "train_large.txt"
feature_sizes_file = "feature_sizes_large.txt"
debug = False
#train_file = "train.txt"
#feature_sizes_file = "feature_sizes.txt"
#debug = True

# load data
train_data = CriteoDataset('./data', train=True, train_file=train_file)

# split trani and valid set
train_idx, valid_idx = split_train_and_valid(train_data, debug)

# loader
loader_train = DataLoader(train_data, batch_size=256, sampler=sampler.SubsetRandomSampler(train_idx), num_workers=0)
loader_val = DataLoader(train_data, batch_size=1000, sampler=sampler.SubsetRandomSampler(valid_idx), num_workers=0)

feature_sizes = np.loadtxt('./data/{}'.format(feature_sizes_file), delimiter=',')
feature_sizes = [int(x) for x in feature_sizes]
print(feature_sizes)

model = DeepFM(feature_sizes, use_cuda=True, overfitting=debug)
#optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0)
optimizer = radam.RAdam(model.parameters(), lr=1e-3, weight_decay=0.0)
model.fit(loader_train, loader_val, optimizer, epochs=1000, verbose=True, print_every=1000, checkpoint_dir="./chkp")
예제 #2
0
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

from model.DeepFM import DeepFM
from data.dataset import CriteoDataset

# 900000 items for training, 10000 items for valid, of all 1000000 items
Num_train = 900000

# load data
train_data = CriteoDataset('./data', train=True)
loader_train = DataLoader(train_data,
                          batch_size=100,
                          sampler=sampler.SubsetRandomSampler(
                              range(Num_train)))
val_data = CriteoDataset('./data', train=True)
loader_val = DataLoader(val_data,
                        batch_size=100,
                        sampler=sampler.SubsetRandomSampler(
                            range(Num_train, 1000000)))

feature_sizes = np.loadtxt('./data/feature_sizes.txt', delimiter=',')
feature_sizes = [int(x) for x in feature_sizes]
print(feature_sizes)

model = DeepFM(feature_sizes, use_cuda=False)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0)
model.fit(loader_train, loader_val, optimizer, epochs=5, verbose=True)
예제 #3
0
def train(args, feature_dict, feature_values, label, ctx, validation_feature,
          validation_label):
    if args.MODEL == 'deepfm':
        model = DeepFM(feature_dict, args, ctx, args.TASK)
    elif args.MODEL == 'xdeepfm':
        model = xDeepFM(feature_dict, args, ctx, args.TASK)
    else:
        model = MergeModel(feature_dict, args, ctx, args.TASK)
    if args.TASK == 'finish':
        if args.FINISH_MODEL_PATH is not None:
            model.initialize(init=init.Xavier(), ctx=ctx)
            model.load_params(args.FINISH_MODEL_PATH)
        else:
            model.initialize(init=init.Xavier(), ctx=ctx)
    else:
        if args.LIKE_MODEL_PATH is not None:
            model.initialize(init=init.Xavier(), ctx=ctx)
            model.load_params(args.LIKE_MODEL_PATH)
        else:
            model.initialize(init=init.Xavier(), ctx=ctx)
    # print(model.collect_params())
    # train_iter = gdata.DataLoader(gdata.ArrayDataset(feature_values, label),
    #                               batch_size=args.BATCH_SIZE,shuffle=True)  # pd.read_csv去掉name
    if args.TASK == 'finish':
        lr = args.FINISH_LEARNING_RATE
    else:
        lr = args.LIKE_LEARNING_RATE
    if args.OPTIMIZER == 'adam':
        model_trainer = Trainer(model.collect_params(), args.OPTIMIZER, {
            'learning_rate': lr,
            'wd': args.WEIGHT_DECAY
        })
    else:
        model_trainer = Trainer(model.collect_params(), args.OPTIMIZER,
                                {'learning_rate': lr})
    if args.TASK == 'finish':
        epochs = args.FINISH_NUM_EPOCHS
        batch = args.FINISH_BATCH_SIZE

    else:
        epochs = args.LIKE_NUM_EPOCHS
        batch = args.LIKE_BATCH_SIZE

    for epoch in range(epochs):
        train_iter = gdata.DataLoader(gdata.ArrayDataset(
            feature_values, label),
                                      batch_size=batch,
                                      shuffle=True)  # pd.read_csv去掉name
        time_start = time.time()
        train_epoch_loss, train_acc = model.train_epoch(
            epoch, train_iter, model_trainer)

        test_iter = gdata.DataLoader(gdata.ArrayDataset(
            validation_feature, validation_label),
                                     batch_size=batch,
                                     shuffle=False)
        epoch_loss, epoch_test_acc = model.eval_model(test_iter)
        # epoch_test_acc = evaluate_accuracy(test_iter,model)
        # epoch_loss = 0.0
        train_num = len(feature_values)
        # print(train_num)
        print(
            '\n[%s] net_name:[%s] ,EPOCH FINISH [%d],,time_cost [%d]s,average_loss:[%f] eval_model_loss:[%f],train_acc:[%f],test_acc:[%f]'
            % (
                time.strftime("%Y-%m-%d %H:%M:%S"), model.task, epoch + 1,
                np.int(time.time() - time_start), train_epoch_loss / train_num,
                epoch_loss / len(validation_feature), train_acc, epoch_test_acc
                # epoch_loss
            ))
        if epoch % 1 == 0:
            filename = args.SAVE_PARAMS_PATH_PREFIX + '/net_' + args.MODEL + '_' + model.task + '_' + args.CONFIG_NAME + '_' + time.strftime(
                "%Y%m%d_%H%M%S") + '.model'
            model.save_params(filename)
    return model
예제 #4
0
def DeepFM_sub(train, test):
    def evaluate_by_batch():
        pred = np.ndarray([n_instances_test])
        n_iter_pred = int(math.ceil(n_instances_test / batch_size))
        print('evaluating')
        for i_eval in range(n_iter_pred):
            print("{}/{}".format(i_eval, n_iter_pred))
            idx_start, idx_end = i_eval * batch_size, min(
                (i_eval + 1) * batch_size, n_instances_test)
            pred[idx_start:idx_end] = classifier.predict_proba(
                Xi_test[idx_start:idx_end], Xv_test[idx_start:idx_end])
        return pred

    def save_checkpoint(pred, postfix):
        test['predicted_score'] = pred
        sub1 = test[['instance_id', 'predicted_score']]
        sub = pd.read_csv(path_test, sep="\s+")
        sub = pd.merge(sub, sub1, on=['instance_id'], how='left')
        sub = sub.fillna(0)
        path_save_result = path_result + 'result_' + training_info + '_' + postfix + '.txt'
        sub[['instance_id', 'predicted_score']].to_csv(path_save_result,
                                                       sep=" ",
                                                       index=False)
        print(path_save_result + ' saved')

    '''
    col = [c for c in train if
           c not in ['is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'instance_id',
                     'context_id', 'realtime', 'context_timestamp',
                     'user_cnt1', 'item_cnt1', 'shop_cnt1', 'user_cntx',	'item_cntx', 'shop_cntx',
                     ]]
    col_cate = [c for c in col if issubclass(np.dtype('intp').type, type(train[c][0]))]
    col_real = [c for c in col if not issubclass(np.dtype('intp').type, type(train[c][0]))]
    '''
    batch_size = 16
    col_cate, col_real = select_features_by_corr(train)

    #n_fields = len(col)
    n_fields_cate = len(col_cate)
    n_fields_real = len(col_real)
    n_fields = n_fields_cate + n_fields_real
    #assert n_fields == n_fields_cate + n_fields_real

    n_instances_train = len(train)
    n_instances_test = len(test)

    enc_lbl = preprocessing.LabelEncoder()

    feature_sizes_cate = np.ndarray((n_fields_cate, ), dtype=np.dtype('int'))
    feature_sizes_real = np.ndarray((n_fields_real, ), dtype=np.dtype('int'))

    Xi_train_cate = np.ndarray((n_fields_cate, n_instances_train),
                               dtype=np.dtype('int'))
    Xv_train_cate = np.ndarray((n_fields_cate, n_instances_train),
                               dtype=np.dtype('int'))
    Xi_train_real = np.ndarray((n_fields_real, n_instances_train))
    Xv_train_real = np.ndarray((n_fields_real, n_instances_train))

    y_train = train['is_trade'].values
    dist = np.bincount(y_train.astype(np.dtype('int64'))).tolist()
    assert y_train.size == dist[0] + dist[1]
    print('label distribution')
    print('label 0 = ' + str(dist[0]))
    print('label 1 = ' + str(dist[1]))
    class_weight = [
        dist[1] / (dist[0] + dist[1]), dist[0] / (dist[0] + dist[1])
    ]
    print(class_weight)
    print(type(class_weight))

    Xi_test_cate = np.ndarray((n_fields_cate, n_instances_test),
                              dtype=np.dtype('int'))
    Xv_test_cate = np.ndarray((n_fields_cate, n_instances_test),
                              dtype=np.dtype('int'))
    Xi_test_real = np.ndarray((n_fields_real, n_instances_test))
    Xv_test_real = np.ndarray((n_fields_real, n_instances_test))

    # category features
    for i, c in enumerate(col_cate):
        enc_lbl.fit(pd.concat([train[c], test[c]]))
        n_features = len(enc_lbl.classes_)
        feature_sizes_cate[i] = n_features

        train[c] = enc_lbl.transform(train[c])
        test[c] = enc_lbl.fit_transform(test[c])

        Xi_train_cate[i] = train[c].astype(int)
        Xv_train_cate[i] = np.ones((n_instances_train, ))
        Xi_test_cate[i] = test[c].astype(int)
        Xv_test_cate[i] = np.ones((n_instances_test, ))

    # real number features
    for i, c in enumerate(col_real):
        n_features = 1
        feature_sizes_real[i] = n_features

        #train[c] = enc_lbl.transform(train[c])
        #test[c] = enc_lbl.fit_transform(test[c])

        Xi_train_real[i] = np.zeros((n_instances_train), dtype=np.dtype('int'))
        Xv_train_real[i] = train[c]
        '''
        print('===================================real value feature : ' + c + '===================================')
        print(Xv_train_real.shape)
        print(Xv_train_real[i][:10])
        '''
        Xi_test_real[i] = np.zeros((n_instances_test), dtype=np.dtype('int'))
        Xv_test_real[i] = test[c]

    feature_sizes = np.concatenate(
        (feature_sizes_cate, feature_sizes_real)).tolist()
    Xi_train = np.concatenate((Xi_train_cate, Xi_train_real))
    Xv_train = np.concatenate((Xv_train_cate, Xv_train_real))
    Xi_test = np.concatenate((Xi_test_cate, Xi_test_real))
    Xv_test = np.concatenate((Xv_test_cate, Xv_test_real))

    Xi_train = Xi_train.swapaxes(0, 1)
    Xv_train = Xv_train.swapaxes(0, 1)
    Xi_test = Xi_test.swapaxes(0, 1)
    Xv_test = Xv_test.swapaxes(0, 1)

    Xi_valid = None
    Xv_valid = None
    y_valid = None

    print('n_fields : ' + str(n_fields))
    print('len(feature_sizes : )' + str(len(feature_sizes)))
    print('max(feature_sizes : )' + str(max(feature_sizes)))
    print('create ' + training_info + ' classifier')
    classifier = DeepFM(
        n_fields,
        feature_sizes,
        embedding_size=20,
        h_depth=2,
        deep_layers=[32, 32],
        is_deep_dropout=True,
        dropout_deep=[0.0, 0.2, 0.2],
        is_batch_norm=True,
        verbose=True,
        weight_decay=0.002,
        n_epochs=1,
        batch_size=batch_size,
        eval_metric=sklearn.metrics.average_precision_score,
        use_fm=True,
        use_ffm=False,
        use_cuda=False,
        class_weight=class_weight,
    )
    """
    print("Xi_train.shape : "+str(Xi_train.shape))
    print("Xv_train.shape : "+str(Xv_train.shape))
    print("y_train.shape : "+str(y_train.shape))
    """
    print(training_info + ' classifier fitting')

    #pred = evaluate_by_batch()
    #save_checkpoint(pred, 'pre')
    for i in range(64):
        classifier.fit(Xi_train, Xv_train, y_train, ealry_stopping=True)
        pred = evaluate_by_batch()
        save_checkpoint(pred, '{:02d}'.format(i))
        '''
예제 #5
0
# 测试集
test_data = CriteoDataset(test_data, train=True)
loader_test = DataLoader(test_data)

# 读取每个特征的size
feature_sizes = np.loadtxt('./data/feature_sizes.txt', delimiter=',')
feature_sizes = [int(x) for x in feature_sizes]
print(feature_sizes)

# dense, sparse特征数量
n_sparse = len(sparse_features)
n_dense = len(train_raw.columns.tolist()) - n_sparse - 1

start = time.process_time()

# 模型训练
model = DeepFM(feature_sizes, n_dense, n_sparse, use_cuda=False)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.0)
model.fit(loader_train, loader_val, optimizer, epochs=50, verbose=True)

# 模型预测
y_pred = model.predict(loader_test)
print(y_pred)
results = result_process(np.array(y_pred))
print(results)
results.to_csv('../../code result/label/label0803.csv', index=False)

end = time.process_time()
print('Running time: %d seconds' % (end - start))