def train_s2s_ch9(model, data_iter, lr, num_epochs, ctx): model.initialize(init.Xavier(), force_reinit=True, ctx=ctx) trainer = gluon.Trainer(model.collect_params(),'adam', {'learning_rate': lr}) loss = MaskedSoftmaxCELoss() animator = d2l.Animator(xlabel='epoch', ylabel='loss',xlim=[1, num_epochs], ylim=[0, 0.25]) animator.fig.subplots_adjust(hspace=0.3) #FIXME - TEST for epoch in range(1, num_epochs + 1): timer = d2l.Timer() metric = d2l.Accumulator(2) # loss_sum, num_tokens for batch in data_iter: X, X_vlen, Y, Y_vlen = [x.as_in_context(ctx) for x in batch] Y_input, Y_label, Y_vlen = Y[:, :-1], Y[:, 1:], Y_vlen-1 with autograd.record(): Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen) l = loss(Y_hat, Y_label, Y_vlen) l.backward() d2l.grad_clipping(model, 1) num_tokens = Y_vlen.sum() trainer.step(num_tokens) metric.add(l.sum(), num_tokens) if epoch % 10 == 0: animator.add(epoch, (metric[0]/metric[1],)) print('loss %.3f, %d tokens/sec on %s ' % ( metric[0]/metric[1], metric[1]/timer.stop(), ctx))
def train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, ctx_list=d2l.try_all_gpus(), split_f=d2l.split_batch): num_batches, timer = len(train_iter), d2l.Timer() animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 1], legend=['train loss', 'train acc', 'test acc']) for epoch in range(num_epochs): # Store training_loss, training_accuracy, num_examples, num_features metric = d2l.Accumulator(4) for i, (features, labels) in enumerate(train_iter): timer.start() l, acc = train_batch_ch13(net, features, labels, loss, trainer, ctx_list, split_f) metric.add(l, acc, labels.shape[0], labels.size) timer.stop() if (i + 1) % (num_batches // 5) == 0: animator.add( epoch + i / num_batches, (metric[0] / metric[2], metric[1] / metric[3], None)) test_acc = d2l.evaluate_accuracy_gpus(net, test_iter, split_f) animator.add(epoch + 1, (None, None, test_acc)) print('loss %.3f, train acc %.3f, test acc %.3f' % (metric[0] / metric[2], metric[1] / metric[3], test_acc)) print('%.1f examples/sec on %s' % (metric[2] * num_epochs / timer.sum(), ctx_list))
def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs, ctx_list=d2l.try_all_gpus(), evaluator=None, **kwargs): timer = d2l.Timer() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2], legend=['train loss', 'test RMSE']) for epoch in range(num_epochs): metric, l = d2l.Accumulator(3), 0. for i, values in enumerate(train_iter): timer.start() input_data = [] values = values if isinstance(values, list) else [values] for v in values: input_data.append(gluon.utils.split_and_load(v, ctx_list)) train_feat = input_data[0:-1] if len(values) > 1 else input_data train_label = input_data[-1] with autograd.record(): preds = [net(*t) for t in zip(*train_feat)] ls = [loss(p, s) for p, s in zip(preds, train_label)] [l.backward() for l in ls] l += sum([l.asnumpy() for l in ls]).mean() / len(ctx_list) trainer.step(values[0].shape[0]) metric.add(l, values[0].shape[0], values[0].size) timer.stop() if len(kwargs) > 0: # it will be used in section AutoRec. test_rmse = evaluator(net, test_iter, kwargs['inter_mat'], ctx_list) else: test_rmse = evaluator(net, test_iter, ctx_list) train_l = l / (i + 1) animator.add(epoch + 1, (train_l, test_rmse)) print('train loss %.3f, test RMSE %.3f' % (metric[0] / metric[1], test_rmse)) print('%.1f examples/sec on %s' % (metric[2] * num_epochs / timer.sum(), ctx_list))
def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter, num_users, num_items, num_epochs, ctx_list, evaluator, candidates, eval_step=1): timer, hit_rate, auc = d2l.Timer(), 0, 0 animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1], legend=['test hit rate', 'test AUC']) for epoch in range(num_epochs): metric, l = d2l.Accumulator(3), 0. for i, values in enumerate(train_iter): input_data = [] for v in values: input_data.append(gluon.utils.split_and_load(v, ctx_list)) with autograd.record(): p_pos = [net(*t) for t in zip(*input_data[0:-1])] p_neg = [ net(*t) for t in zip(*input_data[0:-2], input_data[-1]) ] ls = [loss(p, n) for p, n in zip(p_pos, p_neg)] [l.backward(retain_graph=False) for l in ls] l += sum([l.asnumpy() for l in ls]).mean() / len(ctx_list) trainer.step(values[0].shape[0]) metric.add(l, values[0].shape[0], values[0].size) timer.stop() with autograd.predict_mode(): if (epoch + 1) % eval_step == 0: hit_rate, auc = evaluator(net, test_iter, test_seq_iter, candidates, num_users, num_items, ctx_list) animator.add(epoch + 1, (hit_rate, auc)) print('train loss %.3f, test hit rate %.3f, test AUC %.3f' % (metric[0] / metric[1], hit_rate, auc)) print('%.1f examples/sec on %s' % (metric[2] * num_epochs / timer.sum(), ctx_list))
'learning_rate': lr, 'wd': wd }) num_batches, timer = len(DataLoader_Single_train), d2l.Timer() animator = d2l.Animator( xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 1], legend=['train loss', 'train acc', 'val loss', 'val acc']) ctx = d2l.try_all_gpus() for epoch in range(num_epochs): # Training loop # Store training_loss, training_accuracy, num_examples, num_features metric = d2l.Accumulator(4) for i, (Xs_in, ys_in) in enumerate(DataLoader_Single_train): print("Training iteration: " + str(i)) timer.start() Xs = gluon.utils.split_and_load(Xs_in.astype("float32"), ctx) ys = gluon.utils.split_and_load(ys_in.astype("float32"), ctx) with autograd.record(): pys = [net(X.transpose(axes=(0, 3, 1, 2))) for X in Xs] ls = [loss(py, y) for py, y in zip(pys, ys)] for l in ls: l.backward() trainer.step(ys_in.shape[0]) train_loss_sum = sum([float(l.sum().asnumpy()[0]) for l in ls]) train_acc_sum = sum( d2l.accuracy(py.asnumpy(), y.asnumpy()) for py, y in zip(pys, ys)) l, acc = train_loss_sum, train_acc_sum
def train_model(args, reporter=None): """Training process.""" logger.enable_log_file('valid.log') ctx = d2l.try_all_gpus() log.info('Loading data from %s...', args.data_file) (train_iter, pos_weight, validate_feats, validate_valids, test_feats, test_valids) = load_data(args.data_file, args.batch_size) validate_feats = np.array(validate_feats, ctx=ctx[0]) validate_valids = np.array(validate_valids, ctx=ctx[0]) test_feats = np.array(test_feats, ctx=ctx[0]) test_valids = np.array(test_valids, ctx=ctx[0]) # Initialize loss function. log.info('Positive weight for CELoss: %.2f', pos_weight) valid_loss = gluon.loss.SoftmaxCrossEntropyLoss() fnet = [ 'Valid Hiddens: {}'.format(args.cls_hiddens), 'Dropout: {}'.format(args.dropout), ] log.info('Network\n\t%s', '\n\t'.join(fnet)) fparams = [ 'Batch Size: {}'.format(args.batch_size), 'Epochs: {}'.format(args.epochs), 'Learning Rate: {}'.format(args.lr), 'Weight Decay: {}'.format(args.wd) ] log.info('Hyper-Parameters:\n\t%s', '\n\t'.join(fparams)) net = ValidNet(args.cls_hiddens, args.dropout) net.hybridize() net.initialize(init.Xavier(), ctx=ctx) log.info('Model initialized on %s', str(ctx)) # Initialize trainer. trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': args.lr, 'wd': args.wd }) log.info('Training...') metric = d2l.Accumulator(2) for epoch in range(args.epochs): # if epoch % 10 == 0: # args.lr *= 0.1 # trainer.set_learning_rate(args.lr) # log.info('Reset learning rate to %e', args.lr) if reporter is None: log.info('Epoch %d', epoch) progress = tqdm.tqdm( train_iter, bar_format='{desc} {percentage:3.0f}%|{bar:50}{r_bar}') else: progress = train_iter for iter_idx, (batch_feat, batch_valids) in enumerate(progress): np_feat = split_and_load(batch_feat, ctx, even_split=False)[0] np_valid = split_and_load(batch_valids, ctx, even_split=False)[0] with autograd.record(): ls = get_batch_loss(net, valid_loss, np_feat, np_valid) ls.backward() l_mean = float(ls) trainer.step(1) metric.add(l_mean, 1) if reporter is None and iter_idx % 30 == 0: progress.set_description_str(desc='Loss {:.3f}'.format(l_mean), refresh=True) #val_acc = test_acc(net, validate_feats, validate_valids, print_log=False) #if reporter is None: # log.info('Epoch %d: Loss %.3f, Valid Error %.2f%%', epoch, # metric[0] / metric[1], val_acc) #else: # reporter(epoch=epoch, accuracy=val_acc) #if reporter is None: # log.info('Final loss %.3f', metric[0] / metric[1]) log.info('Testing...') test_acc(net, test_feats, test_valids) return net
def train_bert(args, reporter=None): """Training process.""" logger.enable_log_file('train-{}'.format( os.path.basename(args.data_file).replace('.csv', '.log'))) ctx = d2l.try_all_gpus() log.info('Loading data from %s...', args.data_file) (train_iter, pos_weight, validate_feats, validate_thrpts, test_feats, test_thrpts, train_thrpt_avg, train_thrpt_std) = load_data(args.data_file, args.batch_size, args.center_hiddens) validate_feats = np.array(validate_feats, ctx=ctx[0]) validate_thrpts = np.array(validate_thrpts, ctx=ctx[0]) test_feats = np.array(test_feats, ctx=ctx[0]) test_thrpts = np.array(test_thrpts, ctx=ctx[0]) #log.info('Positive weight for CELoss: %.2f', pos_weight) fnet = [ 'Center Heads: {}'.format(args.center_heads), 'Center FFN Hiddens: {}'.format(args.center_ffn_hiddens), 'Center Layers: {}'.format(args.center_layers), 'Reg Encoder FFN Hiddens: {}'.format(args.reg_encode_ffn_hiddens), 'Reg Encoder Layers: {}'.format(args.reg_encode_layers), 'Regression Hiddens: {}'.format(args.reg_hiddens), 'Valid Hiddens: {}'.format(args.cls_hiddens), ] log.info('Network\n\t%s', '\n\t'.join(fnet)) fparams = [ 'Batch Size: {}'.format(args.batch_size), 'Dropout: {}'.format(args.dropout), 'Epochs: {}'.format(args.epochs), 'Learning Rate: {}'.format(args.lr), 'Weight Decay: {}'.format(args.wd) ] log.info('Hyper-Parameters:\n\t%s', '\n\t'.join(fparams)) net = BERTModel(args.center_hiddens, args.center_ffn_hiddens, args.center_heads, args.center_layers, args.reg_encode_ffn_hiddens, args.reg_encode_layers, args.reg_hiddens, args.cls_hiddens, args.dropout) net.initialize(init.Xavier(), ctx=ctx) log.info('Model initialized on %s', str(ctx)) # Initialize trainer. trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': args.lr, 'wd': args.wd }) # Initialize loss functions. valid_loss = gluon.loss.SoftmaxCELoss() #weight=pos_weight, thrpt_loss = gluon.loss.L2Loss() log.info('Training...') metric = d2l.Accumulator(2) for epoch in range(args.epochs): # if epoch % 10 == 0: # args.lr *= 0.1 # trainer.set_learning_rate(args.lr) # log.info('Reset learning rate to %e', args.lr) if reporter is None: log.info('Epoch %d', epoch) progress = tqdm.tqdm( train_iter, bar_format='{desc} {percentage:3.0f}%|{bar:50}{r_bar}') else: progress = train_iter for iter_idx, (batch_feat, batch_thrpt, batch_label) in enumerate(progress): np_feat = split_and_load(batch_feat, ctx, even_split=False)[0] np_thrpt = split_and_load(batch_thrpt, ctx, even_split=False)[0] np_label = split_and_load(batch_label, ctx, even_split=False)[0] with autograd.record(): ls = get_batch_loss(net, valid_loss, thrpt_loss, np_feat, np_thrpt, np_label) ls.backward() trainer.step(1) l_mean = float(ls) metric.add(l_mean, 1) if reporter is None and iter_idx % 30 == 0: progress.set_description_str(desc='Loss {:.3f}'.format(l_mean), refresh=True) val_acc = test_acc(net, validate_feats, validate_thrpts, train_thrpt_avg, train_thrpt_std, print_log=False) if reporter is None: log.info( 'Epoch %d: Loss %.3f, Valid Error %.2f%%, Thrpt Error %.3f (std %.3f)', epoch, metric[0] / metric[1], val_acc[0], val_acc[1][0], val_acc[1][1]) else: # FIXME: Not working now reporter(epoch=epoch, accuracy=val_acc) if reporter is None: log.info('Final loss %.3f', metric[0] / metric[1]) log.info('Testing...') test_acc(net, test_feats, test_thrpts, train_thrpt_avg, train_thrpt_std) return net
def train_bert(args, reporter=None): """Training process.""" logger.enable_log_file('train-{}'.format( os.path.basename(args.data_file).replace('.csv', '.log'))) ctx = d2l.try_all_gpus() log.info('Loading data from %s...', args.data_file) (train_iter, validate_feats, validate_thrpts, test_feats, test_thrpts, _) = load_data(args.data_file, args.batch_size, args.num_cls) validate_feats = np.array(validate_feats, ctx=ctx[0]) validate_thrpts = np.array(validate_thrpts, ctx=ctx[0]) test_feats = np.array(test_feats, ctx=ctx[0]) test_thrpts = np.array(test_thrpts, ctx=ctx[0]) fnet = [ 'Regression Hiddens: {}'.format(args.reg_hiddens), 'Output Classes: {}'.format(args.num_cls) ] log.info('Network\n\t%s', '\n\t'.join(fnet)) fparams = [ 'Batch Size: {}'.format(args.batch_size), 'Dropout: {}'.format(args.dropout), 'Epochs: {}'.format(args.epochs), 'Learning Rate: {}'.format(args.lr), 'Weight Decay: {}'.format(args.wd) ] log.info('Hyper-Parameters:\n\t%s', '\n\t'.join(fparams)) net = ThrptPred(args.reg_hiddens, args.num_cls, args.dropout) net.initialize(init.Xavier(), ctx=ctx) log.info('Model initialized on %s', str(ctx)) # Initialize trainer. trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': args.lr, 'wd': args.wd }) # Initialize loss functions. thrpt_loss = gluon.loss.SoftmaxCrossEntropyLoss() sample_weight = np.array(list(range(1, 2 * args.num_cls, 2)), ctx=ctx[0]) log.info('Training...') metric = d2l.Accumulator(2) for epoch in range(args.epochs): # if epoch % 10 == 0: # args.lr *= 0.1 # trainer.set_learning_rate(args.lr) # log.info('Reset learning rate to %e', args.lr) if reporter is None: log.info('Epoch %d', epoch) progress = tqdm.tqdm( train_iter, bar_format='{desc} {percentage:3.0f}%|{bar:50}{r_bar}') else: progress = train_iter for iter_idx, (batch_feat, batch_thrpt) in enumerate(progress): np_feat = split_and_load(batch_feat, ctx, even_split=False)[0] np_thrpt = split_and_load(batch_thrpt, ctx, even_split=False)[0] with autograd.record(): ls = get_batch_loss(net, thrpt_loss, np_feat, np_thrpt, sample_weight) ls.backward() trainer.step(1) l_mean = float(ls) metric.add(l_mean, 1) if reporter is None and iter_idx % 30 == 0: progress.set_description_str(desc='Loss {:.3f}'.format(l_mean), refresh=True) val_acc = test_acc(net, validate_feats, validate_thrpts, print_log=False) if reporter is None: log.info('Epoch %d: Loss %.3f, Accuracy %.2f%%', epoch, metric[0] / metric[1], val_acc) else: reporter(epoch=epoch, accuracy=val_acc) if reporter is None: log.info('Final loss %.3f', metric[0] / metric[1]) log.info('Testing...') test_acc(net, test_feats, test_thrpts) return net