def train(): best_val = None for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): data, target = get_batch(train_data, i) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) L = loss(output, target) L.backward() grads = [i.grad(context) for i in model.collect_params().values()] # Here gradient is not divided by batch_size yet. # So we multiply max_norm by batch_size to balance it. gluon.utils.clip_global_norm(grads, args.clip * args.batch_size) trainer.step(args.batch_size) total_L += mx.nd.sum(L).asscalar() if ibatch % args.log_interval == 0 and ibatch > 0: cur_L = total_L / args.batch_size / args.bptt / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%( epoch, ibatch, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%( epoch, time.time()-start_time, val_L, math.exp(val_L)))
def train(): total_loss = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for batch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): data, target = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) loss = criterion(output, target) loss.backward() grads = [p.grad(context) for p in model.collect_params().values()] # grad clipping helps prevent the exploding gradient problem in RNNs / LSTMs. gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size) trainer.step(args.bptt * args.batch_size) total_loss += mx.nd.sum(loss).asscalar() / loss.shape[0] if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} ' '| ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format( epoch + 1, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0.0 start_time = time.time()
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) # add time checkpoint for logging before_time = start_time for i, (data, target) in enumerate(train_data): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) # Here L is a vector of size batch_size * bptt size L = loss(output, target) L = L / (args.bptt * args.batch_size) L.backward() grads = [p.grad(context) for p in model.collect_params().values()] gluon.utils.clip_global_norm(grads, args.clip) trainer.step(1) total_L += mx.nd.sum(L).asscalar() if i % args.log_interval == 0 and i > 0: # log interval latency print batch_latency = time.time() - before_time before_time = time.time() cur_L = total_L / args.log_interval print( '[Epoch %d Batch %d] loss %.2f, ppl %.2f, batch_latency %.4f' % (epoch, i, cur_L, math.exp(cur_L), batch_latency)) total_L = 0.0 if args.export_model: model.export('model') return val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' % (epoch, time.time() - start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.save_parameters(args.save) print('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L))) else: args.lr = args.lr * 0.25 trainer.set_learning_rate(args.lr)
def eval(data_source): total_L = 0.0 ntotal = 0 hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i in range(0, data_source.shape[0] - 1, args.bptt): data, target = get_batch(data_source, i) output, hidden = model(data, hidden) L = loss(output, target) total_L += mx.nd.sum(L).asscalar() ntotal += L.size return total_L / ntotal
def eval(data_source): total_L = 0.0 ntotal = 0 hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i, (data, target) in enumerate(data_source): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) output, hidden = model(data, hidden) L = loss(output, target) total_L += mx.nd.sum(L).asscalar() ntotal += L.size return total_L / ntotal
def eval(data_source): total_L = 0.0 ntotal = 0 hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i, (data, target) in enumerate(data_source): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) output, hidden = model(data, hidden) L = loss(output, target) total_L += mx.nd.sum(L).asscalar() ntotal += L.size return total_L / ntotal
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): data, target = get_batch(train_data, i) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) L = loss(output, target) L.backward() grads = [i.grad(context) for i in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. gluon.utils.clip_global_norm( grads, args.clip * args.bptt * args.batch_size) trainer.step(args.batch_size) total_L += mx.nd.sum(L).asscalar() if ibatch % args.log_interval == 0 and ibatch > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f' % (epoch, ibatch, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' % (epoch, time.time() - start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.collect_params().save(args.save) print('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L))) else: args.lr = args.lr * 0.25 trainer._init_optimizer('sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 }) model.collect_params().load(args.save, context)
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i, (data, target) in enumerate(train_data): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) L = loss(output, target) L.backward() grads = [p.grad(context) for p in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size) trainer.step(args.batch_size) total_L += mx.nd.sum(L).asscalar() if i % args.log_interval == 0 and i > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%( epoch, i, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%( epoch, time.time()-start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.collect_params().save(args.save) print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L))) else: args.lr = args.lr*0.25 trainer._init_optimizer('sgd', {'learning_rate': args.lr, 'momentum': 0, 'wd': 0}) model.collect_params().load(args.save, context)
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i, (data, target) in enumerate(train_data): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) # Here L is a vector of size batch_size * bptt size L = loss(output, target) L = L / (args.bptt * args.batch_size) L.backward() grads = [p.grad(context) for p in model.collect_params().values()] gluon.utils.clip_global_norm(grads, args.clip) trainer.step(1) total_L += mx.nd.sum(L).asscalar() if i % args.log_interval == 0 and i > 0: cur_L = total_L / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%( epoch, i, cur_L, math.exp(cur_L))) total_L = 0.0 if args.export_model: model.export('model') return val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%( epoch, time.time()-start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.save_parameters(args.save) print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L))) else: args.lr = args.lr*0.25 trainer.set_learning_rate(args.lr)
def evaluate(data_source, batch_size): '''https://mxnet.incubator.apache.org/api/python/autograd/autograd.html#train-mode-and-predict-mode''' tic = time.time() total_loss = 0 N = 0 states = model.begin_state(batch_size, ctx=ctxs[0]) for cursor in range(0, data_source.shape[0] - 1, args.bptt): Xs, Ys = get_batch(data_source, cursor, args) # By default, MXNet is in predict_mode output, states, _, _ = model( Xs, states) # state(num_layers, bsz, hidden_size) states = detach(states) total_loss += nd.sum(batch_size * loss(output, Ys)).asscalar() # loss (seq_len,) N += batch_size * len(output) return (total_loss / N), time.time() - tic
def eval(data_source, ctx): total_L = 0.0 ntotal = 0 hidden_states = [ model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size/len(ctx), ctx=ctx[i]) for i in range(len(ctx)) ] for i in range(0, data_source.shape[0] - 1, args.bptt): data_batch, target_batch = get_batch(data_source, i) data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1) target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1) for (d, t) in zip(data, target): hidden = hidden_states[d.context.device_id] output, hidden = model(d, hidden) L = loss(output, t.reshape((-1,))) total_L += mx.nd.sum(L).asscalar() ntotal += L.size return total_L / ntotal
def eval(data_source, ctx): total_L = 0.0 ntotal = 0 hidden_states = [ model.begin_state(func=mx.nd.zeros, batch_size=int(args.batch_size/len(ctx)), ctx=ctx[i]) for i in range(len(ctx)) ] for i in range(0, data_source.shape[0] - 1, args.bptt): data_batch, target_batch = get_batch(data_source, i) data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1) target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1) for (d, t) in zip(data, target): hidden = hidden_states[d.context.device_id] output, hidden = model(d, hidden) L = loss(output, t.reshape((-1,))) total_L += mx.nd.sum(L).asscalar() ntotal += L.size return total_L / ntotal
def train(epochs, ctx): best_val = float("Inf") for epoch in range(epochs): total_L = 0.0 cur_L = 0.0 tic = time.time() hidden_states = [ model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size // len(ctx), ctx=ctx[i]) for i in range(len(ctx)) ] btic = time.time() for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): # get data batch from the training data data_batch, target_batch = get_batch(train_data, i) # For RNN we can do within batch multi-device parallelization data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1) target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1) Ls = [] for (d, t) in zip(data, target): # get corresponding hidden state then update hidden hidden = detach(hidden_states[d.context.device_id]) with autograd.record(): output, hidden = model(d, hidden) L = loss(output, t.reshape((-1, ))) L.backward() Ls.append(L) # write back to the record hidden_states[d.context.device_id] = hidden for c in ctx: grads = [i.grad(c) for i in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. # Also this utility function needs to be applied within the same context gluon.utils.clip_global_norm( grads, args.clip * args.bptt * args.batch_size / len(ctx)) trainer.step(args.batch_size) for L in Ls: total_L += mx.nd.sum(L).asscalar() if ibatch % args.log_interval == 0 and ibatch > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval logging.info( '[Epoch %d Batch %d] Speed: %f samples/sec loss %.2f, ppl %.2f' % (epoch, ibatch, args.batch_size / (time.time() - btic), cur_L, math.exp(cur_L))) total_L = 0.0 btic = time.time() logging.info('[Epoch %d] train loss %.2f, train ppl %.2f' % (epoch, cur_L, math.exp(cur_L))) logging.info('[Epoch %d] time cost %.2f' % (epoch, time.time() - tic)) val_L = eval(val_data, ctx) logging.info('[Epoch %d] valid loss %.2f, valid ppl %.2f' % (epoch, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L # test_L = eval(test_data, ctx) model.collect_params().save('model.params') # logging.info('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L))) else: args.lr = args.lr * 0.25 trainer._init_optimizer('sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 }) model.collect_params().load('model.params', ctx)
def train(epochs, ctx): best_val = float("Inf") for epoch in range(epochs): total_L = 0.0 start_time = time.time() hidden_states = [ model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size // len(ctx), ctx=ctx[i]) for i in range(len(ctx)) ] for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args.bptt)): # get data batch from the training data data_batch, target_batch = get_batch(train_data, i) # For RNN we can do within batch multi-device parallelization data = gluon.utils.split_and_load(data_batch, ctx_list=ctx, batch_axis=1) target = gluon.utils.split_and_load(target_batch, ctx_list=ctx, batch_axis=1) Ls = [] for (d, t) in zip(data, target): # get corresponding hidden state then update hidden hidden = detach(hidden_states[d.context.device_id]) with autograd.record(): output, hidden = model(d, hidden) L = loss(output, t.reshape((-1,))) L.backward() Ls.append(L) # write back to the record hidden_states[d.context.device_id] = hidden for c in ctx: grads = [i.grad(c) for i in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. # Also this utility function needs to be applied within the same context gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size / len(ctx)) trainer.step(args.batch_size) for L in Ls: total_L += mx.nd.sum(L).asscalar() if ibatch % args.log_interval == 0 and ibatch > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval logging.info('[Epoch %d Batch %d] loss %.2f, ppl %.2f' % ( epoch, ibatch, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data, ctx) logging.info('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f' % ( epoch, time.time() - start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data, ctx) model.collect_params().save('model.params') logging.info('test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L))) else: args.lr = args.lr * 0.25 trainer._init_optimizer('sgd', { 'learning_rate': args.lr, 'momentum': 0, 'wd': 0 } ) model.collect_params().load('model.params', ctx)
def train_one_epoch(epoch, cur_lr): ''' Train all the batches within one epoch. costs is the container created once and reuse for efficiency''' total_loss = 0 states = [model.begin_state(batch_size=m, ctx=ctx) for ctx in ctxs] # Loop all batches batch, cursor = 0, 0 tic_log_interval = time.time() while cursor < train_data.shape[0] - 1 - 1: ####################################################################### # Control seq_len cited from origin paper random_bptt = args.bptt if np.random.random( ) < 0.95 else args.bptt / 2. # Normal distribution (mean, variance): Prevent extreme sequence lengths seq_len = max(5, int(np.random.normal(random_bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) # Rescale learning rate depending on the variable length w.r.t bptt trainer.set_learning_rate(cur_lr * seq_len / args.bptt) ######################################################################## '''Each batch shape(seq_len, batch_size), split data to each device. m is the # of samples for each device, devided along batch_size axis.''' Xs, Ys = get_batch(train_data, cursor, args, seq_len=seq_len) assert args.batch_size == Xs.shape[ 1], 'data shape[1] should be batch_size' Xs = gluon.utils.split_and_load(Xs, ctxs, 1) Ys = gluon.utils.split_and_load(Ys, ctxs, 1) tic_b = time.time() # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. states = detach(states) loss_list = [] with autograd.record(): # train_mode for i, X in enumerate(Xs): output, states[i], encoded_raw, encoded_dropped = model( X, states[i]) # state(num_layers, bsz, hidden_size) device_loss = joint_loss(output, Ys[i], encoded_raw, encoded_dropped) loss_list.append(device_loss.as_in_context(ctxs[0]) / X.size) for l in loss_list: l.backward() ''' trainer.allreduce_grads() For each parameter, reduce the gradients from different contexts. Should be called after autograd.backward(), outside of record() scope, and before trainer.update(). For normal parameter updates, step() should be used, which internally calls allreduce_grads() and then update(). However, in gradient clipping, manually call allreduce_grads() and update() separately. ''' # trainer.allreduce_grads() # grads = [p.grad(ctxs[0]) for p in parameters] grads = [p.grad(ctx) for ctx in ctxs for p in parameters] gluon.utils.clip_global_norm(grads, args.clipping_theta) trainer.step(1) # trainer.update(1) batch_loss = sum([nd.sum(l).asscalar() for l in loss_list]) / len(ctxs) toc_b = time.time() batch_info.append([ epoch, batch, trainer.learning_rate, seq_len, (toc_b - tic_b) * 1000, args.batch_size * seq_len // (toc_b - tic_b), batch_loss, math.exp(batch_loss) ]) total_loss += batch_loss if batch % args.log_interval == 0 and batch > 0: utils.save_info(batch_info, batch_file) toc_log_interval = time.time() total_loss = total_loss / args.log_interval logging.info( '| epoch {:4d} ({:5.2f}%)| batch {:4d} | lr {:7.4f} | seq_len {:2d} | {:4.0f} ms/batch | ' '{:5d} tokens/s | loss {:6.3f} | ppl {:5.2f}'.format( epoch, cursor / train_data.shape[0] * 100, batch, trainer.learning_rate, seq_len, (toc_log_interval - tic_log_interval) * 1000 / args.log_interval, int(args.batch_size * args.log_interval * seq_len / (toc_log_interval - tic_log_interval)), total_loss, math.exp(total_loss))) total_loss = 0 tic_log_interval = time.time() batch += 1 cursor += seq_len global parameters_count if not parameters_count: logging.info('Parameters (except embeding): {}'.format( sum(p.data(ctxs[0]).size for p in parameters))) parameters_count = 1 nd.waitall() # synchronize batch data