def run(args, local_rank): """ Distributed Synchronous """ torch.manual_seed(1234) vocab = Vocab(args.vocab, min_occur_cnt=args.min_occur_cnt, specials=[]) if (args.world_size == 1 or dist.get_rank() == 0): print ("vocab.size = %d"%vocab.size, flush=True) model = BIGLM(local_rank, vocab, args.embed_dim, args.ff_embed_dim,\ args.num_heads, args.dropout, args.layers, args.smoothing, args.approx) if args.start_from is not None: ckpt = torch.load(args.start_from, map_location='cpu') model.load_state_dict(ckpt['model']) model = model.cuda(local_rank) if args.world_size > 1: torch.manual_seed(1234 + dist.get_rank()) random.seed(5678 + dist.get_rank()) optimizer = Optim(model.embed_dim, args.lr, args.warmup_steps, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.998), eps=1e-9)) if args.start_from is not None: optimizer.load_state_dict(ckpt['optimizer']) #train_data = DataLoader(vocab, args.train_data+"0"+str(local_rank), args.batch_size, args.max_len, args.min_len) train_data = DataLoader(vocab, args.train_data, args.batch_size, args.max_len, args.min_len) batch_acm = 0 acc_acm, nll_acm, ppl_acm, ntokens_acm, nxs, npairs_acm, loss_acm = 0., 0., 0., 0., 0., 0., 0. while True: model.train() for truth, inp, msk in train_data: batch_acm += 1 truth = truth.cuda(local_rank) inp = inp.cuda(local_rank) msk = msk.cuda(local_rank) model.zero_grad() res, loss, acc, nll, ppl, ntokens, npairs = model(truth, inp, msk) loss_acm += loss.item() acc_acm += acc nll_acm += nll ppl_acm += ppl ntokens_acm += ntokens npairs_acm += npairs nxs += npairs loss.backward() if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() if (args.world_size==1 or dist.get_rank() ==0) and batch_acm%args.print_every == -1%args.print_every: print ('batch_acm %d, loss %.3f, acc %.3f, nll %.3f, ppl %.3f, x_acm %d, lr %.6f'\ %(batch_acm, loss_acm/args.print_every, acc_acm/ntokens_acm, \ nll_acm/nxs, ppl_acm/nxs, npairs_acm, optimizer._rate), flush=True) acc_acm, nll_acm, ppl_acm, ntokens_acm, loss_acm, nxs = 0., 0., 0., 0., 0., 0. if (args.world_size==1 or dist.get_rank() ==0) and batch_acm%args.save_every == -1%args.save_every: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) torch.save({'args':args, 'model':model.state_dict(), 'optimizer':optimizer.state_dict()}, '%s/epoch%d_batch_%d'%(args.save_dir, train_data.epoch_id, batch_acm))
def run(args, local_rank): """ Distributed Synchronous """ torch.manual_seed(1234) vocab = Vocab(args.vocab, min_occur_cnt=args.min_occur_cnt, specials=[]) if (args.world_size == 1 or dist.get_rank() == 0): print("vocab.size = " + str(vocab.size), flush=True) model = BIGLM(local_rank, vocab, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.layers, args.smoothing) if args.start_from is not None: ckpt = torch.load(args.start_from, map_location='cpu') model.load_state_dict(ckpt['model']) model = model.cuda(local_rank) optimizer = Optim( model.embed_dim, args.lr, args.warmup_steps, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.998), eps=1e-9)) if args.start_from is not None: optimizer.load_state_dict(ckpt['optimizer']) train_data = DataLoader(vocab, args.train_data, args.batch_size, args.max_len, args.min_len) batch_acm = 0 acc_acm, nll_acm, ppl_acm, ntokens_acm, nxs, npairs_acm, loss_acm = 0., 0., 0., 0., 0., 0., 0. while True: model.train() if train_data.epoch_id > args.max_epoch: break for xs_tpl, xs_seg, xs_pos, ys_truth, ys_inp, ys_tpl, ys_seg, ys_pos, msk in train_data: batch_acm += 1 xs_tpl = xs_tpl.cuda(local_rank) xs_seg = xs_seg.cuda(local_rank) xs_pos = xs_pos.cuda(local_rank) ys_truth = ys_truth.cuda(local_rank) ys_inp = ys_inp.cuda(local_rank) ys_tpl = ys_tpl.cuda(local_rank) ys_seg = ys_seg.cuda(local_rank) ys_pos = ys_pos.cuda(local_rank) msk = msk.cuda(local_rank) model.zero_grad() res, loss, acc, nll, ppl, ntokens, npairs = model( xs_tpl, xs_seg, xs_pos, ys_truth, ys_inp, ys_tpl, ys_seg, ys_pos, msk) # http://www.myzaker.com/article/5f3747a28e9f096c723a65e0/ 资料 # 常用的文本生成评测指标 PPL、Distinct 外, # 本文还专门设计了衡量格式(Format)准确率、韵律(Rhyme)准确率和句子完整性(integrity)的指标。 # 格式(Format)准确率: Precision p、Recall r 和 F1 得分-> Macro-F1 和 Micro-F1 # 完整性有个奇怪的log值 # 传统的BLEU和ROUGE, 再songnet中完全用不到, 创作要求多样性 loss_acm += loss.item() # 损失 acc_acm += acc # 精确度 nll_acm += nll # ppl_acm += ppl # -log 和, 其实就是句子出现的概率, 越小, 困惑度越高 # 新指标, 困惑度perplexity, 比较两者再预测样本上的优劣, 困惑都越低越好??, 咋定义的 ntokens_acm += ntokens # 字符数 npairs_acm += npairs # 句子? nxs += npairs # 为什么啊, 感觉好难啊gpt2 loss.backward() if args.world_size > 1: is_normal = average_gradients(model) else: is_normal = True if is_normal: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() else: print("gradient: none, gpu: " + str(local_rank), flush=True) continue if (args.world_size == 1 or dist.get_rank() == 0 ) and batch_acm % args.print_every == -1 % args.print_every: today = datetime.datetime.now() print(today) print( 'batch_acm %d, loss %.3f, acc %.3f, nll %.3f, ppl %.3f, x_acm %d, lr %.6f' % (batch_acm, loss_acm / args.print_every, acc_acm / ntokens_acm, nll_acm / nxs, ppl_acm / nxs, npairs_acm, optimizer._rate), flush=True) acc_acm, nll_acm, ppl_acm, ntokens_acm, loss_acm, nxs = 0., 0., 0., 0., 0., 0. if (args.world_size == 1 or dist.get_rank() == 0 ) and batch_acm % args.save_every == -1 % args.save_every: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) model.eval() eval_epoch( args, model, vocab, local_rank, "epoch-" + str(train_data.epoch_id) + "-acm-" + str(batch_acm)) model.train() torch.save( { 'args': args, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, '%s/epoch%d_batch_%d' % (args.save_dir, train_data.epoch_id, batch_acm))
def run(args, local_rank): """ Distributed Synchronous """ torch.manual_seed(1234) vocab = Vocab(args.vocab, min_occur_cnt=args.min_occur_cnt, specials=[]) if (args.world_size == 1 or dist.get_rank() == 0): print (vocab.size) model = BIGLM(local_rank, vocab, args.embed_dim, args.ff_embed_dim, args.num_heads, args.dropout, args.layers, args.approx) if args.start_from is not None: ckpt = torch.load(args.start_from, map_location='cpu') model.load_state_dict(ckpt['model']) model = model.cuda(local_rank) weight_decay_params = [] no_weight_decay_params = [] for name, param in model.named_parameters(): if name.endswith('bias') or 'layer_norm' in name: no_weight_decay_params.append(param) else: weight_decay_params.append(param) grouped_params = [{'params':weight_decay_params, 'weight_decay':0.01}, {'params':no_weight_decay_params, 'weight_decay':0.}] if args.world_size > 1: torch.manual_seed(1234 + dist.get_rank()) random.seed(5678 + dist.get_rank()) if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") optimizer = FusedAdam(grouped_params, lr=args.lr, betas=(0.9, 0.999), eps =1e-6, bias_correction=False, max_grad_norm=1.0) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = AdamWeightDecayOptimizer(grouped_params, lr=args.lr, betas=(0.9, 0.999), eps=1e-6) if args.start_from is not None: optimizer.load_state_dict(ckpt['optimizer']) train_data = DataLoader(vocab, args.train_data, args.batch_size, args.max_len) batch_acm = 0 acc_acm, ntokens_acm, npairs_acm, loss_acm = 0., 0., 0., 0. while True: model.train() for truth, inp, msk in train_data: batch_acm += 1 if batch_acm <= args.warmup_steps: update_lr(optimizer, args.lr*batch_acm/args.warmup_steps) truth = truth.cuda(local_rank) inp = inp.cuda(local_rank) msk = msk.cuda(local_rank) optimizer.zero_grad() res, loss, acc, ntokens, npairs = model(truth, inp, msk) loss_acm += loss.item() acc_acm += acc ntokens_acm += ntokens npairs_acm += npairs if args.fp16: optimizer.backward(loss) else: loss.backward() if args.world_size > 1: average_gradients(model) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() if (args.world_size==1 or dist.get_rank() ==0) and batch_acm%args.print_every == -1%args.print_every: print ('batch_acm %d, loss %.3f, acc %.3f, x_acm %d'%(batch_acm, loss_acm/args.print_every, acc_acm/ntokens_acm, npairs_acm)) acc_acm, ntokens_acm, loss_acm = 0., 0., 0. if (args.world_size==1 or dist.get_rank() ==0) and batch_acm%args.save_every == -1%args.save_every: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) torch.save({'args':args, 'model':model.state_dict(), 'optimizer':optimizer.state_dict()}, '%s/epoch%d_batch_%d'%(args.save_dir, train_data.epoch_id, batch_acm))
def run(args, local_rank): """ Distributed Synchronous """ torch.manual_seed(1234) vocab = Vocab(args.vocab, min_occur_cnt=args.min_occur_cnt, specials=[]) if (args.world_size == 1 or dist.get_rank() == 0): print("vocab.size = " + str(vocab.size), flush=True) model = BIGLM(local_rank, vocab, args.embed_dim, args.ff_embed_dim,\ args.num_heads, args.dropout, args.layers, args.smoothing) if args.start_from is not None: ckpt = torch.load(args.start_from, map_location='cpu') model.load_state_dict(ckpt['model']) model = model.cuda(local_rank) optimizer = Optim( model.embed_dim, args.lr, args.warmup_steps, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.998), eps=1e-9)) if args.start_from is not None: optimizer.load_state_dict(ckpt['optimizer']) train_data = DataLoader(vocab, args.train_data, args.batch_size, args.max_len, args.min_len) batch_acm = 0 acc_acm, nll_acm, ppl_acm, ntokens_acm, nxs, npairs_acm, loss_acm = 0., 0., 0., 0., 0., 0., 0. while True: model.train() if train_data.epoch_id > 30: break for xs_tpl, xs_seg, xs_pos, ys_truth, ys_inp, ys_tpl, ys_seg, ys_pos, msk in train_data: batch_acm += 1 xs_tpl = xs_tpl.cuda(local_rank) xs_seg = xs_seg.cuda(local_rank) xs_pos = xs_pos.cuda(local_rank) ys_truth = ys_truth.cuda(local_rank) ys_inp = ys_inp.cuda(local_rank) ys_tpl = ys_tpl.cuda(local_rank) ys_seg = ys_seg.cuda(local_rank) ys_pos = ys_pos.cuda(local_rank) msk = msk.cuda(local_rank) model.zero_grad() res, loss, acc, nll, ppl, ntokens, npairs = model( xs_tpl, xs_seg, xs_pos, ys_truth, ys_inp, ys_tpl, ys_seg, ys_pos, msk) loss_acm += loss.item() acc_acm += acc nll_acm += nll ppl_acm += ppl ntokens_acm += ntokens npairs_acm += npairs nxs += npairs loss.backward() if args.world_size > 1: is_normal = average_gradients(model) else: is_normal = True if is_normal: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() else: print("gradient: none, gpu: " + str(local_rank), flush=True) continue if (args.world_size == 1 or dist.get_rank() == 0 ) and batch_acm % args.print_every == -1 % args.print_every: print ('batch_acm %d, loss %.3f, acc %.3f, nll %.3f, ppl %.3f, x_acm %d, lr %.6f'\ %(batch_acm, loss_acm/args.print_every, acc_acm/ntokens_acm, \ nll_acm/nxs, ppl_acm/nxs, npairs_acm, optimizer._rate), flush=True) acc_acm, nll_acm, ppl_acm, ntokens_acm, loss_acm, nxs = 0., 0., 0., 0., 0., 0. if (args.world_size == 1 or dist.get_rank() == 0 ) and batch_acm % args.save_every == -1 % args.save_every: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) model.eval() eval_epoch( args, model, vocab, local_rank, "epoch-" + str(train_data.epoch_id) + "-acm-" + str(batch_acm)) model.train() torch.save( { 'args': args, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, '%s/epoch%d_batch_%d' % (args.save_dir, train_data.epoch_id, batch_acm))