def train_entry(config): from models import QANet with open(config.word_emb_file, "rb") as fh: word_mat = np.array(pickle.load(fh), dtype=np.float32) with open(config.char_emb_file, "rb") as fh: char_mat = np.array(pickle.load(fh), dtype=np.float32) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) print("Building model...") train_dataset = SQuADDataset(config.train_record_file, config.num_steps, config.batch_size) dev_dataset = SQuADDataset(config.dev_record_file, config.val_num_batches, config.batch_size) lr = config.learning_rate base_lr = 1 lr_warm_up_num = config.lr_warm_up_num model = QANet(word_mat, char_mat).to(device) parameters = filter(lambda param: param.requires_grad, model.parameters()) #optimizer = optim.Adam(lr=base_lr, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7, params=parameters) #optimizer = optim.SparseAdam(lr=lr, betas=(0.8, 0.999), eps=1e-7, params=parameters) optimizer = optim.Adam(lr=lr, params=parameters) #cr = lr / math.log2(lr_warm_up_num) #scheduler = optim.lr_scheduler.LambdaLR( # optimizer, # lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr) scheduler='' L = config.checkpoint N = config.num_steps best_f1 = 0 best_em = 0 patience = 0 unused = False for iter in range(0, N, L): train(model, optimizer, scheduler, train_dataset, iter, L) metrics = test(model, dev_dataset, dev_eval_file) if iter + L >= lr_warm_up_num - 1 and unused: optimizer.param_groups[0]['initial_lr'] = lr scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.99997) unused = False if config.print_weight: print_weight(model, 5, iter + L) #print("Learning rate: {}".format(scheduler.get_lr())) dev_f1 = metrics["f1"] dev_em = metrics["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > config.early_stop: break else: patience = 0 best_f1 = max(best_f1, dev_f1) best_em = max(best_em, dev_em) fn = os.path.join(config.save_dir, "model.pt") torch.save(model, fn)
def train_entry(config): from models import QANet with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.train_eval_file, "r") as fh: train_eval_file = json.load(fh) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) print("Building model...") train_dataset = SQuADDataset(config.train_record_file, config.num_steps, config.batch_size) dev_dataset = SQuADDataset(config.dev_record_file, config.test_num_batches, config.batch_size) lr = config.learning_rate base_lr = 1.0 warm_up = config.lr_warm_up_num model = QANet(word_mat, char_mat).to(device) ema = EMA(config.ema_decay) for name, p in model.named_parameters(): if p.requires_grad: ema.set(name, p) params = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=base_lr, betas=(config.beta1, config.beta2), eps=1e-7, weight_decay=3e-7, params=params) cr = lr / log2(warm_up) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * log2(ee + 1) if ee < warm_up else lr) L = config.checkpoint N = config.num_steps best_f1 = best_em = patience = 0 for iter in range(0, N, L): train(model, optimizer, scheduler, ema, train_dataset, iter, L) valid(model, train_dataset, train_eval_file) metrics = test(model, dev_dataset, dev_eval_file) print("Learning rate: {}".format(scheduler.get_lr())) dev_f1 = metrics["f1"] dev_em = metrics["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > config.early_stop: break else: patience = 0 best_f1 = max(best_f1, dev_f1) best_em = max(best_em, dev_em) fn = os.path.join(config.save_dir, f"model_iter={iter}_best_f1={best_f1}.pt") torch.save(model, fn)
def train_entry(config): from models import QANet with open(config.word_emb_file, "rb") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "rb") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) print("Building model...") train_dataset = get_loader(config.train_record_file, config.batch_size) dev_dataset = get_loader(config.dev_record_file, config.batch_size) lr = config.learning_rate base_lr = 1 lr_warm_up_num = config.lr_warm_up_num model = QANet(word_mat, char_mat).to(device) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=base_lr, betas=(0.8, 0.999), eps=1e-6, weight_decay=3e-7, params=parameters) cr = lr / math.log2(lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr) best_f1 = 0 best_em = 0 patience = 0 unused = False for iter in range(config.num_epoch): train(model, optimizer, scheduler, train_dataset, dev_dataset, dev_eval_file, iter) metrics = test(model, dev_dataset, dev_eval_file, (iter + 1) * len(train_dataset)) dev_f1 = metrics["f1"] dev_em = metrics["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > config.early_stop: break else: patience = 0 best_f1 = max(best_f1, dev_f1) best_em = max(best_em, dev_em) fn = os.path.join(config.save_dir, "model.pt") torch.save(model, fn)
def train(epoch, data, model=None): if model == None: model = QANet(data).to(device) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7, params=parameters) crit = lr / math.log2(1000) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: crit * math.log2(ee + 1) if ee + 1 <= 1000 else lr) packs = trunk(data.train.packs, batch_size) f_log = open("log/model.log", "w") # test(model, data, ep=0, iter=0, test_num=10, test_size=50, f_log=f_log) try: for ep in range(epoch): print("EPOCH {:02d}: ".format(ep)) l = len(packs) for i in tqdm(range(l)): pack = packs[i] Cw, Cc, Qw, Qc, a = to_batch(pack, data, data.train) optimizer.zero_grad() out1, out2 = model(Cw, Cc, Qw, Qc) loss1 = F.cross_entropy(out1, a[:, 0]) loss2 = F.cross_entropy(out2, a[:, 1]) loss = loss1 + loss2 writer.add_scalar("data/loss", float(loss), ep * l + i) loss.backward() scheduler.step() if (i + 1) % checkpoint == 0: torch.save( model, os.path.join( model_dir, "model-tmp-{:02d}-{}.pt".format(ep, i + 1))) # test(model, data, ep, i, 10, 50, f_log) # test(model, data, ep, i, 1, -1, f_log) random.shuffle(packs) torch.save(model, os.path.join(model_dir, model_fn)) writer.close() except Exception as e: torch.save( model, os.path.join(model_dir, "model-{:02d}-{}.pt".format(ep, i + 1))) raise e except KeyboardInterrupt as k: torch.save( model, os.path.join(model_dir, "model-{:02d}-{}.pt".format(ep, i + 1))) return model
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings # Args: word_vectors: word vector tensor of dimension [vocab_size * wemb_dim] log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get Model log.info('Building Model...') model = QANet(word_vectors, char_vectors, args.para_limit, args.ques_limit, args.f_model, num_head=args.num_head, train_cemb = (not args.pretrained_char)) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam( params=parameters, lr=args.lr, betas=(args.beta1, args.beta2), eps=1e-8, weight_decay=3e-7) cr = 1.0 / math.log(args.lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else 1) loss_f = torch.nn.CrossEntropyLoss() # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2)) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') """ model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) """ ''' model = charBiDAF(word_vectors=word_vectors, char_vectors=char_vectors, emb_size=char_vectors.size(1), hidden_size=args.hidden_size, drop_prob=args.drop_prob) ''' model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, emb_size=char_vectors.size(1), hidden_size=args.hidden_size, drop_prob=args.drop_prob) print( "YOU ARE ENTERING THE QA NET MODEL TRAINING_____________________________________________________________________" ) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() #print("Model status - ", cuda_or_cpu(model)) ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) #torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL = 300 train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() #print(torch.cuda.memory_allocated(device=None)) scheduler.step(step // batch_size) ema(model, step // batch_size) #torch.cuda.empty_cache() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') # print('Memory 1: ', torch.cuda.memory_allocated()) ema.assign(model) # print('Memory 2: ', torch.cuda.memory_allocated()) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def train(config): from models import QANet with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.train_eval_file, "r") as fh: train_eval_file = json.load(fh) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) with open(config.dev_meta, "r") as fh: meta = json.load(fh) train_log = open(config.train_log, "w") dev_total = meta["total"] print("Building model...") train_dataset = SQuADDataset(config.train_record_file, config.num_steps, config.batch_size) dev_dataset = SQuADDataset(config.dev_record_file, config.val_num_batches, config.batch_size) lr = config.learning_rate model = QANet(word_mat, char_mat).to(device) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7, params=parameters) crit = lr / math.log2(1000) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: crit * math.log2(ee + 1) if ee + 1 <= 1000 else lr) for ep in tqdm(range(config.num_steps), total=config.num_steps): (Cwid, Ccid, Qwid, Qcid, y1, y2, ids) = train_dataset[ep] Cwid, Ccid, Qwid, Qcid = Cwid.to(device), Ccid.to(device), Qwid.to( device), Qcid.to(device) p1, p2 = model(Cwid, Ccid, Qwid, Qcid) y1, y2 = y1.to(device), y2.to(device) loss1 = F.cross_entropy(p1, y1) loss2 = F.cross_entropy(p2, y1) loss = loss1 + loss2 loss.backward(retain_graph=True) scheduler.step() model.zero_grad() if (ep + 1) % config.checkpoint == 0: del Cwid, Ccid, Qwid, Qcid, y1, y2, p1, p2, loss torch.cuda.empty_cache() metric = evaluate_batch(model, dev_eval_file, dev_dataset) log_ = "EPOCH {:8d} loss {:8f} F1 {:8f} EM {:8f}\n".format( ep, metric["loss"], metric["f1"], metric["exact_match"]) train_log.write(log_) train_log.flush() dev_f1 = metric["f1"] dev_em = metric["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > config.early_stop: break else: patience = 0 best_em = max(best_em, dev_em) best_f1 = max(best_f1, dev_f1) fn = os.path.join(config.save_dir, "model_{}.ckpt".format(ep)) torch.save(model, fn, pickle_protocol=False)
def train_entry(config): from models import QANet with open(config.word_emb_file, "rb") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "rb") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) print("Building model...") train_dataset = get_loader(config.train_record_file, config.batch_size) dev_dataset = get_loader(config.dev_record_file, config.batch_size) lr = config.learning_rate base_lr = 1 lr_warm_up_num = config.lr_warm_up_num model = QANet(word_mat, char_mat).to(device) if torch.cuda.device_count() > 1: print('i can use gpu') model = torch.nn.DataParallel(model, device_ids=[0, 1]) model.load_state_dict( torch.load('/home/cn/AI/QANet-pytorch-/model_state_dict.pt')) ema = EMA(config.decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=base_lr, betas=(0.9, 0.999), eps=1e-7, weight_decay=5e-8, params=parameters) cr = lr / math.log2(lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr) best_f1 = 0 best_em = 0 patience = 0 unused = False for iter in range(config.num_epoch): train(model, optimizer, scheduler, train_dataset, dev_dataset, dev_eval_file, iter, ema) print(iter) ema.assign(model) metrics = test(model, dev_dataset, dev_eval_file, (iter + 1) * len(train_dataset)) dev_f1 = metrics["f1"] dev_em = metrics["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > config.early_stop: break else: patience = 0 best_f1 = max(best_f1, dev_f1) best_em = max(best_em, dev_em) fn = os.path.join(config.save_dir, "model.pt") torch.save(model, fn) torch.save(model.state_dict(), 'model_state_dict.pt') ema.resume(model)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # setup_args = get_setup_args() with open(args.char2idx_file, "r") as f: char2idx = json_load(f) # Get model log.info('Building model...') model = QANet(word_vectors=word_vectors, char2idx=char2idx) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) # optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.8, 0.999)) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') dataset1 = np.load(args.train_record_file) outfile = '/content/dataset/train_light.npz' n_row = 75000 np.savez(outfile, context_idxs=dataset1['context_idxs'][:n_row], context_char_idxs=dataset1['context_char_idxs'][:n_row], ques_idxs=dataset1['ques_idxs'][:n_row], ques_char_idxs=dataset1['ques_char_idxs'][:n_row], y1s=dataset1['y1s'][:n_row], y2s=dataset1['y2s'][:n_row], ids=dataset1['ids'][:n_row]) # train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_dataset = SQuAD(outfile, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() if step % 10000 == 0: print('loss val: {}'.format(loss_val)) # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def train(config): from models import QANet with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.train_eval_file, "r") as fh: train_eval_file = json.load(fh) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) print("Building model...") model = QANet(word_mat, char_mat).to(device) train_dataset = SQuADDataset(config.train_record_file, config.num_steps, config.batch_size) dev_dataset = SQuADDataset(config.dev_record_file, -1, config.batch_size, name='dev') lr = config.learning_rate base_lr = 1.0 lr_warm_up_num = config.lr_warm_up_num parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=base_lr, betas=(config.beta1, config.beta2), eps=1e-7, weight_decay=3e-7, params=parameters) cr = lr / math.log2(lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr) L = config.checkpoint N = config.num_steps best_f1 = 0 best_em = 0 patience = 0 unused = True for iter in tqdm(range(0, N)): if iter % L == 0: valid_train_loss, valid_train_metrics = valid(model, train_dataset, train_eval_file, num_ex=1000) valid_dev_loss, valid_dev_metrics = valid(model, dev_dataset, dev_eval_file, num_ex=1000) if config.use_tensorboard: writer.add_scalar('data/valid_train_loss', valid_train_loss, iter / L) writer.add_scalar('data/valid_dev_loss', valid_dev_loss, iter / L) writer.add_scalar('data/valid_train_em', valid_train_metrics['exact_match'], iter / L) writer.add_scalar('data/valid_dev_em', valid_dev_metrics['exact_match'], iter / L) writer.add_scalar('data/valid_train_f1', valid_train_metrics['f1'], iter / L) writer.add_scalar('data/valid_dev_f1', valid_dev_metrics['f1'], iter / L) train_loss = update(model, optimizer, scheduler, train_dataset[iter]) if config.use_tensorboard: writer.add_scalar('data/train_loss', train_loss, iter) if iter + L >= lr_warm_up_num - 1 and unused: optimizer.param_groups[0]['initial_lr'] = lr scheduler = optim.lr_scheduler.ExponentialLR( optimizer, config.decay) unused = False #print("Learning rate: {}".format(scheduler.get_lr())) ''' dev_f1 = metrics["f1"] dev_em = metrics["exact_match"] print('after {} steps , f1={} em={}'.format(iter, dev_f1, dev_em)) if dev_em < best_em: patience += 1 print('doesnot beat best model, patience={}'.format(patience)) if patience > config.early_stop: break else: fn = os.path.join(config.save_dir, "model.pt") result = os.path.join(config.save_dir, "best_result.txt") print('beat best model, now best em={}, f1={}'.format(dev_em, dev_f1)) with open(result, "w") as f: json.dump(metrics, f) best_em = dev_em best_f1 = max(best_f1, dev_f1) torch.save(model, fn) patience = 0 ''' writer.close()
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get word embeddings log.info('Loading word embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) ### start our code: # Get char-embeddings log.info('Loading char-embeddings...') char_vectors = util.torch_from_json(args.char_emb_file) ### end our code # Get model log.info('Building model...') # model = BiDAF(word_vectors=word_vectors, # hidden_size=args.hidden_size, # char_vectors = char_vectors, # drop_prob=args.drop_prob) ### start our code: (QANet) model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, kernel_size=7, filters=128, drop_prob=args.drop_prob) ### end our code model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler # https://pytorch.org/docs/stable/optim.html # Original: # optimizer = optim.Adadelta(model.parameters(), args.lr, # weight_decay=args.l2_wd) # # default: lr=0.5, rho=0.9, eps=1e-06, weight_decay=0 # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # A function which computes a multiplicative factor given an integer parameter epoch, # or a list of such functions, one for each group in optimizer.param_groups ### start our code: optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7, amsgrad=False) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Adabound # optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) ### end our code # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # print("cc_idxs: ", cc_idxs) # print("qc_idxs: ", qc_idxs) # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) ### start our code: cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) ### end our code batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(cw_idxs, qw_idxs) # original ### start our code: log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) ### end our code y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ### start our code: # optimizer = LR_decay(optimizer, epoch, lr = 0.8) # initial learning rate 0.8 if step < 1000: optimizer = LR_warmup(optimizer, step, lr=0.) ### end our code optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)