def train(args, data): device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = BiDAF(args, data.WORD.vocab.vectors).to(device) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 iterator = data.train_iter for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == args.epoch: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch p1, p2 = model(batch) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) if (i + 1) % args.print_freq == 0: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model
def train(args, data): device = torch.device( "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") model = BiDAF(args, data.WORD.vocab.vectors).to(device) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 iterator = data.train_iter for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == args.epoch: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch p1, p2 = model(batch) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) if (i + 1) % args.print_freq == 0: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq writer.add_scalar('loss/train', loss, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print('train loss: {} / dev loss: {}'.format(loss, dev_loss) + ' / dev EM: {} / dev F1: {}'.format(dev_exact, dev_f1)) if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() writer.close() print('max dev EM: {} / max dev F1: {}'.format(max_dev_exact, max_dev_f1)) return best_model
def train(args, data): if args.load_model != "": model = BiDAF(args, data.WORD.vocab.vectors) model.load_state_dict(torch.load(args.load_model)) else: model = BiDAF(args, data.WORD.vocab.vectors) device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = model.to(device) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) for name, i in model.named_parameters(): if not i.is_leaf: print(name,i) writer = SummaryWriter(log_dir='runs/' + args.model_name) best_model = None for iterator, dev_iter, dev_file_name, index, print_freq, lr in zip(data.train_iter, data.dev_iter, args.dev_files, range(len(data.train)), args.print_freq, args.learning_rate): # print # (iterator[0]) embed() exit(0) optimizer = optim.Adadelta(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() model.train() loss, last_epoch = 0, 0 max_dev_exact, max_dev_f1 = -1, -1 print(f"Training with {dev_file_name}") print() for i, batch in tqdm(enumerate(iterator), total=len(iterator) * args.epoch[index], ncols=100): present_epoch = int(iterator.epoch) eva = False if present_epoch == args.epoch[index]: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) eva = True last_epoch = present_epoch p1, p2 = model(batch) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) torch.cuda.empty_cache() if (i + 1) % print_freq == 0 or eva: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data, dev_iter, dev_file_name) c = (i + 1) // print_freq writer.add_scalar('loss/train', loss, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print() print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() writer.close() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') print("testing with test batch on best model") test_loss, test_exact, test_f1 = test(best_model, ema, args, data, list(data.test_iter)[-1], args.test_files[-1]) print(f'test loss: {test_loss:.3f}' f' / test EM: {test_exact:.3f} / test F1: {test_f1:.3f}') return best_model
def train(args, data): device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = BiDAF(args, data.CONTEXT_WORD.vocab.vectors).to(device) num = count_parameters(model) print(f'paramter {num}') if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 print('totally {} epoch'.format(args.epoch)) sys.stdout.flush() iterator = data.train_iter iterator.repeat = True for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == args.epoch: print('present_epoch value:',present_epoch) break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch p1, p2 = model(batch.c_char,batch.q_char,batch.c_word[0],batch.q_word[0],batch.c_word[1],batch.q_word[1]) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) if (i + 1) % args.print_freq == 0: dev_loss, dev_exact, dev_f1, dev_hasans_exact, dev_hasans_f1, dev_noans_exact,dev_noans_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq writer.add_scalar('loss/train', loss, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}' f' / dev hasans EM: {dev_hasans_exact} / dev hasans F1: {dev_hasans_f1}' f' / dev noans EM: {dev_noans_exact} / dev noans F1: {dev_noans_f1}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() sys.stdout.flush() writer.close() args.max_f1 = max_dev_f1 print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model
def train(args, data): device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = BiDAF(args, data.WORD.vocab.vectors).to(device) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 iterator = data.train_iter num_batch = len(iterator) for present_epoch in range(args.epoch): print('epoch', present_epoch + 1) for i, batch in enumerate(iterator): # present_epoch = int(iterator.epoch) """ if present_epoch == args.epoch: print(present_epoch) print() print(args.epoch) break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch """ p1, p2 = model(batch) optimizer.zero_grad() """ print(p1) print() print(batch.s_idx) """ if len(p1.size()) == 1: p1 = p1.reshape(1, -1) if len(p2.size()) == 1: p2 = p2.reshape(1, -1) batch_loss = criterion(p1, batch.s_idx) + criterion( p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) best_model = copy.deepcopy(model) if i + 1 == num_batch: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq writer.add_scalar('loss/train', loss / num_batch, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print( f'train loss: {loss/num_batch:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() writer.close() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model
def train(args, data): device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = BiDAF(args).to(device) D_batch = args.train_batch_size ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() # writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 i = 0 # iterator = data.train_iter while i + D_batch < len(data.data): b_id = i e_id = i + D_batch # present_epoch = int(iterator.epoch) # if present_epoch == args.epoch: # break # if present_epoch > last_epoch: # print('epoch:', present_epoch + 1) # last_epoch = present_epoch p1, p2 = model(data, b_id, e_id) optimizer.zero_grad() s_idx, e_idx = data.get_targ(b_id, e_id) batch_loss = criterion(p1, s_idx) + criterion(p2, e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) # if (i + 1) % args.print_freq == 0: # dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) # c = (i + 1) // args.print_freq # # writer.add_scalar('loss/train', loss, c) # # writer.add_scalar('loss/dev', dev_loss, c) # # writer.add_scalar('exact_match/dev', dev_exact, c) # # writer.add_scalar('f1/dev', dev_f1, c) # # print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' # # f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') # if dev_f1 > max_dev_f1: # max_dev_f1 = dev_f1 # max_dev_exact = dev_exact # best_model = copy.deepcopy(model) # loss = 0 # model.train() i += D_batch # writer.close() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model
def train(args, data): device = torch.device("cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") model = BiDAF(args).to(device) # 如果载入的这些参数中,有些参数不要求被更新,即固定不变,不参与训练,需要手动设置这些参数的梯度属性为Fasle, # 并且在optimizer传参时筛选掉这些参数: # ema = EMA(args.exp_decay_rate) # for name, param in model.named_parameters(): # if param.requires_grad: # ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(logdir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 iterator = data.train_iter for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == args.epoch: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch try: p1, p2 = model(batch) except OSError: pass optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss = batch_loss.item() batch_loss.backward() optimizer.step() print("loss", loss) # for name, param in model.named_parameters(): # if param.requires_grad: # ema.update(name, param.data) # if (i + 1) % args.print_freq == 0: # dev_loss, dev_exact, dev_f1 = test(model, args, data) # c = (i + 1) // args.print_freq # writer.add_scalar('loss/train', loss, c) # writer.add_scalar('loss/dev', dev_loss, c) # writer.add_scalar('exact_match/dev', dev_exact, c) # writer.add_scalar('f1/dev', dev_f1, c) # print('train loss: {:.3f} / dev loss: {:.3f} dev EM: {:.3f} dev F1: {:.3f}'.format(loss, dev_loss, dev_exact, dev_f1)) # if dev_f1 > max_dev_f1: # max_dev_f1 = dev_f1 # max_dev_exact = dev_exact # best_model = copy.deepcopy(model) # loss = 0 # model.train() # writer.close() # print('max dev EM: {:.3f} / max dev F1: {:.3f}'.format(max_dev_exact, max_dev_f1)) return best_model