def main(): print("Generating data...", end="") voc_size = args.vocab_sz inp = np.arange(2, voc_size, 2) tgt = np.arange(3, voc_size, 2) data_x, data_y = get_numbers(inp, tgt) train_len = int(len(data_x) * 0.9) train_x, val_x = data_x[:train_len], data_x[train_len:] train_y, val_y = data_y[:train_len], data_y[train_len:] print("Done") print("Setting model...", end="") model = TransformerModel( input_sz=voc_size, output_sz=voc_size, d_model=args.d_model, nhead=args.n_head, num_encoder_layers=args.n_encoder_layers, num_decoder_layers=args.n_decoder_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, ) if args.load_dir != ".": model.load_state_dict(flow.load(args.load_dir)) model = to_cuda(model) criterion = to_cuda(nn.CrossEntropyLoss()) optimizer = flow.optim.Adam(model.parameters(), lr=args.lr) print("Done") print("Training...") min_loss = 100 for i in range(1, args.n_epochs + 1): epoch_loss = train(model, criterion, optimizer, train_x, train_y) epoch_loss_val = validation(model, criterion, val_x, val_y) print("epoch: {} train loss: {}".format(i, epoch_loss)) print("epoch: {} val loss: {}".format(i, epoch_loss_val)) if epoch_loss < min_loss: if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) else: shutil.rmtree(args.save_dir) assert not os.path.exists(args.save_dir) os.mkdir(args.save_dir) flow.save(model.state_dict(), args.save_dir) if i % 3 == 2: print(test(model, test_times=10))
def train(opt, train_data, eval_data=None): logger.info("start training task") dim_input = 6 dim_emb = 64 num_class = train_data.num_class transformer_nhead = 2 transformer_nlayers = 1 model = TransformerModel(dim_input, dim_emb, transformer_nhead, num_class, transformer_nlayers) if model.cuda: model = move_to_gpu(model) summary(model, train_data[0]['x'].shape) try: dataloader = DataLoader( train_data, batch_size=opt.batch_size, shuffle=False, num_workers=4 ) logger.info("create training dataloader") except Exception as e: logger.error("fail to create dataloader", e) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=model.optimizer, milestones=[5, 10], gamma=0.1) model_path = os.path.join(opt.model_dir,opt.model_name+".pth") global_steps = 0 best = 0 for epoch in tqdm(list(range(opt.epoch)), desc='epoch'): for step, batch in enumerate(dataloader): global_steps += 1 metrics = model.train(batch) if global_steps % opt.log_steps == 0: logger.debug(f"global steps={global_steps},{metrics}") if global_steps % opt.save_steps == 0: val_metrics, eval_result = eval(opt, model, eval_data) logger.info(f"global steps={global_steps}, current={val_metrics}, best={best}, result={eval_result}") if val_metrics > best: best = val_metrics torch.save(model.state_dict(), model_path) logger.info(f"global steps={global_steps}, save model:{model_path}") lr_scheduler.step()
def main(model_name=None, hidden=64, nlayers=1): voc_size = 10000 inp = arange(2, voc_size, 2) tgt = arange(3, voc_size, 2) batch_size = 128 epochs = 30 dataset = NumberLoader(inp, tgt) train_len = int(len(dataset) * 0.9) val_len = len(dataset) - train_len train_set, val_set = random_split(dataset, [train_len, val_len]) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=1) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, num_workers=1) model = TransformerModel(voc_size, voc_size, hidden=hidden, nlayers=nlayers) if model_name is not None: model.load_state_dict(load(model_name)) model = model.cuda() # optimizer = optim.SGD(model.parameters(), lr=0.5) optimizer = optim.Adam(model.parameters()) # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) criterion = nn.CrossEntropyLoss() best_loss = 100 for i in range(epochs): epoch_loss = train(model, criterion, optimizer, train_loader) epoch_loss_val = validation(model, criterion, val_loader) # scheduler.step() print("epoch: {} train loss: {}".format(i, epoch_loss)) print("epoch: {} val loss: {}".format(i, epoch_loss_val)) if epoch_loss_val < best_loss: best_loss = epoch_loss_val model_name = "model/model_{0:.5f}.pt".format(epoch_loss_val) save(model.state_dict(), model_name) return model_name
for batch in pb: record_loss, perplexity = train_one_iter(batch, fp16=True) update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: scheduler.step() optimizer.step() optimizer.zero_grad() # speed measure end = time.time() speed = batch_size * num_gradients_accumulation / (end - start) start = end pb.set_postfix(loss=record_loss, perplexity=perplexity, speed=speed) "Evaluation" encoder.eval() decoder.eval() ppl = validate(val_dataloader) checkpointer.save_checkpoint(str(ep), { "encoder": encoder.state_dict(), "decoder": decoder.state_dict() }, {"empty": None}, is_best_so_far=True) logger.info(f"a={a} b={b} Epoch {ep} Validation perplexity: {ppl}") logger.info(f"Finish training of alpha={a} beta={b}")
saved_loss = checkpoint['loss'] if saved_loss < best_val_loss: best_val_loss = saved_loss while epoch < epochs + 1: epoch_start_time = time.time() train(model, bptt, device, train_data, optimizer, criterion, ntokens, scheduler, epoch) val_loss = evaluate(model, model, val_data, bptt, device, ntokens, criterion) epoch_losses = np.append(epoch_losses, val_loss) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': val_loss, }, MODEL_SAVE_NAME) epoch = epoch + 1 scheduler.step()
def main(): ### settings args = set_args() save_path = args.save_path if not os.path.isdir(save_path): os.makedirs(save_path) logger.info(args) ### prepare for data train_dataset = COCOMultiLabel(args, train=True, image_path=args.image_path) test_dataset = COCOMultiLabel(args, train=False, image_path=args.image_path) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, shuffle=True, drop_last=True, collate_fn=my_collate) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=True, shuffle=False, drop_last=False, collate_fn=my_collate) ## prepare for models encoder = CNN_Encoder().cuda() decoder = TransformerModel(args).cuda() ## set different parameter for training or only evaluation' if args.use_eval: weights_dic = torch.load(args.use_model) encoder.load_state_dict( convert_weights(weights_dic['encoder_state_dict'])) decoder.load_state_dict( convert_weights(weights_dic['decoder_state_dict'])) else: encoder.load_state_dict( convert_weights(torch.load(args.encoder_weights))) encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.encoder_lr) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.decoder_lr) ## whether using dataparallel' if torch.cuda.device_count() > 1: encoder = nn.DataParallel(encoder) decoder = nn.DataParallel(decoder) ## set hinge loss function' loss_hinge = torch.nn.HingeEmbeddingLoss(margin=args.C, size_average=None, reduce=None, reduction='mean') ## if only evaluation, return" if args.use_eval: f1 = test(args, encoder, decoder, test_loader, args.threshold, 1) return ## training stage highest_f1 = 0 epochs_without_improve = 0 for epoch in range(args.epochs): ## train and test train(args, encoder, decoder, train_loader, encoder_optimizer, decoder_optimizer, epoch, loss_hinge) f1 = test(args, encoder, decoder, test_loader, args.threshold, epoch) ### save parameter save_dict = { 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'epoch': epoch, 'f1': f1, 'decoder_optimizer_state_dict': decoder_optimizer.state_dict(), 'encoder_optimizer_state_dict': encoder_optimizer.state_dict(), 'epochs_without_improve': epochs_without_improve } ### save models' torch.save(save_dict, args.save_path + "/checkpoint_" + timestr + '.pt.tar') if f1 > highest_f1: torch.save( save_dict, args.save_path + "/BEST_checkpoint_" + timestr + '.pt.tar') logger.info("Now the highest f1 is {}, it was {}".format( 100 * f1, 100 * highest_f1)) highest_f1 = f1 epochs_without_improve = 0 else: epochs_without_improve += 1 if epochs_without_improve == 3: adjust_learning_rate(decoder_optimizer, args.coeff) adjust_learning_rate(encoder_optimizer, args.coeff) epochs_without_imp = 0
input_pad_at='start', num_workers=8) logging.info(str(model)) logging.info(str(print_hparams(hp))) logging.info('Data loaded!') logging.info('Data size: ' + str(len(training_data))) logging.info('Total Model parameters: ' + str(sum(p.numel() for p in model.parameters() if p.requires_grad))) epoch = 1 if hp.mode == 'train': while epoch < hp.training_epochs + 1: epoch_start_time = time.time() train() torch.save(model.state_dict(), '{log_dir}/{num_epoch}.pt'.format(log_dir=log_dir, num_epoch=epoch)) scheduler.step(epoch) eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path) eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path, beam_size=2) eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path, beam_size=3) eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path, beam_size=4) epoch += 1 if hp.mode == 'eval': # Evaluation model score model.load_state_dict(torch.load("./models/best.pt")) eval_all(evaluation_beam, word_dict_pickle_path=word_dict_pickle_path) eval_with_beam(evaluation_beam, max_len=30, eos_ind=9, word_dict_pickle_path=word_dict_pickle_path,
# 记录最佳模型 if val_loss < best_val_loss: best_val_loss = val_loss best_model = model # 调整学习率 scheduler.step() # # 保存模型 # if not os.path.exists('datasets/models'): # os.makedirs('datasets/models') # torch.save({'state_dict': model.state_dict()}, 'datasets/models/best_model.pth.tar') # 保存模型 if not os.path.exists('temp/models'): os.makedirs('temp/models') torch.save({'state_dict': model.state_dict()}, 'temp/models/best_model.pth.tar') print('train finish') # test # 计算交叉熵损失 test_loss = evaluate(best_model, test_data) # 计算困惑度 ppl = math.exp(test_loss) print('=' * 40) print('| End of training | test ppl {:8.2f}'.format(ppl)) print('=' * 40)