def train(cfg, log_path=None): torch.backends.cudnn.benchmark = True def rein_loss(model, inputs, bs, t, device): inputs = list(map(lambda x: x.to(device), inputs)) L, ll = model(inputs, decode_type='sampling') b = bs[t] if bs is not None else baseline.eval(inputs, L) return ((L - b.to(device)) * ll).mean(), L.mean() model = AttentionModel(cfg.embed_dim, cfg.n_encode_layers, cfg.n_heads, cfg.tanh_clipping) model.train() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model.to(device) baseline = RolloutBaseline(model, cfg.task, cfg.weight_dir, cfg.n_rollout_samples, cfg.embed_dim, cfg.n_customer, cfg.warmup_beta, cfg.wp_epochs, device) optimizer = optim.Adam(model.parameters(), lr=cfg.lr) t1 = time() for epoch in range(cfg.epochs): ave_loss, ave_L = 0., 0. dataset = Generator(cfg.batch * cfg.batch_steps, cfg.n_customer) bs = baseline.eval_all(dataset) bs = bs.view( -1, cfg.batch ) if bs is not None else None # bs: (cfg.batch_steps, cfg.batch) or None dataloader = DataLoader(dataset, batch_size=cfg.batch, shuffle=True) for t, inputs in enumerate(dataloader): loss, L_mean = rein_loss(model, inputs, bs, t, device) optimizer.zero_grad() loss.backward() # print('grad: ', model.Decoder.Wk1.weight.grad[0][0]) # https://github.com/wouterkool/attention-learn-to-route/blob/master/train.py nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=2) optimizer.step() ave_loss += loss.item() ave_L += L_mean.item() if t % (cfg.batch_verbose) == 0: t2 = time() print( 'Epoch %d (batch = %d): Loss: %1.3f L: %1.3f, %dmin%dsec' % (epoch, t, ave_loss / (t + 1), ave_L / (t + 1), (t2 - t1) // 60, (t2 - t1) % 60)) if cfg.islogger: if log_path is None: log_path = '%s%s_%s.csv' % ( cfg.log_dir, cfg.task, cfg.dump_date ) #cfg.log_dir = ./Csv/ with open(log_path, 'w') as f: f.write('time,epoch,batch,loss,cost\n') with open(log_path, 'a') as f: f.write('%dmin%dsec,%d,%d,%1.3f,%1.3f\n' % ((t2 - t1) // 60, (t2 - t1) % 60, epoch, t, ave_loss / (t + 1), ave_L / (t + 1))) t1 = time() baseline.epoch_callback(model, epoch) torch.save(model.state_dict(), '%s%s_epoch%s.pt' % (cfg.weight_dir, cfg.task, epoch))
# add padding train_tokens = data.add_padding(train_tokens, max) test_tokens = data.add_padding(test_tokens, max) # convert2vec train_tokens, train_tags = data.convert2vec(train_tokens, train_tags, word2idx, tag2idx) test_tokens, test_tags = data.convert2vec(test_tokens, test_tags, word2idx=word2idx, tag2idx = tag2idx) # dataset train_dataset = myDataSet(train_tokens, train_tags, train_seqlen) test_dataset = myDataSet(test_tokens, test_tags, test_seqlen) # dataloader train_data = DataLoader(train_dataset, batch_size=args.batch_size) test_data = DataLoader(test_dataset, batch_size=args.batch_size) # model baseModel = baseModel(vocab_size=vocab_size, embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, tag2idx=tag2idx, batch_size=args.batch_size, use_gpu=use_gpu, idx2word=idx2word, emb_path=emb_path) attentionModel = AttentionModel(vocab_size=vocab_size, embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, tag2idx=tag2idx, batch_size=args.batch_size, use_gpu=use_gpu, idx2word=idx2word, emb_path=emb_path) optimizer = optim.Adam(attentionModel.parameters(), lr=args.lr) # trainer if args.mode == 'base': myTrainer = trainer(model=baseModel, train_dataloader=train_data, test_dataloader=test_data, optimizer=optimizer, epochs=args.epochs, word2idx=word2idx, tag2idx=tag2idx, idx2word=idx2word, idx2tag=idx2tag, use_gpu=use_gpu) if args.mode == 'attention': myTrainer = trainer(model=attentionModel, train_dataloader=train_data, test_dataloader=test_data, optimizer=optimizer, epochs=args.epochs, word2idx=word2idx, tag2idx=tag2idx, idx2word=idx2word, idx2tag=idx2tag, use_gpu=use_gpu) else: print('not right mode') myTrainer.train()