class Translation(object): def __init__(self, args): super(Translation, self).__init__() self.datasets = {} self.data_dir = args.data_dir self.src_lang, self.trg_lang = dataset_utils.infer_language_pair( args.data_dir) src_dict_path = os.path.join(args.data_dir, dict_path.format(self.src_lang)) trg_dict_path = os.path.join(args.data_dir, dict_path.format(self.trg_lang)) self.src_dict = Dictionary.build_from_dict_file(src_dict_path) self.trg_dict = Dictionary.build_from_dict_file(trg_dict_path) self.model = None self.criterion = None self.optimizer = None def load_dataset(self, split): # 根据split找到路径 src_split_path = os.path.join( self.data_dir, subset_path.format(split, self.src_lang, self.trg_lang, self.src_lang)) trg_split_path = os.path.join( self.data_dir, subset_path.format(split, self.src_lang, self.trg_lang, self.trg_lang)) src_dataset = SingleDataset(src_split_path) trg_dataset = SingleDataset(trg_split_path) pair_dataset = PairDataset(src_dataset, trg_dataset) self.datasets[split] = pair_dataset def build_model(self, args): encoder_embed_tokens = nn.Embedding( self.src_dict.token_num, args.encoder_embed_dim, padding_idx=self.src_dict.padding_idx) if args.share_all_embeddings: decoder_embed_tokens = encoder_embed_tokens else: decoder_embed_tokens = nn.Embedding( self.trg_dict.token_num, args.decoder_embed_dim, padding_idx=self.trg_dict.padding_idx) self.model = Transformer(args, self.src_dict, self.trg_dict) def build_criterion(self, label_smooth): self.criterion = LabelSmoothedCrossEntropyCriterion(label_smooth) def build_optimizer(self): if self.model is None: print("should build model first!") else: self.optimizer = CustomAdam(self.model.parameters(), lr=self.args.lr, betas=self.args.betas)
def train(): inputs, src_vocab_size, tgt_vocab_size, idx2word = create_data() enc_inputs, dec_inputs, dec_outputs = make_data(*inputs) data_loader = Data.DataLoader(dataset=MyDataSet(enc_inputs, dec_inputs, dec_outputs), batch_size=2, shuffle=True) model = Transformer(src_vocab_size, tgt_vocab_size).cuda() criterion = nn.CrossEntropyLoss(ignore_index=0) # PAD本身无意义,单词索引为0,设置ignore_index=0,可避免计算PAD的损失 optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.09) for epoch in range(30): for enc_inputs, dec_inputs, dec_outputs in data_loader: """ enc_inputs: [batch_size, src_len] dec_inputs: [batch_size, tgt_len] dec_outputs: [batch_size, tgt_len] """ enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda(), dec_inputs.cuda(), dec_outputs.cuda() outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs) loss = criterion(outputs, dec_outputs.view(-1)) print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss)) optimizer.zero_grad() loss.backward() optimizer.step()
def instantiate_model(self, english_vocab_size, norwegian_vocab_size, embedding_dim=256, num_heads=8, num_encoders=6, ff_dim=256): model = Transformer(english_vocab_size, norwegian_vocab_size, embedding_dim, num_heads, num_encoders, ff_dim, self.cuda).to(self.cuda) for p in model.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform(p) return model
def main(args): torch.manual_seed(args.seed) train_loader, test_loader = data_generator(args.data_dir, args.batch_size) for m in range(len(models)): if(models[m]=="Transformer"): model = Transformer(args.NumFeatures,args.NumTimeSteps,args.n_layers, args.heads, args.dropout,args.n_classes,time=args.NumTimeSteps) elif(models[m]=="TCN"): channel_sizes = [args.nhid] * args.levels model = TCN(args.NumFeatures, args.n_classes, channel_sizes, kernel_size=args.ksize, dropout=args.dropout) elif(models[m]=="LSTMWithInputCellAttention"): model = LSTMWithInputCellAttention(args.NumFeatures, args.nhid,args.n_classes,args.dropout,args.attention_hops,args.d_a) elif(models[m]=="LSTM"): model = LSTM(args.NumFeatures, args.nhid, args.n_classes,args.dropout) model.to(device) model_name = "model_{}_NumFeatures_{}".format(models[m],args.NumFeatures) model_filename = args.model_dir + 'm_' + model_name + '.pt' lr=args.lr optimizer = getattr(optim, args.optim)(model.parameters(), lr=lr) best_test_loss=100 for epoch in range(1, args.epochs+1): model,optimizer = train(args,epoch,model,train_loader,optimizer) test_loss,test_acc = test(args,model,test_loader) if(test_loss<best_test_loss): best_test_loss = test_loss save(model, model_filename) if(test_acc>=99): break if epoch % 10 == 0: lr /= 10 for param_group in optimizer.param_groups: param_group['lr'] = lr
def main(): parser = argparse.ArgumentParser(description="Train the model") parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # Load data data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_word_seq_len + 2 training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size print(opt) # opt.cuda = True device = torch.device('cuda' if opt.cuda else 'cpu') # TODO: Fill the code transformer = Transformer(d_word_embedding=opt.d_word_vec, d_h=opt.d_model, d_s=opt.d_model, src_vocab_size=opt.src_vocab_size, tgt_vocab_size=opt.tgt_vocab_size, max_sent_len=opt.max_token_seq_len).to(device) optimizer = optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(gpu_id=None): dataset = Dataset(transform=transform, n_datas=10000) pad_vec = np.zeros(len(dataset.human_vocab)) pad_vec[dataset.human_vocab['<pad>']] = 1 dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=6, shuffle=True, num_workers=6, collate_fn=partial( collate_fn, pad_vec)) model = Transformer(n_head=2) if gpu_id is not None: print('use gpu') os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id n_gpus = torch.cuda.device_count() # print('use %d gpu [%s]' % (n_gpus, gpu_id)) model = model.cuda() # model = torch.nn.DataParallel(model, device_ids=[i for i in range(n_gpus)]) # loss_fn = torch.nn.CrossEntropyLoss() loss_fn = torch.nn.MSELoss() optimizer = torch.optim.Adam(model.parameters()) model = sl.load_model('./checkpoint', -1, model) optimizer = sl.load_optimizer('./checkpoint', -1, optimizer) try: trained_epoch = sl.find_last_checkpoint('./checkpoint') print('train form epoch %d' % (trained_epoch + 1)) except Exception as e: print('train from the very begining, {}'.format(e)) trained_epoch = -1 for epoch in range(trained_epoch + 1, 20): train(model, loss_fn, optimizer, dataloader, epoch, use_gpu=True if gpu_id is not None else False)
def main(args): # 0. initial setting # set environmet cudnn.benchmark = True if not os.path.isdir('./ckpt'): os.mkdir('./ckpt') if not os.path.isdir('./results'): os.mkdir('./results') if not os.path.isdir(os.path.join('./ckpt', args.name)): os.mkdir(os.path.join('./ckpt', args.name)) if not os.path.isdir(os.path.join('./results', args.name)): os.mkdir(os.path.join('./results', args.name)) if not os.path.isdir(os.path.join('./results', args.name, "log")): os.mkdir(os.path.join('./results', args.name, "log")) # set logger logger = logging.getLogger() logger.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(message)s') handler = logging.FileHandler("results/{}/log/{}.log".format( args.name, time.strftime('%c', time.localtime(time.time())))) handler.setFormatter(formatter) logger.addHandler(handler) logger.addHandler(logging.StreamHandler()) args.logger = logger # set cuda if torch.cuda.is_available(): args.logger.info("running on cuda") args.device = torch.device("cuda") args.use_cuda = True else: args.logger.info("running on cpu") args.device = torch.device("cpu") args.use_cuda = False args.logger.info("[{}] starts".format(args.name)) # 1. load data args.logger.info("loading data...") src, tgt = load_data(args.path) src_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>') src_vocab.load(os.path.join(args.path, 'vocab.en')) tgt_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>') tgt_vocab.load(os.path.join(args.path, 'vocab.de')) # 2. setup args.logger.info("setting up...") sos_idx = 0 eos_idx = 1 pad_idx = 2 max_length = 50 src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) # transformer config d_e = 512 # embedding size d_q = 64 # query size (= key, value size) d_h = 2048 # hidden layer size in feed forward network num_heads = 8 num_layers = 6 # number of encoder/decoder layers in encoder/decoder args.sos_idx = sos_idx args.eos_idx = eos_idx args.pad_idx = pad_idx args.max_length = max_length args.src_vocab_size = src_vocab_size args.tgt_vocab_size = tgt_vocab_size args.d_e = d_e args.d_q = d_q args.d_h = d_h args.num_heads = num_heads args.num_layers = num_layers model = Transformer(args) model.to(args.device) loss_fn = nn.CrossEntropyLoss(ignore_index=pad_idx) optimizer = optim.Adam(model.parameters(), lr=1e-5) if args.load: model.load_state_dict(load(args, args.ckpt)) # 3. train / test if not args.test: # train args.logger.info("starting training") acc_val_meter = AverageMeter(name="Acc-Val (%)", save_all=True, save_dir=os.path.join( 'results', args.name)) train_loss_meter = AverageMeter(name="Loss", save_all=True, save_dir=os.path.join( 'results', args.name)) train_loader = get_loader(src['train'], tgt['train'], src_vocab, tgt_vocab, batch_size=args.batch_size, shuffle=True) valid_loader = get_loader(src['valid'], tgt['valid'], src_vocab, tgt_vocab, batch_size=args.batch_size) for epoch in range(1, 1 + args.epochs): spent_time = time.time() model.train() train_loss_tmp_meter = AverageMeter() for src_batch, tgt_batch in tqdm(train_loader): # src_batch: (batch x source_length), tgt_batch: (batch x target_length) optimizer.zero_grad() src_batch, tgt_batch = torch.LongTensor(src_batch).to( args.device), torch.LongTensor(tgt_batch).to(args.device) batch = src_batch.shape[0] # split target batch into input and output tgt_batch_i = tgt_batch[:, :-1] tgt_batch_o = tgt_batch[:, 1:] pred = model(src_batch.to(args.device), tgt_batch_i.to(args.device)) loss = loss_fn(pred.contiguous().view(-1, tgt_vocab_size), tgt_batch_o.contiguous().view(-1)) loss.backward() optimizer.step() train_loss_tmp_meter.update(loss / batch, weight=batch) train_loss_meter.update(train_loss_tmp_meter.avg) spent_time = time.time() - spent_time args.logger.info( "[{}] train loss: {:.3f} took {:.1f} seconds".format( epoch, train_loss_tmp_meter.avg, spent_time)) # validation model.eval() acc_val_tmp_meter = AverageMeter() spent_time = time.time() for src_batch, tgt_batch in tqdm(valid_loader): src_batch, tgt_batch = torch.LongTensor( src_batch), torch.LongTensor(tgt_batch) tgt_batch_i = tgt_batch[:, :-1] tgt_batch_o = tgt_batch[:, 1:] with torch.no_grad(): pred = model(src_batch.to(args.device), tgt_batch_i.to(args.device)) corrects, total = val_check( pred.max(dim=-1)[1].cpu(), tgt_batch_o) acc_val_tmp_meter.update(100 * corrects / total, total) spent_time = time.time() - spent_time args.logger.info( "[{}] validation accuracy: {:.1f} %, took {} seconds".format( epoch, acc_val_tmp_meter.avg, spent_time)) acc_val_meter.update(acc_val_tmp_meter.avg) if epoch % args.save_period == 0: save(args, "epoch_{}".format(epoch), model.state_dict()) acc_val_meter.save() train_loss_meter.save() else: # test args.logger.info("starting test") test_loader = get_loader(src['test'], tgt['test'], src_vocab, tgt_vocab, batch_size=args.batch_size) pred_list = [] model.eval() for src_batch, tgt_batch in test_loader: #src_batch: (batch x source_length) src_batch = torch.Tensor(src_batch).long().to(args.device) batch = src_batch.shape[0] pred_batch = torch.zeros(batch, 1).long().to(args.device) pred_mask = torch.zeros(batch, 1).bool().to( args.device) # mask whether each sentece ended up with torch.no_grad(): for _ in range(args.max_length): pred = model( src_batch, pred_batch) # (batch x length x tgt_vocab_size) pred[:, :, pad_idx] = -1 # ignore <pad> pred = pred.max(dim=-1)[1][:, -1].unsqueeze( -1) # next word prediction: (batch x 1) pred = pred.masked_fill( pred_mask, 2).long() # fill out <pad> for ended sentences pred_mask = torch.gt(pred.eq(1) + pred.eq(2), 0) pred_batch = torch.cat([pred_batch, pred], dim=1) if torch.prod(pred_mask) == 1: break pred_batch = torch.cat([ pred_batch, torch.ones(batch, 1).long().to(args.device) + pred_mask.long() ], dim=1) # close all sentences pred_list += seq2sen(pred_batch.cpu().numpy().tolist(), tgt_vocab) with open('results/pred.txt', 'w', encoding='utf-8') as f: for line in pred_list: f.write('{}\n'.format(line)) os.system( 'bash scripts/bleu.sh results/pred.txt multi30k/test.de.atok')
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-data', type=str, default='./data/data.pt', help= 'Path to the source data. The default is ./data/data.pt, which is the output of preprocessing.' ) parser.add_argument('-epoch', default=10000) parser.add_argument('-log_step', default=5) parser.add_argument('-save_model_epoch', default=1) parser.add_argument('-save_model_path', default='./saved_model/') args = parser.parse_args() dataset = torch.load(args.data) batch_size = 4 src_vocab = dataset['dict']['src'] tgt_vocab = dataset['dict']['tgt'] print("\n\nBatch Size = %d" % batch_size) print("Source Vocab Size = %d" % len(src_vocab)) print("Target Vocab Size = %d" % len(tgt_vocab)) print("\nLoading Training Data ... ") training_batches = get_loader(src=dataset['train']['src'], tgt=dataset['train']['tgt'], src_vocabs=dataset['dict']['src'], tgt_vocabs=dataset['dict']['tgt'], batch_size=batch_size, use_cuda=True, shuffle=True) # print("\nLoading Validation Data ... ") # validation_data = get_loader( # src=dataset['valid']['src'], # tgt=dataset['valid']['tgt'], # src_vocabs=dataset['dict']['src'], # tgt_vocabs=dataset['dict']['tgt'], # batch_size=batch_size, # use_cuda=False, # shuffle=False # ) # For python 2 transformer_config = [ 6, 512, 512, 8, batch_size, len(src_vocab), len(tgt_vocab), 100, 0.1, True ] # For python 3 # transformer_config = { # 'N': 6, # 'd_model': int(512), # 'd_ff': 512, # 'H': 8, # 'batch_size': batch_size, # 'src_vocab_size': int(len(src_vocab)), # 'tgt_vocab_size': int(len(tgt_vocab)), # 'max_seq': 100, # 'dropout': 0.1, # 'use_cuda': True # } transformer = Transformer(transformer_config) if torch.cuda.is_available(): print("CUDA enabled.") transformer.cuda() optimizer = optim.Adam( transformer.parameters(), lr=0.001, # betas=(0.9, 0.98), # eps=1e-09 ) criterion = nn.CrossEntropyLoss() # Prepare a txt file to print training log if not os.path.exists(args.save_model_path): print( "\nCreated a directory (%s) for saving model since it does not exist.\n" % args.save_model_path) os.makedirs(args.save_model_path) f = open('%s/train_log.txt' % args.save_model_path, 'w') # Train the model for e in range(args.epoch): for i, batch in enumerate( tqdm(training_batches, mininterval=2, desc=' Training ', leave=False)): # print ("BATCH") # print(batch[0][0]) # exit() sources = to_var(batch[0]) targets = to_var(batch[1]) src_seq_len = targets.size()[1] tgt_seq_len = targets.size()[1] if torch.cuda.is_available(): sources = sources.cuda() targets = targets.cuda() optimizer.zero_grad() outputs = transformer(sources, targets) # print("\n\n\n########### OUTPUT ###########") # print(len(outputs)) # print(outputs.max(1)[1].data.tolist() ) # exit() # # print("\n\n\n########### TARGET ###########") # print(len(targets)) # print(targets) # print(" \n\n TARGETS %d " %i) # print(targets) # print(targets.contiguous().view(-1).long()) # exit() targets = targets.contiguous().view(-1).long() loss = criterion(outputs, targets) # backprop loss.backward() # optimize params optimizer.step() # Print log info to both console and file if i % args.log_step == 0: print( "\n\n\n\n#################################################################################" ) log = ( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f\n' % (e, args.epoch, i, len(training_batches), loss.data[0], np.exp(loss.data[0]))) print(log) f.write("{}".format(log)) # Print the first sentence of the batch (The first sentence of the batch) src_indices = sources.data.tolist( )[0][:src_seq_len] # Variable -> Tensor -> List src_sentence = convert2text(src_indices, src_vocab) # Get sentence pred_indices = outputs.max( 1)[1].data.tolist() # Variable -> Tensor -> List pred_indices = [ i[0] for i in pred_indices[:tgt_seq_len] ] # Get data of index until the max_seq_length of target (i.e. first sentence of the batch). pred_sentence = convert2text(pred_indices, tgt_vocab) # Get sentence tgt_indices = targets.data.tolist( )[:tgt_seq_len] # Variable -> Tensor -> List tgt_sentence = convert2text(tgt_indices, tgt_vocab) # Get sentence original = ("ORIGINAL: {}\n".format(src_sentence)) predicted = ("PREDICTED: {}\n".format(pred_sentence)) truth = ("TRUTH: {}\n\n".format(tgt_sentence)) print(original) print(predicted) print(truth) f.write("{}".format(original)) f.write("{}".format(predicted)) f.write("{}".format(truth)) # Save the models if (e) % args.save_model_epoch == 0: torch.save( transformer.state_dict(), os.path.join(args.save_model_path, 'transformer-%d-%d.pkl' % (e + 1, i + 1)))
def transformer(dataloader, EPOCH, k, frequency, path_to_save_model, path_to_save_loss, path_to_save_predictions, device): device = torch.device(device) model = Transformer().double().to(device) optimizer = torch.optim.Adam(model.parameters()) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=200) criterion = torch.nn.MSELoss() best_model = "" min_train_loss = float('inf') for epoch in range(EPOCH + 1): train_loss = 0 val_loss = 0 ## TRAIN -- TEACHER FORCING model.train() for index_in, index_tar, _input, target, sensor_number in dataloader: # Shape of _input : [batch, input_length, feature] # Desired input for model: [input_length, batch, feature] optimizer.zero_grad() src = _input.permute(1,0,2).double().to(device)[:-1,:,:] # torch.Size([24, 1, 7]) target = _input.permute(1,0,2).double().to(device)[1:,:,:] # src shifted by 1. sampled_src = src[:1, :, :] #t0 torch.Size([1, 1, 7]) for i in range(len(target)-1): prediction = model(sampled_src, device) # torch.Size([1xw, 1, 1]) # for p1, p2 in zip(params, model.parameters()): # if p1.data.ne(p2.data).sum() > 0: # ic(False) # ic(True) # ic(i, sampled_src[:,:,0], prediction) # time.sleep(1) """ # to update model at every step # loss = criterion(prediction, target[:i+1,:,:1]) # loss.backward() # optimizer.step() """ if i < 24: # One day, enough data to make inferences about cycles prob_true_val = True else: ## coin flip v = k/(k+math.exp(epoch/k)) # probability of heads/tails depends on the epoch, evolves with time. prob_true_val = flip_from_probability(v) # starts with over 95 % probability of true val for each flip in epoch 0. ## if using true value as new value if prob_true_val: # Using true value as next value sampled_src = torch.cat((sampled_src.detach(), src[i+1, :, :].unsqueeze(0).detach())) else: ## using prediction as new value positional_encodings_new_val = src[i+1,:,1:].unsqueeze(0) predicted_humidity = torch.cat((prediction[-1,:,:].unsqueeze(0), positional_encodings_new_val), dim=2) sampled_src = torch.cat((sampled_src.detach(), predicted_humidity.detach())) """To update model after each sequence""" loss = criterion(target[:-1,:,0].unsqueeze(-1), prediction) loss.backward() optimizer.step() train_loss += loss.detach().item() if train_loss < min_train_loss: torch.save(model.state_dict(), path_to_save_model + f"best_train_{epoch}.pth") torch.save(optimizer.state_dict(), path_to_save_model + f"optimizer_{epoch}.pth") min_train_loss = train_loss best_model = f"best_train_{epoch}.pth" if epoch % 10 == 0: # Plot 1-Step Predictions logger.info(f"Epoch: {epoch}, Training loss: {train_loss}") scaler = load('scalar_item.joblib') sampled_src_humidity = scaler.inverse_transform(sampled_src[:,:,0].cpu()) #torch.Size([35, 1, 7]) src_humidity = scaler.inverse_transform(src[:,:,0].cpu()) #torch.Size([35, 1, 7]) target_humidity = scaler.inverse_transform(target[:,:,0].cpu()) #torch.Size([35, 1, 7]) prediction_humidity = scaler.inverse_transform(prediction[:,:,0].detach().cpu().numpy()) #torch.Size([35, 1, 7]) plot_training_3(epoch, path_to_save_predictions, src_humidity, sampled_src_humidity, prediction_humidity, sensor_number, index_in, index_tar) train_loss /= len(dataloader) log_loss(train_loss, path_to_save_loss, train=True) plot_loss(path_to_save_loss, train=True) return best_model
def main(args): src, tgt = load_data(args.path) src_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>') src_vocab.load(os.path.join(args.path, 'vocab.en')) tgt_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>') tgt_vocab.load(os.path.join(args.path, 'vocab.de')) vsize_src = len(src_vocab) vsize_tar = len(tgt_vocab) net = Transformer(vsize_src, vsize_tar) if not args.test: train_loader = get_loader(src['train'], tgt['train'], src_vocab, tgt_vocab, batch_size=args.batch_size, shuffle=True) valid_loader = get_loader(src['valid'], tgt['valid'], src_vocab, tgt_vocab, batch_size=args.batch_size) net.to(device) optimizer = optim.Adam(net.parameters(), lr=args.lr) best_valid_loss = 10.0 for epoch in range(args.epochs): print("Epoch {0}".format(epoch)) net.train() train_loss = run_epoch(net, train_loader, optimizer) print("train loss: {0}".format(train_loss)) net.eval() valid_loss = run_epoch(net, valid_loader, None) print("valid loss: {0}".format(valid_loss)) torch.save(net, 'data/ckpt/last_model') if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(net, 'data/ckpt/best_model') else: # test net = torch.load('data/ckpt/best_model') net.to(device) net.eval() test_loader = get_loader(src['test'], tgt['test'], src_vocab, tgt_vocab, batch_size=args.batch_size) pred = [] iter_cnt = 0 for src_batch, tgt_batch in test_loader: source, src_mask = make_tensor(src_batch) source = source.to(device) src_mask = src_mask.to(device) res = net.decode(source, src_mask) pred_batch = res.tolist() # every sentences in pred_batch should start with <sos> token (index: 0) and end with <eos> token (index: 1). # every <pad> token (index: 2) should be located after <eos> token (index: 1). # example of pred_batch: # [[0, 5, 6, 7, 1], # [0, 4, 9, 1, 2], # [0, 6, 1, 2, 2]] pred += seq2sen(pred_batch, tgt_vocab) iter_cnt += 1 #print(pred_batch) with open('data/results/pred.txt', 'w') as f: for line in pred: f.write('{}\n'.format(line)) os.system( 'bash scripts/bleu.sh data/results/pred.txt data/multi30k/test.de.atok' )
model = Transformer(device=device, d_feature=train_data.sig_len, d_model=d_model, d_inner=d_inner, n_layers=num_layers, n_head=num_heads, d_k=64, d_v=64, dropout=dropout, class_num=class_num) model = model.to(device) optimizer = ScheduledOptim( Adam(filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), d_model, warm_steps) train_accs = [] valid_accs = [] eva_indis = [] train_losses = [] valid_losses = [] for epoch_i in range(epoch): print('[ Epoch', epoch_i, ']') start = time.time() train_loss, train_acc, cnt = train_epoch(train_loader, device, model, optimizer, train_data.__len__()) print( ' - (Training) loss: {loss: 8.5f}, accuracy: {accu:3.3f} %, '
train_x, train_y, test_x, test_y = build_data(series, min_len = 3, max_len = max_len) max_index = int(max(train_x.max(), test_x.max())) args = { 'emb_dim': 32, # Embedding vector dimension 'n_att_heads': 16, # Number of attention heads for each transformer block 'n_transformers': 4, # Depth of the network (nr. of self-attention layers) 'seq_length': max_len, # Sequence length 'num_tokens': max_index + 1, # Vocabulary size (highest index found in dataset) 'device': device, # Device: cuda/cpu 'wide': False # Narrow or wide self-attention } stats = { 'loss': [], 'perplexity': [] } # we accomulate and save training statistics here model = Transformer(**args).to(device) opt = torch.optim.Adam(lr=learning_rate, params=model.parameters()) for i in range(epochs): model.train() opt.zero_grad() # Sample a random batch of size `batch_size` from the train dataset idxs = torch.randint(size=(batch_size,), low=0, high=len(train_x)) output, (emb_mean, emb_max) = model(train_x[idxs]) loss = F.nll_loss(output, train_y[idxs], reduction='mean') nn.utils.clip_grad_norm_(model.parameters(), 1) loss.backward() opt.step() # Calculate perplexity on the test-set
def main(): """Entry point. """ if torch.cuda.is_available(): device = torch.device(torch.cuda.current_device()) print(f"Using CUDA device {device}") else: device = None # Load data vocab = Vocab(config_data.vocab_file) data_hparams = { # "batch_size" is ignored for train since we use dynamic batching "batch_size": config_data.test_batch_size, "bos_id": vocab.bos_token_id, "eos_id": vocab.eos_token_id, } datasets = { split: data_utils.Seq2SeqData(os.path.join( config_data.input_dir, f"{config_data.filename_prefix}{split}.npy"), hparams=data_hparams, device=device) for split in ["train", "valid", "test"] } print(f"Training data size: {len(datasets['train'])}") beam_width = config_model.beam_width # Create logging tx.utils.maybe_create_dir(args.output_dir) logging_file = os.path.join(args.output_dir, "logging.txt") logger = utils.get_logger(logging_file) print(f"logging file is saved in: {logging_file}") # Create model and optimizer model = Transformer(config_model, config_data, vocab).to(device) best_results = {"score": 0, "epoch": -1} lr_config = config_model.lr_config if lr_config["learning_rate_schedule"] == "static": init_lr = lr_config["static_lr"] scheduler_lambda = lambda x: 1.0 else: init_lr = lr_config["lr_constant"] scheduler_lambda = functools.partial( utils.get_lr_multiplier, warmup_steps=lr_config["warmup_steps"]) optim = torch.optim.Adam(model.parameters(), lr=init_lr, betas=(0.9, 0.997), eps=1e-9) scheduler = torch.optim.lr_scheduler.LambdaLR(optim, scheduler_lambda) @torch.no_grad() def _eval_epoch(epoch, mode, print_fn=None): if print_fn is None: print_fn = print tqdm_leave = True else: tqdm_leave = False model.eval() eval_data = datasets[mode] eval_iter = tx.data.DataIterator(eval_data) references, hypotheses = [], [] for batch in tqdm.tqdm(eval_iter, ncols=80, leave=tqdm_leave, desc=f"Eval on {mode} set"): predictions = model( encoder_input=batch.source, beam_width=beam_width, ) if beam_width == 1: decoded_ids = predictions[0].sample_id else: decoded_ids = predictions["sample_id"][:, :, 0] hypotheses.extend(h.tolist() for h in decoded_ids) references.extend(r.tolist() for r in batch.target_output) hypotheses = utils.list_strip_eos(hypotheses, vocab.eos_token_id) references = utils.list_strip_eos(references, vocab.eos_token_id) if mode == "valid": # Writes results to files to evaluate BLEU # For 'eval' mode, the BLEU is based on token ids (rather than # text tokens) and serves only as a surrogate metric to monitor # the training process fname = os.path.join(args.output_dir, "tmp.eval") hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([str(y) for y in hyp]) rwords.append([str(y) for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_file, ref_file = tx.utils.write_paired_text( hwords, rwords, fname, mode="s", src_fname_suffix="hyp", tgt_fname_suffix="ref", ) eval_bleu = tx.evals.file_bleu(ref_file, hyp_file, case_sensitive=True) logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu) print_fn(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}") if eval_bleu > best_results["score"]: logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu) best_results["score"] = eval_bleu best_results["epoch"] = epoch model_path = os.path.join(args.output_dir, args.output_filename) logger.info("Saving model to %s", model_path) print_fn(f"Saving model to {model_path}") states = { "model": model.state_dict(), "optimizer": optim.state_dict(), "scheduler": scheduler.state_dict(), } torch.save(states, model_path) elif mode == "test": # For 'test' mode, together with the commands in README.md, BLEU # is evaluated based on text tokens, which is the standard metric. fname = os.path.join(args.output_dir, "test.output") hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append(vocab.map_ids_to_tokens_py(hyp)) rwords.append(vocab.map_ids_to_tokens_py(ref)) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_file, ref_file = tx.utils.write_paired_text( hwords, rwords, fname, mode="s", src_fname_suffix="hyp", tgt_fname_suffix="ref", ) logger.info("Test output written to file: %s", hyp_file) print_fn(f"Test output written to file: {hyp_file}") def _train_epoch(epoch: int): model.train() train_iter = tx.data.DataIterator( datasets["train"], data_utils.CustomBatchingStrategy(config_data.max_batch_tokens)) progress = tqdm.tqdm( train_iter, ncols=80, desc=f"Training epoch {epoch}", ) for train_batch in progress: optim.zero_grad() loss = model( encoder_input=train_batch.source, decoder_input=train_batch.target_input, labels=train_batch.target_output, ) loss.backward() optim.step() scheduler.step() step = scheduler.last_epoch if step % config_data.display_steps == 0: logger.info("step: %d, loss: %.4f", step, loss) lr = optim.param_groups[0]["lr"] progress.write(f"lr: {lr:.4e} step: {step}, loss: {loss:.4}") if step and step % config_data.eval_steps == 0: _eval_epoch(epoch, mode="valid", print_fn=progress.write) progress.close() model_path = os.path.join(args.output_dir, args.output_filename) if args.run_mode == "train_and_evaluate": logger.info("Begin running with train_and_evaluate mode") if os.path.exists(model_path): logger.info("Restore latest checkpoint in %s", model_path) ckpt = torch.load(model_path) model.load_state_dict(ckpt["model"]) optim.load_state_dict(ckpt["optimizer"]) scheduler.load_state_dict(ckpt["scheduler"]) _eval_epoch(0, mode="valid") for epoch in range(config_data.max_train_epoch): _train_epoch(epoch) _eval_epoch(epoch, mode="valid") elif args.run_mode in ["evaluate", "test"]: logger.info("Begin running with %s mode", args.run_mode) logger.info("Restore latest checkpoint in %s", model_path) ckpt = torch.load(model_path) model.load_state_dict(ckpt["model"]) _eval_epoch(0, mode=("test" if args.run_mode == "test" else "valid")) else: raise ValueError(f"Unknown mode: {args.run_mode}")
# Getting the vocabulary size for the embedding matrix vocab_size = len(vocab_dict) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ## Setting up the transformer transformer = Transformer(d_model=config.d_model, heads=config.heads, num_layers=config.num_layers, vocab_size=vocab_size) ## Sending the transformer to device transformer = transformer.to(device) ## Hack no. 1 setting the parameters of layer to xavier_uniform for p in transformer.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) ## Want to train the loaded model # checkpoint = torch.load('checkpoint.pth.tar') # transformer = checkpoint['transformer'] ## Hack no. 2 Got from pytorch transformer implementation lr = 5.0 # learning rate optimizer = torch.optim.SGD(transformer.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) for epoch in range(config.epochs): tot_loss = 0
def main(TEXT, LABEL, train_loader, test_loader): # for sentiment analysis. load .pt file from KoBERT.Bert_model import BERTClassifier from kobert.pytorch_kobert import get_pytorch_kobert_model bertmodel, vocab = get_pytorch_kobert_model() sa_model = BERTClassifier(bertmodel, dr_rate=0.5).to(device) sa_model.load_state_dict(torch.load('bert_SA-model.pt')) # print argparse for idx, (key, value) in enumerate(args.__dict__.items()): if idx == 0: print("\nargparse{\n", "\t", key, ":", value) elif idx == len(args.__dict__) - 1: print("\t", key, ":", value, "\n}") else: print("\t", key, ":", value) from model import Transformer, GradualWarmupScheduler # Transformer model init model = Transformer(args, TEXT, LABEL) if args.per_soft: sorted_path = 'sorted_model-soft.pth' else: sorted_path = 'sorted_model-rough.pth' # loss 계산시 pad 제외. criterion = nn.CrossEntropyLoss(ignore_index=LABEL.vocab.stoi['<pad>']) optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr) scheduler = GradualWarmupScheduler(optimizer, multiplier=8, total_epoch=args.num_epochs) # pre-trained 된 vectors load model.src_embedding.weight.data.copy_(TEXT.vocab.vectors) model.trg_embedding.weight.data.copy_(LABEL.vocab.vectors) model.to(device) criterion.to(device) # overfitting 막기 best_valid_loss = float('inf') # train if args.train: for epoch in range(args.num_epochs): torch.manual_seed(SEED) scheduler.step(epoch) start_time = time.time() # train, validation train_loss, train_acc = train(model, train_loader, optimizer, criterion) valid_loss, valid_acc = test(model, test_loader, criterion) # time cal end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) #torch.save(model.state_dict(), sorted_path) # for some overfitting #전에 학습된 loss 보다 현재 loss 가 더 낮을시 모델 저장. if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': valid_loss }, sorted_path) print( f'\t## SAVE valid_loss: {valid_loss:.3f} | valid_acc: {valid_acc:.3f} ##' ) # print loss and acc print( f'\n\t==Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s==' ) print( f'\t==Train Loss: {train_loss:.3f} | Train_acc: {train_acc:.3f}==' ) print( f'\t==Valid Loss: {valid_loss:.3f} | Valid_acc: {valid_acc:.3f}==\n' ) # inference print("\t----------성능평가----------") checkpoint = torch.load(sorted_path) model.load_state_dict(checkpoint['model_state_dict']) test_loss, test_acc = test(model, test_loader, criterion) # 아 print(f'==test_loss : {test_loss:.3f} | test_acc: {test_acc:.3f}==') print("\t-----------------------------") while (True): inference(device, args, TEXT, LABEL, model, sa_model) print("\n")
def ed_train(train_iter, val_iter, TEXT, LABEL): global D_MODEL, N_LAYERS, N_HEADS, DROPOUT, N_EPOCHS, LR SRC_V_SIZE = len(TEXT.vocab) TGT_V_SIZE = len(LABEL.vocab) model = Transformer(SRC_V_SIZE, TGT_V_SIZE, D_MODEL, N_LAYERS, N_HEADS, dropout=DROPOUT).to(device) optim = torch.optim.SGD(model.parameters(), lr=LR) scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=1, gamma=0.9) criterion = nn.CrossEntropyLoss() print( f'Encoder/Decoder Model Hyperparameters\n---------------------\nModel Hidden Dimension: {D_MODEL}' f'\nNum Layers: {N_LAYERS}\nNum Attention Heads: {N_HEADS}\nDropout: {DROPOUT}' f'\nLearning Rate: {LR}\nNum Epochs: {N_EPOCHS}\nBatch Size: {B_SIZE}' f'\nSource Vocab Size: {SRC_V_SIZE}\nTarget Vocab Size: {TGT_V_SIZE}\n' ) loss_interval = 128 loss_values = [] val_loss_values = [] val_acc_values = [] model.train() for epoch in range(1, N_EPOCHS + 1): running_loss = 0. loss_values_sum = 0. print(f'Epoch {epoch}/{N_EPOCHS}') for b_num, batch in enumerate(train_iter): torch.cuda.empty_cache() start_time = time.time() true_batch_num = (len(train_iter) * epoch - 1) + b_num src_input, tgt_input, row_src, row_tgt = parse_batch(batch) if epoch == 1 and b_num == 0: print('src_input shape:', src_input.shape) print('tgt_input shape:', tgt_input.shape) print('row_src:', row_src.shape) print('row_tgt:', row_tgt.shape) SRC_SEQ_LEN = row_src.size(-1) TGT_SEQ_LEN = row_tgt.size(-1) src_mask, src_key_padding_mask, memory_key_padding_mask = create_src_masks( row_src, SRC_SEQ_LEN, TEXT, use_srcmask=args.srcmask) tgt_mask, tgt_key_padding_mask = create_tgt_masks( row_tgt, TGT_SEQ_LEN, LABEL) output = model(src_input, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask) loss = criterion(output.view(-1, TGT_V_SIZE), row_tgt.contiguous().view(-1)) optim.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 0.5) # prevent exploding gradient optim.step() loss_values_sum += loss.item() running_loss += loss.item() el_time = time.time() - start_time if b_num % loss_interval == 0 and b_num > 0: loss_values.append( (true_batch_num, loss_values_sum / loss_interval)) loss_values_sum = 0. if b_num % 128 == 0: print(f'\tBatch {b_num}/{len(train_iter)} | secs/batch: ' f'{round(el_time, 4)} | loss: {loss} | ' f'lr: {scheduler.get_last_lr()}') if b_num % (len(train_iter) // 5) == 0 and b_num > 0: val_loss, val_acc = ed_evaluate(model, val_iter, TEXT, LABEL) model.train() val_loss_values.append((true_batch_num, val_loss)) val_acc_values.append((true_batch_num, val_acc)) if len(val_loss_values) > 1: plt.plot(*zip(*loss_values), label='Train Loss') plt.plot(*zip(*val_loss_values), label='Validation Loss') plt.xlabel('Batch') plt.ylabel('Loss') plt.legend() plt.show() if len(val_acc_values) > 1: plt.plot(*zip(*val_acc_values), label='Validation Accuracy') plt.xlabel('Batch') plt.ylabel('Accuracy') plt.ylim(0, 1) plt.legend() plt.show() scheduler.step() print(f'Epoch {epoch}/{N_EPOCHS} | loss: {running_loss}') if epoch != N_EPOCHS: save_path = f'{args.savepath}train{epoch}.pth' torch.save(model.state_dict(), save_path) if epoch > 1: # save the previous model save_path = f'{args.savepath}train{epoch-1}.pth' try: files.download(save_path) except: print(f'Unable to download {save_path}') print(f'Expected output shape {row_tgt.shape}\nTargets:{row_tgt}') print( f'output raw shape: {output.shape}\nargmax:\n{format_preds(output, TGT_SEQ_LEN)}' ) save_path = f'{args.savepath}goldtrain.pth' torch.save(model.state_dict(), save_path)
batch_size=BATCH_SIZE, shuffle=True, sort=False) test_iter = Iterator(test_data, batch_size=BATCH_SIZE, shuffle=False, sort=False) SRC_PAD_IDX = SRC.vocab.stoi['<pad>'] TRG_PAD_IDX = TRG.vocab.stoi['<pad>'] model = Transformer(len(SRC.vocab), len(TRG.vocab), MAX_LEN, MODEL_SIZE, FF_SIZE, KEY_SIZE, VALUE_SIZE, NUM_HEADS, NUM_LAYERS, DROPOUT, SRC_PAD_IDX, TRG_PAD_IDX).to(DEVICE) criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) opt = AdamWrapper(model.parameters(), MODEL_SIZE, WARMUP) if args.train or args.continue_training: if args.train: best_val_loss = float('inf') with open(LOG_PATH, 'w') as f: f.write('') else: model.load_state_dict(torch.load(MODEL_PATH)) with open(LOG_PATH, 'r') as f: val_losses = [float(line.split()[-1]) for line in f] best_val_loss = min(val_losses) print(f'best_val_loss: {best_val_loss}') for epoch in range(NUM_EPOCHS):
shuffle=True) model = Transformer() # 指定多gpu运行 if torch.cuda.is_available(): model.cuda() if torch.cuda.device_count() > 1: args.n_gpu = torch.cuda.device_count() print("Let's use", torch.cuda.device_count(), "GPUs!") # 就这一行 model = nn.DataParallel(model) criterion = nn.CrossEntropyLoss(ignore_index=0) optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99) for epoch in range(30): # 训练三十轮 for enc_inputs, dec_inputs, dec_outputs in loader: ''' enc_inputs: [batch_size, src_len] dec_inputs: [batch_size, tgt_len] dec_outputs: [batch_size, tgt_len] ''' if torch.cuda.is_available(): enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda( ), dec_inputs.cuda(), dec_outputs.cuda() # outputs: [batch_size * tgt_len, tgt_vocab_size] outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model( enc_inputs, dec_inputs)
model = Transformer( args.embedding_size, args.src_vocab_size, args.trg_vocab_size, src_pad_idx, args.num_heads, args.num_encoder_layers, args.num_decoder_layers, args.forward_expansion, args.dropout, args.max_len, device, ).to(device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True) pad_idx = english.vocab.stoi["<pad>"] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) # Trainer module trainer = Trainer(model=model, device=device, loss_fn=criterion, optimizer=optimizer, scheduler=None)
def do_train(args): if args.use_cuda: trainer_count = fluid.dygraph.parallel.Env().nranks place = fluid.CUDAPlace(fluid.dygraph.parallel.Env( ).dev_id) if trainer_count > 1 else fluid.CUDAPlace(0) else: trainer_count = 1 place = fluid.CPUPlace() # define the data generator processor = reader.DataProcessor( fpattern=args.training_file, src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, token_delimiter=args.token_delimiter, use_token_batch=args.use_token_batch, batch_size=args.batch_size, device_count=trainer_count, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], max_length=args.max_length, n_head=args.n_head) batch_generator = processor.data_generator(phase="train") if args.validation_file: val_processor = reader.DataProcessor( fpattern=args.validation_file, src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, token_delimiter=args.token_delimiter, use_token_batch=args.use_token_batch, batch_size=args.batch_size, device_count=trainer_count, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=False, shuffle_batch=False, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], max_length=args.max_length, n_head=args.n_head) val_batch_generator = val_processor.data_generator(phase="train") if trainer_count > 1: # for multi-process gpu training batch_generator = fluid.contrib.reader.distributed_batch_reader( batch_generator) args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ args.unk_idx = processor.get_vocab_summary() with fluid.dygraph.guard(place): # set seed for CE random_seed = eval(str(args.random_seed)) if random_seed is not None: fluid.default_main_program().random_seed = random_seed fluid.default_startup_program().random_seed = random_seed # define data loader train_loader = fluid.io.DataLoader.from_generator(capacity=10) train_loader.set_batch_generator(batch_generator, places=place) if args.validation_file: val_loader = fluid.io.DataLoader.from_generator(capacity=10) val_loader.set_batch_generator(val_batch_generator, places=place) # define model transformer = Transformer( args.src_vocab_size, args.trg_vocab_size, args.max_length + 1, args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model, args.d_inner_hid, args.prepostprocess_dropout, args.attention_dropout, args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, args.weight_sharing, args.bos_idx, args.eos_idx) # define loss criterion = CrossEntropyCriterion(args.label_smooth_eps) # define optimizer optimizer = fluid.optimizer.Adam( learning_rate=NoamDecay(args.d_model, args.warmup_steps, args.learning_rate), beta1=args.beta1, beta2=args.beta2, epsilon=float(args.eps), parameter_list=transformer.parameters()) ## init from some checkpoint, to resume the previous training if args.init_from_checkpoint: model_dict, opt_dict = fluid.load_dygraph( os.path.join(args.init_from_checkpoint, "transformer")) transformer.load_dict(model_dict) optimizer.set_dict(opt_dict) ## init from some pretrain models, to better solve the current task if args.init_from_pretrain_model: model_dict, _ = fluid.load_dygraph( os.path.join(args.init_from_pretrain_model, "transformer")) transformer.load_dict(model_dict) if trainer_count > 1: strategy = fluid.dygraph.parallel.prepare_context() transformer = fluid.dygraph.parallel.DataParallel(transformer, strategy) # the best cross-entropy value with label smoothing loss_normalizer = -( (1. - args.label_smooth_eps) * np.log( (1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) ce_time = [] ce_ppl = [] step_idx = 0 # train loop for pass_id in range(args.epoch): epoch_start = time.time() batch_id = 0 batch_start = time.time() interval_word_num = 0.0 for input_data in train_loader(): if args.max_iter and step_idx == args.max_iter: #NOTE: used for benchmark return batch_reader_end = time.time() (src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight) = input_data logits = transformer(src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias) sum_cost, avg_cost, token_num = criterion(logits, lbl_word, lbl_weight) if trainer_count > 1: avg_cost = transformer.scale_loss(avg_cost) avg_cost.backward() transformer.apply_collective_grads() else: avg_cost.backward() optimizer.minimize(avg_cost) transformer.clear_gradients() interval_word_num += np.prod(src_word.shape) if step_idx % args.print_step == 0: total_avg_cost = avg_cost.numpy() * trainer_count if step_idx == 0: logger.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) else: train_avg_batch_cost = args.print_step / ( time.time() - batch_start) word_speed = interval_word_num / ( time.time() - batch_start) logger.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f, avg_speed: %.2f step/s, " "words speed: %0.2f words/s" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]), train_avg_batch_cost, word_speed)) batch_start = time.time() interval_word_num = 0.0 if step_idx % args.save_step == 0 and step_idx != 0: # validation if args.validation_file: transformer.eval() total_sum_cost = 0 total_token_num = 0 for input_data in val_loader(): (src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight) = input_data logits = transformer( src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias) sum_cost, avg_cost, token_num = criterion( logits, lbl_word, lbl_weight) total_sum_cost += sum_cost.numpy() total_token_num += token_num.numpy() total_avg_cost = total_sum_cost / total_token_num logger.info("validation, step_idx: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) transformer.train() if args.save_model and ( trainer_count == 1 or fluid.dygraph.parallel.Env().dev_id == 0): model_dir = os.path.join(args.save_model, "step_" + str(step_idx)) if not os.path.exists(model_dir): os.makedirs(model_dir) fluid.save_dygraph( transformer.state_dict(), os.path.join(model_dir, "transformer")) fluid.save_dygraph( optimizer.state_dict(), os.path.join(model_dir, "transformer")) batch_id += 1 step_idx += 1 train_epoch_cost = time.time() - epoch_start ce_time.append(train_epoch_cost) logger.info("train epoch: %d, epoch_cost: %.5f s" % (pass_id, train_epoch_cost)) if args.save_model: model_dir = os.path.join(args.save_model, "step_final") if not os.path.exists(model_dir): os.makedirs(model_dir) fluid.save_dygraph(transformer.state_dict(), os.path.join(model_dir, "transformer")) fluid.save_dygraph(optimizer.state_dict(), os.path.join(model_dir, "transformer")) if args.enable_ce: _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (trainer_count, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (trainer_count, _ppl))
class Trainer: def __init__(self, args, train_loader, test_loader, tokenizer_src, tokenizer_tgt): self.args = args self.train_loader = train_loader self.test_loader = test_loader self.src_vocab_size = tokenizer_src.vocab_size self.tgt_vocab_size = tokenizer_tgt.vocab_size self.pad_id = tokenizer_src.pad_token_id # pad_token_id in tokenizer_tgt.vocab should be the same with this. self.device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu' self.model = Transformer(src_vocab_size = self.src_vocab_size, tgt_vocab_size = self.tgt_vocab_size, seq_len = args.max_seq_len, d_model = args.hidden, n_layers = args.n_layers, n_heads = args.n_attn_heads, p_drop = args.dropout, d_ff = args.ffn_hidden, pad_id = self.pad_id) if args.multi_gpu: self.model = nn.DataParallel(self.model) self.model.to(self.device) self.optimizer = ScheduledOptim(optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-9), init_lr=2.0, d_model=args.hidden) self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_id) def train(self, epoch): losses = 0 n_batches, n_samples = len(self.train_loader), len(self.train_loader.dataset) self.model.train() for i, batch in enumerate(self.train_loader): encoder_inputs, decoder_inputs, decoder_outputs = map(lambda x: x.to(self.device), batch) # |encoder_inputs| : (batch_size, seq_len), |decoder_inputs| : (batch_size, seq_len-1), |decoder_outputs| : (batch_size, seq_len-1) outputs, encoder_attns, decoder_attns, enc_dec_attns = self.model(encoder_inputs, decoder_inputs) # |outputs| : (batch_size, seq_len-1, tgt_vocab_size) # |encoder_attns| : [(batch_size, n_heads, seq_len, seq_len)] * n_layers # |decoder_attns| : [(batch_size, n_heads, seq_len-1, seq_len-1)] * n_layers # |enc_dec_attns| : [(batch_size, n_heads, seq_len-1, seq_len)] * n_layers loss = self.criterion(outputs.view(-1, self.tgt_vocab_size), decoder_outputs.view(-1)) losses += loss.item() self.optimizer.zero_grad() loss.backward() self.optimizer.update_learning_rate() self.optimizer.step() if i % (n_batches//5) == 0 and i != 0: print('Iteration {} ({}/{})\tLoss: {:.4f}\tlr: {:.4f}'.format(i, i, n_batches, losses/i, self.optimizer.get_current_lr)) print('Train Epoch: {}\t>\tLoss: {:.4f}'.format(epoch, losses/n_batches)) def validate(self, epoch): losses = 0 n_batches, n_samples = len(self.test_loader), len(self.test_loader.dataset) self.model.eval() with torch.no_grad(): for i, batch in enumerate(self.test_loader): encoder_inputs, decoder_inputs, decoder_outputs = map(lambda x: x.to(self.device), batch) # |encoder_inputs| : (batch_size, seq_len), |decoder_inputs| : (batch_size, seq_len-1), |decoder_outputs| : (batch_size, seq_len-1) outputs, encoder_attns, decoder_attns, enc_dec_attns = self.model(encoder_inputs, decoder_inputs) # |outputs| : (batch_size, seq_len-1, tgt_vocab_size) # |encoder_attns| : [(batch_size, n_heads, seq_len, seq_len)] * n_layers # |decoder_attns| : [(batch_size, n_heads, seq_len-1, seq_len-1)] * n_layers # |enc_dec_attns| : [(batch_size, n_heads, seq_len-1, seq_len)] * n_layers loss = self.criterion(outputs.view(-1, self.tgt_vocab_size), decoder_outputs.view(-1)) losses += loss.item() print('Valid Epoch: {}\t>\tLoss: {:.4f}'.format(epoch, losses/n_batches)) def save(self, epoch, model_prefix='model', root='.model'): path = Path(root) / (model_prefix + '.ep%d' % epoch) if not path.parent.exists(): path.parent.mkdir() torch.save(self.model, path)
def main(tokenizer, src_tok_file, tgt_tok_file, train_file, val_file, test_file, num_epochs, batch_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, learning_rate, data_path, checkpoint_file, do_train): logging.info('Using tokenizer: {}'.format(tokenizer)) src_tokenizer = TokenizerWrapper(tokenizer, BLANK_WORD, SEP_TOKEN, CLS_TOKEN, PAD_TOKEN, MASK_TOKEN) src_tokenizer.train(src_tok_file, 20000, SPECIAL_TOKENS) tgt_tokenizer = TokenizerWrapper(tokenizer, BLANK_WORD, SEP_TOKEN, CLS_TOKEN, PAD_TOKEN, MASK_TOKEN) tgt_tokenizer.train(tgt_tok_file, 20000, SPECIAL_TOKENS) SRC = ttdata.Field(tokenize=src_tokenizer.tokenize, pad_token=BLANK_WORD) TGT = ttdata.Field(tokenize=tgt_tokenizer.tokenize, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) logging.info('Loading training data...') train_ds, val_ds, test_ds = ttdata.TabularDataset.splits( path=data_path, format='tsv', train=train_file, validation=val_file, test=test_file, fields=[('src', SRC), ('tgt', TGT)]) test_src_sentence = val_ds[0].src test_tgt_sentence = val_ds[0].tgt MIN_FREQ = 2 SRC.build_vocab(train_ds.src, min_freq=MIN_FREQ) TGT.build_vocab(train_ds.tgt, min_freq=MIN_FREQ) logging.info(f'''SRC vocab size: {len(SRC.vocab)}''') logging.info(f'''TGT vocab size: {len(TGT.vocab)}''') train_iter = ttdata.BucketIterator(train_ds, batch_size=batch_size, repeat=False, sort_key=lambda x: len(x.src)) val_iter = ttdata.BucketIterator(val_ds, batch_size=1, repeat=False, sort_key=lambda x: len(x.src)) test_iter = ttdata.BucketIterator(test_ds, batch_size=1, repeat=False, sort_key=lambda x: len(x.src)) source_vocab_length = len(SRC.vocab) target_vocab_length = len(TGT.vocab) model = Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, source_vocab_length=source_vocab_length, target_vocab_length=target_vocab_length) optim = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9) model = model.cuda() if do_train: train_losses, valid_losses = train(train_iter, val_iter, model, optim, num_epochs, batch_size, test_src_sentence, test_tgt_sentence, SRC, TGT, src_tokenizer, tgt_tokenizer, checkpoint_file) else: logging.info('Skipped training.') # Load best model and score test set logging.info('Loading best model.') model.load_state_dict(torch.load(checkpoint_file)) model.eval() logging.info('Scoring the test set...') score_start = time.time() test_bleu, test_chrf = score(test_iter, model, tgt_tokenizer, SRC, TGT) score_time = time.time() - score_start logging.info(f'''Scoring complete in {score_time/60:.3f} minutes.''') logging.info(f'''BLEU : {test_bleu}''') logging.info(f'''CHRF : {test_chrf}''')
bidir=True, char_vocab_size=len(c2idx), char_embed_dim=50, dropout1=0.5, dropout2=0, dropout3=0.1) Transformer_model = Transformer(emb=300 + 1024 + 250 + 30, k=300, heads=1, depth=1, num_classes=2, char_vocab_size=len(c2idx), char_embed_dim=50) transformer_parameters = sum(p.numel() for p in Transformer_model.parameters() if p.requires_grad) rnn_parameters = sum(p.numel() for p in RNNseq_model.parameters() if p.requires_grad) total_parameters = transformer_parameters + rnn_parameters print(f'Number of parameters: {total_parameters}') # Move the model to the GPU if available if using_GPU: RNNseq_model = RNNseq_model.cuda() Transformer_model = Transformer_model.cuda() # Set up criterion for calculating loss weight_tensor = torch.Tensor([1.0, 2.0]).cuda() loss_criterion = nn.NLLLoss(weight=weight_tensor)
model = Transformer( embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_len, device, ).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True) pad_idx = input_text.vocab.stoi["<pad>"] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) if load_model: load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer) sentence = "Emma Woodhouse, handsome, clever, and rich, with a comfortable home" # Output should be: and happy disposition, seemed to unite some of the best blessings if graph:
def main() -> None: """Entry point. """ # Load data vocab = tx.data.Vocab(config_data.vocab_file) data_hparams = { # "batch_size" is ignored for train since we use dynamic batching. "batch_size": config_data.test_batch_size, "pad_id": vocab.pad_token_id, "bos_id": vocab.bos_token_id, "eos_id": vocab.eos_token_id, } datasets = { split: data_utils.Seq2SeqData( os.path.join(config_data.input_dir, f"{config_data.filename_prefix}{split}.npy"), # Only shuffle during training. hparams={ **data_hparams, "shuffle": split == "train" }, ) for split in ["train", "valid", "test"] } print(f"Training data size: {len(datasets['train'])}") batching_strategy = data_utils.CustomBatchingStrategy( config_data.max_batch_tokens) # Create model and optimizer model = Transformer(config_model, config_data, vocab) model = ModelWrapper(model, config_model.beam_width) lr_config = config_model.lr_config if lr_config["learning_rate_schedule"] == "static": init_lr = lr_config["static_lr"] scheduler_lambda = lambda x: 1.0 else: init_lr = lr_config["lr_constant"] scheduler_lambda = functools.partial( utils.get_lr_multiplier, warmup_steps=lr_config["warmup_steps"]) optim = torch.optim.Adam(model.parameters(), lr=init_lr, betas=(0.9, 0.997), eps=1e-9) scheduler = torch.optim.lr_scheduler.LambdaLR(optim, scheduler_lambda) output_dir = Path(args.output_dir) encoding = getattr(config_data, 'encoding', None) executor = Executor( model=model, train_data=datasets["train"], valid_data=datasets["valid"], test_data=datasets["test"], batching_strategy=batching_strategy, optimizer=optim, lr_scheduler=scheduler, log_destination=[sys.stdout, output_dir / "log.txt"], log_every=cond.iteration(config_data.display_steps), validate_every=[cond.iteration(config_data.eval_steps), cond.epoch(1)], stop_training_on=cond.epoch(config_data.max_train_epoch), train_metrics=[ ("loss", metric.RunningAverage(1)), # only show current loss ("lr", metric.LR(optim)) ], log_format="{time} : Epoch {epoch:2d} @ {iteration:6d}it " "({progress}%, {speed}), lr = {lr:.3e}, loss = {loss:.3f}", valid_metrics=BLEUWrapper(vocab, encoding=encoding), test_metrics=[ FileBLEU(vocab, output_dir / "test.output", encoding=encoding), ("unofficial_bleu", BLEUWrapper(vocab, decode=True, encoding=encoding)) ], valid_log_format="{time} : Epoch {epoch}, " "{split} BLEU = {BLEU:.3f}", test_progress_log_format=( "{time} : Evaluating on test ({progress}%, {speed}), " "unofficial BLEU = {unofficial_bleu:.2f}"), validate_mode='predict', checkpoint_dir=args.output_dir, save_every=cond.validation(better=True), max_to_keep=1, show_live_progress=True, ) if args.run_mode == "train_and_evaluate": executor.write_log("Begin running with train_and_evaluate mode") if args.load_checkpoint: load_path = executor.load(allow_failure=True) if load_path is not None: executor.test({"valid": datasets["valid"]}) executor.train() elif args.run_mode in ["evaluate", "test"]: executor.write_log(f"Begin running with {args.run_mode} mode") executor.load(load_training_state=False) split = "test" if args.run_mode == "test" else "valid" executor.test({split: datasets[split]}) elif args.run_mode == 'infer': print("it's being developed.") else: raise ValueError(f"Unknown mode: {args.run_mode}")
def transformer(dataloader, EPOCH, frequency, path_to_save_model, path_to_save_loss, path_to_save_predictions, device): device = torch.device(device) model = Transformer().double().to(device) optimizer = torch.optim.Adam(model.parameters()) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=200) criterion = torch.nn.MSELoss() best_model = "" min_train_loss = float('inf') for epoch in range(EPOCH + 1): train_loss = 0 val_loss = 0 ## TRAIN -- TEACHER FORCING model.train() for index_in, index_tar, _input, target, sensor_number in dataloader: # for each data set optimizer.zero_grad() # Shape of _input : [batch, input_length, feature] # Desired input for model: [input_length, batch, feature] src = _input.permute( 1, 0, 2).double().to(device)[:-1, :, :] # torch.Size([24, 1, 7]) target = _input.permute( 1, 0, 2).double().to(device)[1:, :, :] # src shifted by 1. prediction = model(src, device) # torch.Size([24, 1, 7]) loss = criterion(prediction, target[:, :, 0].unsqueeze(-1)) loss.backward() optimizer.step() # scheduler.step(loss.detach().item()) train_loss += loss.detach().item() if train_loss < min_train_loss: torch.save(model.state_dict(), path_to_save_model + f"best_train_{epoch}.pth") torch.save(optimizer.state_dict(), path_to_save_model + f"optimizer_{epoch}.pth") min_train_loss = train_loss best_model = f"best_train_{epoch}.pth" if epoch % 100 == 0: # Plot 1-Step Predictions logger.info(f"Epoch: {epoch}, Training loss: {train_loss}") scaler = load('scalar_item.joblib') src_humidity = scaler.inverse_transform( src[:, :, 0].cpu()) #torch.Size([35, 1, 7]) target_humidity = scaler.inverse_transform( target[:, :, 0].cpu()) #torch.Size([35, 1, 7]) prediction_humidity = scaler.inverse_transform( prediction[:, :, 0].detach().cpu().numpy()) #torch.Size([35, 1, 7]) plot_training(epoch, path_to_save_predictions, src_humidity, prediction_humidity, sensor_number, index_in, index_tar) train_loss /= len(dataloader) log_loss(train_loss, path_to_save_loss, train=True) plot_loss(path_to_save_loss, train=True) return best_model
if args.load_from is not None: with torch.cuda.device(args.gpu): model.load_state_dict( torch.load('./models/' + args.load_from + '.pt', map_location=lambda storage, loc: storage.cuda()) ) # load the pretrained models. # if using a teacher teacher_model = None if args.teacher is not None: teacher_model = Transformer(SRC, TRG, teacher_args) with torch.cuda.device(args.gpu): teacher_model.load_state_dict( torch.load('./models/' + args.teacher + '.pt', map_location=lambda storage, loc: storage.cuda())) for params in teacher_model.parameters(): params.requires_grad = False if (args.share_encoder) and (args.load_from is None): model.encoder = copy.deepcopy(teacher_model.encoder) for params in model.encoder.parameters(): params.requires_grad = True # use cuda if args.gpu > -1: model.cuda(args.gpu) if align_table is not None: align_table = torch.LongTensor(align_table).cuda(args.gpu) align_table = Variable(align_table) model.alignment = align_table
def main(conf): conf.distributed = dist.get_world_size() > 1 device = "cuda" if dist.is_primary(): from pprint import pprint pprint(conf.dict()) if dist.is_primary() and conf.evaluate.wandb: wandb = load_wandb() wandb.init(project="asr") else: wandb = None with open("trainval_indices.pkl", "rb") as f: split_indices = pickle.load(f) train_set = ASRDataset( conf.dataset.path, indices=split_indices["train"], alignment=conf.dataset.alignment, ) valid_set = ASRDataset(conf.dataset.path, indices=split_indices["val"]) train_sampler = dist.data_sampler(train_set, shuffle=True, distributed=conf.distributed) valid_sampler = dist.data_sampler(valid_set, shuffle=False, distributed=conf.distributed) if conf.training.batch_sampler is not None: train_lens = [] for i in split_indices["train"]: train_lens.append(train_set.mel_lengths[i]) opts = conf.training.batch_sampler bins = ((opts.base**np.linspace(opts.start, 1, 2 * opts.k + 1)) * 1000).tolist() groups, bins, n_samples = create_groups(train_lens, bins) batch_sampler = GroupedBatchSampler( train_sampler, groups, conf.training.dataloader.batch_size) conf.training.dataloader.batch_size = 1 train_loader = conf.training.dataloader.make( train_set, batch_sampler=batch_sampler, collate_fn=collate_data_imputer) else: train_loader = conf.training.dataloader.make( train_set, collate_fn=collate_data_imputer) valid_loader = conf.training.dataloader.make(valid_set, sampler=valid_sampler, collate_fn=collate_data) model = Transformer( conf.dataset.n_vocab, conf.model.delta, conf.dataset.n_mels, conf.model.feature_channel, conf.model.dim, conf.model.dim_ff, conf.model.n_layer, conf.model.n_head, conf.model.dropout, ).to(device) if conf.distributed: model = nn.parallel.DistributedDataParallel( model, device_ids=[dist.get_local_rank()], output_device=dist.get_local_rank(), ) optimizer = conf.training.optimizer.make(model.parameters()) scheduler = conf.training.scheduler.make(optimizer) if conf.ckpt is not None: ckpt = torch.load(conf.ckpt, map_location=lambda storage, loc: storage) model_p = model if conf.distributed: model_p = model.module model_p.load_state_dict(ckpt["model"]) # scheduler.load_state_dict(ckpt["scheduler"]) model_p.copy_embed(1) model_training = ModelTraining( model, optimizer, scheduler, train_set, train_loader, valid_loader, device, wandb, ) train(conf, model_training)
train_dataset.cn_vocab_size, config.max_output_len, num_layers=config.n_layers, model_dim=config.model_dim, num_heads=config.num_heads, ffn_dim=config.ffn_dim, dropout=config.dropout, ).to(config.device) print("使用模型:") print(transformer_model) total_steps = 0 if config.load_model: transformer_model.load_state_dict(torch.load(config.load_model_path)) total_steps = int(re.split('[_/.]', config.model_file)[1]) optimizer = torch.optim.Adam(transformer_model.parameters(), lr=config.learning_rate) loss_function = CrossEntropyLoss(ignore_index=0) train_losses, val_losses, bleu_scores = [], [], [] while total_steps < config.num_steps: # 訓練模型 transformer_model.train() transformer_model.zero_grad() losses = [] loss_sum = 0.0 for step in range(config.summary_steps): source, target = next( train_iter) # sources targets[batch_size, max_output_len] source, target = source.to(config.device), target.to(config.device)
lang_vocab_file = join(args.data_dir, 'lang.vocab') lang_vocab, _ = ut.init_vocab(lang_vocab_file) args.lang_vocab_size = len(lang_vocab) # since args is passed to many modules, keep logger with it instead of reinit everytime log_file = join(dump_dir, 'DEBUG.log') logger = args.logger = ut.get_logger(log_file) # log args for future reference logger.info(args) model = Transformer(args) # TODO: nicer formatting? logger.info(model) param_count = sum([np.prod(p.size()) for p in model.parameters()]) logger.info('Model has {:,} parameters'.format(param_count)) # controller data_manager = DataManager(args) controller = Controller(args, model, data_manager) if args.mode == 'train': controller.train() elif args.mode == 'translate': controller.model.load_state_dict(torch.load(args.model_file)) files_langs = args.files_langs for fl in files_langs: input_file, src_lang, tgt_lang = fl.split(',') controller.translate(input_file, src_lang, tgt_lang) else: raise ValueError('Unknown mode. Only train/translate.')