def train(engine, mini_batch): for language_model, model, optimizer in zip(engine.language_models, engine.models, engine.optimizers): language_model.eval() model.train() if engine.state.iteration % engine.config.iteration_per_update == 1 or \ engine.config.iteration_per_update == 1: if engine.state.iteration > 1: optimizer.zero_grad() device = next(engine.models[0].parameters()).device mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1].to(device)) mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1].to(device)) with autocast(not engine.config.off_autocast): # X2Y x, y = (mini_batch.src[0][:, 1:-1], mini_batch.src[1] - 2), mini_batch.tgt[0][:, :-1] x_hat_lm, y_hat_lm = None, None # |x| = (batch_size, n) # |y| = (batch_size, m) y_hat = engine.models[X2Y](x, y) # |y_hat| = (batch_size, m, y_vocab_size) if engine.state.epoch > engine.config.dsl_n_warmup_epochs: with torch.no_grad(): y_hat_lm = engine.language_models[X2Y](y) # |y_hat_lm| = |y_hat| #Y2X # Since encoder in seq2seq takes packed_sequence instance, # we need to re-sort if we use reversed src and tgt. x, y, restore_indice = DualSupervisedTrainingEngine._reorder( mini_batch.src[0][:, :-1], mini_batch.tgt[0][:, 1:-1], mini_batch.tgt[1] - 2, ) # |x| = (batch_size, n) # |y| = (batch_size, m) x_hat = DualSupervisedTrainingEngine._restore_order( engine.models[Y2X](y, x), restore_indice=restore_indice, ) # |x_hat| = (batch_size, n, x_vocab_size) if engine.state.epoch > engine.config.dsl_n_warmup_epochs: with torch.no_grad(): x_hat_lm = DualSupervisedTrainingEngine._restore_order( engine.language_models[Y2X](x), restore_indice=restore_indice, ) # |x_hat_lm| = |x_hat| x, y = mini_batch.src[0][:, 1:], mini_batch.tgt[0][:, 1:] loss_x2y, loss_y2x, dual_loss = DualSupervisedTrainingEngine._get_loss( x, y, x_hat, y_hat, engine.crits, x_hat_lm, y_hat_lm, # According to the paper, DSL should be warm-started. # Thus, we turn-off the regularization at the beginning. lagrange=engine.config.dsl_lambda if engine.state.epoch > engine.config.dsl_n_warmup_epochs else .0) backward_targets = [ loss_x2y.div(y.size(0)).div( engine.config.iteration_per_update), loss_y2x.div(x.size(0)).div( engine.config.iteration_per_update), ] for scaler, backward_target in zip(engine.scalers, backward_targets): if engine.config.gpu_id >= 0 and not engine.config.off_autocast: scaler.scale(backward_target).backward() else: backward_target.backward() x_word_count = int(mini_batch.src[1].sum()) y_word_count = int(mini_batch.tgt[1].sum()) p_norm = float( get_parameter_norm( list(engine.models[X2Y].parameters()) + list(engine.models[Y2X].parameters()))) g_norm = float( get_grad_norm( list(engine.models[X2Y].parameters()) + list(engine.models[Y2X].parameters()))) if engine.state.iteration % engine.config.iteration_per_update == 0 and \ engine.state.iteration > 0: for model, optimizer, scaler in zip(engine.models, engine.optimizers, engine.scalers): torch_utils.clip_grad_norm_( model.parameters(), engine.config.max_grad_norm, ) # Take a step of gradient descent. if engine.config.gpu_id >= 0 and not engine.config.off_autocast: # Use scaler instead of engine.optimizer.step() scaler.step(optimizer) scaler.update() else: optimizer.step() return { 'x2y': float(loss_x2y / y_word_count), 'y2x': float(loss_y2x / x_word_count), 'reg': float(dual_loss / x.size(0)), '|param|': p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0., '|g_param|': g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0., }
def train(engine, mini_batch): # You have to reset the gradients of all model parameters # before to take another step in gradient descent. engine.model.train() if engine.state.iteration % engine.config.iteration_per_update == 1: engine.optimizer.zero_grad() device = next(engine.model.parameters()).device mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1]) mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1]) # Raw target variable has both BOS and EOS token. # The output of sequence-to-sequence does not have BOS token. # Thus, remove BOS token for reference. x, y = mini_batch.src, mini_batch.tgt[0][:, 1:] # |x| = (batch_size, length) # |y| = (batch_size, length) with autocast(): # Take feed-forward # Similar as before, the input of decoder does not have EOS token. # Thus, remove EOS token for decoder input. y_hat = engine.model(x, mini_batch.tgt[0][:, :-1]) # |y_hat| = (batch_size, length, output_size) loss = engine.crit(y_hat.contiguous().view(-1, y_hat.size(-1)), y.contiguous().view(-1)) backward_target = loss.div(y.size(0)).div( engine.config.iteration_per_update) engine.scaler.scale(backward_target).backward() word_count = int(mini_batch.tgt[1].sum()) p_norm = float(get_parameter_norm(engine.model.parameters())) g_norm = float(get_grad_norm(engine.model.parameters())) if engine.state.iteration % engine.config.iteration_per_update == 0: # In orther to avoid gradient exploding, we apply gradient clipping. torch_utils.clip_grad_norm_( engine.model.parameters(), engine.config.max_grad_norm, ) # Take a step of gradient descent. # Use scaler instead of engine.optimizer.step() engine.scaler.step(engine.optimizer) engine.scaler.update() if engine.config.use_noam_decay and engine.lr_scheduler is not None: engine.lr_scheduler.step() loss = float(loss / word_count) ppl = np.exp(loss) return { 'loss': loss, 'ppl': ppl, '|param|': p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0., '|g_param|': g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0., }
def train(engine, mini_batch): # You have to reset the gradients of all model parameters # before to take another step in gradient descent. engine.model.train() engine.optimizer.zero_grad() device = next(engine.model.parameters()).device mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1]) mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1]) # if 'is_src_target' is true, the trainer would train language model for source language. # For dsl case, both x and y has BOS and EOS tokens. # Thus, we need to remove BOS and EOS before the training. x = mini_batch.src[ 0][:, :-1] if engine.is_src_target else mini_batch.tgt[0][:, :-1] y = mini_batch.src[ 0][:, 1:] if engine.is_src_target else mini_batch.tgt[0][:, 1:] # |x| = |y| = (batch_size, length) with autocast(not engine.config.off_autocast): y_hat = engine.model(x) # |y_hat| = (batch_size, length, output_size) loss = engine.crit( y_hat.contiguous().view(-1, y_hat.size(-1)), y.contiguous().view(-1), ).sum() backward_target = loss.div(y.size(0)) if engine.config.gpu_id >= 0 and not engine.config.off_autocast: engine.scaler.scale(backward_target).backward() else: backward_target.backward() word_count = int( mini_batch.src[1].sum()) if engine.is_src_target else int( mini_batch.tgt[1].sum()) p_norm = float(get_parameter_norm(engine.model.parameters())) g_norm = float(get_grad_norm(engine.model.parameters())) # In orther to avoid gradient exploding, we apply gradient clipping. torch_utils.clip_grad_norm_( engine.model.parameters(), engine.config.max_grad_norm, ) # Take a step of gradient descent. if engine.config.gpu_id >= 0 and not engine.config.off_autocast: # Use scaler instead of engine.optimizer.step() if using GPU. engine.scaler.step(engine.optimizer) engine.scaler.update() else: engine.optimizer.step() loss = float(loss / word_count) ppl = np.exp(loss) return { 'loss': loss, 'ppl': ppl, '|param|': p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0., '|g_param|': g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0., }
def train(engine, mini_batch): # You have to reset the gradients of all model parameters # before to take another step in gradient descent. engine.model.train() if engine.state.iteration % engine.config.iteration_per_update == 1 or \ engine.config.iteration_per_update == 1: engine.optimizer.zero_grad() device = next(engine.model.parameters()).device mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1]) mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1]) # Raw target variable has both BOS and EOS token. # The output of sequence-to-sequence does not have BOS token. # Thus, remove BOS token for reference. x, y = mini_batch.src, mini_batch.tgt[0][:, 1:] # |x| = (batch_size, length) # |y| = (batch_size, length) # Take sampling process because set False for is_greedy. y_hat, indice = engine.model.search( x, is_greedy=False, max_length=engine.config.max_length) with torch.no_grad(): # Based on the result of sampling, get reward. actor_reward = MinimumRiskTrainingEngine._get_reward( indice, y, n_gram=engine.config.rl_n_gram) # |y_hat| = (batch_size, length, output_size) # |indice| = (batch_size, length) # |actor_reward| = (batch_size) # Take samples as many as n_samples, and get average rewards for them. # I figured out that n_samples = 1 would be enough. baseline = [] for _ in range(engine.config.rl_n_samples): _, sampled_indice = engine.model.search( x, is_greedy=False, max_length=engine.config.max_length, ) baseline += [ MinimumRiskTrainingEngine._get_reward( sampled_indice, y, n_gram=engine.config.rl_n_gram, ) ] baseline = torch.stack(baseline).mean(dim=0) # |baseline| = (n_samples, batch_size) --> (batch_size) # Now, we have relatively expected cumulative reward. # Which score can be drawn from actor_reward subtracted by baseline. risk = (-actor_reward) - (-baseline) # |risk| = (batch_size) # calculate gradients with back-propagation loss = MinimumRiskTrainingEngine._get_loss(y_hat, indice, risk=risk) backward_target = loss.div(y.size(0)).div( engine.config.iteration_per_update) backward_target.backward() p_norm = float(get_parameter_norm(engine.model.parameters())) g_norm = float(get_grad_norm(engine.model.parameters())) if engine.state.iteration % engine.config.iteration_per_update == 0: # In orther to avoid gradient exploding, we apply gradient clipping. torch_utils.clip_grad_norm_( engine.model.parameters(), engine.config.max_grad_norm, ) # Take a step of gradient descent. engine.optimizer.step() return { 'actor': float(actor_reward.mean()), 'baseline': float(baseline.mean()), 'risk': float(risk.mean()), '|param|': p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0., '|g_param|': g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0., }
def train(engine, mini_batch): # You have to reset the gradients of all model parameters # before to take another step in gradient descent. engine.model.train() if engine.state.iteration % engine.config.iteration_per_update == 1 or \ engine.config.iteration_per_update == 1: if engine.state.iteration > 1: engine.optimizer.zero_grad() device = next(engine.model.parameters()).device mini_batch.src = (mini_batch.src[0].to(device), mini_batch.src[1]) mini_batch.tgt = (mini_batch.tgt[0].to(device), mini_batch.tgt[1]) # Raw target variable has both BOS and EOS token. # The output of sequence-to-sequence does not have BOS token. # Thus, remove BOS token for reference. x, y = mini_batch.src, mini_batch.tgt[0][:, 1:] # |x| = (batch_size, length) # |y| = (batch_size, length) # autocast로 공간효율적으로 학습 실행 with autocast(not engine.config.off_autocast): # with autocast(not engine.config.off_autocast): # y_hat = (batch_size, length_m, output_size) # 입력 tgt의 경우, 맨뒤에 EOS를 토큰을 제거 y_hat = engine.model(x, mini_batch.tgt[0][:, :-1]) # |y_hat| = (batch_size, length, output_size) ''' loss값 연산을 위해 다음과 같이 텐서 모양 정리 모든 문장의 각 단어를 순서대로 배치했다고 보면됨 변경 전(3D): y_hat = (batch_size, length_m, output_size) y = (batch_size, length_m) 변경 후(2D): y_hat = (batch_size * length_m, output_size) y = (batch_size * length_m) ''' loss = engine.crit( y_hat.contiguous().view(-1, y_hat.size(-1)), y.contiguous().view(-1) ) ''' div(y.size(0)): loss를 구한후, batch_size만큼 나눠준 후 div(engine.config.iteration_per_update): Gradient Accumulation을 위해 미리 나눠줌 즉, backward_target이 진짜 적용시킬 loss 값이라 보면 됨 ''' backward_target = loss.div(y.size(0)).div(engine.config.iteration_per_update) if engine.config.gpu_id >= 0 and not engine.config.off_autocast: engine.scaler.scale(backward_target).backward() else: backward_target.backward() word_count = int(mini_batch.tgt[1].sum()) p_norm = float(get_parameter_norm(engine.model.parameters())) g_norm = float(get_grad_norm(engine.model.parameters())) if engine.state.iteration % engine.config.iteration_per_update == 0 and \ engine.state.iteration > 0: ''' Gradient Clipping 시퀸스의 time_step이 길수록, gradient가 매우 커질수도 있음 g_norm이 너무 커서 많이 움직이는 걸 막기 위해 사용 - 단, Adam을 쓰면 큰 필요는 없다고 함 ㅇㅇ ''' torch_utils.clip_grad_norm_( engine.model.parameters(), engine.config.max_grad_norm, ) # Take a step of gradient descent. if engine.config.gpu_id >= 0 and not engine.config.off_autocast: # GPU를 사용할 경우, 기존 optim.step() 대신에 scaler로 step 수행 engine.scaler.step(engine.optimizer) engine.scaler.update() else: engine.optimizer.step() loss = float(loss / word_count) ppl = np.exp(loss) return { 'loss': loss, 'ppl': ppl, '|param|': p_norm if not np.isnan(p_norm) and not np.isinf(p_norm) else 0., '|g_param|': g_norm if not np.isnan(g_norm) and not np.isinf(g_norm) else 0., }