def validate_step(model, valid_dl, criterion): model.eval() # init hidden states model.hidden = model.init_hidden() total_acc = 0.0 total_loss = 0.0 total = 0.0 for i, (test_inputs, test_labels) in tqdm_notebook(enumerate(valid_dl), desc='Validation', total=len(valid_dl)): test_inputs, test_labels = to_var(test_inputs, True), to_var(test_labels, True) if len(test_labels) < valid_dl.batch_size: continue output = model(test_inputs.t()) loss = criterion(output, test_labels) # calculate testing acc and loss _, predicted = torch.max(output.data, 1) total_acc += (predicted == test_labels.data).sum() total_loss += loss.data[0] total += len(test_labels) model.hidden = detach(model.hidden) return total_loss / total, total_acc / total
def validate_epoch(epoch, model, val_ids, criterion, num_epochs, batch_size, seq_length): model.eval() states = model.init_hidden(batch_size) num_batches = val_ids.size(1) // seq_length val_loss = 0.0 val_acc = 0.0 for i in range(0, val_ids.size(1) - seq_length, seq_length): inputs = to_var(val_ids[:, i:i + seq_length], volatile=True) targets = to_var(val_ids[:, (i + 1):(i + 1) + seq_length].contiguous()) # Forward states = detach(states) outputs, states = model(inputs, states) # accuracy _, predictions = torch.max(outputs, dim=1) acc = torch.mean((predictions == targets.view(-1)).float()) val_acc = (val_acc * i + acc.data[0]) / (i + 1) # loss loss = criterion(outputs, targets.view(-1)) val_loss = (val_loss * i + loss.data[0]) / (i + 1) # report step = (i + 1) // seq_length sys.stdout.flush() sys.stdout.write( '\rValidation: Epoch [%d/%d], Step [%d/%d], Loss: %.3f, Perp: %.2f, Acc: %-15.2f' % (epoch + 1, num_epochs, step + 1, num_batches, val_loss, np.exp(val_loss), val_acc)) return val_loss
def evaluate(self, source, dag, name, batch_size=1, max_num=None): """Evaluate on the validation set. NOTE(brendan): We should not be using the test set to develop the algorithm (basic machine learning good practices). """ self.shared.eval() self.controller.eval() data = source[:max_num * self.max_length] total_loss = 0 hidden = self.shared.init_hidden(batch_size) pbar = range(0, data.size(0) - 1, self.max_length) for count, idx in enumerate(pbar): inputs, targets = self.get_batch(data, idx, volatile=True) output, hidden, _ = self.shared(inputs, dag, hidden=hidden, is_train=False) output_flat = output.view(-1, self.dataset.num_tokens) total_loss += len(inputs) * self.ce(output_flat, targets).data hidden = utils.detach(hidden) ppl = math.exp( utils.to_item(total_loss) / (count + 1) / self.max_length) val_loss = utils.to_item(total_loss) / len(data) ppl = math.exp(val_loss) self.tb.scalar_summary(f'eval/{name}_loss', val_loss, self.epoch) self.tb.scalar_summary(f'eval/{name}_ppl', ppl, self.epoch) logger.info(f'eval | loss: {val_loss:8.2f} | ppl: {ppl:8.2f}')
def evaluate(self, source, dag, name, batch_size=1, max_num=None): """Evaluate on the validation set. """ self.shared.eval() self.controller.eval() if self.image_dataset: data = source else: data = source[:max_num * self.max_length] total_loss = 0 hidden = self.shared.init_training(batch_size) pbar = range(0, self.valid_data_size - 1, self.max_length) for count, idx in enumerate(pbar): inputs, targets = self.get_batch(data, idx, volatile=True) output, hidden, _ = self.shared(inputs, dag, hidden=hidden, is_train=False) output_flat = output.view(-1, self.dataset.num_classes) total_loss += len(inputs) * self.ce(output_flat, targets).data hidden = utils.detach(hidden) ppl = math.exp( utils.to_item(total_loss) / (count + 1) / self.max_length) val_loss = utils.to_item(total_loss) / len(data) ppl = math.exp(val_loss) self.tb.scalar_summary(f'eval/{name}_loss', val_loss, self.epoch) self.tb.scalar_summary(f'eval/{name}_ppl', ppl, self.epoch) logger.info(f'eval | loss: {val_loss:8.2f} | ppl: {ppl:8.2f}')
def evaluate(dataloader: DataLoader, model: RNN, loss_function: Union[SplitCrossEntropyLoss, CrossEntropyLoss], only_l: Union[torch.Tensor, int] = None, device: Union[torch.device, str] = 'cpu', **kwargs): model.eval() languages = dataloader.dataset.data.keys() if only_l: if only_l not in languages: raise ValueError(f'Language {only_l} does not exist in the dataset') local_losses = {only_l: 0} else: local_losses = {lang: 0 for lang in languages} batch = 0 prev_lang = "" with tqdm(dataloader, total=len(dataloader)) as pbar: for data, targets, seq_len, lang in pbar: data = data.squeeze(0).to(device) targets = targets.squeeze(0).to(device) lang = lang.to(device) if only_l and only_l != lang: continue if prev_lang != lang: prev_lang = lang hidden = model.init_hidden(batchsize=data.size(-1)) else: detach(hidden) with torch.no_grad(): output, hidden = model(data, hidden, lang) if isinstance(loss_function, SplitCrossEntropyLoss): loss = loss_function(model.decoder.weight, model.decoder.bias, output, targets) else: loss = loss_function(output, targets) local_losses[lang.item()] += len(data) * loss.data batch += 1 pbar.set_description('Evaluation, finished batch {} | loss {}'.format(batch, loss.data)) avg_loss = {lang: local_losses[lang].item() / len(dataloader.dataset.data[lang]) for lang in languages} if only_l is None else {only_l: local_losses[only_l].item() / len(dataloader.dataset.data[only_l])} total_loss = sum(avg_loss.values()) return total_loss / len(languages), avg_loss
def _run_shared_one_batch(inputs, targets, hidden, dags, raw_total_loss): # global abs_max_grad # global abs_max_hidden_norm # global raw_total_loss loss, sample_loss, rest_loss, hidden, extra_out = self.get_loss( inputs, targets, dags, hidden=hidden) # Detach the hidden # Because they are input from previous state. hidden = utils.detach(hidden) raw_total_loss += sample_loss.data / self.args.num_batch_per_iter penalty_loss = _apply_penalties(extra_out, self.args) loss += penalty_loss rest_loss += penalty_loss return loss, sample_loss, rest_loss, hidden, extra_out, raw_total_loss
def evaluate(data_source, batch_size): '''https://mxnet.incubator.apache.org/api/python/autograd/autograd.html#train-mode-and-predict-mode''' tic = time.time() total_loss = 0 N = 0 states = model.begin_state(batch_size, ctx=ctxs[0]) for cursor in range(0, data_source.shape[0] - 1, args.bptt): Xs, Ys = get_batch(data_source, cursor, args) # By default, MXNet is in predict_mode output, states, _, _ = model( Xs, states) # state(num_layers, bsz, hidden_size) states = detach(states) total_loss += nd.sum(batch_size * loss(output, Ys)).asscalar() # loss (seq_len,) N += batch_size * len(output) return (total_loss / N), time.time() - tic
def train_epoch(epoch, model, trn_ids, criterion, optimizer, scheduler, num_epochs, batch_size, seq_length): model.train() scheduler.step() states = model.init_hidden(batch_size) num_batches = trn_ids.size(1) // seq_length trn_loss = 0.0 trn_acc = 0.0 for i in range(0, trn_ids.size(1) - seq_length, seq_length): inputs = to_var(trn_ids[:, i:i + seq_length]) targets = to_var(trn_ids[:, (i + 1):(i + 1) + seq_length].contiguous()) # Forward states = detach(states) outputs, states = model(inputs, states) # accuracy _, predictions = torch.max(outputs, dim=1) acc = torch.mean((predictions == targets.view(-1)).float()) trn_acc = (trn_acc * i + acc.data[0]) / (i + 1) # loss loss = criterion(outputs, targets.view(-1)) trn_loss = (trn_loss * i + loss.data[0]) / (i + 1) # backward optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), 0.3) optimizer.step() # report step = (i + 1) // seq_length sys.stdout.flush() sys.stdout.write( '\rTraining: Epoch [%d/%d], Step [%d/%d], Loss: %.3f, Perp: %.2f, Acc: %-15.2f' % (epoch + 1, num_epochs, step + 1, num_batches, trn_loss, np.exp(trn_loss), trn_acc)) return trn_loss
def train_step(model, train_dl, criterion, optimizer, scheduler): model.train() scheduler.step() # init hidden states model.hidden = model.init_hidden() total_acc = 0.0 total_loss = 0.0 total = 0.0 for i, (train_inputs, train_labels) in tqdm_notebook(enumerate(train_dl), desc='Training', total=len(train_dl)): train_inputs, train_labels = to_var(train_inputs), to_var(train_labels) if len(train_labels) < train_dl.batch_size: continue # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.hidden = detach(model.hidden) model.zero_grad() output = model(train_inputs.t()) loss = criterion(output, train_labels) loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), 0.3) optimizer.step() # calculate training acc and loss _, predicted = torch.max(output.data, 1) total_acc += (predicted == train_labels.data).sum() total_loss += loss.data[0] total += len(train_labels) return total_loss / total, total_acc / total
def train_shared(self, max_step=None): """Train the language model for 400 steps of minibatches of 64 examples. Args: max_step: Used to run extra training steps as a warm-up. BPTT is truncated at 35 timesteps. For each weight update, gradients are estimated by sampling M models from the fixed controller policy, and averaging their gradients computed on a batch of training data. """ model = self.shared model.train() self.controller.eval() hidden = self.shared.init_hidden(self.args.batch_size) if max_step is None: max_step = self.args.shared_max_step else: max_step = min(self.args.shared_max_step, max_step) abs_max_grad = 0 abs_max_hidden_norm = 0 step = 0 raw_total_loss = 0 total_loss = 0 train_idx = 0 # TODO(brendan): Why - 1 - 1? #while train_idx < self.train_data.size(0) - 1 - 1: for _, (inputs, targets) in enumerate(self.train_data): if step > max_step: break dags = self.controller.sample(self.args.shared_num_sample) # inputs, targets = self.get_batch(self.train_data, # train_idx, # self.max_length) inputs, targets = prep_batch(inputs, targets) print('batch_size', inputs.size()) loss, hidden, extra_out = self.get_loss(inputs, targets, hidden, dags) hidden = utils.detach(hidden) raw_total_loss += loss.data # should only be for RNNs loss += _apply_penalties(extra_out, self.args) # update self.shared_optim.zero_grad() loss.backward() h1tohT = extra_out['hiddens'] new_abs_max_hidden_norm = utils.to_item( h1tohT.norm(dim=-1).data.max()) if new_abs_max_hidden_norm > abs_max_hidden_norm: abs_max_hidden_norm = new_abs_max_hidden_norm logger.info(f'max hidden {abs_max_hidden_norm}') abs_max_grad = _check_abs_max_grad(abs_max_grad, model) torch.nn.utils.clip_grad_norm(model.parameters(), self.args.shared_grad_clip) self.shared_optim.step() total_loss += loss.data if ((step % self.args.log_step) == 0) and (step > 0): self._summarize_shared_train(total_loss, raw_total_loss) raw_total_loss = 0 total_loss = 0 step += 1 self.shared_step += 1 train_idx += self.max_length
def train_shared(self, max_step=None): """Train the language model for 400 steps of minibatches of 64 examples. Args: max_step: Used to run extra training steps as a warm-up. BPTT is truncated at 35 timesteps. For each weight update, gradients are estimated by sampling M models from the fixed controller policy, and averaging their gradients computed on a batch of training data. """ model = self.shared model.train() self.controller.eval() hidden = self.shared.init_hidden(self.args.batch_size) if max_step is None: max_step = self.args.shared_max_step else: max_step = min(self.args.shared_max_step, max_step) abs_max_grad = 0 abs_max_hidden_norm = 0 step = 0 raw_total_loss = 0 total_loss = 0 train_idx = 0 # NOTE(brendan): The - 1 - 1 here is because each example should # include at least one (x_t, y_{t + 1}) sequence, since y_{t + 1} is # predicted from x_t. while train_idx < self.train_data.size(0) - 1 - 1: bptt = self.max_length if np.random.random() >= 0.95: bptt /= 2. seq_len = int(np.random.normal(bptt, 5)) seq_len = max(5, seq_len) saved_lr = self.shared_optim.param_groups[0]['lr'] self.shared_optim.param_groups[0]['lr'] = saved_lr*seq_len/bptt dags = self.controller.sample(self.args.shared_num_sample) inputs, targets = self.get_batch(self.train_data, train_idx, seq_len) loss, hidden, extra_out = self.get_loss(inputs, targets, hidden, dags) hidden = utils.detach(hidden) raw_total_loss += loss.data.squeeze() loss += _apply_penalties(extra_out, self.args) # update self.shared_optim.zero_grad() loss.backward() abs_max_hidden_norm = _check_max_hidden(abs_max_hidden_norm, extra_out['hiddens']) abs_max_grad = _check_abs_max_grad(abs_max_grad, model) torch.nn.utils.clip_grad_norm(model.parameters(), self.args.shared_grad_clip) self.shared_optim.step() total_loss += loss.data.squeeze() self.shared_optim.param_groups[0]['lr'] = saved_lr if ((step % self.args.log_step) == 0) and (step > 0): self._summarize_shared_train(total_loss, raw_total_loss) raw_total_loss = 0 total_loss = 0 step += 1 self.shared_step += 1 train_idx += seq_len
def train(dataloader: DataLoader, model: RNN, optimizer: torch.optim.Optimizer, loss_function: Union[SplitCrossEntropyLoss, CrossEntropyLoss], use_apex=False, amp=None, lr_weights: dict = None, prior: str = 'ninf', scaling: str = None, total_steps: int = 0, steps: int = 0, bptt: int = 125, alpha: float = 0., beta: float = 0., log_interval: int = 200, n_samples: int = 4, device: Union[torch.device, str] = 'cpu', tb_writer=None, **kwargs): total_loss = 0 batch = 0 tr_kl = 0. logging_kl = 0. tr_loss = 0. logging_loss = 0. model.train() log.info('Starting training loop') start_time = time.time() with tqdm(dataloader, total=len(dataloader)) as pbar: for data, targets, seq_len, lang in pbar: data = data.squeeze(0).to(device) targets = targets.squeeze(0).to(device) lang = lang.to(device) hidden = model.init_hidden(batchsize=data.size(-1)) lr2 = optimizer.param_groups[0]['lr'] if lr_weights is not None: optimizer.param_groups[0]['lr'] = lr2 * seq_len.item() / bptt * lr_weights[lang.item()] else: optimizer.param_groups[0]['lr'] = lr2 * seq_len.item() / bptt hidden = detach(hidden) optimizer.zero_grad() loss = 0 if not isinstance(prior, VIPrior): n_samples = 1 for s in range(n_samples): output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, lang, return_h=True) if isinstance(loss_function, SplitCrossEntropyLoss): raw_loss = loss_function(model.decoder.weight, model.decoder.bias, output, targets) else: raw_loss = loss_function(output, targets) if alpha: raw_loss = raw_loss + sum(alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if beta: raw_loss = raw_loss + sum(beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss += raw_loss loss /= n_samples log_loss = loss if isinstance(prior, VIPrior): kl_term = prior.kl_div() if scaling == "uniform": scale = 1. / total_steps elif scaling == "linear_annealing": scale = ((total_steps - steps - 1) * 2. + 1.) / total_steps ** 2 elif scaling == "logistic_annealing": steepness = 0.0025 scale = 1. / (1 + np.exp(-steepness * (steps - total_steps / 2.))) else: scale = 1. loss = loss + scale * kl_term tr_kl += kl_term.item() if use_apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if tb_writer is not None: tb_writer.add_scalar('train/loss', log_loss.item(), steps) if isinstance(prior, VIPrior): tb_writer.add_scalar('train/kl', kl_term.item(), steps) tb_writer.add_scalar('train/loss+kl', loss.item(), steps) logging_kl += tr_kl logging_loss += tr_loss optimizer.step() total_loss += raw_loss.data batch += 1 steps += 1 # reset lr to optimiser default optimizer.param_groups[0]['lr'] = lr2 if batch % log_interval == 0 and batch > 0: cur_loss = total_loss.item() / log_interval elapsed = time.time() - start_time log.debug( '| {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( batch, len(dataloader), optimizer.param_groups[0]['lr'], elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 start_time = time.time() pbar.set_description('Training, end of batch {} | Loss {}'.format(batch, loss.data)) return steps
def train_one_epoch(epoch, cur_lr): ''' Train all the batches within one epoch. costs is the container created once and reuse for efficiency''' total_loss = 0 states = [model.begin_state(batch_size=m, ctx=ctx) for ctx in ctxs] # Loop all batches batch, cursor = 0, 0 tic_log_interval = time.time() while cursor < train_data.shape[0] - 1 - 1: ####################################################################### # Control seq_len cited from origin paper random_bptt = args.bptt if np.random.random( ) < 0.95 else args.bptt / 2. # Normal distribution (mean, variance): Prevent extreme sequence lengths seq_len = max(5, int(np.random.normal(random_bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) # Rescale learning rate depending on the variable length w.r.t bptt trainer.set_learning_rate(cur_lr * seq_len / args.bptt) ######################################################################## '''Each batch shape(seq_len, batch_size), split data to each device. m is the # of samples for each device, devided along batch_size axis.''' Xs, Ys = get_batch(train_data, cursor, args, seq_len=seq_len) assert args.batch_size == Xs.shape[ 1], 'data shape[1] should be batch_size' Xs = gluon.utils.split_and_load(Xs, ctxs, 1) Ys = gluon.utils.split_and_load(Ys, ctxs, 1) tic_b = time.time() # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. states = detach(states) loss_list = [] with autograd.record(): # train_mode for i, X in enumerate(Xs): output, states[i], encoded_raw, encoded_dropped = model( X, states[i]) # state(num_layers, bsz, hidden_size) device_loss = joint_loss(output, Ys[i], encoded_raw, encoded_dropped) loss_list.append(device_loss.as_in_context(ctxs[0]) / X.size) for l in loss_list: l.backward() ''' trainer.allreduce_grads() For each parameter, reduce the gradients from different contexts. Should be called after autograd.backward(), outside of record() scope, and before trainer.update(). For normal parameter updates, step() should be used, which internally calls allreduce_grads() and then update(). However, in gradient clipping, manually call allreduce_grads() and update() separately. ''' # trainer.allreduce_grads() # grads = [p.grad(ctxs[0]) for p in parameters] grads = [p.grad(ctx) for ctx in ctxs for p in parameters] gluon.utils.clip_global_norm(grads, args.clipping_theta) trainer.step(1) # trainer.update(1) batch_loss = sum([nd.sum(l).asscalar() for l in loss_list]) / len(ctxs) toc_b = time.time() batch_info.append([ epoch, batch, trainer.learning_rate, seq_len, (toc_b - tic_b) * 1000, args.batch_size * seq_len // (toc_b - tic_b), batch_loss, math.exp(batch_loss) ]) total_loss += batch_loss if batch % args.log_interval == 0 and batch > 0: utils.save_info(batch_info, batch_file) toc_log_interval = time.time() total_loss = total_loss / args.log_interval logging.info( '| epoch {:4d} ({:5.2f}%)| batch {:4d} | lr {:7.4f} | seq_len {:2d} | {:4.0f} ms/batch | ' '{:5d} tokens/s | loss {:6.3f} | ppl {:5.2f}'.format( epoch, cursor / train_data.shape[0] * 100, batch, trainer.learning_rate, seq_len, (toc_log_interval - tic_log_interval) * 1000 / args.log_interval, int(args.batch_size * args.log_interval * seq_len / (toc_log_interval - tic_log_interval)), total_loss, math.exp(total_loss))) total_loss = 0 tic_log_interval = time.time() batch += 1 cursor += seq_len global parameters_count if not parameters_count: logging.info('Parameters (except embeding): {}'.format( sum(p.data(ctxs[0]).size for p in parameters))) parameters_count = 1 nd.waitall() # synchronize batch data