def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch.Tensor: """ Does a forward pass on the given batches and returns the ``loss`` value in the result. If ``for_training`` is `True` also applies regularization penalty. This function has been modified for gradient accumulation and half precision training. The ability to do multi-GPU training has also been taken out since I never use it. """ assert len(batch_group) == 1 batch = batch_group[0] batch_size = training_util.get_batch_size(batch) accumulated_loss = 0 for start_idx in range(0, batch_size, self._accumulation_steps): mini_batch = self.get_mini_batch(batch, start_idx, self._accumulation_steps) mini_batch = nn_util.move_to_device(mini_batch, self._cuda_devices[0]) mini_batch_size = training_util.get_batch_size(mini_batch) output_dict = self.model(**mini_batch) try: loss = output_dict["loss"] if for_training: loss += self.model.get_regularization_penalty() # For Gradient Accumulation: scales loss by multiplying the mini_batch_size and scaling down by the total batch size gradient_accumulation_scaling_factor = mini_batch_size / batch_size loss *= gradient_accumulation_scaling_factor accumulated_loss += loss if torch.isnan(loss): raise ValueError("nan loss encountered") if for_training: if self._half_precision: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() except KeyError: if for_training: raise RuntimeError( "The model you are trying to optimize does not contain a" " 'loss' key in the output of model.forward(inputs).") return None return accumulated_loss
def log_batch( self, model: Model, optimizer: Optimizer, batch_grad_norm: Optional[float], metrics: Dict[str, float], batch_group: List[List[TensorDict]], param_updates: Optional[Dict[str, torch.Tensor]], ) -> None: if self.should_log_this_batch(): self.log_parameter_and_gradient_statistics(model, batch_grad_norm) self.log_learning_rates(model, optimizer) self.add_train_scalar("loss/loss_train", metrics["loss"]) self.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()}) if self.should_log_histograms_this_batch(): self.log_histograms(model) self.log_gradient_updates(model, param_updates) if self._batch_size_interval: # We're assuming here that `log_batch` will get called every batch, and only every # batch. This is true with our current usage of this code (version 1.0); if that # assumption becomes wrong, this code will break. batch_group_size = sum(training_util.get_batch_size(batch) for batch in batch_group) self._batches_this_epoch += 1 self._cumulative_batch_group_size += batch_group_size if (self._batches_this_epoch - 1) % self._batch_size_interval == 0: average = self._cumulative_batch_group_size / self._batches_this_epoch logger.info(f"current batch size: {batch_group_size} mean batch size: {average}") self.add_train_scalar("current_batch_size", batch_group_size) self.add_train_scalar("mean_batch_size", average)
def log_batch( self, batch_grad_norm: Optional[float], metrics: Dict[str, float], batch_group: List[TensorDict], param_updates: Optional[Dict[str, torch.Tensor]], batch_number: int, ) -> None: """ Called every batch to perform all of the logging that is due. """ if batch_number <= 1: # batch_number is usually 1-indexed self._cumulative_batch_group_size = 0 self.log_inputs(batch_group) if self._should_log_this_batch(): if self._should_log_parameter_statistics: self._log_parameter_and_gradient_statistics(batch_grad_norm) if self._should_log_learning_rate: self._log_learning_rates() # Now collect per-batch metrics to log. metrics_to_log: Dict[str, float] = {} for key in ("batch_loss", "batch_reg_loss"): if key not in metrics: continue value = metrics[key] metrics_to_log[key] = value # Update and add moving average. self._batch_loss_moving_sum[key] += value self._batch_loss_moving_items[key].append(value) if len(self._batch_loss_moving_items[key]) > self._batch_loss_moving_average_count: self._batch_loss_moving_sum[key] -= self._batch_loss_moving_items[key].popleft() metrics_to_log[f"{key}_mov_avg"] = self._batch_loss_moving_sum[key] / len( self._batch_loss_moving_items[key] ) self.log_scalars( metrics_to_log, log_prefix="train", ) if self._should_log_distributions_this_batch(): assert param_updates is not None self._log_distributions() self._log_gradient_updates(param_updates) if self._batch_size_interval: # We're assuming here that `log_batch` will get called every batch, and only every # batch. This is true with our current usage of this code (version 1.0); if that # assumption becomes wrong, this code will break. batch_group_size = sum(get_batch_size(batch) for batch in batch_group) # type: ignore self._cumulative_batch_group_size += batch_group_size if batch_number % self._batch_size_interval == 0: average = self._cumulative_batch_group_size / batch_number self.log_scalars( {"batch_size": batch_group_size, "mean_batch_size": average}, log_prefix="train" )
def forward(self, # type: ignore task_index: torch.IntTensor, reverse: torch.ByteTensor, epoch_trained: torch.IntTensor, for_training: torch.ByteTensor, tokens: Dict[str, torch.LongTensor], label: torch.IntTensor = None, text_id: torch.IntTensor = None) -> Dict[str, torch.Tensor]: embeddeds = self._encoder(task_index, tokens, epoch_trained, self._valid_discriminator, reverse, for_training, text_id) batch_size = get_batch_size(embeddeds["embedded_text"]) sentiment_logits = self._sentiment_discriminator(embeddeds["embedded_text"]) p_domain_logits = self._p_domain_discriminator(embeddeds["private_embedding"]) # TODO set reverse = true s_domain_logits = self._s_domain_discriminator(embeddeds["share_embedding"], reverse=reverse) logits = [sentiment_logits, p_domain_logits, s_domain_logits] # domain_logits = self._domain_discriminator(embedded_text) output_dict = {'logits': sentiment_logits} if label is not None: loss = self._loss(sentiment_logits, label) # task_index = task_index.unsqueeze(0) task_index = task_index.expand(batch_size) # targets = [label, label, label, task_index, task_index] # print(p_domain_logits.shape, task_index, task_index.shape) p_domain_loss = self._domain_loss(p_domain_logits, task_index) s_domain_loss = self._domain_loss(s_domain_logits, task_index) # logger.info("Share domain logits standard variation is {}", # torch.mean(torch.std(F.softmax(s_domain_logits), dim=-1))) output_dict["tokens"] = tokens output_dict['stm_loss'] = loss output_dict['p_d_loss'] = p_domain_loss output_dict['s_d_loss'] = s_domain_loss # TODO add share domain logits std loss output_dict['loss'] = loss + 0.06 * p_domain_loss + 0.04 * s_domain_loss for (metric_name, metric) in zip(self.metrics.keys(), self.metrics.values()): if "auc" in metric_name: metric(self.decode(output_dict)["label"], label) continue metric(sentiment_logits, label) print("for training", for_training) if not for_training: with open("class_probabilities.txt", "a", encoding="utf8") as f: f.write(f"Task: {TASKS_NAME[task_index[0].detach()]}\nLine ID: ") f.write(" ".join(list(map(str, text_id.cpu().detach().numpy())))) f.write("\nProb: ") f.write(" ".join(list(map(str, F.softmax(sentiment_logits, dim=-1).cpu().detach().numpy())))) f.write("\nLabel: " + " ".join(list(map(str, label.cpu().detach().numpy()))) + "\n") f.write("\n\n\n") return output_dict
def forward(self, task_index: torch.IntTensor, tokens: Dict[str, torch.LongTensor], epoch_trained: torch.IntTensor, valid_discriminator: Discriminator, reverse: torch.ByteTensor, for_training: torch.ByteTensor) -> Dict[str, torch.Tensor]: embedded_text_input = self._text_field_embedder(tokens) tokens_mask = util.get_text_field_mask(tokens) batch_size = get_batch_size(tokens) # TODO if np.random.rand() < -1 and for_training.all(): logger.info("Domain Embedding with Perturbation") domain_embeddings = self._domain_embeddings( torch.arange(0, len(TASKS_NAME)).cuda()) domain_embedding = get_perturbation_domain_embedding( domain_embeddings, task_index, epoch_trained) # domain_embedding = FGSM(self._domain_embeddings, task_index, valid_discriminator) output_dict = {"valid": torch.tensor(0)} else: logger.info("Domain Embedding without Perturbation") domain_embedding = self._domain_embeddings(task_index) output_dict = {"valid": torch.tensor(1)} output_dict["domain_embedding"] = domain_embedding embedded_text_input = self._input_dropout(embedded_text_input) if self._with_domain_embedding: domain_embedding = domain_embedding.expand(batch_size, 1, -1) embedded_text_input = torch.cat( (domain_embedding, embedded_text_input), 1) tokens_mask = torch.cat( [tokens_mask.new_ones(batch_size, 1), tokens_mask], 1) shared_encoded_text = self._shared_encoder(embedded_text_input, tokens_mask) # shared_encoded_text = self._seq2vec(shared_encoded_text, tokens_mask) shared_encoded_text = get_final_encoder_states(shared_encoded_text, tokens_mask, bidirectional=True) output_dict["share_embedding"] = shared_encoded_text private_encoded_text = self._private_encoder(embedded_text_input, tokens_mask) # private_encoded_text = self._seq2vec(private_encoded_text) private_encoded_text = get_final_encoder_states(private_encoded_text, tokens_mask, bidirectional=True) output_dict["private_embedding"] = private_encoded_text embedded_text = torch.cat([shared_encoded_text, private_encoded_text], -1) output_dict["embedded_text"] = embedded_text return output_dict
def forward(self, task_index: torch.IntTensor, tokens: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]: embedded_text_input = self._text_field_embedder(tokens) tokens_mask = util.get_text_field_mask(tokens) batch_size = get_batch_size(tokens) embedded_text_input = self._input_dropout(embedded_text_input) shared_encoded_text = self._shared_encoder(embedded_text_input, tokens_mask) private_encoded_text = self._private_encoder(embedded_text_input, tokens_mask) output_dict = dict() output_dict["share_embedding"] = shared_encoded_text output_dict["private_embedding"] = private_encoded_text embedded_text = torch.cat([shared_encoded_text, private_encoded_text], -1) output_dict["embedded_text"] = embedded_text return output_dict
def log_iterable(iterable, assume_multiprocess_types): start = time.perf_counter() last = start periodic_logging_process = None have_started_periodic_process = False batch_count = 0 cumulative_batch_size = 0 cumulative_token_count = 0 for batch in iterable: batch_count += 1 cumulative_batch_size += get_batch_size(batch) tokens_size = batch['source']['tokens'].size() cumulative_token_count += tokens_size[0] * tokens_size[1] if assume_multiprocess_types and not have_started_periodic_process: have_started_periodic_process = True periodic_logging_process = Process( target=run_periodically, # Pass the queues directly. Passing the iterable naively # won't work because the forked process (in contrast with # threads) has an entirely separate address space. # Presumably this could be worked around with # multiprocessing.managers or similar. args=(iterable.gi_frame.f_locals['qiterable'].output_queue, iterable.gi_frame.f_locals['output_queue'] ) ) periodic_logging_process.start() if batch_count % BATCH_INTERVAL == 0: end = time.perf_counter() msg = (f"s/b total: {(end - start) / batch_count:.3f} " + f"s/b last: {(end - last) / BATCH_INTERVAL:.3f} " + f"batch count: {batch_count} " + f"batch size: {cumulative_batch_size / batch_count:.1f} " + f"total tokens {cumulative_token_count}") print(msg) last = end if periodic_logging_process: periodic_logging_process.terminate()
def batch_end_logging(self, trainer: "CallbackTrainer"): # Log parameter values to tensorboard if self.tensorboard.should_log_this_batch(): self.tensorboard.log_parameter_and_gradient_statistics( trainer.model, trainer.batch_grad_norm) self.tensorboard.log_learning_rates(trainer.model, trainer.optimizer) self.tensorboard.add_train_scalar("loss/loss_train", trainer.train_metrics["loss"]) self.tensorboard.log_metrics({ "epoch_metrics/" + k: v for k, v in trainer.train_metrics.items() }) if self.log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in trainer.batch_group ]) self.cumulative_batch_size += cur_batch if (trainer.batches_this_epoch - 1) % self.log_batch_size_period == 0: average = self.cumulative_batch_size / trainer.batches_this_epoch logger.debug( f"current batch size: {cur_batch} mean batch size: {average}" ) self.tensorboard.add_train_scalar("current_batch_size", cur_batch) self.tensorboard.add_train_scalar("mean_batch_size", average) if self.tensorboard.should_log_histograms_this_batch(): for name, param in trainer.model.named_parameters(): self.param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(self.param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self.tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) self.param_updates.clear() self.tensorboard.log_histograms(trainer.model, self.histogram_parameters)
def forward( self, task_index: torch.IntTensor, tokens: Dict[str, torch.LongTensor]) -> Dict[str, torch.Tensor]: embedded_text_input = self._text_field_embedder(tokens) tokens_mask = util.get_text_field_mask(tokens) batch_size = get_batch_size(tokens) domain_embedding = self._domain_embeddings(task_index) output_dict = {"domain_embedding": domain_embedding} embedded_text_input = self._input_dropout(embedded_text_input) shared_encoded_text = self._shared_encoder(embedded_text_input, tokens_mask) private_encoded_text = self._private_encoder(embedded_text_input, tokens_mask) if self._with_domain_embedding: domain_embedding = domain_embedding.expand(batch_size, -1) shared_encoded_text = torch.cat( [domain_embedding, shared_encoded_text], -1) private_encoded_text = torch.cat( [domain_embedding, shared_encoded_text], -1) # shared_encoded_text = shared_encoded_text.view(batch_size, 4, -1) # scores = F.softmax(torch.matmul(shared_encoded_text, domain_embedding), -1) # shared_encoded_text = torch.matmul(shared_encoded_text.transpose(1, 2), scores.unsqueeze(2)) # shared_encoded_text = shared_encoded_text.view(batch_size, -1) # private_encoded_text = private_encoded_text.view(batch_size, 4, -1) # scores = F.softmax(torch.matmul(private_encoded_text, domain_embedding), -1) # private_encoded_text = torch.matmul(private_encoded_text.transpose(1, 2), scores.unsqueeze(2)) # private_encoded_text = private_encoded_text.view(batch_size, -1) # domain_embedding = domain_embedding.expand(batch_size, -1) # shared_encoded_text = torch.cat([domain_embedding, shared_encoded_text], -1) # private_encoded_text = torch.cat([domain_embedding, private_encoded_text], -1) output_dict["share_embedding"] = shared_encoded_text output_dict["private_embedding"] = private_encoded_text # embedded_text = torch.cat([domain_embedding, shared_encoded_text, private_encoded_text], -1) embedded_text = torch.cat([shared_encoded_text, private_encoded_text], -1) output_dict["embedded_text"] = embedded_text return output_dict
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() images = [] text = [] segment_ids = [] labels = [] for i in range(len(batch_group[0]['images'])): positive_index = random.randint(0, self.num_negative_samples) labels.append(positive_index) if self.retrieve_text: instance_text = [] instance_segment_ids = [] for j in range(self.num_negative_samples + 1): if j == positive_index: instance_text.append(batch_group[0]['token_ids'] ['tokens'][i, :].tolist()) instance_segment_ids.append( batch_group[0]['segment_ids'][i].tolist()) else: negative_sample_index = random.choice( self.train_indices) text_field = TextField( self.train_text_db[negative_sample_index], self.train_token_indexers) text_field.index(self.model.vocab) padding_lengths = text_field.get_padding_lengths() instance_text.append( text_field.as_tensor( padding_lengths=padding_lengths) ['tokens'].tolist()) instance_segment_ids.append( self.train_segment_ids_db[ negative_sample_index].tolist()) text += instance_text segment_ids += instance_segment_ids else: instance_images = [ None for _ in range(self.num_negative_samples + 1) ] for j in range(self.num_negative_samples + 1): if j == positive_index: instance_images[j] = np.expand_dims( batch_group[0]['images'][i].numpy(), 0) else: instance_images[j] = np.expand_dims( random.choice(self.train_image_db), 0) images += instance_images matching_label_field_name = "labels" if self.retrieve_text: max_text_len = max([len(sequence) for sequence in text]) text = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in text ] batch_group[0]['token_ids'] = { 'tokens': torch.LongTensor(text) } segment_ids = [ sequence + [0 for _ in range(max_text_len - len(sequence))] for sequence in segment_ids ] batch_group[0]['segment_ids'] = torch.from_numpy( np.array(segment_ids, dtype=np.int64)) else: batch_group[0]['images'] = torch.from_numpy(np.vstack(images)) batch_group[0][matching_label_field_name] = torch.from_numpy( np.array(labels, dtype=np.int64)) loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def forward( self, # type: ignore task_index: torch.IntTensor, reverse: torch.ByteTensor, epoch_trained: torch.IntTensor, for_training: torch.ByteTensor, tokens: Dict[str, torch.LongTensor], label: torch.IntTensor = None) -> Dict[str, torch.Tensor]: embeddeds = self._encoder(task_index, tokens, epoch_trained, self._valid_discriminator, reverse, for_training) batch_size = get_batch_size(embeddeds["embedded_text"]) sentiment_logits = self._sentiment_discriminator( embeddeds["embedded_text"]) p_domain_logits = self._p_domain_discriminator( embeddeds["private_embedding"]) # TODO set reverse = true s_domain_logits = self._s_domain_discriminator( embeddeds["share_embedding"], reverse=reverse) # TODO set reverse = true # TODO use share_embedding instead of domain_embedding valid_logits = self._valid_discriminator(embeddeds["domain_embedding"], reverse=reverse) valid_label = embeddeds['valid'] logits = [ sentiment_logits, p_domain_logits, s_domain_logits, valid_logits ] # domain_logits = self._domain_discriminator(embedded_text) output_dict = {'logits': sentiment_logits} if label is not None: loss = self._loss(sentiment_logits, label) # task_index = task_index.unsqueeze(0) task_index = task_index.expand(batch_size) targets = [label, task_index, task_index, valid_label] # print(p_domain_logits.shape, task_index, task_index.shape) p_domain_loss = self._domain_loss(p_domain_logits, task_index) s_domain_loss = self._domain_loss(s_domain_logits, task_index) logger.info( "Share domain logits standard variation is {}", torch.mean(torch.std(F.softmax(s_domain_logits), dim=-1))) if self._label_smoothing is not None and self._label_smoothing > 0.0: valid_loss = sequence_cross_entropy_with_logits( valid_logits, valid_label.unsqueeze(0).cuda(), torch.tensor(1).unsqueeze(0).cuda(), average="token", label_smoothing=self._label_smoothing) else: valid_loss = self._valid_loss( valid_logits, torch.zeros(2).scatter_(0, valid_label, torch.tensor(1.0)).cuda()) output_dict['stm_loss'] = loss output_dict['p_d_loss'] = p_domain_loss output_dict['s_d_loss'] = s_domain_loss output_dict['valid_loss'] = valid_loss # TODO add share domain logits std loss output_dict['loss'] = loss + p_domain_loss + 0.005 * s_domain_loss\ # + 0.005 * valid_loss # + torch.mean(torch.std(s_domain_logits, dim=1)) # output_dict['loss'] = loss + p_domain_loss + 0.005 * s_domain_loss for (metric, logit, target) in zip(self.metrics.values(), logits, targets): metric(logit, target) return output_dict
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 train_loss_lang1 = 0.0 train_loss_lang2 = 0.0 train_loss_cm = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() self.optimizer_lang1.zero_grad() self.optimizer_lang2.zero_grad() self.optimizer_cm.zero_grad() loss, loss_cm, loss_lang1, loss_lang2 = self.batch_loss( batch_group, for_training=True) if torch.isnan(loss): # if either on of loss_%s is nan, loss will be nan raise ValueError("nan loss encountered") ####### # lang1 ####### loss_lang1.backward() train_loss_lang1 += loss_lang1.item() self.rescale_gradients() if self._learning_rate_scheduler_lang1: self._learning_rate_scheduler_lang1.step_batch(batch_num_total) if self._momentum_scheduler_lang1: self._momentum_scheduler_lang1.step_batch(batch_num_total) self.optimizer_lang1.step() self.optimizer_lang1.zero_grad() ####### # cm ####### loss_lang2.backward() train_loss_lang2 += loss_lang2.item() batch_grad_norm = self.rescale_gradients() if self._learning_rate_scheduler_lang2: self._learning_rate_scheduler_lang2.step_batch(batch_num_total) if self._momentum_scheduler_lang2: self._momentum_scheduler_lang2.step_batch(batch_num_total) self.optimizer_lang2.step() self.optimizer_lang2.zero_grad() ####### # lang2 ####### loss_cm.backward() train_loss_cm += loss_cm.item() self.rescale_gradients() if self._learning_rate_scheduler_cm: self._learning_rate_scheduler_cm.step_batch(batch_num_total) if self._momentum_scheduler_cm: self._momentum_scheduler_cm.step_batch(batch_num_total) self.optimizer_cm.step() self.optimizer_cm.zero_grad() train_loss += loss.item() # Update the description with the latest metrics # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) metrics = self.model.get_metrics(False) metrics["loss"] = float( train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cm_loss"] = float( train_loss_cm / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang1_loss"] = float( train_loss_lang1 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang2_loss"] = float( train_loss_lang2 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer_lang1) self._tensorboard.log_learning_rates(self.model, self.optimizer_lang2) self._tensorboard.log_learning_rates(self.model, self.optimizer_cm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.add_train_scalar("loss/cm_loss_train", metrics["cm_loss"]) self._tensorboard.add_train_scalar("loss/lang1_loss_train", metrics["lang1_loss"]) self._tensorboard.add_train_scalar("loss/lang2_loss_train", metrics["lang2_loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics = self.model.get_metrics(reset=True) metrics["loss"] = float( train_loss / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cm_loss"] = float( train_loss_cm / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang1_loss"] = float( train_loss_lang1 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["lang2_loss"] = float( train_loss_lang2 / batches_this_epoch) if batches_this_epoch > 0 else 0.0 metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def semi_train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.trainer.model.train() num_gpus = len(self.trainer._cuda_devices) self.trainer._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self.trainer._batch_num_total is None: self.trainer._batch_num_total = 0 histogram_parameters = set( self.trainer.model. get_parameters_for_histogram_tensorboard_logging()) #Pdb().set_trace() mixed_generator, num_training_batches = get_mixer( self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled) #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer) #generator for lambda update mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm', 1.0) #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm') logger.info("Training") train_generator_tqdm = Tqdm.tqdm(mixed_generator, total=num_training_batches) #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator), # total=num_training_batches) cumulative_batch_size = 0 unlabelled_loss = 0 unlabelled_batches_this_epoch = 0 batches_since_last_step = 0 agg_loss = 0.0 flag = False batch_grad_norm = None for batch_group, group_id in train_generator_tqdm: #print(batch_group[0]['sentence']['tokens'].shape) if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id: continue output_dict = self.batch_loss( batch_group, for_training=True, eval_metric=(group_id == self.labelled_id)) penalties = defaultdict(float) if self.constraints_model is not None: penalties = self.constraints_model( output_dict['task1_tag_logits'], output_dict['task2_tag_logits'], output_dict['mask']) loss = 0.0 if 'loss' in output_dict: loss = output_dict['loss'] train_loss += loss.item() loss += output_dict.get('regularization_penalty', 0.0) loss += self.constraints_wt * penalties['loss'] unlabelled_loss += penalties['loss'].item() if torch.is_tensor( penalties['loss']) else penalties['loss'] agg_loss += loss batches_since_last_step += 1 if batches_since_last_step == self.backprop_after_xbatches: #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False else: flag = True #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) if (group_id != self.labelled_id): unlabelled_batches_this_epoch += 1 #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() #self.trainer.optimizer.step() else: self.total_supervised_iters += 1.0 batches_this_epoch += 1 self.trainer._batch_num_total += 1 batch_num_total = self.trainer._batch_num_total #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self.trainer._learning_rate_scheduler: self.trainer._learning_rate_scheduler.step_batch( batch_num_total) if self.trainer._tensorboard.should_log_histograms_this_batch( ): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.trainer.model.named_parameters() } #self.trainer.optimizer.step() for name, param in self.trainer.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view( -1, )) param_norm = torch.norm(param.view(-1, )).cpu() self.trainer._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: pass #self.trainer.optimizer.step() # Update moving averages if self.trainer._moving_average is not None: self.trainer._moving_average.apply(batch_num_total) # metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch) metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) # Update the description with the latest metrics description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self.trainer._tensorboard.should_log_this_batch( ) and batch_grad_norm is not None: self.trainer._tensorboard.log_parameter_and_gradient_statistics( self.trainer.model, batch_grad_norm) self.trainer._tensorboard.log_learning_rates( self.trainer.model, self.trainer.optimizer) self.trainer._tensorboard.add_train_scalar( "loss/loss_train", metrics["loss"]) self.trainer._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self.trainer._tensorboard.should_log_histograms_this_batch( ): self.trainer._tensorboard.log_histograms( self.trainer.model, histogram_parameters) if self.trainer._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self.trainer._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self.trainer._tensorboard.add_train_scalar( "current_batch_size", cur_batch) self.trainer._tensorboard.add_train_scalar( "mean_batch_size", average) # Save model if needed. if self.trainer._model_save_interval is not None and ( time.time() - last_save_time > self.trainer._model_save_interval): last_save_time = time.time() self.trainer._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters ) and (self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by if flag: batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters) and ( self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage metrics['lb'] = batches_this_epoch metrics['ub'] = unlabelled_batches_this_epoch metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) if self.constraints_model is not None: lambda_stats_dict = self.constraints_model.lambda_stats() metrics.update(lambda_stats_dict) for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of( raw_train_generator, num_gpus * self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / (num_gpus * self._num_gradient_accumulation_steps)) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: if not self._graph_added and self._require_graph: model_copy = deepcopy(self.model) model_copy.log_graph() wrapped_model = ModelWrapper(model_copy) graph_inputs = wrapped_model.process_inputs(batch_group[0]) # print(deepcopy(wrapped_model)(graph_inputs)) self._tensorboard.add_graph(wrapped_model, [graph_inputs]) self._graph_added = True batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() num_batch = len(batch_group) // num_gpus for i in range(num_batch): if (i + 1) * num_gpus > len(batch_group): batch_i = batch_group[i * num_gpus:] else: batch_i = batch_group[i * num_gpus:(i + 1) * num_gpus] loss = self.batch_loss(batch_i, for_training=True) if loss is None or torch.isnan(loss): print("nan loss") continue # raise ValueError("nan loss encountered") loss = loss / num_batch # try: # loss.backward() # except Exception: # print("loss: ", loss) # print(batch_group) # with torch.autograd.set_detect_anomaly(True): This can potentially lead to slower training # loss.backward() # loss = loss.half() loss.backward() # with amp.scale_loss(loss, self.optimizer) as scaled_loss: # try: # scaled_loss.backward() # except RuntimeError: # print("CUDA out of memory during backward()") # continue train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains on one epoch. Differs from base trainer in that it utilizes """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) raw_generators = [] # fix max number of batches self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_size = 0 for i in range(0, self.meta_batches): train_generators = [] for i, train_info in enumerate(self.train_data): raw_train_generator = self.iterator(train_info, num_epochs=1, shuffle=self.shuffle) train_generators.append( lazy_groups_of(raw_train_generator, num_gpus)) loss_batch = self.reptile_outer_update(train_generators, i, num_gpus) # TODO figure out if is important train_loss = loss_batch print('[info] train_loss is:{}'.format(train_loss)) # TODO figure out BATCH NORM MAML https://openreview.net/pdf?id=HygBZnRctX if self.batch_norm: batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. # TODO investigate learning rate scheduling for meta learning #if self._learning_rate_scheduler: #self._learning_rate_scheduler.step_batch(batch_num_total) #if self._momentum_scheduler: #self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches # 使训练数据可迭代 raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) # 将可迭代的单实例批处理到list中 train_generator = lazy_groups_of(raw_train_generator, num_gpus) # 向上取整 获取batch数 (总batch/gpu数) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) # 默认的accumulated batch count 为4,此处是求accumulate的尾巴 residue = num_training_batches % self.accumulated_batch_count self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") # 训练进度条 train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 # 梯度清零 常规操作 self.optimizer.zero_grad() # 开始训练 for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total # 一个batch为accumulated_batch_count个iteration,梯度累积 iter_len = self.accumulated_batch_count \ if batches_this_epoch <= (num_training_batches - residue) else residue if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) try: # 平均loss loss = self.batch_loss(batch_group, for_training=True) / iter_len except RuntimeError as e: print(e) for x in batch_group: all_words = [len(y['words']) for y in x['metadata']] print(f"Total sents: {len(all_words)}. " f"Min {min(all_words)}. Max {max(all_words)}") for elem in ['labels', 'd_tags']: tt = x[elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) for elem in ["bert", "mask", "bert-offsets"]: tt = x['tokens'][elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) raise e if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) if torch.isnan(loss): raise ValueError("nan loss encountered") # 反向传播 loss.backward() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) # 计算loss train_loss += loss.item() * iter_len # 删除两个变量 del batch_group, loss # pytorch 训练时无用的临时变量可能会越来越多,导致 out of memory ,可以使用下面语句来清理这些不需要的变量。 torch.cuda.empty_cache() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) # 正则化梯度 batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. # lr会在epoch变大的同时予以调整,一般是逐渐变小 # momentum 动量 防止损失函数陷入局部极小值,跳出鞍点 if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # copy参数 防止爆内存 # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: # 自动计算梯度 optimizer.step() self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) # 求l1范数 update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() self.optimizer.zero_grad() # Update moving averages 在adam或SGD优化中为了平衡模型更新速度一般设置滑动平均来提高模型在测试数据上的健壮性 if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() # Get tqdm for the training batches train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) num_training_batches = self.iterator.get_num_batches(self.train_data) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 self.optimizer.zero_grad() for batch_id, batch in enumerate(train_generator_tqdm): batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if (batch_id + 1) % self._accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if (batch_id + 1) % self._accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = training_util.get_batch_size(batch) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # 如果没有gpu ,也返回1. # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) residue = num_training_batches % self.accumulated_batch_count self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm( train_generator, total=num_training_batches) # 打印一个进度条而已. cumulative_batch_size = 0 self.optimizer.zero_grad() for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total iter_len = self.accumulated_batch_count \ if batches_this_epoch <= (num_training_batches - residue) else residue if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'Before forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'Before forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) try: loss = self.batch_loss( batch_group, for_training=True) / iter_len # 输入的数据里面去除了全部都是keep的情况 except RuntimeError as e: print(e) for x in batch_group: all_words = [len(y['words']) for y in x['metadata']] print(f"Total sents: {len(all_words)}. " f"Min {min(all_words)}. Max {max(all_words)}") for elem in ['labels', 'd_tags']: tt = x[elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) for elem in ["bert", "mask", "bert-offsets"]: tt = x['tokens'][elem] print( f"{elem} shape {list(tt.shape)} and min {tt.min().item()} and {tt.max().item()}" ) raise e if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After forward pass - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After forward pass - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After backprop - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After backprop - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) train_loss += loss.item() * iter_len del batch_group, loss torch.cuda.empty_cache() # 节省内存,显存 if self.cuda_verbose_step is not None and batch_num_total % self.cuda_verbose_step == 0: print( f'After collecting garbage - Cuda memory allocated: {torch.cuda.memory_allocated() / 1e9}' ) print( f'After collecting garbage - Cuda memory cached: {torch.cuda.memory_cached() / 1e9}' ) batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() self.optimizer.zero_grad() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: if batches_this_epoch % self.accumulated_batch_count == 0 or \ batches_this_epoch == num_training_batches: self.optimizer.step() #多个batch才进行bp算法. self.optimizer.zero_grad() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) # 计算准确率 description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. 取一个间隔来存 if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def forward(self, # type: ignore task_index: torch.IntTensor, reverse: torch.ByteTensor, for_training: torch.ByteTensor, train_stage: torch.IntTensor, tokens: Dict[str, torch.LongTensor], label: torch.IntTensor = None) -> Dict[str, torch.Tensor]: """ :param task_index: :param reverse: :param for_training: :param train_stage: ["share_senti", "share_classify", "share_classify_adversarial", "domain_valid", "domain_valid_adversarial"] :param tokens: :param label: :return: """ embedded_text = self._text_field_embedder(tokens) mask = get_text_field_mask(tokens).float() embed_tokens = self._encoder(embedded_text, mask) batch_size = get_batch_size(embed_tokens) # bs * (25*4) seq_vec = self._seq_vec(embed_tokens, mask) # TODO add linear layer domain_embeddings = self._domain_embeddings(torch.arange(self._de_dim).cuda()) de_scores = F.softmax( self._de_attention(seq_vec, domain_embeddings.expand(batch_size, *domain_embeddings.size())), dim=1) de_valid = False if np.random.rand() < 0.3: de_valid = True noise = 0.01 * torch.normal(mean=0.5, # std=torch.std(domain_embeddings).sign_()) std=torch.empty(*de_scores.size()).fill_(1.0)) de_scores = de_scores + noise.cuda() domain_embedding = torch.matmul(de_scores, domain_embeddings) domain_embedding = self._de_feedforward(domain_embedding) # train sentiment classify if train_stage.cpu() == torch.tensor(0) or not for_training: de_representation = torch.tanh(torch.add(domain_embedding, seq_vec)) sentiment_logits = self._sentiment_discriminator(de_representation) if label is not None: loss = self._loss(sentiment_logits, label) self.metrics["{}_stm_acc".format(TASKS_NAME[task_index.cpu()])](sentiment_logits, label) if train_stage.cpu() == torch.tensor(1) or not for_training: s_domain_logits = self._s_domain_discriminator(seq_vec, reverse=reverse) task_index = task_index.expand(batch_size) loss = self._domain_loss(s_domain_logits, task_index) self.metrics["s_domain_acc"](s_domain_logits, task_index) if train_stage.cpu() == torch.tensor(2) or not for_training: valid_logits = self._valid_discriminator(domain_embedding, reverse=reverse) valid_label = torch.ones(batch_size).cuda() if de_valid: valid_label = torch.zeros(batch_size).cuda() if self._label_smoothing is not None and self._label_smoothing > 0.0: loss = sequence_cross_entropy_with_logits(valid_logits, valid_label.unsqueeze(0).cuda(), torch.tensor(1).unsqueeze(0).cuda(), average="token", label_smoothing=self._label_smoothing) else: loss = self._valid_loss(valid_logits, torch.zeros(2).scatter_(0, valid_label, torch.tensor(1.0)).cuda()) self.metrics["valid_acc"](valid_logits, valid_label) # TODO add orthogonal loss output_dict = {"loss": loss} return output_dict
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: self.model.train() batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self.model.named_parameters()} self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size/batches_this_epoch logger.info(f"current batch size: {cur_batch} mean batch size: {average}") self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time))) ) if self._early_stopping_by_batch and self._batch_num_total % 10 == 0: if self._validation_data is not None: with torch.no_grad(): # We have a validation set, so compute all the metrics on it. val_loss, num_batches = self._validation_loss() val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True) # Check validation metric for early stopping this_epoch_val_metric = val_metrics[self._validation_metric] self._metric_tracker.add_metric(this_epoch_val_metric) if self._metric_tracker.is_best_so_far(): metrics['best_batch'] = self._batch_num_total for key, value in val_metrics.items(): metrics["best_validation_" + key] = value self._metric_tracker.best_epoch_metrics = val_metrics self._save_checkpoint(self._batch_num_total) if self.callbacks is not None: for callback in self.callbacks: callback.on_batch_end(self._batch_num_total) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. only report system utils when we are local rank 0 at each machine. """ logger.info("Rank %d: Epoch %d/%d", self._rank, epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() if self._is_chief: logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() # should be 1 anyway, because we are only dealing with nprocess_with_ngpus num_gpus = len(self._cuda_device) # TODO: Implementation of whether the generator should take into account of worldsize. # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 # NOTE: only work in nprocess_ngpus device = torch.device("cuda:%d" % self._cuda_device[0]) for batch_group in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._is_chief: # only chief do tensorboard if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view( -1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() else: self.optimizer.step() # Update moving averages # NOTE: not sure whether this need to be average if self._moving_average is not None: self._moving_average.apply(batch_num_total) metrics = get_metrics(self.model, device, self._worldsize, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description( ("Rank %d: " % self._rank) + description, refresh=False) if self._is_chief: # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates( self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"rank {self._rank}, current batch size: {cur_batch} mean batch size: {average}" ) if self._is_chief: self._tensorboard.add_train_scalar( "current_batch_size", cur_batch) self._tensorboard.add_train_scalar( "mean_batch_size", average) if self._is_chief: # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) metrics = get_metrics(self.model, device, self._worldsize, train_loss, batches_this_epoch) metrics['cpu_memory_MB'] = peak_cpu_usage return metrics