def forward_step(data_iterator, model): """Forward step.""" timers = get_timers() # Get the batch. timers('batch generator').start() tokens, types, sentence_order, loss_mask, lm_labels, padding_mask \ = get_batch(data_iterator) timers('batch generator').stop() # Forward model. lm_logits, sop_logits = model(tokens, padding_mask, tokentype_ids=types) sop_loss = F.cross_entropy(sop_logits.view(-1, 2).contiguous().float(), sentence_order.view(-1).contiguous(), ignore_index=-1) lm_loss_ = mpu.vocab_parallel_cross_entropy(lm_logits.contiguous().float(), lm_labels.contiguous()) lm_loss = torch.sum( lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum() loss = lm_loss + sop_loss reduced_losses = reduce_losses([lm_loss, sop_loss]) return loss, {'lm loss': reduced_losses[0], 'sop loss': reduced_losses[1]}
def forward_step(data_iterator, model): """Forward step.""" args = get_args() timers = get_timers() # Get the batch. timers('batch generator').start() query_tokens, query_pad_mask, \ block_tokens, block_pad_mask, block_indices = get_ict_batch(data_iterator) timers('batch generator').stop() # Forward model. query_logits, block_logits = model(query_tokens, query_pad_mask, block_tokens, block_pad_mask) local_batch_size = query_logits.shape[0] global_batch_size = dist.get_world_size( ) * local_batch_size # recall we assert that model_parallel_size == 1 all_query_logits = AllgatherFromDataParallelRegion.apply(query_logits) all_block_logits = AllgatherFromDataParallelRegion.apply(block_logits) # scores are inner products between query and block embeddings retrieval_scores = all_query_logits.float().matmul( torch.transpose(all_block_logits, 0, 1).float()) softmaxed = F.softmax(retrieval_scores, dim=1) sorted_vals, sorted_indices = torch.topk(softmaxed, k=softmaxed.shape[1], sorted=True) def topk_accuracy(k): return torch.cuda.FloatTensor([ sum([ int(i in sorted_indices[i, :k]) for i in range(global_batch_size) ]) / global_batch_size ]) topk_accs = [topk_accuracy(int(k)) for k in args.report_topk_accuracies] retrieval_loss = torch.nn.CrossEntropyLoss()( retrieval_scores, torch.arange(global_batch_size).long().cuda()) reduced_losses = reduce_losses([retrieval_loss, *topk_accs]) # create stats_dict with retrieval loss and all specified top-k accuracies topk_acc_dict = { 'top{}_acc'.format(k): v for k, v in zip(args.report_topk_accuracies, reduced_losses[1:]) } stats_dict = dict(retrieval_loss=reduced_losses[0], **topk_acc_dict) return retrieval_loss, stats_dict
def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler): """Single training step.""" # Pipeline parallelism schedules forward/backward/step if neox_args.is_pipe_parallel: reduced_loss = train_step_pipe(neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator) else: losses = [] for _ in range(neox_args.gradient_accumulation_steps): # Forward model for one step. timers("forward").start() loss = forward_step( neox_args=neox_args, timers=timers, data_iterator=data_iterator, model=model, ) timers("forward").stop() losses.append(loss) # Calculate gradients, reduce across processes, and clip. timers("backward").start() backward_step( neox_args=neox_args, timers=timers, optimizer=optimizer, model=model, loss=loss, ) timers("backward").stop() # Update parameters. timers("optimizer").start() if neox_args.deepspeed: model.step() else: raise ValueError("Must be using deepspeed to run neox") timers("optimizer").stop() reduced_loss = { "lm_loss": reduce_losses(losses).mean() } # reduces losses across machines for logging if neox_args.precision == "fp16" and model.optimizer.overflow: skipped_iter = 1 else: skipped_iter = 0 return reduced_loss, skipped_iter
def evaluate(neox_args, forward_step_fn, data_iterator, model, verbose=False, timers=None): """Evaluation. neox_args: NeoX Arguments forward_step_fn: function with args `neox_args, timers, data_iterator & model that will run a forward pass on the model data_iterator: Iterator that iterates over batches of data. Should return data in the form: {'text': np.array([tokens], dtype=np.int64)} where the size of the array is the model's context size + 1 (`get_batch` transforms it into inputs / labels) """ # Turn on evaluation mode which disables dropout. model.eval() losses = [] with torch.no_grad(): iteration = 0 while iteration < neox_args.eval_iters: iteration += 1 if verbose and iteration % neox_args.log_interval == 0: print_rank_0('Evaluating iter {}/{}'.format( iteration, neox_args.eval_iters)) # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s # to be consistent with deepspeed's pipe parallel engine for _ in range(neox_args.gradient_accumulation_steps): # Forward evaluation loss = forward_step_fn(model=model, data_iterator=data_iterator, neox_args=neox_args, timers=timers) losses.append(loss) # When contiguous memory optimizations are enabled, the buffers # allocated by the optimizations are deallocated during backward pass # in the absence of backward pass the buffers should be reset after each # forward pass if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing: deepspeed.checkpointing.reset() # reduces losses across processes for logging reduced_loss = {"lm_loss": reduce_losses(losses).mean()} # Move model back to the train mode. model.train() return reduced_loss
def forward_step(data_iterator, model): """Forward step.""" args = get_args() timers = get_timers() # Get the batch. timers('batch generator').start() tokens, labels, loss_mask, attention_mask, position_ids = get_batch( data_iterator) timers('batch generator').stop() # Forward model. losses = model(tokens, position_ids, attention_mask, labels=labels) loss_mask = loss_mask.view(-1) loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() # Reduce loss for logging. reduced_loss = reduce_losses([loss]) return loss, {'lm loss': reduced_loss[0]}
def forward_step(data_iterator, model): """Forward step.""" timers = get_timers() # Get the batch. timers('batch generator').start() tokens, labels, loss_mask, attention_mask, position_ids = get_batch( data_iterator) timers('batch generator').stop() # Forward model. output = model(tokens, position_ids, attention_mask) losses = mpu.vocab_parallel_cross_entropy(output.contiguous().float(), labels) loss_mask = loss_mask.view(-1) loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() # Reduce loss for logging. reduced_loss = reduce_losses([loss]) return loss, {'lm loss': reduced_loss[0]}
def _cross_entropy_forward_step(batch, model): """Simple forward step with cross-entropy loss.""" timers = get_timers() # Get the batch. timers('batch generator').start() try: batch_ = next(batch) except BaseException: batch_ = batch tokens, types, labels, attention_mask = process_batch(batch_) timers('batch generator').stop() # Forward model. logits = model(tokens, attention_mask, types) # Cross-entropy loss. loss_func = torch.nn.CrossEntropyLoss() loss = loss_func(logits.contiguous().float(), labels) # Reduce loss for logging. reduced_loss = reduce_losses([loss]) return loss, {'lm loss': reduced_loss[0]}
def train_batch(self, data_iterator, epoch_idx, batch_idx): if self.neox_args.is_pipe_parallel: reduced_loss = megatron_train.train_step_pipe( neox_args=self.neox_args, timers=self.timers, model=self.model, data_iterator=data_iterator, ) else: losses = [] for _ in range(self.neox_args.gradient_accumulation_steps): self.timers("forward").start() loss = megatron_train.forward_step( neox_args=self.neox_args, timers=self.timers, data_iterator=data_iterator, model=self.model, ) self.timers("forward").stop() losses.append(loss) # Calculate gradients, reduce across processes, and clip. self.timers("backward").start() megatron_train.backward_step( neox_args=self.neox_args, timers=self.timers, optimizer=self.optimizer, model=self.model, loss=loss, ) self.timers("backward").stop() # Update parameters. self.timers("optimizer").start() if self.neox_args.deepspeed: self.model.step() else: raise ValueError("Must be using deepspeed to run neox") self.timers("optimizer").stop() reduced_loss = { "lm_loss": megatron_utils.reduce_losses(losses).mean() } if self.neox_args.precision == "fp16" and self.model.optimizer.overflow: skipped_iter = 1 else: skipped_iter = 0 self.neox_args.iteration += 1 self.overflow_monitor.check( skipped_iter) # check for repeated overflow if self.neox_args.log_gradient_noise_scale: # log noise scale if applicable self.noise_scale_logger.update() # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you # may have no tunable parameters on a specific rank if self.optimizer.param_groups: lr = self.optimizer.param_groups[0].get("lr", 0) else: lr = 0 # Logging. self.report_memory_flag, additional_metrics = megatron_train.training_log( neox_args=self.neox_args, timers=self.timers, loss_dict=reduced_loss, total_loss_dict=self.total_train_loss_dict, learning_rate=lr, iteration=self.neox_args.iteration, loss_scale=self.optimizer.cur_scale if self.neox_args.precision == "fp16" else None, report_memory_flag=self.report_memory_flag, skipped_iter=skipped_iter, model=self.model, optimizer=self.optimizer, noise_scale_logger=self.noise_scale_logger, return_metrics=True, ) if (additional_metrics is not None and additional_metrics["num_nans"] == 0 and additional_metrics["num_skipped"] == 0): self.tflops = additional_metrics["flops_per_sec_per_gpu"] / 10**12 if (self.neox_args.exit_interval and self.neox_args.iteration % self.neox_args.exit_interval == 0): torch.distributed.barrier() time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") rank = torch.distributed.get_rank() megatron_utils.print_rank_0( "time: {} | exiting the program at iteration {}".format( time_str, self.neox_args.iteration)) self.context.set_stop_requested(True) return reduced_loss
def evaluate(neox_args, forward_step_fn, data_iterator, model, verbose=False, timers=None): """Evaluation. neox_args: NeoX Arguments forward_step_fn: function with args `neox_args, timers, data_iterator & model that will run a forward pass on the model data_iterator: Iterator that iterates over batches of data. Should return data in the form: {'text': np.array([tokens], dtype=np.int64)} where the size of the array is the model's context size + 1 (`get_batch` transforms it into inputs / labels) """ # Turn on evaluation mode which disables dropout. model.eval() losses = [] if neox_args.char_level_ppl: data_iterator = CharCounter(data_iterator, neox_args.tokenizer) with torch.no_grad(): iteration = 0 while iteration < neox_args.eval_iters: iteration += 1 if verbose and iteration % neox_args.log_interval == 0: print_rank_0("Evaluating iter {}/{}".format( iteration, neox_args.eval_iters)) # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s # to be consistent with deepspeed's pipe parallel engine # since pipe parallel already takes gas into account - default to 1 here if pipe parallel is true for _ in range(1 if neox_args.is_pipe_parallel else neox_args. gradient_accumulation_steps): # Forward evaluation loss = forward_step_fn( model=model, data_iterator=data_iterator, neox_args=neox_args, timers=timers, ) losses.append(loss) # When contiguous memory optimizations are enabled, the buffers # allocated by the optimizations are deallocated during backward pass # in the absence of backward pass the buffers should be reset after each # forward pass if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing: deepspeed.checkpointing.reset() # reduces losses across processes for logging & run eval harness tasks eval_results = {"lm_loss": reduce_losses(losses).mean().item()} eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"]) if neox_args.char_level_ppl: # calculate character level perplexity, if specified # if neox_args.char_level_ppl: # unwrap the data_iterator tokens_per_char = data_iterator.tokens_per_char() print_rank_0(f"Counting chars took {data_iterator.total_time} seconds") data_iterator = data_iterator.data_iterator eval_results["lm_loss_char_lvl_ppl"] = math.exp( eval_results["lm_loss"] * tokens_per_char) if neox_args.eval_tasks: eval_results.update( run_eval_harness(model, forward_step_fn, neox_args, eval_tasks=neox_args.eval_tasks).get("results")) # Move model back to the train mode. model.train() return eval_results