示例#1
0
def forward_step(data_iterator, model):
    """Forward step."""
    timers = get_timers()

    # Get the batch.
    timers('batch generator').start()
    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask \
        = get_batch(data_iterator)
    timers('batch generator').stop()

    # Forward model.
    lm_logits, sop_logits = model(tokens, padding_mask, tokentype_ids=types)

    sop_loss = F.cross_entropy(sop_logits.view(-1, 2).contiguous().float(),
                               sentence_order.view(-1).contiguous(),
                               ignore_index=-1)

    lm_loss_ = mpu.vocab_parallel_cross_entropy(lm_logits.contiguous().float(),
                                                lm_labels.contiguous())
    lm_loss = torch.sum(
        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()

    loss = lm_loss + sop_loss

    reduced_losses = reduce_losses([lm_loss, sop_loss])

    return loss, {'lm loss': reduced_losses[0], 'sop loss': reduced_losses[1]}
def forward_step(data_iterator, model):
    """Forward step."""
    args = get_args()
    timers = get_timers()

    # Get the batch.
    timers('batch generator').start()
    query_tokens, query_pad_mask, \
    block_tokens, block_pad_mask, block_indices = get_ict_batch(data_iterator)
    timers('batch generator').stop()

    # Forward model.
    query_logits, block_logits = model(query_tokens, query_pad_mask,
                                       block_tokens, block_pad_mask)
    local_batch_size = query_logits.shape[0]
    global_batch_size = dist.get_world_size(
    ) * local_batch_size  # recall we assert that model_parallel_size == 1

    all_query_logits = AllgatherFromDataParallelRegion.apply(query_logits)
    all_block_logits = AllgatherFromDataParallelRegion.apply(block_logits)

    # scores are inner products between query and block embeddings
    retrieval_scores = all_query_logits.float().matmul(
        torch.transpose(all_block_logits, 0, 1).float())
    softmaxed = F.softmax(retrieval_scores, dim=1)
    sorted_vals, sorted_indices = torch.topk(softmaxed,
                                             k=softmaxed.shape[1],
                                             sorted=True)

    def topk_accuracy(k):
        return torch.cuda.FloatTensor([
            sum([
                int(i in sorted_indices[i, :k])
                for i in range(global_batch_size)
            ]) / global_batch_size
        ])

    topk_accs = [topk_accuracy(int(k)) for k in args.report_topk_accuracies]
    retrieval_loss = torch.nn.CrossEntropyLoss()(
        retrieval_scores, torch.arange(global_batch_size).long().cuda())
    reduced_losses = reduce_losses([retrieval_loss, *topk_accs])

    # create stats_dict with retrieval loss and all specified top-k accuracies
    topk_acc_dict = {
        'top{}_acc'.format(k): v
        for k, v in zip(args.report_topk_accuracies, reduced_losses[1:])
    }
    stats_dict = dict(retrieval_loss=reduced_losses[0], **topk_acc_dict)

    return retrieval_loss, stats_dict
示例#3
0
def train_step(neox_args, timers, data_iterator, model, optimizer,
               lr_scheduler):
    """Single training step."""

    # Pipeline parallelism schedules forward/backward/step
    if neox_args.is_pipe_parallel:
        reduced_loss = train_step_pipe(neox_args=neox_args,
                                       timers=timers,
                                       model=model,
                                       data_iterator=data_iterator)
    else:
        losses = []
        for _ in range(neox_args.gradient_accumulation_steps):
            # Forward model for one step.
            timers("forward").start()
            loss = forward_step(
                neox_args=neox_args,
                timers=timers,
                data_iterator=data_iterator,
                model=model,
            )
            timers("forward").stop()
            losses.append(loss)
            # Calculate gradients, reduce across processes, and clip.
            timers("backward").start()
            backward_step(
                neox_args=neox_args,
                timers=timers,
                optimizer=optimizer,
                model=model,
                loss=loss,
            )
            timers("backward").stop()
            # Update parameters.
            timers("optimizer").start()
            if neox_args.deepspeed:
                model.step()
            else:
                raise ValueError("Must be using deepspeed to run neox")
            timers("optimizer").stop()
        reduced_loss = {
            "lm_loss": reduce_losses(losses).mean()
        }  # reduces losses across machines for logging

    if neox_args.precision == "fp16" and model.optimizer.overflow:
        skipped_iter = 1
    else:
        skipped_iter = 0

    return reduced_loss, skipped_iter
示例#4
0
def evaluate(neox_args,
             forward_step_fn,
             data_iterator,
             model,
             verbose=False,
             timers=None):
    """Evaluation.
    neox_args: NeoX Arguments
    forward_step_fn: function with args `neox_args, timers,
                    data_iterator & model that will run a forward pass on the model
    data_iterator: Iterator that iterates over batches of data. Should return data in the form:
                    {'text': np.array([tokens], dtype=np.int64)}
                    where the size of the array is the model's context size + 1
                    (`get_batch` transforms it into inputs / labels)
    """
    # Turn on evaluation mode which disables dropout.
    model.eval()
    losses = []
    with torch.no_grad():
        iteration = 0
        while iteration < neox_args.eval_iters:
            iteration += 1
            if verbose and iteration % neox_args.log_interval == 0:
                print_rank_0('Evaluating iter {}/{}'.format(
                    iteration, neox_args.eval_iters))

            # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s
            # to be consistent with deepspeed's pipe parallel engine
            for _ in range(neox_args.gradient_accumulation_steps):
                # Forward evaluation
                loss = forward_step_fn(model=model,
                                       data_iterator=data_iterator,
                                       neox_args=neox_args,
                                       timers=timers)
                losses.append(loss)

            # When contiguous memory optimizations are enabled, the buffers
            # allocated by the optimizations are deallocated during backward pass
            # in the absence of backward pass the buffers should be reset after each
            # forward pass
            if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing:
                deepspeed.checkpointing.reset()
    # reduces losses across processes for logging
    reduced_loss = {"lm_loss": reduce_losses(losses).mean()}
    # Move model back to the train mode.
    model.train()
    return reduced_loss
示例#5
0
def forward_step(data_iterator, model):
    """Forward step."""
    args = get_args()
    timers = get_timers()

    # Get the batch.
    timers('batch generator').start()
    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
        data_iterator)
    timers('batch generator').stop()
    # Forward model.
    losses = model(tokens, position_ids, attention_mask, labels=labels)
    loss_mask = loss_mask.view(-1)
    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()

    # Reduce loss for logging.
    reduced_loss = reduce_losses([loss])

    return loss, {'lm loss': reduced_loss[0]}
示例#6
0
def forward_step(data_iterator, model):
    """Forward step."""
    timers = get_timers()

    # Get the batch.
    timers('batch generator').start()
    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
        data_iterator)
    timers('batch generator').stop()

    # Forward model.
    output = model(tokens, position_ids, attention_mask)
    losses = mpu.vocab_parallel_cross_entropy(output.contiguous().float(),
                                              labels)
    loss_mask = loss_mask.view(-1)
    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()

    # Reduce loss for logging.
    reduced_loss = reduce_losses([loss])

    return loss, {'lm loss': reduced_loss[0]}
def _cross_entropy_forward_step(batch, model):
    """Simple forward step with cross-entropy loss."""
    timers = get_timers()

    # Get the batch.
    timers('batch generator').start()
    try:
        batch_ = next(batch)
    except BaseException:
        batch_ = batch
    tokens, types, labels, attention_mask = process_batch(batch_)
    timers('batch generator').stop()

    # Forward model.
    logits = model(tokens, attention_mask, types)

    # Cross-entropy loss.
    loss_func = torch.nn.CrossEntropyLoss()
    loss = loss_func(logits.contiguous().float(), labels)

    # Reduce loss for logging.
    reduced_loss = reduce_losses([loss])

    return loss, {'lm loss': reduced_loss[0]}
    def train_batch(self, data_iterator, epoch_idx, batch_idx):
        if self.neox_args.is_pipe_parallel:
            reduced_loss = megatron_train.train_step_pipe(
                neox_args=self.neox_args,
                timers=self.timers,
                model=self.model,
                data_iterator=data_iterator,
            )
        else:
            losses = []
            for _ in range(self.neox_args.gradient_accumulation_steps):
                self.timers("forward").start()
                loss = megatron_train.forward_step(
                    neox_args=self.neox_args,
                    timers=self.timers,
                    data_iterator=data_iterator,
                    model=self.model,
                )
                self.timers("forward").stop()
                losses.append(loss)
                # Calculate gradients, reduce across processes, and clip.
                self.timers("backward").start()
                megatron_train.backward_step(
                    neox_args=self.neox_args,
                    timers=self.timers,
                    optimizer=self.optimizer,
                    model=self.model,
                    loss=loss,
                )
                self.timers("backward").stop()
                # Update parameters.
                self.timers("optimizer").start()
                if self.neox_args.deepspeed:
                    self.model.step()
                else:
                    raise ValueError("Must be using deepspeed to run neox")
                self.timers("optimizer").stop()
            reduced_loss = {
                "lm_loss": megatron_utils.reduce_losses(losses).mean()
            }

        if self.neox_args.precision == "fp16" and self.model.optimizer.overflow:
            skipped_iter = 1
        else:
            skipped_iter = 0
        self.neox_args.iteration += 1

        self.overflow_monitor.check(
            skipped_iter)  # check for repeated overflow
        if self.neox_args.log_gradient_noise_scale:  # log noise scale if applicable
            self.noise_scale_logger.update()

        # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you
        # may have no tunable parameters on a specific rank
        if self.optimizer.param_groups:
            lr = self.optimizer.param_groups[0].get("lr", 0)
        else:
            lr = 0

        # Logging.
        self.report_memory_flag, additional_metrics = megatron_train.training_log(
            neox_args=self.neox_args,
            timers=self.timers,
            loss_dict=reduced_loss,
            total_loss_dict=self.total_train_loss_dict,
            learning_rate=lr,
            iteration=self.neox_args.iteration,
            loss_scale=self.optimizer.cur_scale
            if self.neox_args.precision == "fp16" else None,
            report_memory_flag=self.report_memory_flag,
            skipped_iter=skipped_iter,
            model=self.model,
            optimizer=self.optimizer,
            noise_scale_logger=self.noise_scale_logger,
            return_metrics=True,
        )
        if (additional_metrics is not None
                and additional_metrics["num_nans"] == 0
                and additional_metrics["num_skipped"] == 0):
            self.tflops = additional_metrics["flops_per_sec_per_gpu"] / 10**12

        if (self.neox_args.exit_interval and
                self.neox_args.iteration % self.neox_args.exit_interval == 0):
            torch.distributed.barrier()
            time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            rank = torch.distributed.get_rank()
            megatron_utils.print_rank_0(
                "time: {} | exiting the program at iteration {}".format(
                    time_str, self.neox_args.iteration))
            self.context.set_stop_requested(True)
        return reduced_loss
示例#9
0
def evaluate(neox_args,
             forward_step_fn,
             data_iterator,
             model,
             verbose=False,
             timers=None):
    """Evaluation.
    neox_args: NeoX Arguments
    forward_step_fn: function with args `neox_args, timers,
                    data_iterator & model that will run a forward pass on the model
    data_iterator: Iterator that iterates over batches of data. Should return data in the form:
                    {'text': np.array([tokens], dtype=np.int64)}
                    where the size of the array is the model's context size + 1
                    (`get_batch` transforms it into inputs / labels)
    """
    # Turn on evaluation mode which disables dropout.
    model.eval()
    losses = []
    if neox_args.char_level_ppl:
        data_iterator = CharCounter(data_iterator, neox_args.tokenizer)

    with torch.no_grad():
        iteration = 0
        while iteration < neox_args.eval_iters:
            iteration += 1
            if verbose and iteration % neox_args.log_interval == 0:
                print_rank_0("Evaluating iter {}/{}".format(
                    iteration, neox_args.eval_iters))

            # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s
            # to be consistent with deepspeed's pipe parallel engine
            # since pipe parallel already takes gas into account - default to 1 here if pipe parallel is true
            for _ in range(1 if neox_args.is_pipe_parallel else neox_args.
                           gradient_accumulation_steps):
                # Forward evaluation
                loss = forward_step_fn(
                    model=model,
                    data_iterator=data_iterator,
                    neox_args=neox_args,
                    timers=timers,
                )
                losses.append(loss)

            # When contiguous memory optimizations are enabled, the buffers
            # allocated by the optimizations are deallocated during backward pass
            # in the absence of backward pass the buffers should be reset after each
            # forward pass
            if neox_args.deepspeed and neox_args.deepspeed_activation_checkpointing:
                deepspeed.checkpointing.reset()

    # reduces losses across processes for logging & run eval harness tasks
    eval_results = {"lm_loss": reduce_losses(losses).mean().item()}
    eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"])

    if neox_args.char_level_ppl:
        # calculate character level perplexity, if specified
        # if neox_args.char_level_ppl:
        # unwrap the data_iterator
        tokens_per_char = data_iterator.tokens_per_char()
        print_rank_0(f"Counting chars took {data_iterator.total_time} seconds")

        data_iterator = data_iterator.data_iterator
        eval_results["lm_loss_char_lvl_ppl"] = math.exp(
            eval_results["lm_loss"] * tokens_per_char)

    if neox_args.eval_tasks:
        eval_results.update(
            run_eval_harness(model,
                             forward_step_fn,
                             neox_args,
                             eval_tasks=neox_args.eval_tasks).get("results"))
    # Move model back to the train mode.
    model.train()
    return eval_results