예제 #1
0
def train_epoch(train_dloader, model, optimizer, cur_epoch, cfg):
    model.train()
    train_tqdm = tqdm(train_dloader, ncols=80)
    data_size = len(train_dloader)
    for cur_iter, (inputs, labels, _, extra_data) in enumerate(train_tqdm):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in extra_data.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    extra_data[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            preds = model(inputs, extra_data["boxes"])
        else:
            preds = model(inputs)

        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        train_tqdm.set_description("Train_loss: %.4f" % loss.cpu().item())
예제 #2
0
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, cnt):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    if cfg.BN.FREEZE:
        model.freeze_fn('bn_statistics')

    train_meter.iter_tic()
    data_size = len(train_loader)

    #for cur_iter, (inputs, bboxs, masks, labels, _, meta) in enumerate(train_loader):
    for cur_iter, output_dict in enumerate(train_loader):
        if cfg.EPICKITCHENS.USE_BBOX:
            inputs = output_dict['inputs']
            bboxs = output_dict['bboxs']
            masks = output_dict['masks']
            labels = output_dict['label'] 
            # output_dict['index'] 
            meta = output_dict['metadata'] 
        else:
            inputs = output_dict['inputs']
            labels = output_dict['label'] 
            meta = output_dict['metadata'] 
        

        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list,)):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        if isinstance(labels, (dict,)):
            labels = {k: v.cuda() for k, v in labels.items()}
        else:
            labels = labels.cuda()
        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            if cfg.EPICKITCHENS.USE_BBOX:
                if isinstance(bboxs, (list,)):
                    for i in range(len(bboxs)):
                        bboxs[i] = bboxs[i].cuda(non_blocking=True)
                        masks[i] = masks[i].cuda(non_blocking=True)
                else:
                    bboxs = bboxs.cuda(non_blocking=True)
                    masks = masks.cuda(non_blocking=True)
                
                preds = model(inputs, bboxes=bboxs, masks=masks)
            else:
                preds = model(inputs)

        if isinstance(labels, (dict,)):
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")
            # Compute the loss.
            loss_verb = loss_fun(preds[0], labels['verb'])
            loss_noun = loss_fun(preds[1], labels['noun'])
            loss = 0.5 * (loss_verb + loss_noun)
            # check Nan Loss.
            misc.check_nan_losses(loss)
        else:
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")
            # Compute the loss.
            loss = loss_fun(preds, labels)
            # check Nan Loss.
            misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
        else:
            if isinstance(labels, (dict,)):
                # Compute the verb accuracies.
                verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(preds[0], labels['verb'], (1, 5))
                
                # predicted_answer_softmax = torch.nn.Softmax(dim=1)(preds[0])
                # predicted_answer_max = torch.max(predicted_answer_softmax.data, 1).indices
                # print(cnt, predicted_answer_max, labels['verb'])

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce(
                        [loss_verb, verb_top1_acc, verb_top5_acc]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss_verb, verb_top1_acc, verb_top5_acc = (
                    loss_verb.item(),
                    verb_top1_acc.item(),
                    verb_top5_acc.item(),
                )

                # Compute the noun accuracies.
                noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(preds[1], labels['noun'], (1, 5))

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce(
                        [loss_noun, noun_top1_acc, noun_top5_acc]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss_noun, noun_top1_acc, noun_top5_acc = (
                    loss_noun.item(),
                    noun_top1_acc.item(),
                    noun_top5_acc.item(),
                )

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss = du.all_reduce(
                        [loss]
                    )
                    if isinstance(loss, (list,)):
                        loss = loss[0]

                # Copy the stats from GPU to CPU (sync point).
                loss = loss.item()

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(
                    (verb_top1_acc, noun_top1_acc),
                    (verb_top5_acc, noun_top5_acc),
                    (loss_verb, loss_noun, loss),
                    lr, inputs[0].size(0) * cfg.NUM_GPUS
                )
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
                top1_err, top5_err = [
                    (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct
                ]

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(
                    top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS
                )
        train_meter.log_iter_stats(cur_epoch, cur_iter, cnt)
        train_meter.iter_tic()
        cnt += 1
    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
    return cnt
예제 #3
0
def train_epoch(train_loader,
                model,
                optimizer,
                scaler,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    if cfg.MIXUP.ENABLE:
        mixup_fn = MixUp(
            mixup_alpha=cfg.MIXUP.ALPHA,
            cutmix_alpha=cfg.MIXUP.CUTMIX_ALPHA,
            mix_prob=cfg.MIXUP.PROB,
            switch_prob=cfg.MIXUP.SWITCH_PROB,
            label_smoothing=cfg.MIXUP.LABEL_SMOOTH_VALUE,
            num_classes=cfg.MODEL.NUM_CLASSES,
        )

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()
        if cfg.MIXUP.ENABLE:
            samples, labels = mixup_fn(inputs[0], labels)
            inputs[0] = samples

        with torch.cuda.amp.autocast(enabled=cfg.TRAIN.MIXED_PRECISION):
            if cfg.DETECTION.ENABLE:
                preds = model(inputs, meta["boxes"])
            else:
                preds = model(inputs)
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Clip gradients if necessary
        if cfg.SOLVER.CLIP_GRAD_VAL:
            torch.nn.utils.clip_grad_value_(model.parameters(),
                                            cfg.SOLVER.CLIP_GRAD_VAL)
        elif cfg.SOLVER.CLIP_GRAD_L2NORM:
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           cfg.SOLVER.CLIP_GRAD_L2NORM)
        # Update the parameters.
        scaler.step(optimizer)
        scaler.update()

        if cfg.MIXUP.ENABLE:
            _top_max_k_vals, top_max_k_inds = torch.topk(labels,
                                                         2,
                                                         dim=1,
                                                         largest=True,
                                                         sorted=True)
            idx_top1 = torch.arange(labels.shape[0]), top_max_k_inds[:, 0]
            idx_top2 = torch.arange(labels.shape[0]), top_max_k_inds[:, 1]
            preds = preds.detach()
            preds[idx_top1] += preds[idx_top2]
            preds[idx_top2] = 0.0
            labels = top_max_k_inds[:, 0]

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #4
0
def train_epoch(
        train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer=None
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    # Check if the correct params are set to requires_grad = True
    assert_requires_grad_correctness(model, du.is_master_proc(), cfg)
    train_meter.iter_tic()
    data_size = len(train_loader)
    np.set_printoptions(suppress=True)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list,)):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list,)):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        if cfg.MODEL.HEAD_ACT == "softmax" and cfg.TRAIN.DATASET == "custom":
            # We have to change our labels to long tensor
            labels = labels.type(torch.LongTensor)
            labels = labels.cuda()

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr, cfg)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"], is_train=True)

        else:
            # Perform the forward pass.
            preds = model(inputs)

        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        """
        if cur_iter % 70 == 0:
            softmax = torch.nn.Softmax(dim=1)
            probabilities = softmax(preds)
            loss_prob = loss_fun(probabilities, labels)
            preds_numpy = probabilities.cpu().detach().numpy()
            preds_numpy = np.round(preds_numpy, 4)
            labels_numpy = labels.cpu().detach().numpy()
            print("--------------------------")
            for label, pred in zip (labels_numpy, preds_numpy):
                print(str(label) + "---->", end= "")
                print(pred[label])
        """


        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        # Todo: adjust accordingly
        if cfg.DETECTION.ENABLE:  #and not (cfg.MODEL.HEAD_ACT == "softmax"):
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {"Train/loss": loss, "Train/lr": lr},
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
                top1_err, top5_err = [
                    (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct
                ]

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(
                top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #5
0
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)
    regr_list = []
    num_list = []
    top_list = []
    for cur_iter, (inputs, labels, _) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)

        if isinstance(labels, (list, )):
            for i in range(len(labels)):
                labels[i] = labels[i].cuda(non_blocking=True)
            labels = torch.stack((labels))
        else:
            labels = labels.cuda(non_blocking=True)

        if cfg.MODEL.LOSS_FUNC == 'mse':
            labels = labels.float()

        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        # Perform the forward pass.
        preds = model(inputs)
        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")
        # Compute the loss.

        loss = loss_fun(preds, labels)
        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        top1_err = None
        # Compute the errors.
        num_classes = cfg.MODEL.NUM_CLASSES
        if cfg.DATA.LABELS_TYPE == 'regression':
            ln = (labels.size(1) - 1) // 2 + 1
            pr = preds[:, ln:].reshape(-1, 5)
            lb = labels[:, ln:].reshape(-1)
            num_topks_correct = metrics.topks_correct(pr, lb, (1, ))
            top1_err = (1.0 - num_topks_correct[0] / len(lb)) * 100.0
            regr = ((preds[:, 0] - labels[:, 0])**2).mean()
            numbers = ((preds[:, 1:ln] - labels[:, 1:ln])**2).mean()
            if cfg.NUM_GPUS > 1:
                regr, numbers = du.all_reduce([regr, numbers])
            regr_list.append(regr.item())
            num_list.append(numbers.item())
        elif cfg.DATA.LABELS_TYPE == 'length':
            regr = ((preds[:, 0] - labels[:, 0])**2).mean()
            numbers = ((preds[:, 1:] - labels[:, 1:])**2).mean()
            if cfg.NUM_GPUS > 1:
                regr, numbers = du.all_reduce([regr, numbers])
            regr_list.append(regr.item())
            num_list.append(numbers.item())
            num_topks_correct = metrics.topks_correct(preds, labels, (1, ))
            top1_err = num_topks_correct[0] * 0.0
        elif cfg.DATA.LABELS_TYPE == 'stend':
            top1_err = loss.clone()
            # sigm = torch.nn.Sigmoid()
            # start = sigm(preds[:, 0]).cpu().detach().numpy()
            # end = sigm(preds[:, 1]).cpu().detach().numpy()

        else:
            num_topks_correct = metrics.topks_correct(preds, labels, (1, ))
            preds_ix = preds.size(2) * preds.size(
                0) if cfg.DATA.LABELS_TYPE == 'mask' else preds.size(1)
            top1_err = (1.0 - num_topks_correct[0] / preds_size) * 100.0

        # Gather all the predictions across all the devices.
        if cfg.NUM_GPUS > 1:
            loss, top1_err = du.all_reduce([loss, top1_err])

        # Copy the stats from GPU to CPU (sync point).
        loss, top1_err = (loss.item(), top1_err.item())
        top_list.append(top1_err)
        train_meter.iter_toc()
        # Update and log stats.
        train_meter.update_stats(top1_err, loss, lr,
                                 inputs[0].size(0) * cfg.NUM_GPUS)

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()
    if cfg.DATA.LABELS_TYPE == 'regression' or cfg.DATA.LABELS_TYPE == 'length':
        print('---------------------')
        print(
            f'LOSS VALUES!!: SIZE_LOSS:{np.mean(regr_list)} NUM_LOSS:{np.mean(num_list)} CLASS_LOSS:{np.mean(top_list)}'
        )
        print('---------------------')
    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #6
0
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        # _________________________ save model test __________________________________________
        if cur_iter % 100 == 1:
            cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_iter,
                               cfg)  # cur_epoch
            print("----------------------- save done ")
            # exit(0)
            # _________________________________________________________________________________________

        if cfg.DETECTION.ENABLE:
            # inputs[4,3,8,224,224], preds[32,2048,7,7]
            # change {1,3,8,224,224]  ->  [8,3,224,224]
            ##################################################################################
            inputs0 = inputs[0].squeeze(0).permute(1, 0, 2, 3)
            inputs1 = inputs[1].squeeze(0).permute(1, 0, 2, 3)
            meta["boxes"] = meta["boxes"].unsqueeze(0).unsqueeze(0)
            inputs = [inputs0, inputs1]
            preds = model(inputs, meta["boxes"])
            # #################################################################################################################################
            # import os
            # weights = 'checkpoints/checkpoint_epoch_00007.pyth'
            # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
            # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # chkpt = torch.load(weights, map_location=device)

            # try:
            #     model_dict = model.module.state_dict()
            # except AttributeError:
            #     model_dict = model.state_dict()  # 读取原始状态及参数,                                         ## 多GPU训练,导致训练存储的模型时key会加上model
            #     # 将pretrained_dict里不属于model_dict的键剔除掉
            #     chkpt = {k: v for k, v in chkpt.items() if k in model_dict}
            # print("load pretrain model")
            # model_dict.update(chkpt)
            # model.load_state_dict(model_dict)

            # model.to(device)
            # # inputs = [inputs.to(device)]
            # model.eval()
            # input_tensor = (inputs, meta["boxes"].to(device))
            # traced_script_module = torch.jit.trace(model, input_tensor)
            # traced_script_module.save("weights/sf_pytorch.pt")
            # print("************************* out put save **********************************")
            # exit(0)


##############################################################################################
        else:
            preds = model(inputs)
        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #7
0
def train_epoch(
    train_loader,
    model,
    optimizer,
    scaler,
    train_meter,
    cur_epoch,
    cfg,
    writer=None,
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    print(model)
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    if cfg.MIXUP.ENABLE:
        mixup_fn = MixUp(
            mixup_alpha=cfg.MIXUP.ALPHA,
            cutmix_alpha=cfg.MIXUP.CUTMIX_ALPHA,
            mix_prob=cfg.MIXUP.PROB,
            switch_prob=cfg.MIXUP.SWITCH_PROB,
            label_smoothing=cfg.MIXUP.LABEL_SMOOTH_VALUE,
            num_classes=cfg.MODEL.NUM_CLASSES,
        )

    # print(model.patch_embed.proj.weight.device)
    # if cfg.NUM_GPUS >= 2 and not cfg.MODEL.DDP:
    #     blk_size = int(16/cfg.NUM_GPUS)
    #     start = blk_size
    #     for g in range(cfg.NUM_GPUS-1):
    #         dev = f"cuda:{g+1}"
    #         for i in range(start, start + blk_size):
    #             model.blocks[i] = model.blocks[i].to(dev)
    #         start += blk_size
    #     model.norm = model.norm.to(dev)
    #     model.head = model.head.to(dev)

    profiler.log_tic("loop_time")
    # extra_model = Mlp(400, 1000000, 400)
    # print(extra_model)
    # extra_model = extra_model.to("cuda:4")

    if cfg.MODEL.MODEL_NAME == "MViTHybridP1":
        cfg.MODEL.MODEL_NAME = "MViTHybridP2"
        original_ddp = cfg.MODEL.DDP
        cfg.MODEL.DDP = False
        model_p2 = build_model(cfg)
        model_p2 = model_p2.to("cuda:2")  # cuda()
        # because the rest of the logic is about the P1 model
        cfg.MODEL.MODEL_NAME = "MViTHybridP1"
        cfg.MODEL.DDP = original_ddp

    for cur_iter, (inputs, labels, index, time,
                   meta) in enumerate(train_loader):
        print(f"Iteration: {cur_iter}, {inputs.shape}")
        # print(inputs.shape)
        # batchsize = 18
        # inputs = [
        #     torch.rand((batchsize, 3, 16, 224, 224)),
        # ]
        # labels = torch.zeros(batchsize)
        # meta =
        # Transfer the data to the current GPU device.
        if cfg.MODEL.MODEL_NAME in ["MViT", "MViTHybridP1"] and cfg.NUM_GPUS:

            print("in MViT model if statement")

            # if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            # for key, val in meta.items():
            #     if isinstance(val, (list,)):
            #         for i in range(len(val)):
            #             val[i] = val[i].cuda(non_blocking=True)
            #     else:
            #         meta[key] = val.cuda(non_blocking=True)
        # else:
        # inputs[0] = inputs[0].to("cuda:0")
        # inputs = inputs.to("cuda:0")
        # labels = labels.to("cuda:0")
        # print(inputs.shape)
        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()
        if cfg.MIXUP.ENABLE:
            samples, labels = mixup_fn(inputs[0], labels)
            inputs[0] = samples

        with torch.cuda.amp.autocast(enabled=cfg.TRAIN.MIXED_PRECISION):
            # if cfg.DETECTION.ENABLE:
            #     preds = model(inputs, meta["boxes"])
            # else:

            profiler.log_tic("model_time")

            if cfg.MODEL.MODEL_NAME == "MViTHybridP1":
                preds, thw = model(inputs)
                preds = preds.to("cuda:2")
                # import ipdb; ipdb.set_trace()
                preds = model_p2(preds, thw)

            else:
                preds = model(inputs)

            # preds = preds.to("cuda:4")
            # pred = extra_model(preds)

            profiler.log_toc("model_time", shape=inputs.shape)
            # Explicitly declare reduction to mean.
            # loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            # loss = loss_fun(preds, labels)
            loss = preds.norm()
            # loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        profiler.log_tic("backward_time")
        scaler.scale(loss).backward()

        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Clip gradients if necessary
        if cfg.SOLVER.CLIP_GRAD_VAL:
            torch.nn.utils.clip_grad_value_(model.parameters(),
                                            cfg.SOLVER.CLIP_GRAD_VAL)
        elif cfg.SOLVER.CLIP_GRAD_L2NORM:
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           cfg.SOLVER.CLIP_GRAD_L2NORM)
        # Update the parameters.
        scaler.step(optimizer)
        scaler.update()
        profiler.log_toc("backward_time", shape=inputs.shape)

        if cfg.MIXUP.ENABLE:
            _top_max_k_vals, top_max_k_inds = torch.topk(labels,
                                                         2,
                                                         dim=1,
                                                         largest=True,
                                                         sorted=True)
            idx_top1 = torch.arange(labels.shape[0]), top_max_k_inds[:, 0]
            idx_top2 = torch.arange(labels.shape[0]), top_max_k_inds[:, 1]
            preds = preds.detach()
            preds[idx_top1] += preds[idx_top2]
            preds[idx_top2] = 0.0
            labels = top_max_k_inds[:, 0]

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        # else:
        #     top1_err, top5_err = None, None
        #     if cfg.DATA.MULTI_LABEL:
        #         # Gather all the predictions across all the devices.
        #         if cfg.NUM_GPUS > 1:
        #             [loss] = du.all_reduce([loss])
        #         loss = loss.item()
        #     else:
        #         Compute the errors.
        #         num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
        #         top1_err, top5_err = [
        #             (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct
        #         ]
        #         Gather all the predictions across all the devices.
        #         if cfg.NUM_GPUS > 1:
        #             loss, top1_err, top5_err = du.all_reduce([loss, top1_err, top5_err])

        #         # Copy the stats from GPU to CPU (sync point).
        #         loss, top1_err, top5_err = (
        #             loss.item(),
        #             top1_err.item(),
        #             top5_err.item(),
        #         )

        #     # Update and log stats.
        #     train_meter.update_stats(
        #         top1_err,
        #         top5_err,
        #         loss,
        #         lr,
        #         inputs[0].size(0)
        #         * max(
        #             cfg.NUM_GPUS, 1
        #         ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
        #     )
        #     write to tensorboard format if available.
        #     if writer is not None:
        #         writer.add_scalars(
        #             {
        #                 "Train/loss": loss,
        #                 "Train/lr": lr,
        #                 "Train/Top1_err": top1_err,
        #                 "Train/Top5_err": top5_err,
        #             },
        #             global_step=data_size * cur_epoch + cur_iter,
        #         )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()
        profiler.log_toc("loop_time", shape=inputs.shape)
        profiler.log_tic("loop_time")

        profiler.report(25)
    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #8
0
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            preds = model(inputs)

        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
        else:
            # Compute the errors.
            num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
            top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                  for x in num_topks_correct]

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss, top1_err, top5_err = du.all_reduce(
                    [loss, top1_err, top5_err])

            # Copy the stats from GPU to CPU (sync point).
            loss, top1_err, top5_err = (
                loss.item(),
                top1_err.item(),
                top5_err.item(),
            )

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(top1_err, top5_err, loss, lr,
                                     inputs[0].size(0) * cfg.NUM_GPUS)

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None,
                wandb_log=False):
    """
    Perform the audio training for one epoch.
    Args:
        train_loader (loader): audio training loader.
        model (model): the audio model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    if cfg.BN.FREEZE:
        model.module.freeze_fn(
            'bn_statistics') if cfg.NUM_GPUS > 1 else model.freeze_fn(
                'bn_statistics')

    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            if isinstance(labels, (dict, )):
                labels = {k: v.cuda() for k, v in labels.items()}
            else:
                labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        # preds = model(inputs) #this is how model.forward() is called
        preds = model(inputs)[
            0]  #this is the original output, the output of the last layer
        linear_layer_output = model(inputs)[1]

        if isinstance(labels, (dict, )):
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss_verb = loss_fun(preds[0], labels['verb'])
            loss_noun = loss_fun(preds[1], labels['noun'])
            loss = 0.5 * (loss_verb + loss_noun)

            # check Nan Loss.
            misc.check_nan_losses(loss)
        else:
            #I believe this is the VGG loss part, as the labels are not split into nouns and verbs

            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Embedding loss function.
            emb_loss_fun = losses.get_loss_func(
                cfg.MODEL.EMB_LOSS_FUNC)(reduction="mean")

            # Compute the loss for the main model.
            loss = loss_fun(preds, labels)

            # Compute the loss for the embeddings.
            emb_loss = emb_loss_fun(linear_layer_output, word_embedding)

            # Add the losses together- use embeddings to fine tune the model's objective
            loss = loss + emb_loss

            # check Nan Loss.
            misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if isinstance(labels, (dict, )):
            # Compute the verb accuracies.
            verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(
                preds[0], labels['verb'], (1, 5))

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce(
                    [loss_verb, verb_top1_acc, verb_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss_verb, verb_top1_acc, verb_top5_acc = (
                loss_verb.item(),
                verb_top1_acc.item(),
                verb_top5_acc.item(),
            )

            # Compute the noun accuracies.
            noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(
                preds[1], labels['noun'], (1, 5))

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce(
                    [loss_noun, noun_top1_acc, noun_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss_noun, noun_top1_acc, noun_top5_acc = (
                loss_noun.item(),
                noun_top1_acc.item(),
                noun_top5_acc.item(),
            )

            # Compute the action accuracies.
            action_top1_acc, action_top5_acc = metrics.multitask_topk_accuracies(
                (preds[0], preds[1]), (labels['verb'], labels['noun']), (1, 5))
            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss, action_top1_acc, action_top5_acc = du.all_reduce(
                    [loss, action_top1_acc, action_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss, action_top1_acc, action_top5_acc = (
                loss.item(),
                action_top1_acc.item(),
                action_top5_acc.item(),
            )

            # Update and log stats.
            train_meter.update_stats(
                (verb_top1_acc, noun_top1_acc, action_top1_acc),
                (verb_top5_acc, noun_top5_acc, action_top5_acc),
                (loss_verb, loss_noun, loss),
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None and not wandb_log:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_acc": action_top1_acc,
                        "Train/Top5_acc": action_top5_acc,
                        "Train/verb/loss": loss_verb,
                        "Train/noun/loss": loss_noun,
                        "Train/verb/Top1_acc": verb_top1_acc,
                        "Train/verb/Top5_acc": verb_top5_acc,
                        "Train/noun/Top1_acc": noun_top1_acc,
                        "Train/noun/Top5_acc": noun_top5_acc,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

            if wandb_log:
                wandb.log(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_acc": action_top1_acc,
                        "Train/Top5_acc": action_top5_acc,
                        "Train/verb/loss": loss_verb,
                        "Train/noun/loss": loss_noun,
                        "Train/verb/Top1_acc": verb_top1_acc,
                        "Train/verb/Top5_acc": verb_top5_acc,
                        "Train/noun/Top1_acc": noun_top1_acc,
                        "Train/noun/Top5_acc": noun_top5_acc,
                        "train_step": data_size * cur_epoch + cur_iter,
                    }, )
        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None and not wandb_log:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

            if wandb_log:
                wandb.log(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                        "train_step": data_size * cur_epoch + cur_iter,
                    }, )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #10
0
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, writer,
                nep, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    global_iters = data_size * cur_epoch
    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):

        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)

        else:
            inputs = inputs.cuda(non_blocking=True)

        if len(inputs[i].shape) > 5:

            labels = torch.repeat_interleave(labels, inputs[i].size(1), 0)

        for i in range(len(inputs)):
            if len(inputs[i].shape) > 5:

                inputs[i] = inputs[i].view((-1, ) + inputs[i].shape[2:])

        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size,
                                global_iters, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            if 'masks' in meta:
                preds = model((inputs, meta['masks']))
            else:
                preds = model(inputs)

        ####################################################################################################################################
        # check activations
        ####################################################################################################################################
        # if writer is not None and global_iters%cfg.SUMMARY_PERIOD==0:

        #     bu_errors = preds['bu_errors']#.cpu()#.data.numpy().squeeze()

        #     for layer in range(len(bu_errors)):
        #         images = bu_errors[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])

        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/bu_error_l%d'%layer, grid, global_iters)

        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_bu_errors_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     mix_out = preds['mix_layer']#.cpu().data.numpy().squeeze()
        #     for layer in range(len(mix_out)):

        #         images = mix_out[layer].transpose(1,2).transpose(0,1)
        #         images = images.reshape((-1,) + images.shape[2:])
        #         images = (images-images.min())
        #         images = images/images.max()
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/mix_layer_l%d'%layer, grid, global_iters)
        #         # tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'example_%d_mix_layer_l%d.jpg'%(i,layer)), nrow=18, normalize=True)

        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_mix_layer_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     inhibition = preds['H_inh']#.cpu()#.data.numpy().squeeze()
        #     for layer in range(len(inhibition)):
        #         images = inhibition[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/H_inh_l%d'%layer, grid, global_iters)
        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_H_inh_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     hidden = preds['hidden']#.cpu()#.data.numpy().squeeze()
        #     for layer in range(len(hidden)):
        #         images = hidden[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/hidden_l%d'%layer, grid, global_iters)
        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_hidden_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        out_keys = preds.keys()
        total_loss = 0

        if cfg.PREDICTIVE.ENABLE:

            errors = preds['pred_errors']
            if 'frame_errors' in preds:
                frame_errors = preds['frame_errors']

            if 'IoU' in preds:
                iou = preds['IoU']
            if 'Acc' in preds:
                acc = preds['Acc']

            pred_loss = errors.mean()
            total_loss += pred_loss

            # if 'frame_errors' in out_keys:
            #     total_loss += frame_errors
            # copy_baseline = F.smooth_l1_loss(inputs[i][:,:,1:] - inputs[i][:,:,:-1], torch.zeros_like(inputs[i][:,:,1:]))
            # copy_baseline = F.l1_loss(inputs[i][:,:,1:] - inputs[i][:,:,:-1], torch.zeros_like(inputs[i][:,:,1:]))

        if cfg.PREDICTIVE.CPC:
            cpc_loss = preds['cpc_loss']
            total_loss += cpc_loss

        if 'cbp_penalty' in preds:
            penalty = preds['cbp_penalty']
            total_loss += penalty

        if cfg.SUPERVISED:
            preds = preds['logits']

            if cfg.MODEL.LOSS_FUNC != '':
                # Explicitly declare reduction to mean.
                loss_fun = losses.get_loss_func(
                    cfg.MODEL.LOSS_FUNC)(reduction="mean")

                # Compute the loss.
                loss = loss_fun(preds, labels)

                total_loss += loss

        # check Nan Loss.
        misc.check_nan_losses(total_loss)

        # Perform the backward pass.
        optimizer.zero_grad()

        total_loss.backward()

        ####################################################################################################################################
        # check gradients
        if writer is not None and global_iters % cfg.SUMMARY_PERIOD == 0:
            n_p = model.module.named_parameters() if hasattr(
                model, 'module') else model.named_parameters()
            fig = viz_helpers.plot_grad_flow_v2(n_p)
            writer.add_figure('grad_flow/grad_flow', fig, global_iters)
        ####################################################################################################################################

        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(lr,
                                     inputs[0].size(0) * cfg.NUM_GPUS,
                                     loss=loss)
        else:
            if cfg.SUPERVISED:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:

                if cfg.PREDICTIVE.ENABLE:
                    pred_loss = du.all_reduce([pred_loss])
                    pred_loss = pred_loss[0]
                    if 'frame_errors' in out_keys:
                        frame_errors = du.all_reduce([frame_errors])[0]
                    if 'IoU' in preds:
                        iou = du.all_reduce([iou])[0]
                    if 'Acc' in preds:
                        acc = du.all_reduce([acc])[0]
                    # copy_baseline = du.all_reduce([copy_baseline])
                    # copy_baseline = copy_baseline[0]

                if cfg.PREDICTIVE.CPC:
                    cpc_loss = du.all_reduce([cpc_loss])
                    cpc_loss = cpc_loss[0]
                if cfg.SUPERVISED:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                if 'cbp_penalty' in out_keys:
                    penalty = du.all_reduce([penalty])[0]

            loss_logs = {}
            if cfg.PREDICTIVE.ENABLE:
                pred_loss = pred_loss.item()
                loss_logs['loss_pred'] = pred_loss
                if 'frame_errors' in out_keys:
                    frame_errors = frame_errors.item()
                    loss_logs['frame_errors'] = frame_errors

                if 'IoU' in preds:
                    loss_logs['IoU'] = iou.item()
                if 'Acc' in preds:
                    loss_logs['Acc'] = acc.item()
                # copy_baseline = copy_baseline.item()
                # loss_logs['copy_comp'] = copy_baseline
            if cfg.PREDICTIVE.CPC:
                cpc_loss = cpc_loss.item()
                loss_logs['loss_cpc'] = cpc_loss

            if cfg.SUPERVISED:
                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

                loss_logs['loss_class'] = loss
                loss_logs['top5_err'] = top5_err
                loss_logs['top1_err'] = top1_err

            if 'cbp_penalty' in out_keys:
                loss_logs['cbp_penalty'] = penalty.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(lr, inputs[0].size(0) * cfg.NUM_GPUS,
                                     **loss_logs)

            if writer is not None and global_iters % cfg.LOG_PERIOD == 0:
                for k, v in loss_logs.items():
                    writer.add_scalar('loss/' + k.strip('loss_'),
                                      train_meter.stats[k].get_win_median(),
                                      global_iters)
            if nep is not None and global_iters % cfg.LOG_PERIOD == 0:
                for k, v in loss_logs.items():
                    nep.log_metric(k.strip('loss_'),
                                   train_meter.stats[k].get_win_median())

                nep.log_metric('global_iters', global_iters)

                # writer.add_scalar('loss/top1_err', train_meter.mb_top1_err.get_win_median(), global_iters)
                # writer.add_scalar('loss/top5_err', train_meter.mb_top5_err.get_win_median(), global_iters)
                # writer.add_scalar('loss/loss', train_meter.loss.get_win_median(), global_iters)
            if global_iters % cfg.SUMMARY_PERIOD == 0 and du.get_rank(
            ) == 0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS):

                with torch.no_grad():
                    # logger.info(inputs[i].shape)
                    # sys.stdout.flush()
                    inputs[0] = inputs[0][:min(3, len(inputs[0]))]
                    if 'masks' in meta:
                        frames = model(
                            (inputs, meta['masks'][:min(3, len(inputs[0]))]),
                            extra=['frames'])['frames']
                    else:
                        frames = model(inputs, extra=['frames'])['frames']

                    n_rows = inputs[0].size(2) - 1

                    inputs = inputs[0].transpose(1, 2)[:, -n_rows:]
                    frames = frames.transpose(1, 2)[:, -n_rows:]

                    inputs = inputs * inputs.new(
                        cfg.DATA.STD)[None, None, :, None, None] + inputs.new(
                            cfg.DATA.MEAN)[None, None, :, None, None]
                    frames = frames * frames.new(
                        cfg.DATA.STD)[None, None, :, None, None] + frames.new(
                            cfg.DATA.MEAN)[None, None, :, None, None]
                    images = torch.cat([inputs, frames],
                                       1).reshape((-1, ) + inputs.shape[2:])

                # grid = tv.utils.make_grid(images, nrow=8, normalize=True)
                # writer.add_image('predictions', images, global_iters)

                tv.utils.save_image(
                    images,
                    os.path.join(cfg.OUTPUT_DIR,
                                 'preds_%d.jpg' % global_iters),
                    nrow=n_rows,
                    normalize=True)

                # del images
                # del frames
                # del inputs

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

        global_iters += 1

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #11
0
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    #Unsupervised learning => No labels
    for cur_iter, inputs in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        # if cfg.DETECTION.ENABLE:
        #     preds = model(inputs, meta["boxes"])
        # else:
        #     preds = model(inputs)

        # Loss already computed in model
        # logger.info("Size: {}".format(inputs.Size()))
        output = model(inputs)
        loss = torch.mean(output['loss'])

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )
        else:
            if cfg.NUM_GPUS > 1:
                [loss] = du.all_reduce([loss])
            loss = loss.item()
            # Update and log stats.
            train_meter.update_stats(
                1,
                1,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #12
0
def train_epoch(
    train_loader, student_model, teacher_model, optimizer, train_meter, cur_epoch, cfg, writer=None
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    teacher_model.eval()
    student_model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta, _) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list,)):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list,)):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            student_preds, student_features = student_model(inputs.copy(), meta["boxes"])
            with torch.no_grad():
                teacher_preds, teacher_features = teacher_model(inputs.copy(), meta["boxes"])
        else:
            # Perform the forward pass.
            student_preds, student_features = student_model(inputs.copy())
            with torch.no_grad():
                teacher_preds, teacher_features = teacher_model(inputs.copy())
        # Explicitly declare reduction to mean.
        # L2 loss for featuremap difference
        loss_mse_func = losses.get_loss_func('mse')(reduction="mean")
        # Cross entropy loss for prediction
        loss_pred_func = losses.get_loss_func('cross_entropy')(reduction="mean")
        # kl-divergence loss
        loss_kl_func = losses.get_loss_func('kl_divergence')(reduction="batchmean")
        
        T = cfg.KD.TEMPERATURE
        alpha = cfg.KD.ALPHA
        
        loss_pred = loss_pred_func(student_preds, labels) * (1. - alpha)
        loss_mse = []
        loss_kl = []
        for s_features, t_features in zip(student_features, teacher_features):
            for i in range(2):
                #mse loss
                loss_mse.append(loss_mse_func(s_features[i], t_features[i]) * (alpha * T * T))

                #kl divergence loss 
                b, c, t, h, w = s_features[i].shape
                s_feature = s_features[i].permute(0, 2, 3, 4, 1).contiguous().view(b*t*h*w, c)
                t_feature = t_features[i].permute(0, 2, 3, 4, 1).contiguous().view(b*t*h*w, c)
                loss_kl.append(loss_kl_func(F.log_softmax(s_feature/T, dim = 0), F.softmax(t_feature/T, dim = 0)) * (alpha * T * T))
            
        #TOTAL LOSS = sum of all losses
        loss = loss_pred + sum(loss_mse) + sum(loss_kl)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.T
            if writer is not None:
                writer.add_scalars(
                    {"Train/loss": loss, "Train/lr": lr, "Train/mse": sum(loss_mse), "Train/loss_kl": sum(loss_kl), "Train/loss_pred": loss_pred},
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(student_preds, labels, (1, 5))
                top1_err, top5_err = [
                    (1.0 - x / student_preds.size(0)) * 100.0 for x in num_topks_correct
                ]

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0)
                * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                        "Train/mse": sum(loss_mse),
                        "Train/loss_kl": sum(loss_kl),
                        "Train/loss_pred": loss_pred,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #13
0
def train_epoch(
    train_loader,
    model,
    optimizer,
    scaler,
    train_meter,
    cur_epoch,
    cfg,
    writer=None,
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    if cfg.MIXUP.ENABLE:
        mixup_fn = MixUp(
            mixup_alpha=cfg.MIXUP.ALPHA,
            cutmix_alpha=cfg.MIXUP.CUTMIX_ALPHA,
            mix_prob=cfg.MIXUP.PROB,
            switch_prob=cfg.MIXUP.SWITCH_PROB,
            label_smoothing=cfg.MIXUP.LABEL_SMOOTH_VALUE,
            num_classes=cfg.MODEL.NUM_CLASSES,
        )

    iters_noupdate = 0
    if cfg.MODEL.MODEL_NAME == "ContrastiveModel" and cfg.CONTRASTIVE.TYPE == "moco":
        assert cfg.CONTRASTIVE.QUEUE_LEN % (cfg.TRAIN.BATCH_SIZE *
                                            cfg.NUM_SHARDS) == 0
        iters_noupdate = (cfg.CONTRASTIVE.QUEUE_LEN // cfg.TRAIN.BATCH_SIZE //
                          cfg.NUM_SHARDS)
    if cfg.MODEL.FROZEN_BN:
        misc.frozen_bn_stats(model)
    # Explicitly declare reduction to mean.
    loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

    # import ipdb; ipdb.set_trace()

    profiler.log_tic("loop_time")
    for cur_iter, (inputs, labels, index, time,
                   meta) in enumerate(train_loader):
        if not isinstance(inputs, list):
            inputs = [inputs]
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    if isinstance(inputs[i], (list, )):
                        for j in range(len(inputs[i])):
                            # inputs[i][j] = inputs[i][j].cuda(non_blocking=True)
                            inputs[i][j] = inputs[i][j].to("cuda:0")
                    else:
                        # inputs[i] = inputs[i].cuda(non_blocking=True)
                        inputs[i] = inputs[i].to("cuda:0")
            else:
                # inputs = inputs.cuda(non_blocking=True)
                inputs = inputs.to("cuda:0")

            # labels = labels.cuda()
            labels = labels.to("cuda:0")

            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)
            index = index.to("cuda:0")
            time = time.to("cuda:0")

        batch_size = (inputs[0][0].size(0)
                      if isinstance(inputs[0], list) else inputs[0].size(0))
        # Update the learning rate.
        epoch_exact = cur_epoch + float(cur_iter) / data_size
        lr = optim.get_epoch_lr(epoch_exact, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()
        if cfg.MIXUP.ENABLE:
            samples, labels = mixup_fn(inputs[0], labels)
            inputs[0] = samples

        with torch.cuda.amp.autocast(enabled=cfg.TRAIN.MIXED_PRECISION):

            # Explicitly declare reduction to mean.
            perform_backward = True
            optimizer.zero_grad()

            if cfg.MODEL.MODEL_NAME == "ContrastiveModel":
                (
                    model,
                    preds,
                    partial_loss,
                    perform_backward,
                ) = contrastive_forward(model, cfg, inputs, index, time,
                                        epoch_exact, scaler)
            elif cfg.DETECTION.ENABLE:
                # Compute the predictions.
                preds = model(inputs, meta["boxes"])
            else:
                profiler.log_tic("model_time")
                preds = model(inputs)
                profiler.log_toc("model_time", shape=inputs[0].shape)

        if cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel":
            labels = torch.zeros(preds.size(0),
                                 dtype=labels.dtype,
                                 device=labels.device)

        if cfg.MODEL.MODEL_NAME == "ContrastiveModel" and partial_loss:
            loss = partial_loss
        else:
            # Compute the loss.
            loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        if perform_backward:
            # print("Running backward!")
            scaler.scale(loss).backward()

        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Clip gradients if necessary
        if cfg.SOLVER.CLIP_GRAD_VAL:
            torch.nn.utils.clip_grad_value_(model.parameters(),
                                            cfg.SOLVER.CLIP_GRAD_VAL)
        elif cfg.SOLVER.CLIP_GRAD_L2NORM:
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           cfg.SOLVER.CLIP_GRAD_L2NORM)

        model = cancel_swav_gradients(model, cfg, epoch_exact)
        if cur_iter < iters_noupdate and cur_epoch == 0:  #  for e.g. MoCo
            logger.info("Not updating parameters {}/{}".format(
                cur_iter, iters_noupdate))
        else:
            # print("Updating optimizer!")
            # Update the parameters.
            scaler.step(optimizer)
        scaler.update()

        if cfg.MIXUP.ENABLE:
            _top_max_k_vals, top_max_k_inds = torch.topk(labels,
                                                         2,
                                                         dim=1,
                                                         largest=True,
                                                         sorted=True)
            idx_top1 = torch.arange(labels.shape[0]), top_max_k_inds[:, 0]
            idx_top2 = torch.arange(labels.shape[0]), top_max_k_inds[:, 1]
            preds = preds.detach()
            preds[idx_top1] += preds[idx_top2]
            preds[idx_top2] = 0.0
            labels = top_max_k_inds[:, 0]

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss.detach(), top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                batch_size * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        torch.cuda.synchronize()
        train_meter.iter_toc()  # do measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        torch.cuda.synchronize()
        train_meter.iter_tic()

        profiler.log_toc("loop_time", shape=inputs[0].shape)
        profiler.log_tic("loop_time")

        profiler.report(25)
        # profiler.blahblah

    del inputs
    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #14
0
def train_epoch(
    train_loader, model, optimizer, train_meter, cur_epoch, cfg, test_imp=False
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    test_counter = 0
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)
    
    for cur_iter, sampled_batch in enumerate(train_loader): 
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['question']
        des_ans = sampled_batch['question_dict']['ans']
        # des_len = sampled_batch['question_dict']['len']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(frames, (list,)):
                for i in range(len(frames)):
                    frames[i] = frames[i].cuda(non_blocking=True)
            else:
                frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            # des_len = des_len.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        #Separated batches
        #Des
        pred_des_ans = model(frames, des_q, True)
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        loss = des_loss_fun(pred_des_ans, des_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #Save for stats
        loss_des_val = loss

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans, (1, 5))
        top1_err, top5_err = [
            (1.0 - x / pred_des_ans.size(0)) * 100.0 for x in num_topks_correct
        ]
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (
            loss_des_val.item(),
            top1_err.item(),
            top5_err.item()
        )
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, lr, mb_size
        # Update and log stats.
        train_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            loss_des_val,
            None,
            lr,
            des_q.size()[0],
            mb_size_mc
        )
        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()


        #For testing implementation
        if test_imp:
            print(" --- Descriptive questions results --- ")
            # print("Des_q")
            # print(des_q)
            print("Des_ans")
            print(des_ans)
            #print("Des_ans_pred")
            #print(pred_des_ans)
            print("Argmax => prediction")
            print(torch.argmax(pred_des_ans, dim=1, keepdim=False))
            print("Top1_err and Top5err")
            print(top1_err, top5_err)
            print("Loss_des_val = {}".format(loss_des_val))
            test_counter += 1
            if test_counter == 1: 
                break

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #15
0
    def train_epoch(self,
                    train_loader,
                    model,
                    optimizer,
                    train_meter,
                    cur_epoch,
                    cfg,
                    writer=None):
        """
        Perform the video training for one epoch.
        Args:
            train_loader (loader): video training loader.
            model (model): the video model to train.
            optimizer (optim): the optimizer to perform optimization on the model's
                parameters.
            train_meter (TrainMeter): training meters to log the training performance.
            cur_epoch (int): current epoch of training.
            cfg (CfgNode): configs. Details can be found in
                slowfast/config/defaults.py
            writer (TensorboardWriter, optional): TensorboardWriter object
                to writer Tensorboard log.
        """
        # Enable train mode.
        model.train()
        train_meter.iter_tic()
        data_size = len(train_loader)
        start = time.time()
        btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS
        rankE = os.environ.get("RANK", None)
        worldE = os.environ.get("WORLD_SIZE", None)
        dSize = data_size * btch
        self.logger.info(
            "Train Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}"
            .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(),
                    du.get_rank(), rankE, du.get_world_size(), worldE))
        tot = 0
        first = True
        predsAll = []
        labelsAll = []

        for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
            # Transfer the data to the current GPU device.
            tot += len(labels)
            if isinstance(inputs, (list, )):
                if first:
                    self.logger.info(
                        "rank {} LEN {}  {} shape Slow {} Fast {} {} tot {}".
                        format(du.get_rank(), len(labels), len(inputs),
                               inputs[0].shape, inputs[1].shape,
                               labels[0].shape, tot))
                    first = False
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                if first:
                    self.logger.info(
                        "rank {} LEN {} shape {} {} tot {}".format(
                            du.get_rank(), len(labels), inputs.shape,
                            labels[0].shape, tot))
                    first = False
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()

            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

            # Update the learning rate.
            lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size,
                                    cfg)
            optim.set_lr(optimizer, lr)
            if cfg.DETECTION.ENABLE:
                # Compute the predictions.
                preds = model(inputs, meta["boxes"])

            else:
                # Perform the forward pass.
                preds = model(inputs)
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

            # check Nan Loss.
            misc.check_nan_losses(loss)

            # Perform the backward pass.
            optimizer.zero_grad()
            loss.backward()
            # Update the parameters.
            optimizer.step()

            if cfg.DETECTION.ENABLE:
                if cfg.NUM_GPUS > 1:
                    loss = du.all_reduce([loss])[0]
                loss = loss.item()

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(None, None, None, loss, lr)
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Train/loss": loss,
                            "Train/lr": lr
                        },
                        global_step=data_size * cur_epoch + cur_iter,
                    )
                ite = data_size * cur_epoch + cur_iter
                if du.is_master_proc():
                    self.logger.log_row(name='TrainLoss',
                                        iter=ite,
                                        loss=loss,
                                        description="train loss")
                    self.logger.log_row(name='TrainLr',
                                        iter=ite,
                                        lr=lr,
                                        description="train learn rate")

            else:
                top1_err, top5_err = None, None
                if cfg.DATA.MULTI_LABEL:
                    # Gather all the predictions across all the devices.
                    if cfg.NUM_GPUS > 1:
                        [loss] = du.all_reduce([loss])
                    loss = loss.item()
                else:
                    # Binary classifier - save preds / labels for metrics
                    if cfg.MODEL.NUM_CLASSES == 2:
                        predsAll.extend(preds.detach().cpu().numpy()[:, -1])
                        labelsAll.extend(labels.detach().cpu().numpy())
                    # Compute the errors.
                    num_topks_correct = metrics.topks_correct(
                        preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES)))
                    top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                          for x in num_topks_correct]

                    # Gather all the predictions across all the devices.
                    if cfg.NUM_GPUS > 1:
                        loss, top1_err, top5_err = du.all_reduce(
                            [loss, top1_err, top5_err])

                    # Copy the stats from GPU to CPU (sync point).
                    loss, top1_err, top5_err = (
                        loss.item(),
                        top1_err.item(),
                        top5_err.item(),
                    )

                train_meter.iter_toc()
                # Update and log stats.
                # self.logger.info("UPDATING stat {} {} {}".format(inputs[0].size(0), cfg.NUM_GPUS, inputs[0].size(0) * cfg.NUM_GPUS))
                train_meter.update_stats(top1_err, top5_err, loss, lr,
                                         inputs[0].size(0) * cfg.NUM_GPUS)
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Train/loss": loss,
                            "Train/lr": lr,
                            "Train/Top1_err": top1_err,
                            "Train/Top5_err": top5_err,
                        },
                        global_step=data_size * cur_epoch + cur_iter,
                    )

            stats = train_meter.log_iter_stats(cur_epoch, cur_iter, predsAll,
                                               labelsAll)
            ite = dSize * cur_epoch + btch * (cur_iter + 1)
            self.plotStats(stats, ite, 'TrainIter')
            train_meter.iter_tic()

        if du.is_master_proc() and cfg.LOG_MODEL_INFO:
            misc.log_model_info(model, cfg, use_train_input=True)
        # Log epoch stats.
        gathered = du.all_gather([
            torch.tensor(predsAll).to(torch.device("cuda")),
            torch.tensor(labelsAll).to(torch.device("cuda"))
        ])
        stats = train_meter.log_epoch_stats(cur_epoch,
                                            gathered[0].detach().cpu().numpy(),
                                            gathered[1].detach().cpu().numpy())
        ite = (cur_epoch + 1) * dSize
        self.plotStats(stats, ite, 'TrainEpoch')
        train_meter.reset()
        end = time.time()
        el = end - start
        totAll = du.all_reduce([torch.tensor(tot).cuda()], average=False)
        tSum = totAll[0].item()
        elT = torch.tensor(el).cuda()
        elMax = du.all_reduce([elT], op=dist.ReduceOp.MAX,
                              average=False)[0].item()
        jobRate = tSum / elMax
        self.logger.info(
            "totSampCnt {} workerSampCnt {}  eTimeMax {} eTimeWorker {}  SampPerSecJob {:.1f} SampPerSecWorker {:.1f}"
            .format(tSum, tot, elMax, el, jobRate, tot / el))
        return jobRate
예제 #16
0
def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, writer, nep, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    global_iters = data_size*cur_epoch
    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):

        # Transfer the data to the current GPU device.
        inputs = inputs.cuda(non_blocking=True)
        # labels = torch.repeat_interleave(labels,inputs[i].size(1),0)

        # for i in range(len(inputs)):
        #     if len(inputs[i].shape) > 5:
        #         inputs[i] = inputs[i].view((-1,)+inputs[i].shape[2:])

        # labels = labels.cuda()
        # for key, val in meta.items():
        #     if isinstance(val, (list,)):
        #         for i in range(len(val)):
        #             val[i] = val[i].cuda(non_blocking=True)
        #     else:
        #         meta[key] = val.cuda(non_blocking=True)


        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, global_iters, cfg)
        optim.set_lr(optimizer, lr)

        preds = model(inputs)

        out_keys = list(preds.keys())

        total_loss = preds['total_loss']

        # check Nan Loss.
        misc.check_nan_losses(total_loss)

        # Perform the backward pass.
        optimizer.zero_grad()

        total_loss.backward()

        ####################################################################################################################################
        # check gradients
        # if writer is not None and global_iters%cfg.SUMMARY_PERIOD==0:
        #     n_p = model.module.named_parameters() if hasattr(model,'module') else model.named_parameters()
        #     fig = viz_helpers.plot_grad_flow_v2(n_p)
        #     writer.add_figure('grad_flow/grad_flow', fig, global_iters)
        ####################################################################################################################################

        # Update the parameters.
        optimizer.step()
        
        losses = [preds[k] for k in out_keys]

        # Gather all the predictions across all the devices.
        if cfg.NUM_GPUS > 1:    
            losses = du.all_reduce(losses)
        
        losses = [l.item() for l in losses]

        loss_logs = {}
        for i in range(len(losses)):
            loss_logs[out_keys[i]] = losses[i]
        
        train_meter.iter_toc()
        # Update and log stats.
        train_meter.update_stats(
            lr, inputs[0].size(0) * cfg.NUM_GPUS, **loss_logs
        )

        if writer is not None and global_iters%cfg.LOG_PERIOD==0:
            logger.info(model.conv0[2].weight)
            logger.info(model.conv0[2].bias)
            for k,v in loss_logs.items():
                writer.add_scalar('loss/'+k.strip('loss_'), train_meter.stats[k].get_win_median(), global_iters)
        if nep is not None and global_iters%cfg.LOG_PERIOD==0:
            for k,v in loss_logs.items():
                nep.log_metric(k.strip('loss_'), train_meter.stats[k].get_win_median())

            nep.log_metric('global_iters', global_iters)


        if global_iters%cfg.SUMMARY_PERIOD==0 and du.get_rank()==0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS):

            with torch.no_grad():
                # logger.info(inputs[i].shape)
                # sys.stdout.flush()
                inputs = inputs[:min(3,len(inputs))]
                if 'masks' in meta:
                    frames = model((inputs, meta['masks'][:min(3,len(inputs))]), extra=['frames'])['frames']
                else:
                    frames = model(inputs, extra=['frames'])['frames']

                n_rows = inputs.size(2)-1

                inputs = inputs.transpose(1,2)[:, -n_rows:]
                frames = frames.transpose(1,2)[:, -n_rows:]
                frames = torch.cat([(frames!=inputs)*frames, (frames==inputs)*inputs, torch.zeros_like(frames)], 2)
                inputs = torch.cat([inputs]*3, 2)
                # inputs = inputs*inputs.new(cfg.DATA.STD)[None,None,:,None,None]+inputs.new(cfg.DATA.MEAN)[None,None,:,None,None]
                # frames = frames*frames.new(cfg.DATA.STD)[None,None,:,None,None]+frames.new(cfg.DATA.MEAN)[None,None,:,None,None]
                images = torch.cat([inputs, frames], 1).reshape((-1,) + inputs.shape[2:])

            # grid = tv.utils.make_grid(images, nrow=8, normalize=True)
            # writer.add_image('predictions', images, global_iters)

            tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d.jpg'%global_iters), nrow=n_rows, normalize=True)

            # del images
            # del frames
            # del inputs

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

        global_iters+=1

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #17
0
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, sampled_batch in enumerate(train_loader):
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['des_q']
        des_ans = sampled_batch['question_dict']['des_ans']
        mc_q = sampled_batch['question_dict']['mc_q']
        mc_ans = sampled_batch['question_dict']['mc_ans']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            mc_q = mc_q.cuda(non_blocking=True)
            mc_ans = mc_ans.cuda()

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        pred_des_ans = model(frames, des_q, True)
        pred_mc_ans = model(frames, mc_q, False)
        # Explicitly declare reduction to mean.
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")
        # Compute the loss.
        loss = des_loss_fun(pred_des_ans, des_ans)
        loss += mc_loss_fun(pred_mc_ans, mc_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Check if plateau

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        diff_mc_ans = torch.abs(
            mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float())  #Errors
        mc_opt_err = 100 * torch.true_divide(diff_mc_ans.sum(),
                                             (4 * des_q.size()[0]))
        mc_q_err = 100 * torch.true_divide(
            (diff_mc_ans.sum(dim=1, keepdim=True) != 0).float().sum(),
            des_q.size()[0])
        # Gather all the predictions across all the devices.
        if cfg.NUM_GPUS > 1:
            loss, top1_err, top5_err, mc_opt_err, mc_q_err = du.all_reduce(
                [loss, top1_err, top5_err, mc_opt_err, mc_q_err])
        # Copy the stats from GPU to CPU (sync point).
        loss, top1_err, top5_err, mc_opt_err, mc_q_err = (loss.item(),
                                                          top1_err.item(),
                                                          top5_err.item(),
                                                          mc_opt_err.item(),
                                                          mc_q_err.item())

        # Update and log stats.
        train_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            loss,
            lr,
            frames.size()[0] * max(
                cfg.NUM_GPUS, 1
            ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
        )
        # write to tensorboard format if available.
        if writer is not None:
            writer.add_scalars(
                {
                    "Train/loss": loss,
                    "Train/lr": lr,
                    "Train/Top1_err": top1_err,
                    "Train/Top5_err": top5_err,
                    "Train/mc_opt_err": mc_opt_err,
                    "Train/mc_q_err": mc_q_err,
                },
                global_step=data_size * cur_epoch + cur_iter,
            )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #18
0
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta, boxes,
                   b_indices) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        if cfg.DETECTION.ENABLE:
            preds = model(inputs, meta["boxes"])
        else:
            preds = model(inputs)
        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
예제 #19
0
def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                test_imp=False):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    test_counter = 0
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, sampled_batch in enumerate(train_loader):
        #Samples 2 batches. One for des and one for mc
        #There are much more des, then some batches are only des
        des_batch = sampled_batch['des']
        des_q = des_batch['question_dict']['question']
        des_ans = des_batch['question_dict']['ans']
        des_len = des_batch['question_dict']['len']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            des_len = des_len.cuda(non_blocking=True)

        has_mc = sampled_batch['has_mc'][0]
        if has_mc:
            mc_batch = sampled_batch['mc']
            mc_q = mc_batch['question_dict']['question']
            mc_ans = mc_batch['question_dict']['ans']
            mc_len = mc_batch['question_dict']['len']
            if cfg.NUM_GPUS:
                mc_q = mc_q.cuda(non_blocking=True)
                mc_ans = mc_ans.cuda()
                mc_len = mc_len.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        #Separated batches
        #Des
        pred_des_ans = model(des_q, True)
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        loss = des_loss_fun(pred_des_ans, des_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #Save for stats
        loss_des_val = loss

        #MC
        loss_mc_val = None
        if has_mc:
            pred_mc_ans = model(mc_q, False)
            mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")
            loss = mc_loss_fun(pred_mc_ans, mc_ans)  #Multiply by 4
            # check Nan Loss.
            misc.check_nan_losses(loss)
            #Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #Save for stats
            loss_mc_val = loss

        # #Non separated Not updated for same batch 2 questions:
        # pred_des_ans = model(des_q, True)
        # pred_mc_ans = model(mc_q, False)
        # # Explicitly declare reduction to mean.
        # des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        # mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")
        # # Compute the loss.
        # loss_des_val = des_loss_fun(pred_des_ans, des_ans)
        # loss_mc_val = mc_loss_fun(pred_mc_ans, mc_ans)
        # loss = loss_mc_val + loss_des_val
        # # check Nan Loss.
        # misc.check_nan_losses(loss)
        # # Perform the backward pass.
        # optimizer.zero_grad()
        # loss.backward()
        # # Update the parameters.
        # optimizer.step()

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        if has_mc:
            diff_mc_ans = torch.abs(
                mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float())  #Errors
            mc_opt_err = 100 * torch.true_divide(diff_mc_ans.sum(),
                                                 (4 * mc_q.size()[0]))
            mc_q_err = 100 * torch.true_divide(
                (diff_mc_ans.sum(dim=1, keepdim=True) != 0).float().sum(),
                mc_q.size()[0])
            # Copy the stats from GPU to CPU (sync point).
            loss_des_val, loss_mc_val, top1_err, top5_err, mc_opt_err, mc_q_err = (
                loss_des_val.item(), loss_mc_val.item(), top1_err.item(),
                top5_err.item(), mc_opt_err.item(), mc_q_err.item())
            mb_size_mc = mc_q.size()[0]
        else:
            mc_opt_err, mc_q_err = None, None
            mb_size_mc = None
            loss_des_val, top1_err, top5_err = (loss_des_val.item(),
                                                top1_err.item(),
                                                top5_err.item())
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, lr, mb_size
        # Update and log stats.
        train_meter.update_stats(top1_err, top5_err, mc_opt_err, mc_q_err,
                                 loss_des_val, loss_mc_val, lr,
                                 des_q.size()[0], mb_size_mc)
        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

        #For testing implementation
        if test_imp:
            print(" --- Descriptive questions results --- ")
            # print("Des_q")
            # print(des_q)
            print("Des_ans")
            print(des_ans)
            #print("Des_ans_pred")
            #print(pred_des_ans)
            print("Argmax => prediction")
            print(torch.argmax(pred_des_ans, dim=1, keepdim=False))
            print("Top1_err and Top5err")
            print(top1_err, top5_err)
            print("Loss_des_val = {}".format(loss_des_val))
            if has_mc:
                print(" --- Multiple Choice questions results --- ")
                # print("Mc_q")
                # print(mc_q)
                # print("Mc errors pred x ans")
                # print(torch.abs(mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float()))
                print("mc_opt_err = {} \nmc_q_err = {}".format(
                    mc_opt_err, mc_q_err))
                print("Loss_mc_val = {}".format(loss_mc_val))
            test_counter += 1
            if test_counter == 4:
                break

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()