Python get_loss_func примеры использования

Язык программирования: Python

Пространство имен/Пакет: slowfast.models.losses

Метод/Функция: get_loss_func

Примеров на hotexamples.com: 25

Python get_loss_func - 25 примеров найдено. Это лучшие примеры Python кода для slowfast.models.losses.get_loss_func, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, test_imp=False):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ClevrerValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, sampled_batch in enumerate(val_loader):
        video_ft = sampled_batch['res_ft']
        des_q = sampled_batch['question_dict']['question']
        attn_masks = sampled_batch['question_dict']['attention_mask']
        des_ans = sampled_batch['question_dict']['ans']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            video_ft = video_ft.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            attn_masks = attn_masks.cuda(non_blocking=True)
            des_ans = des_ans.cuda()

        val_meter.data_toc()

        # Explicitly declare reduction to mean.
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        pred_des_ans = model(video_ft, des_q, attn_masks)
        loss_des_val = des_loss_fun(pred_des_ans, des_ans)

        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        # Combine the errors across the GPUs.
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        loss_mc_val = None
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (loss_des_val.item(),
                                            top1_err.item(), top5_err.item())

        val_meter.iter_toc()
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, mb_size_des, mb_size_mc
        # Update and log stats.
        val_meter.update_stats(top1_err, top5_err, mc_opt_err,
                               mc_q_err, loss_des_val, loss_mc_val,
                               des_ans.size(0), mb_size_mc)
        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()

Пример #2

Показать файл

def train_epoch(train_dloader, model, optimizer, cur_epoch, cfg):
    model.train()
    train_tqdm = tqdm(train_dloader, ncols=80)
    data_size = len(train_dloader)
    for cur_iter, (inputs, labels, _, extra_data) in enumerate(train_tqdm):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in extra_data.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    extra_data[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            preds = model(inputs, extra_data["boxes"])
        else:
            preds = model(inputs)

        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        train_tqdm.set_description("Train_loss: %.4f" % loss.cpu().item())

Пример #3

Показать файл

Файл: train_net.py Проект: mrevow/SlowFast

    def train_epoch(self,
                    train_loader,
                    model,
                    optimizer,
                    train_meter,
                    cur_epoch,
                    cfg,
                    writer=None):
        """
        Perform the video training for one epoch.
        Args:
            train_loader (loader): video training loader.
            model (model): the video model to train.
            optimizer (optim): the optimizer to perform optimization on the model's
                parameters.
            train_meter (TrainMeter): training meters to log the training performance.
            cur_epoch (int): current epoch of training.
            cfg (CfgNode): configs. Details can be found in
                slowfast/config/defaults.py
            writer (TensorboardWriter, optional): TensorboardWriter object
                to writer Tensorboard log.
        """
        # Enable train mode.
        model.train()
        train_meter.iter_tic()
        data_size = len(train_loader)
        start = time.time()
        btch = cfg.TRAIN.BATCH_SIZE * self.cfg.NUM_SHARDS
        rankE = os.environ.get("RANK", None)
        worldE = os.environ.get("WORLD_SIZE", None)
        dSize = data_size * btch
        self.logger.info(
            "Train Epoch {} dLen {} Batch {} dSize {} localRank {} rank {} {} world {} {}"
            .format(cur_epoch, data_size, btch, dSize, du.get_local_rank(),
                    du.get_rank(), rankE, du.get_world_size(), worldE))
        tot = 0
        first = True
        predsAll = []
        labelsAll = []

        for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
            # Transfer the data to the current GPU device.
            tot += len(labels)
            if isinstance(inputs, (list, )):
                if first:
                    self.logger.info(
                        "rank {} LEN {}  {} shape Slow {} Fast {} {} tot {}".
                        format(du.get_rank(), len(labels), len(inputs),
                               inputs[0].shape, inputs[1].shape,
                               labels[0].shape, tot))
                    first = False
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                if first:
                    self.logger.info(
                        "rank {} LEN {} shape {} {} tot {}".format(
                            du.get_rank(), len(labels), inputs.shape,
                            labels[0].shape, tot))
                    first = False
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()

            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

            # Update the learning rate.
            lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size,
                                    cfg)
            optim.set_lr(optimizer, lr)
            if cfg.DETECTION.ENABLE:
                # Compute the predictions.
                preds = model(inputs, meta["boxes"])

            else:
                # Perform the forward pass.
                preds = model(inputs)
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

            # check Nan Loss.
            misc.check_nan_losses(loss)

            # Perform the backward pass.
            optimizer.zero_grad()
            loss.backward()
            # Update the parameters.
            optimizer.step()

            if cfg.DETECTION.ENABLE:
                if cfg.NUM_GPUS > 1:
                    loss = du.all_reduce([loss])[0]
                loss = loss.item()

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(None, None, None, loss, lr)
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Train/loss": loss,
                            "Train/lr": lr
                        },
                        global_step=data_size * cur_epoch + cur_iter,
                    )
                ite = data_size * cur_epoch + cur_iter
                if du.is_master_proc():
                    self.logger.log_row(name='TrainLoss',
                                        iter=ite,
                                        loss=loss,
                                        description="train loss")
                    self.logger.log_row(name='TrainLr',
                                        iter=ite,
                                        lr=lr,
                                        description="train learn rate")

            else:
                top1_err, top5_err = None, None
                if cfg.DATA.MULTI_LABEL:
                    # Gather all the predictions across all the devices.
                    if cfg.NUM_GPUS > 1:
                        [loss] = du.all_reduce([loss])
                    loss = loss.item()
                else:
                    # Binary classifier - save preds / labels for metrics
                    if cfg.MODEL.NUM_CLASSES == 2:
                        predsAll.extend(preds.detach().cpu().numpy()[:, -1])
                        labelsAll.extend(labels.detach().cpu().numpy())
                    # Compute the errors.
                    num_topks_correct = metrics.topks_correct(
                        preds, labels, (1, min(5, cfg.MODEL.NUM_CLASSES)))
                    top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                          for x in num_topks_correct]

                    # Gather all the predictions across all the devices.
                    if cfg.NUM_GPUS > 1:
                        loss, top1_err, top5_err = du.all_reduce(
                            [loss, top1_err, top5_err])

                    # Copy the stats from GPU to CPU (sync point).
                    loss, top1_err, top5_err = (
                        loss.item(),
                        top1_err.item(),
                        top5_err.item(),
                    )

                train_meter.iter_toc()
                # Update and log stats.
                # self.logger.info("UPDATING stat {} {} {}".format(inputs[0].size(0), cfg.NUM_GPUS, inputs[0].size(0) * cfg.NUM_GPUS))
                train_meter.update_stats(top1_err, top5_err, loss, lr,
                                         inputs[0].size(0) * cfg.NUM_GPUS)
                # write to tensorboard format if available.
                if writer is not None:
                    writer.add_scalars(
                        {
                            "Train/loss": loss,
                            "Train/lr": lr,
                            "Train/Top1_err": top1_err,
                            "Train/Top5_err": top5_err,
                        },
                        global_step=data_size * cur_epoch + cur_iter,
                    )

            stats = train_meter.log_iter_stats(cur_epoch, cur_iter, predsAll,
                                               labelsAll)
            ite = dSize * cur_epoch + btch * (cur_iter + 1)
            self.plotStats(stats, ite, 'TrainIter')
            train_meter.iter_tic()

        if du.is_master_proc() and cfg.LOG_MODEL_INFO:
            misc.log_model_info(model, cfg, use_train_input=True)
        # Log epoch stats.
        gathered = du.all_gather([
            torch.tensor(predsAll).to(torch.device("cuda")),
            torch.tensor(labelsAll).to(torch.device("cuda"))
        ])
        stats = train_meter.log_epoch_stats(cur_epoch,
                                            gathered[0].detach().cpu().numpy(),
                                            gathered[1].detach().cpu().numpy())
        ite = (cur_epoch + 1) * dSize
        self.plotStats(stats, ite, 'TrainEpoch')
        train_meter.reset()
        end = time.time()
        el = end - start
        totAll = du.all_reduce([torch.tensor(tot).cuda()], average=False)
        tSum = totAll[0].item()
        elT = torch.tensor(el).cuda()
        elMax = du.all_reduce([elT], op=dist.ReduceOp.MAX,
                              average=False)[0].item()
        jobRate = tSum / elMax
        self.logger.info(
            "totSampCnt {} workerSampCnt {}  eTimeMax {} eTimeWorker {}  SampPerSecJob {:.1f} SampPerSecWorker {:.1f}"
            .format(tSum, tot, elMax, el, jobRate, tot / el))
        return jobRate

Пример #4

Показать файл

    question = sample_batched['question_dict']['question']
    ans = sample_batched['question_dict']['ans']
    break

print("Model")
vocab_len = dataset.get_vocab_len()
ans_vocab_len = dataset.get_ans_vocab_len()
vocab = dataset.get_vocab()
name = cfg.MODEL.MODEL_NAME
model = MODEL_REGISTRY.get(name)(cfg, vocab_len, ans_vocab_len, vocab)

print("Embedding layer: ")
print(model.embed_layer.weight)

print("Pass through model")
print("Question = {}".format(question))
if is_des:
    pred_des_ans = model(question, True)
    print("Model output = {}".format(pred_des_ans))
    des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
    loss = des_loss_fun(pred_des_ans, ans)
else:
    pred_mc_ans = model(question, False)
    print("Model output = {}".format(pred_mc_ans))
    mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")
    loss = mc_loss_fun(pred_mc_ans, ans)

print("Loss = {}".format(loss))
loss.backward()
print("Embed Grad 0:5:")
print(model.embed_layer.weight.grad[0:5])

Пример #5

Показать файл

Файл: train_net.py Проект: gabrielsluz/SlowFast

def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                test_imp=False):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    test_counter = 0
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, sampled_batch in enumerate(train_loader):
        #Samples 2 batches. One for des and one for mc
        #There are much more des, then some batches are only des
        des_batch = sampled_batch['des']
        des_q = des_batch['question_dict']['question']
        des_ans = des_batch['question_dict']['ans']
        des_len = des_batch['question_dict']['len']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            des_len = des_len.cuda(non_blocking=True)

        has_mc = sampled_batch['has_mc'][0]
        if has_mc:
            mc_batch = sampled_batch['mc']
            mc_q = mc_batch['question_dict']['question']
            mc_ans = mc_batch['question_dict']['ans']
            mc_len = mc_batch['question_dict']['len']
            if cfg.NUM_GPUS:
                mc_q = mc_q.cuda(non_blocking=True)
                mc_ans = mc_ans.cuda()
                mc_len = mc_len.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        #Separated batches
        #Des
        pred_des_ans = model(des_q, True)
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        loss = des_loss_fun(pred_des_ans, des_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #Save for stats
        loss_des_val = loss

        #MC
        loss_mc_val = None
        if has_mc:
            pred_mc_ans = model(mc_q, False)
            mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")
            loss = mc_loss_fun(pred_mc_ans, mc_ans)  #Multiply by 4
            # check Nan Loss.
            misc.check_nan_losses(loss)
            #Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #Save for stats
            loss_mc_val = loss

        # #Non separated Not updated for same batch 2 questions:
        # pred_des_ans = model(des_q, True)
        # pred_mc_ans = model(mc_q, False)
        # # Explicitly declare reduction to mean.
        # des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        # mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")
        # # Compute the loss.
        # loss_des_val = des_loss_fun(pred_des_ans, des_ans)
        # loss_mc_val = mc_loss_fun(pred_mc_ans, mc_ans)
        # loss = loss_mc_val + loss_des_val
        # # check Nan Loss.
        # misc.check_nan_losses(loss)
        # # Perform the backward pass.
        # optimizer.zero_grad()
        # loss.backward()
        # # Update the parameters.
        # optimizer.step()

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        if has_mc:
            diff_mc_ans = torch.abs(
                mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float())  #Errors
            mc_opt_err = 100 * torch.true_divide(diff_mc_ans.sum(),
                                                 (4 * mc_q.size()[0]))
            mc_q_err = 100 * torch.true_divide(
                (diff_mc_ans.sum(dim=1, keepdim=True) != 0).float().sum(),
                mc_q.size()[0])
            # Copy the stats from GPU to CPU (sync point).
            loss_des_val, loss_mc_val, top1_err, top5_err, mc_opt_err, mc_q_err = (
                loss_des_val.item(), loss_mc_val.item(), top1_err.item(),
                top5_err.item(), mc_opt_err.item(), mc_q_err.item())
            mb_size_mc = mc_q.size()[0]
        else:
            mc_opt_err, mc_q_err = None, None
            mb_size_mc = None
            loss_des_val, top1_err, top5_err = (loss_des_val.item(),
                                                top1_err.item(),
                                                top5_err.item())
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, lr, mb_size
        # Update and log stats.
        train_meter.update_stats(top1_err, top5_err, mc_opt_err, mc_q_err,
                                 loss_des_val, loss_mc_val, lr,
                                 des_q.size()[0], mb_size_mc)
        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

        #For testing implementation
        if test_imp:
            print(" --- Descriptive questions results --- ")
            # print("Des_q")
            # print(des_q)
            print("Des_ans")
            print(des_ans)
            #print("Des_ans_pred")
            #print(pred_des_ans)
            print("Argmax => prediction")
            print(torch.argmax(pred_des_ans, dim=1, keepdim=False))
            print("Top1_err and Top5err")
            print(top1_err, top5_err)
            print("Loss_des_val = {}".format(loss_des_val))
            if has_mc:
                print(" --- Multiple Choice questions results --- ")
                # print("Mc_q")
                # print(mc_q)
                # print("Mc errors pred x ans")
                # print(torch.abs(mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float()))
                print("mc_opt_err = {} \nmc_q_err = {}".format(
                    mc_opt_err, mc_q_err))
                print("Loss_mc_val = {}".format(loss_mc_val))
            test_counter += 1
            if test_counter == 4:
                break

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #6

Показать файл

def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            preds = model(inputs)

        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
        else:
            # Compute the errors.
            num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
            top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                  for x in num_topks_correct]

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss, top1_err, top5_err = du.all_reduce(
                    [loss, top1_err, top5_err])

            # Copy the stats from GPU to CPU (sync point).
            loss, top1_err, top5_err = (
                loss.item(),
                top1_err.item(),
                top5_err.item(),
            )

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(top1_err, top5_err, loss, lr,
                                     inputs[0].size(0) * cfg.NUM_GPUS)

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #7

Показать файл

Файл: train_net_des.py Проект: gabrielsluz/SlowFast

def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, test_imp=False):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ClevrerValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    test_counter = 0
    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, sampled_batch in enumerate(val_loader):
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['question']
        des_ans = sampled_batch['question_dict']['ans']
        # des_len = sampled_batch['question_dict']['len']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(frames, (list,)):
                for i in range(len(frames)):
                    frames[i] = frames[i].cuda(non_blocking=True)
            else:
                frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            # des_len = des_len.cuda(non_blocking=True)

        val_meter.data_toc()

        # Explicitly declare reduction to mean.
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        pred_des_ans = model(frames, des_q, True)
        loss_des_val = des_loss_fun(pred_des_ans, des_ans)

        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans, (1, 5))
        # Combine the errors across the GPUs.
        top1_err, top5_err = [
            (1.0 - x / pred_des_ans.size(0)) * 100.0 for x in num_topks_correct
        ]
        loss_mc_val = None
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (
            loss_des_val.item(),
            top1_err.item(),
            top5_err.item()
        )

        val_meter.iter_toc()
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, mb_size_des, mb_size_mc
        # Update and log stats.
        val_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            loss_des_val,
            loss_mc_val,
            des_q.size()[0],
            mb_size_mc
        )
        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

        #For testing implementation
        if test_imp:
            print(" --- Descriptive questions results --- ")
            # print("Des_q")
            # print(des_q)
            print("Des_ans")
            print(des_ans)
            #print("Des_ans_pred")
            #print(pred_des_ans)
            print("Argmax => prediction")
            print(torch.argmax(pred_des_ans, dim=1, keepdim=False))
            print("Top1_err and Top5err")
            print(top1_err, top5_err)
            print("Loss_des_val = {}".format(loss_des_val))
            test_counter += 1
            if test_counter == 1: 
                break

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()

Пример #8

Показать файл

def train_epoch(train_loader,
                model,
                optimizer,
                scaler,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    if cfg.MIXUP.ENABLE:
        mixup_fn = MixUp(
            mixup_alpha=cfg.MIXUP.ALPHA,
            cutmix_alpha=cfg.MIXUP.CUTMIX_ALPHA,
            mix_prob=cfg.MIXUP.PROB,
            switch_prob=cfg.MIXUP.SWITCH_PROB,
            label_smoothing=cfg.MIXUP.LABEL_SMOOTH_VALUE,
            num_classes=cfg.MODEL.NUM_CLASSES,
        )

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()
        if cfg.MIXUP.ENABLE:
            samples, labels = mixup_fn(inputs[0], labels)
            inputs[0] = samples

        with torch.cuda.amp.autocast(enabled=cfg.TRAIN.MIXED_PRECISION):
            if cfg.DETECTION.ENABLE:
                preds = model(inputs, meta["boxes"])
            else:
                preds = model(inputs)
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Clip gradients if necessary
        if cfg.SOLVER.CLIP_GRAD_VAL:
            torch.nn.utils.clip_grad_value_(model.parameters(),
                                            cfg.SOLVER.CLIP_GRAD_VAL)
        elif cfg.SOLVER.CLIP_GRAD_L2NORM:
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           cfg.SOLVER.CLIP_GRAD_L2NORM)
        # Update the parameters.
        scaler.step(optimizer)
        scaler.update()

        if cfg.MIXUP.ENABLE:
            _top_max_k_vals, top_max_k_inds = torch.topk(labels,
                                                         2,
                                                         dim=1,
                                                         largest=True,
                                                         sorted=True)
            idx_top1 = torch.arange(labels.shape[0]), top_max_k_inds[:, 0]
            idx_top2 = torch.arange(labels.shape[0]), top_max_k_inds[:, 1]
            preds = preds.detach()
            preds[idx_top1] += preds[idx_top2]
            preds[idx_top2] = 0.0
            labels = top_max_k_inds[:, 0]

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #9

Показать файл

def train_epoch(
        train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer=None
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    # Check if the correct params are set to requires_grad = True
    assert_requires_grad_correctness(model, du.is_master_proc(), cfg)
    train_meter.iter_tic()
    data_size = len(train_loader)
    np.set_printoptions(suppress=True)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list,)):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list,)):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        if cfg.MODEL.HEAD_ACT == "softmax" and cfg.TRAIN.DATASET == "custom":
            # We have to change our labels to long tensor
            labels = labels.type(torch.LongTensor)
            labels = labels.cuda()

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr, cfg)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"], is_train=True)

        else:
            # Perform the forward pass.
            preds = model(inputs)

        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        """
        if cur_iter % 70 == 0:
            softmax = torch.nn.Softmax(dim=1)
            probabilities = softmax(preds)
            loss_prob = loss_fun(probabilities, labels)
            preds_numpy = probabilities.cpu().detach().numpy()
            preds_numpy = np.round(preds_numpy, 4)
            labels_numpy = labels.cpu().detach().numpy()
            print("--------------------------")
            for label, pred in zip (labels_numpy, preds_numpy):
                print(str(label) + "---->", end= "")
                print(pred[label])
        """


        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        # Todo: adjust accordingly
        if cfg.DETECTION.ENABLE:  #and not (cfg.MODEL.HEAD_ACT == "softmax"):
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {"Train/loss": loss, "Train/lr": lr},
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
                top1_err, top5_err = [
                    (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct
                ]

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(
                top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #10

Показать файл

Файл: train_KD.py Проект: bqhuyy/SlowFast-clean

def train_epoch(
    train_loader, student_model, teacher_model, optimizer, train_meter, cur_epoch, cfg, writer=None
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    teacher_model.eval()
    student_model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta, _) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list,)):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list,)):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            student_preds, student_features = student_model(inputs.copy(), meta["boxes"])
            with torch.no_grad():
                teacher_preds, teacher_features = teacher_model(inputs.copy(), meta["boxes"])
        else:
            # Perform the forward pass.
            student_preds, student_features = student_model(inputs.copy())
            with torch.no_grad():
                teacher_preds, teacher_features = teacher_model(inputs.copy())
        # Explicitly declare reduction to mean.
        # L2 loss for featuremap difference
        loss_mse_func = losses.get_loss_func('mse')(reduction="mean")
        # Cross entropy loss for prediction
        loss_pred_func = losses.get_loss_func('cross_entropy')(reduction="mean")
        # kl-divergence loss
        loss_kl_func = losses.get_loss_func('kl_divergence')(reduction="batchmean")
        
        T = cfg.KD.TEMPERATURE
        alpha = cfg.KD.ALPHA
        
        loss_pred = loss_pred_func(student_preds, labels) * (1. - alpha)
        loss_mse = []
        loss_kl = []
        for s_features, t_features in zip(student_features, teacher_features):
            for i in range(2):
                #mse loss
                loss_mse.append(loss_mse_func(s_features[i], t_features[i]) * (alpha * T * T))

                #kl divergence loss 
                b, c, t, h, w = s_features[i].shape
                s_feature = s_features[i].permute(0, 2, 3, 4, 1).contiguous().view(b*t*h*w, c)
                t_feature = t_features[i].permute(0, 2, 3, 4, 1).contiguous().view(b*t*h*w, c)
                loss_kl.append(loss_kl_func(F.log_softmax(s_feature/T, dim = 0), F.softmax(t_feature/T, dim = 0)) * (alpha * T * T))
            
        #TOTAL LOSS = sum of all losses
        loss = loss_pred + sum(loss_mse) + sum(loss_kl)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.T
            if writer is not None:
                writer.add_scalars(
                    {"Train/loss": loss, "Train/lr": lr, "Train/mse": sum(loss_mse), "Train/loss_kl": sum(loss_kl), "Train/loss_pred": loss_pred},
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(student_preds, labels, (1, 5))
                top1_err, top5_err = [
                    (1.0 - x / student_preds.size(0)) * 100.0 for x in num_topks_correct
                ]

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0)
                * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                        "Train/mse": sum(loss_mse),
                        "Train/loss_kl": sum(loss_kl),
                        "Train/loss_pred": loss_pred,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #11

Показать файл

def train_epoch(
    train_loader,
    model,
    optimizer,
    scaler,
    train_meter,
    cur_epoch,
    cfg,
    writer=None,
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    if cfg.MIXUP.ENABLE:
        mixup_fn = MixUp(
            mixup_alpha=cfg.MIXUP.ALPHA,
            cutmix_alpha=cfg.MIXUP.CUTMIX_ALPHA,
            mix_prob=cfg.MIXUP.PROB,
            switch_prob=cfg.MIXUP.SWITCH_PROB,
            label_smoothing=cfg.MIXUP.LABEL_SMOOTH_VALUE,
            num_classes=cfg.MODEL.NUM_CLASSES,
        )

    iters_noupdate = 0
    if cfg.MODEL.MODEL_NAME == "ContrastiveModel" and cfg.CONTRASTIVE.TYPE == "moco":
        assert cfg.CONTRASTIVE.QUEUE_LEN % (cfg.TRAIN.BATCH_SIZE *
                                            cfg.NUM_SHARDS) == 0
        iters_noupdate = (cfg.CONTRASTIVE.QUEUE_LEN // cfg.TRAIN.BATCH_SIZE //
                          cfg.NUM_SHARDS)
    if cfg.MODEL.FROZEN_BN:
        misc.frozen_bn_stats(model)
    # Explicitly declare reduction to mean.
    loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

    # import ipdb; ipdb.set_trace()

    profiler.log_tic("loop_time")
    for cur_iter, (inputs, labels, index, time,
                   meta) in enumerate(train_loader):
        if not isinstance(inputs, list):
            inputs = [inputs]
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    if isinstance(inputs[i], (list, )):
                        for j in range(len(inputs[i])):
                            # inputs[i][j] = inputs[i][j].cuda(non_blocking=True)
                            inputs[i][j] = inputs[i][j].to("cuda:0")
                    else:
                        # inputs[i] = inputs[i].cuda(non_blocking=True)
                        inputs[i] = inputs[i].to("cuda:0")
            else:
                # inputs = inputs.cuda(non_blocking=True)
                inputs = inputs.to("cuda:0")

            # labels = labels.cuda()
            labels = labels.to("cuda:0")

            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)
            index = index.to("cuda:0")
            time = time.to("cuda:0")

        batch_size = (inputs[0][0].size(0)
                      if isinstance(inputs[0], list) else inputs[0].size(0))
        # Update the learning rate.
        epoch_exact = cur_epoch + float(cur_iter) / data_size
        lr = optim.get_epoch_lr(epoch_exact, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()
        if cfg.MIXUP.ENABLE:
            samples, labels = mixup_fn(inputs[0], labels)
            inputs[0] = samples

        with torch.cuda.amp.autocast(enabled=cfg.TRAIN.MIXED_PRECISION):

            # Explicitly declare reduction to mean.
            perform_backward = True
            optimizer.zero_grad()

            if cfg.MODEL.MODEL_NAME == "ContrastiveModel":
                (
                    model,
                    preds,
                    partial_loss,
                    perform_backward,
                ) = contrastive_forward(model, cfg, inputs, index, time,
                                        epoch_exact, scaler)
            elif cfg.DETECTION.ENABLE:
                # Compute the predictions.
                preds = model(inputs, meta["boxes"])
            else:
                profiler.log_tic("model_time")
                preds = model(inputs)
                profiler.log_toc("model_time", shape=inputs[0].shape)

        if cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel":
            labels = torch.zeros(preds.size(0),
                                 dtype=labels.dtype,
                                 device=labels.device)

        if cfg.MODEL.MODEL_NAME == "ContrastiveModel" and partial_loss:
            loss = partial_loss
        else:
            # Compute the loss.
            loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        if perform_backward:
            # print("Running backward!")
            scaler.scale(loss).backward()

        # Unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # Clip gradients if necessary
        if cfg.SOLVER.CLIP_GRAD_VAL:
            torch.nn.utils.clip_grad_value_(model.parameters(),
                                            cfg.SOLVER.CLIP_GRAD_VAL)
        elif cfg.SOLVER.CLIP_GRAD_L2NORM:
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           cfg.SOLVER.CLIP_GRAD_L2NORM)

        model = cancel_swav_gradients(model, cfg, epoch_exact)
        if cur_iter < iters_noupdate and cur_epoch == 0:  #  for e.g. MoCo
            logger.info("Not updating parameters {}/{}".format(
                cur_iter, iters_noupdate))
        else:
            # print("Updating optimizer!")
            # Update the parameters.
            scaler.step(optimizer)
        scaler.update()

        if cfg.MIXUP.ENABLE:
            _top_max_k_vals, top_max_k_inds = torch.topk(labels,
                                                         2,
                                                         dim=1,
                                                         largest=True,
                                                         sorted=True)
            idx_top1 = torch.arange(labels.shape[0]), top_max_k_inds[:, 0]
            idx_top2 = torch.arange(labels.shape[0]), top_max_k_inds[:, 1]
            preds = preds.detach()
            preds[idx_top1] += preds[idx_top2]
            preds[idx_top2] = 0.0
            labels = top_max_k_inds[:, 0]

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss.detach(), top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                batch_size * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        torch.cuda.synchronize()
        train_meter.iter_toc()  # do measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        torch.cuda.synchronize()
        train_meter.iter_tic()

        profiler.log_toc("loop_time", shape=inputs[0].shape)
        profiler.log_tic("loop_time")

        profiler.report(25)
        # profiler.blahblah

    del inputs
    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #12

Показать файл

Файл: train_net.py Проект: anton-br/SlowFast

def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)
    regr_list = []
    num_list = []
    top_list = []
    for cur_iter, (inputs, labels, _) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)

        if isinstance(labels, (list, )):
            for i in range(len(labels)):
                labels[i] = labels[i].cuda(non_blocking=True)
            labels = torch.stack((labels))
        else:
            labels = labels.cuda(non_blocking=True)

        if cfg.MODEL.LOSS_FUNC == 'mse':
            labels = labels.float()

        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        # Perform the forward pass.
        preds = model(inputs)
        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")
        # Compute the loss.

        loss = loss_fun(preds, labels)
        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        top1_err = None
        # Compute the errors.
        num_classes = cfg.MODEL.NUM_CLASSES
        if cfg.DATA.LABELS_TYPE == 'regression':
            ln = (labels.size(1) - 1) // 2 + 1
            pr = preds[:, ln:].reshape(-1, 5)
            lb = labels[:, ln:].reshape(-1)
            num_topks_correct = metrics.topks_correct(pr, lb, (1, ))
            top1_err = (1.0 - num_topks_correct[0] / len(lb)) * 100.0
            regr = ((preds[:, 0] - labels[:, 0])**2).mean()
            numbers = ((preds[:, 1:ln] - labels[:, 1:ln])**2).mean()
            if cfg.NUM_GPUS > 1:
                regr, numbers = du.all_reduce([regr, numbers])
            regr_list.append(regr.item())
            num_list.append(numbers.item())
        elif cfg.DATA.LABELS_TYPE == 'length':
            regr = ((preds[:, 0] - labels[:, 0])**2).mean()
            numbers = ((preds[:, 1:] - labels[:, 1:])**2).mean()
            if cfg.NUM_GPUS > 1:
                regr, numbers = du.all_reduce([regr, numbers])
            regr_list.append(regr.item())
            num_list.append(numbers.item())
            num_topks_correct = metrics.topks_correct(preds, labels, (1, ))
            top1_err = num_topks_correct[0] * 0.0
        elif cfg.DATA.LABELS_TYPE == 'stend':
            top1_err = loss.clone()
            # sigm = torch.nn.Sigmoid()
            # start = sigm(preds[:, 0]).cpu().detach().numpy()
            # end = sigm(preds[:, 1]).cpu().detach().numpy()

        else:
            num_topks_correct = metrics.topks_correct(preds, labels, (1, ))
            preds_ix = preds.size(2) * preds.size(
                0) if cfg.DATA.LABELS_TYPE == 'mask' else preds.size(1)
            top1_err = (1.0 - num_topks_correct[0] / preds_size) * 100.0

        # Gather all the predictions across all the devices.
        if cfg.NUM_GPUS > 1:
            loss, top1_err = du.all_reduce([loss, top1_err])

        # Copy the stats from GPU to CPU (sync point).
        loss, top1_err = (loss.item(), top1_err.item())
        top_list.append(top1_err)
        train_meter.iter_toc()
        # Update and log stats.
        train_meter.update_stats(top1_err, loss, lr,
                                 inputs[0].size(0) * cfg.NUM_GPUS)

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()
    if cfg.DATA.LABELS_TYPE == 'regression' or cfg.DATA.LABELS_TYPE == 'length':
        print('---------------------')
        print(
            f'LOSS VALUES!!: SIZE_LOSS:{np.mean(regr_list)} NUM_LOSS:{np.mean(num_list)} CLASS_LOSS:{np.mean(top_list)}'
        )
        print('---------------------')
    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #13

Показать файл

Файл: train_net.py Проект: anton-br/SlowFast

def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """

    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()
    regr_list = []
    num_list = []
    top_list = []
    for cur_iter, (inputs, labels, _) in enumerate(val_loader):
        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)

        if isinstance(labels, (list, )):
            for i in range(len(labels)):
                labels[i] = labels[i].cuda(non_blocking=True)
            labels = torch.stack((labels))
        else:
            labels = labels.cuda(non_blocking=True)

        preds = model(inputs)

        if cfg.DATA.LABELS_TYPE == 'regression':
            pr = preds[:, ln:].reshape(-1, 5)
            lb = labels[:, ln:].reshape(-1)
            num_topks_correct = metrics.topks_correct(pr, lb, (1, ))
            top1_err = (1.0 - num_topks_correct[0] / len(lb)) * 100.0
            regr = ((preds[:, 0] - labels[:, 0])**2).mean()
            numbers = ((preds[:, 1:ln] - labels[:, 1:ln])**2).mean()
            if cfg.NUM_GPUS > 1:
                regr, numbers = du.all_reduce([regr, numbers])
            regr_list.append(regr.item())
            num_list.append(numbers.item())
        elif cfg.DATA.LABELS_TYPE == 'length':
            regr = ((preds[:, 0] - labels[:, 0])**2).mean()
            numbers = ((preds[:, 1:] - labels[:, 1:])**2).mean()
            if cfg.NUM_GPUS > 1:
                regr, numbers = du.all_reduce([regr, numbers])
            regr_list.append(regr.item())
            num_list.append(numbers.item())
            num_topks_correct = metrics.topks_correct(preds, labels, (1, ))
            top1_err = num_topks_correct[0] * 0.0
        elif cfg.DATA.LABELS_TYPE == 'stend':
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")
            loss = loss_fun(preds, labels)
            top1_err = loss.clone()
        else:
            num_topks_correct = metrics.topks_correct(preds, labels, (1, ))
            preds_ix = preds.size(2) * preds.size(
                0) if cfg.DATA.LABELS_TYPE == 'mask' else preds.size(1)
            top1_err = (1.0 - num_topks_correct[0] / preds_size) * 100.0

        # num_topks_correct = metrics.topks_correct(preds, labels, (1, ))
        # # Combine the errors across the GPUs.
        # preds_ix = 2 if cfg.DATA.LABELS_TYPE == 'mask' else 1
        # top1_err = (1.0 - num_topks_correct[0] / preds.size(preds_ix)) * 100.0

        if cfg.NUM_GPUS > 1:
            top1_err = du.all_reduce([top1_err])[0]
        # Copy the errors from GPU to CPU (sync point).
        top1_err = top1_err.item()
        top_list.append(top1_err)
        val_meter.iter_toc()
        # Update and log stats.
        val_meter.update_stats(top1_err, inputs[0].size(0) * cfg.NUM_GPUS)

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()
    if cfg.DATA.LABELS_TYPE == 'regression' or cfg.DATA.LABELS_TYPE == 'length':
        print('---------------------')
        print(
            f'VALIDATE LOSS!!: SIZE_LOSS:{np.mean(regr_list):.5} NUM_LOSS:{np.mean(num_list):.5} CLASS_LOSS:{np.mean(top_list):.5}'
        )
        print('---------------------')
    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()

Пример #14

Показать файл

Файл: contrastive.py Проект: karttikeya/SlowFast

    def __init__(self, cfg):
        super(ContrastiveModel, self).__init__()
        # Construct the model.
        self.backbone = _MODEL_TYPES[cfg.MODEL.ARCH](cfg)
        self.type = cfg.CONTRASTIVE.TYPE
        self.T = cfg.CONTRASTIVE.T
        self.dim = cfg.CONTRASTIVE.DIM
        self.length = cfg.CONTRASTIVE.LENGTH
        self.k = cfg.CONTRASTIVE.QUEUE_LEN
        self.mmt = cfg.CONTRASTIVE.MOMENTUM
        self.momentum_annealing = cfg.CONTRASTIVE.MOMENTUM_ANNEALING
        self.duration = 1
        self.cfg = cfg
        self.num_gpus = cfg.NUM_GPUS
        self.l2_norm = Normalize()
        self.knn_num_imgs = 0
        self.knn_on = cfg.CONTRASTIVE.KNN_ON
        self.train_labels = np.zeros((0, ), dtype=np.int32)
        self.num_pos = 2
        self.num_crops = (self.cfg.DATA.TRAIN_CROP_NUM_TEMPORAL *
                          self.cfg.DATA.TRAIN_CROP_NUM_SPATIAL)
        self.nce_loss_fun = losses.get_loss_func("contrastive_loss")(
            reduction="mean")
        assert self.cfg.MODEL.LOSS_FUNC == "contrastive_loss"
        self.softmax = nn.Softmax(dim=1).cuda()

        if self.type == "mem":
            self.mem_type = cfg.CONTRASTIVE.MEM_TYPE
            if self.mem_type == "1d":
                self.memory = Memory1D(self.length, self.duration, self.dim,
                                       cfg)
            else:
                self.memory = Memory(self.length, self.duration, self.dim, cfg)
            self.examplar_type = "video"
            self.interp = cfg.CONTRASTIVE.INTERP_MEMORY
        elif self.type == "self":
            pass
        elif self.type == "moco" or self.type == "byol":
            # MoCo components
            self.backbone_hist = _MODEL_TYPES[cfg.MODEL.ARCH](cfg)
            for p in self.backbone_hist.parameters():
                p.requires_grad = False
            self.register_buffer("ptr", torch.tensor([0]))
            self.ptr.requires_grad = False
            stdv = 1.0 / math.sqrt(self.dim / 3)
            self.register_buffer(
                "queue_x",
                torch.rand(self.k, self.dim).mul_(2 * stdv).add_(-stdv),
            )
            self.register_buffer("iter", torch.zeros([1], dtype=torch.long))
            self._batch_shuffle_on = (False if
                                      ("sync" in cfg.BN.NORM_TYPE and
                                       cfg.BN.NUM_SYNC_DEVICES == cfg.NUM_GPUS)
                                      or self.type == "byol" else True)
        elif self.type == "swav":
            self.swav_use_public_code = True
            if self.swav_use_public_code:
                self.swav_prototypes = nn.Linear(
                    self.dim, 1000, bias=False)  # for orig implementation
            else:
                self.swav_prototypes = nn.Parameter(
                    torch.randn((self.dim, 1000), dtype=torch.float))
            self.swav_eps_sinkhorn = 0.05
            self.swav_use_the_queue = False
            # optionally starts a queue
            if self.cfg.CONTRASTIVE.SWAV_QEUE_LEN > 0:
                self.register_buffer(
                    "queue_swav",
                    torch.zeros(
                        2,  # = args.crops_for_assign
                        self.cfg.CONTRASTIVE.SWAV_QEUE_LEN //
                        du.get_world_size(),
                        self.dim,
                    ),
                )
        elif self.type == "simclr":
            self._simclr_precompute_pos_neg_mask_multi()
        self.simclr_dist_on = cfg.CONTRASTIVE.SIMCLR_DIST_ON

        # self.knn_mem = Memory1D(self.length, 1, self.dim, cfg) #  does not work
        if self.knn_on:
            self.knn_mem = Memory(self.length, 1, self.dim, cfg)

Пример #15

Показать файл

def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, sampled_batch in enumerate(train_loader):
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['des_q']
        des_ans = sampled_batch['question_dict']['des_ans']
        mc_q = sampled_batch['question_dict']['mc_q']
        mc_ans = sampled_batch['question_dict']['mc_ans']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            mc_q = mc_q.cuda(non_blocking=True)
            mc_ans = mc_ans.cuda()

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        pred_des_ans = model(frames, des_q, True)
        pred_mc_ans = model(frames, mc_q, False)
        # Explicitly declare reduction to mean.
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")
        # Compute the loss.
        loss = des_loss_fun(pred_des_ans, des_ans)
        loss += mc_loss_fun(pred_mc_ans, mc_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Check if plateau

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        diff_mc_ans = torch.abs(
            mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float())  #Errors
        mc_opt_err = 100 * torch.true_divide(diff_mc_ans.sum(),
                                             (4 * des_q.size()[0]))
        mc_q_err = 100 * torch.true_divide(
            (diff_mc_ans.sum(dim=1, keepdim=True) != 0).float().sum(),
            des_q.size()[0])
        # Gather all the predictions across all the devices.
        if cfg.NUM_GPUS > 1:
            loss, top1_err, top5_err, mc_opt_err, mc_q_err = du.all_reduce(
                [loss, top1_err, top5_err, mc_opt_err, mc_q_err])
        # Copy the stats from GPU to CPU (sync point).
        loss, top1_err, top5_err, mc_opt_err, mc_q_err = (loss.item(),
                                                          top1_err.item(),
                                                          top5_err.item(),
                                                          mc_opt_err.item(),
                                                          mc_q_err.item())

        # Update and log stats.
        train_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            loss,
            lr,
            frames.size()[0] * max(
                cfg.NUM_GPUS, 1
            ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
        )
        # write to tensorboard format if available.
        if writer is not None:
            writer.add_scalars(
                {
                    "Train/loss": loss,
                    "Train/lr": lr,
                    "Train/Top1_err": top1_err,
                    "Train/Top5_err": top5_err,
                    "Train/mc_opt_err": mc_opt_err,
                    "Train/mc_q_err": mc_q_err,
                },
                global_step=data_size * cur_epoch + cur_iter,
            )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #16

Показать файл

def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        # _________________________ save model test __________________________________________
        if cur_iter % 100 == 1:
            cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_iter,
                               cfg)  # cur_epoch
            print("----------------------- save done ")
            # exit(0)
            # _________________________________________________________________________________________

        if cfg.DETECTION.ENABLE:
            # inputs[4,3,8,224,224], preds[32,2048,7,7]
            # change {1,3,8,224,224]  ->  [8,3,224,224]
            ##################################################################################
            inputs0 = inputs[0].squeeze(0).permute(1, 0, 2, 3)
            inputs1 = inputs[1].squeeze(0).permute(1, 0, 2, 3)
            meta["boxes"] = meta["boxes"].unsqueeze(0).unsqueeze(0)
            inputs = [inputs0, inputs1]
            preds = model(inputs, meta["boxes"])
            # #################################################################################################################################
            # import os
            # weights = 'checkpoints/checkpoint_epoch_00007.pyth'
            # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
            # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # chkpt = torch.load(weights, map_location=device)

            # try:
            #     model_dict = model.module.state_dict()
            # except AttributeError:
            #     model_dict = model.state_dict()  # 读取原始状态及参数，                                         ## 多GPU训练，导致训练存储的模型时key会加上model
            #     # 将pretrained_dict里不属于model_dict的键剔除掉
            #     chkpt = {k: v for k, v in chkpt.items() if k in model_dict}
            # print("load pretrain model")
            # model_dict.update(chkpt)
            # model.load_state_dict(model_dict)

            # model.to(device)
            # # inputs = [inputs.to(device)]
            # model.eval()
            # input_tensor = (inputs, meta["boxes"].to(device))
            # traced_script_module = torch.jit.trace(model, input_tensor)
            # traced_script_module.save("weights/sf_pytorch.pt")
            # print("************************* out put save **********************************")
            # exit(0)


##############################################################################################
        else:
            preds = model(inputs)
        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #17

Показать файл

Файл: train_net.py Проект: porcelluscavia/auditory-slow-fast-thesis

def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None,
                wandb_log=False):
    """
    Perform the audio training for one epoch.
    Args:
        train_loader (loader): audio training loader.
        model (model): the audio model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    if cfg.BN.FREEZE:
        model.module.freeze_fn(
            'bn_statistics') if cfg.NUM_GPUS > 1 else model.freeze_fn(
                'bn_statistics')

    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            if isinstance(labels, (dict, )):
                labels = {k: v.cuda() for k, v in labels.items()}
            else:
                labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        # preds = model(inputs) #this is how model.forward() is called
        preds = model(inputs)[
            0]  #this is the original output, the output of the last layer
        linear_layer_output = model(inputs)[1]

        if isinstance(labels, (dict, )):
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss_verb = loss_fun(preds[0], labels['verb'])
            loss_noun = loss_fun(preds[1], labels['noun'])
            loss = 0.5 * (loss_verb + loss_noun)

            # check Nan Loss.
            misc.check_nan_losses(loss)
        else:
            #I believe this is the VGG loss part, as the labels are not split into nouns and verbs

            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Embedding loss function.
            emb_loss_fun = losses.get_loss_func(
                cfg.MODEL.EMB_LOSS_FUNC)(reduction="mean")

            # Compute the loss for the main model.
            loss = loss_fun(preds, labels)

            # Compute the loss for the embeddings.
            emb_loss = emb_loss_fun(linear_layer_output, word_embedding)

            # Add the losses together- use embeddings to fine tune the model's objective
            loss = loss + emb_loss

            # check Nan Loss.
            misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if isinstance(labels, (dict, )):
            # Compute the verb accuracies.
            verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(
                preds[0], labels['verb'], (1, 5))

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce(
                    [loss_verb, verb_top1_acc, verb_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss_verb, verb_top1_acc, verb_top5_acc = (
                loss_verb.item(),
                verb_top1_acc.item(),
                verb_top5_acc.item(),
            )

            # Compute the noun accuracies.
            noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(
                preds[1], labels['noun'], (1, 5))

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce(
                    [loss_noun, noun_top1_acc, noun_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss_noun, noun_top1_acc, noun_top5_acc = (
                loss_noun.item(),
                noun_top1_acc.item(),
                noun_top5_acc.item(),
            )

            # Compute the action accuracies.
            action_top1_acc, action_top5_acc = metrics.multitask_topk_accuracies(
                (preds[0], preds[1]), (labels['verb'], labels['noun']), (1, 5))
            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                loss, action_top1_acc, action_top5_acc = du.all_reduce(
                    [loss, action_top1_acc, action_top5_acc])

            # Copy the stats from GPU to CPU (sync point).
            loss, action_top1_acc, action_top5_acc = (
                loss.item(),
                action_top1_acc.item(),
                action_top5_acc.item(),
            )

            # Update and log stats.
            train_meter.update_stats(
                (verb_top1_acc, noun_top1_acc, action_top1_acc),
                (verb_top5_acc, noun_top5_acc, action_top5_acc),
                (loss_verb, loss_noun, loss),
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None and not wandb_log:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_acc": action_top1_acc,
                        "Train/Top5_acc": action_top5_acc,
                        "Train/verb/loss": loss_verb,
                        "Train/noun/loss": loss_noun,
                        "Train/verb/Top1_acc": verb_top1_acc,
                        "Train/verb/Top5_acc": verb_top5_acc,
                        "Train/noun/Top1_acc": noun_top1_acc,
                        "Train/noun/Top5_acc": noun_top5_acc,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

            if wandb_log:
                wandb.log(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_acc": action_top1_acc,
                        "Train/Top5_acc": action_top5_acc,
                        "Train/verb/loss": loss_verb,
                        "Train/noun/loss": loss_noun,
                        "Train/verb/Top1_acc": verb_top1_acc,
                        "Train/verb/Top5_acc": verb_top5_acc,
                        "Train/noun/Top1_acc": noun_top1_acc,
                        "Train/noun/Top5_acc": noun_top5_acc,
                        "train_step": data_size * cur_epoch + cur_iter,
                    }, )
        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None and not wandb_log:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

            if wandb_log:
                wandb.log(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                        "train_step": data_size * cur_epoch + cur_iter,
                    }, )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #18

Показать файл

Файл: train_net.py Проект: porcelluscavia/auditory-slow-fast-thesis

def eval_epoch(val_loader,
               model,
               val_meter,
               cur_epoch,
               cfg,
               writer=None,
               wandb_log=False):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader):
        if cfg.NUM_GPUS:
            # Transferthe data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            if isinstance(labels, (dict, )):
                labels = {k: v.cuda() for k, v in labels.items()}
            else:
                labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)
        val_meter.data_toc()

        preds = model(inputs)

        if isinstance(labels, (dict, )):
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss_verb = loss_fun(preds[0], labels['verb'])
            loss_noun = loss_fun(preds[1], labels['noun'])
            loss = 0.5 * (loss_verb + loss_noun)

            # Compute the verb accuracies.
            verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(
                preds[0], labels['verb'], (1, 5))

            # Combine the errors across the GPUs.
            if cfg.NUM_GPUS > 1:
                loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce(
                    [loss_verb, verb_top1_acc, verb_top5_acc])

            # Copy the errors from GPU to CPU (sync point).
            loss_verb, verb_top1_acc, verb_top5_acc = (
                loss_verb.item(),
                verb_top1_acc.item(),
                verb_top5_acc.item(),
            )

            # Compute the noun accuracies.
            noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(
                preds[1], labels['noun'], (1, 5))

            # Combine the errors across the GPUs.
            if cfg.NUM_GPUS > 1:
                loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce(
                    [loss_noun, noun_top1_acc, noun_top5_acc])

            # Copy the errors from GPU to CPU (sync point).
            loss_noun, noun_top1_acc, noun_top5_acc = (
                loss_noun.item(),
                noun_top1_acc.item(),
                noun_top5_acc.item(),
            )

            # Compute the action accuracies.
            action_top1_acc, action_top5_acc = metrics.multitask_topk_accuracies(
                (preds[0], preds[1]), (labels['verb'], labels['noun']), (1, 5))
            # Combine the errors across the GPUs.
            if cfg.NUM_GPUS > 1:
                loss, action_top1_acc, action_top5_acc = du.all_reduce(
                    [loss, action_top1_acc, action_top5_acc])

            # Copy the errors from GPU to CPU (sync point).
            loss, action_top1_acc, action_top5_acc = (
                loss.item(),
                action_top1_acc.item(),
                action_top5_acc.item(),
            )

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(
                (verb_top1_acc, noun_top1_acc, action_top1_acc),
                (verb_top5_acc, noun_top5_acc, action_top5_acc),
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None and not wandb_log:
                writer.add_scalars(
                    {
                        "Val/loss": loss,
                        "Val/Top1_acc": action_top1_acc,
                        "Val/Top5_acc": action_top5_acc,
                        "Val/verb/loss": loss_verb,
                        "Val/verb/Top1_acc": verb_top1_acc,
                        "Val/verb/Top5_acc": verb_top5_acc,
                        "Val/noun/loss": loss_noun,
                        "Val/noun/Top1_acc": noun_top1_acc,
                        "Val/noun/Top5_acc": noun_top5_acc,
                    },
                    global_step=len(val_loader) * cur_epoch + cur_iter,
                )

            if wandb_log:
                wandb.log(
                    {
                        "Val/loss": loss,
                        "Val/Top1_acc": action_top1_acc,
                        "Val/Top5_acc": action_top5_acc,
                        "Val/verb/loss": loss_verb,
                        "Val/verb/Top1_acc": verb_top1_acc,
                        "Val/verb/Top5_acc": verb_top5_acc,
                        "Val/noun/loss": loss_noun,
                        "Val/noun/Top1_acc": noun_top1_acc,
                        "Val/noun/Top5_acc": noun_top5_acc,
                        "val_step": len(val_loader) * cur_epoch + cur_iter,
                    }, )

            val_meter.update_predictions((preds[0], preds[1]),
                                         (labels['verb'], labels['noun']))

        else:
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(
                cfg.MODEL.LOSS_FUNC)(reduction="mean")

            # Compute the loss.
            loss = loss_fun(preds, labels)

            if cfg.DATA.MULTI_LABEL:
                if cfg.NUM_GPUS > 1:
                    preds, labels = du.all_gather([preds, labels])

            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))

                # Combine the errors across the GPUs.
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the errors from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

                val_meter.iter_toc()
                # Update and log stats.
                val_meter.update_stats(
                    top1_err,
                    top5_err,
                    inputs[0].size(0) * max(
                        cfg.NUM_GPUS, 1
                    ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
                )
                # write to tensorboard format if available.
                if writer is not None and not wandb_log:
                    writer.add_scalars(
                        {
                            "Val/loss": loss,
                            "Val/Top1_err": top1_err,
                            "Val/Top5_err": top5_err,
                        },
                        global_step=len(val_loader) * cur_epoch + cur_iter,
                    )

                if wandb_log:
                    wandb.log(
                        {
                            "Val/loss": loss,
                            "Val/Top1_err": top1_err,
                            "Val/Top5_err": top5_err,
                            "val_step": len(val_loader) * cur_epoch + cur_iter,
                        }, )

            val_meter.update_predictions(preds, labels)

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # Log epoch stats.
    is_best_epoch, top1_dict = val_meter.log_epoch_stats(cur_epoch)
    # write to tensorboard format if available.
    if writer is not None:
        all_preds = [pred.clone().detach() for pred in val_meter.all_preds]
        all_labels = [label.clone().detach() for label in val_meter.all_labels]
        if cfg.NUM_GPUS:
            all_preds = [pred.cpu() for pred in all_preds]
            all_labels = [label.cpu() for label in all_labels]
        writer.plot_eval(preds=all_preds,
                         labels=all_labels,
                         global_step=cur_epoch)

    if writer is not None and not wandb_log:
        if "top1_acc" in top1_dict.keys():
            writer.add_scalars(
                {
                    "Val/epoch/Top1_acc": top1_dict["top1_acc"],
                    "Val/epoch/verb/Top1_acc": top1_dict["verb_top1_acc"],
                    "Val/epoch/noun/Top1_acc": top1_dict["noun_top1_acc"],
                },
                global_step=cur_epoch,
            )

        else:
            writer.add_scalars(
                {"Val/epoch/Top1_err": top1_dict["top1_err"]},
                global_step=cur_epoch,
            )

    if wandb_log:
        if "top1_acc" in top1_dict.keys():
            wandb.log(
                {
                    "Val/epoch/Top1_acc": top1_dict["top1_acc"],
                    "Val/epoch/verb/Top1_acc": top1_dict["verb_top1_acc"],
                    "Val/epoch/noun/Top1_acc": top1_dict["noun_top1_acc"],
                    "epoch": cur_epoch,
                }, )

        else:
            wandb.log({
                "Val/epoch/Top1_err": top1_dict["top1_err"],
                "epoch": cur_epoch
            })

    top1 = top1_dict["top1_acc"] if "top1_acc" in top1_dict.keys(
    ) else top1_dict["top1_err"]
    val_meter.reset()
    return is_best_epoch, top1

Пример #19

Показать файл

def train_epoch(train_loader,
                model,
                optimizer,
                scheduler,
                train_meter,
                cur_epoch,
                cfg,
                test_imp=False):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, sampled_batch in enumerate(train_loader):
        frames = sampled_batch['res_ft']
        des_q = sampled_batch['question_dict']['question']
        des_ans = sampled_batch['question_dict']['ans']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()

        train_meter.data_toc()
        #Pass through
        model.zero_grad()
        pred_des_ans = model(frames, des_q, True)
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        loss = des_loss_fun(pred_des_ans, des_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()
        #Save for stats
        loss_des_val = loss

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (loss_des_val.item(),
                                            top1_err.item(), top5_err.item())
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, lr, mb_size
        # Update and log stats.
        train_meter.update_stats(top1_err, top5_err, mc_opt_err, mc_q_err,
                                 loss_des_val, None, scheduler.get_last_lr(),
                                 des_ans.size(0), mb_size_mc)
        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #20

Показать файл

def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, cfg, cnt):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    if cfg.BN.FREEZE:
        model.freeze_fn('bn_statistics')

    train_meter.iter_tic()
    data_size = len(train_loader)

    #for cur_iter, (inputs, bboxs, masks, labels, _, meta) in enumerate(train_loader):
    for cur_iter, output_dict in enumerate(train_loader):
        if cfg.EPICKITCHENS.USE_BBOX:
            inputs = output_dict['inputs']
            bboxs = output_dict['bboxs']
            masks = output_dict['masks']
            labels = output_dict['label'] 
            # output_dict['index'] 
            meta = output_dict['metadata'] 
        else:
            inputs = output_dict['inputs']
            labels = output_dict['label'] 
            meta = output_dict['metadata'] 
        

        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list,)):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        if isinstance(labels, (dict,)):
            labels = {k: v.cuda() for k, v in labels.items()}
        else:
            labels = labels.cuda()
        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            if cfg.EPICKITCHENS.USE_BBOX:
                if isinstance(bboxs, (list,)):
                    for i in range(len(bboxs)):
                        bboxs[i] = bboxs[i].cuda(non_blocking=True)
                        masks[i] = masks[i].cuda(non_blocking=True)
                else:
                    bboxs = bboxs.cuda(non_blocking=True)
                    masks = masks.cuda(non_blocking=True)
                
                preds = model(inputs, bboxes=bboxs, masks=masks)
            else:
                preds = model(inputs)

        if isinstance(labels, (dict,)):
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")
            # Compute the loss.
            loss_verb = loss_fun(preds[0], labels['verb'])
            loss_noun = loss_fun(preds[1], labels['noun'])
            loss = 0.5 * (loss_verb + loss_noun)
            # check Nan Loss.
            misc.check_nan_losses(loss)
        else:
            # Explicitly declare reduction to mean.
            loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")
            # Compute the loss.
            loss = loss_fun(preds, labels)
            # check Nan Loss.
            misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
        else:
            if isinstance(labels, (dict,)):
                # Compute the verb accuracies.
                verb_top1_acc, verb_top5_acc = metrics.topk_accuracies(preds[0], labels['verb'], (1, 5))
                
                # predicted_answer_softmax = torch.nn.Softmax(dim=1)(preds[0])
                # predicted_answer_max = torch.max(predicted_answer_softmax.data, 1).indices
                # print(cnt, predicted_answer_max, labels['verb'])

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss_verb, verb_top1_acc, verb_top5_acc = du.all_reduce(
                        [loss_verb, verb_top1_acc, verb_top5_acc]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss_verb, verb_top1_acc, verb_top5_acc = (
                    loss_verb.item(),
                    verb_top1_acc.item(),
                    verb_top5_acc.item(),
                )

                # Compute the noun accuracies.
                noun_top1_acc, noun_top5_acc = metrics.topk_accuracies(preds[1], labels['noun'], (1, 5))

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss_noun, noun_top1_acc, noun_top5_acc = du.all_reduce(
                        [loss_noun, noun_top1_acc, noun_top5_acc]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss_noun, noun_top1_acc, noun_top5_acc = (
                    loss_noun.item(),
                    noun_top1_acc.item(),
                    noun_top5_acc.item(),
                )

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss = du.all_reduce(
                        [loss]
                    )
                    if isinstance(loss, (list,)):
                        loss = loss[0]

                # Copy the stats from GPU to CPU (sync point).
                loss = loss.item()

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(
                    (verb_top1_acc, noun_top1_acc),
                    (verb_top5_acc, noun_top5_acc),
                    (loss_verb, loss_noun, loss),
                    lr, inputs[0].size(0) * cfg.NUM_GPUS
                )
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(preds, labels, (1, 5))
                top1_err, top5_err = [
                    (1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct
                ]

                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err]
                    )

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

                train_meter.iter_toc()
                # Update and log stats.
                train_meter.update_stats(
                    top1_err, top5_err, loss, lr, inputs[0].size(0) * cfg.NUM_GPUS
                )
        train_meter.log_iter_stats(cur_epoch, cur_iter, cnt)
        train_meter.iter_tic()
        cnt += 1
    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()
    return cnt

Пример #21

Показать файл

Файл: train_net.py Проект: serre-lab/pred_gn

def train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, writer,
                nep, cfg):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    global_iters = data_size * cur_epoch
    for cur_iter, (inputs, labels, _, meta) in enumerate(train_loader):

        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)

        else:
            inputs = inputs.cuda(non_blocking=True)

        if len(inputs[i].shape) > 5:

            labels = torch.repeat_interleave(labels, inputs[i].size(1), 0)

        for i in range(len(inputs)):
            if len(inputs[i].shape) > 5:

                inputs[i] = inputs[i].view((-1, ) + inputs[i].shape[2:])

        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size,
                                global_iters, cfg)
        optim.set_lr(optimizer, lr)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

        else:
            # Perform the forward pass.
            if 'masks' in meta:
                preds = model((inputs, meta['masks']))
            else:
                preds = model(inputs)

        ####################################################################################################################################
        # check activations
        ####################################################################################################################################
        # if writer is not None and global_iters%cfg.SUMMARY_PERIOD==0:

        #     bu_errors = preds['bu_errors']#.cpu()#.data.numpy().squeeze()

        #     for layer in range(len(bu_errors)):
        #         images = bu_errors[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])

        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/bu_error_l%d'%layer, grid, global_iters)

        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_bu_errors_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     mix_out = preds['mix_layer']#.cpu().data.numpy().squeeze()
        #     for layer in range(len(mix_out)):

        #         images = mix_out[layer].transpose(1,2).transpose(0,1)
        #         images = images.reshape((-1,) + images.shape[2:])
        #         images = (images-images.min())
        #         images = images/images.max()
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/mix_layer_l%d'%layer, grid, global_iters)
        #         # tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'example_%d_mix_layer_l%d.jpg'%(i,layer)), nrow=18, normalize=True)

        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_mix_layer_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     inhibition = preds['H_inh']#.cpu()#.data.numpy().squeeze()
        #     for layer in range(len(inhibition)):
        #         images = inhibition[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/H_inh_l%d'%layer, grid, global_iters)
        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_H_inh_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        #     hidden = preds['hidden']#.cpu()#.data.numpy().squeeze()
        #     for layer in range(len(hidden)):
        #         images = hidden[layer].transpose(1,2).transpose(0,1)
        #         images = (images-images.min())
        #         images = images/images.max()
        #         images = images.reshape((-1,) + images.shape[2:])
        #         # grid = tv.utils.make_grid(images, nrow=18, normalize=True)
        #         # writer.add_image('activations/hidden_l%d'%layer, grid, global_iters)
        #         tv.utils.save_image(images, os.path.join(cfg.OUTPUT_DIR, 'preds_%d_hidden_l%d.jpg'%(global_iters,layer)), nrow=18, normalize=True)

        out_keys = preds.keys()
        total_loss = 0

        if cfg.PREDICTIVE.ENABLE:

            errors = preds['pred_errors']
            if 'frame_errors' in preds:
                frame_errors = preds['frame_errors']

            if 'IoU' in preds:
                iou = preds['IoU']
            if 'Acc' in preds:
                acc = preds['Acc']

            pred_loss = errors.mean()
            total_loss += pred_loss

            # if 'frame_errors' in out_keys:
            #     total_loss += frame_errors
            # copy_baseline = F.smooth_l1_loss(inputs[i][:,:,1:] - inputs[i][:,:,:-1], torch.zeros_like(inputs[i][:,:,1:]))
            # copy_baseline = F.l1_loss(inputs[i][:,:,1:] - inputs[i][:,:,:-1], torch.zeros_like(inputs[i][:,:,1:]))

        if cfg.PREDICTIVE.CPC:
            cpc_loss = preds['cpc_loss']
            total_loss += cpc_loss

        if 'cbp_penalty' in preds:
            penalty = preds['cbp_penalty']
            total_loss += penalty

        if cfg.SUPERVISED:
            preds = preds['logits']

            if cfg.MODEL.LOSS_FUNC != '':
                # Explicitly declare reduction to mean.
                loss_fun = losses.get_loss_func(
                    cfg.MODEL.LOSS_FUNC)(reduction="mean")

                # Compute the loss.
                loss = loss_fun(preds, labels)

                total_loss += loss

        # check Nan Loss.
        misc.check_nan_losses(total_loss)

        # Perform the backward pass.
        optimizer.zero_grad()

        total_loss.backward()

        ####################################################################################################################################
        # check gradients
        if writer is not None and global_iters % cfg.SUMMARY_PERIOD == 0:
            n_p = model.module.named_parameters() if hasattr(
                model, 'module') else model.named_parameters()
            fig = viz_helpers.plot_grad_flow_v2(n_p)
            writer.add_figure('grad_flow/grad_flow', fig, global_iters)
        ####################################################################################################################################

        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(lr,
                                     inputs[0].size(0) * cfg.NUM_GPUS,
                                     loss=loss)
        else:
            if cfg.SUPERVISED:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:

                if cfg.PREDICTIVE.ENABLE:
                    pred_loss = du.all_reduce([pred_loss])
                    pred_loss = pred_loss[0]
                    if 'frame_errors' in out_keys:
                        frame_errors = du.all_reduce([frame_errors])[0]
                    if 'IoU' in preds:
                        iou = du.all_reduce([iou])[0]
                    if 'Acc' in preds:
                        acc = du.all_reduce([acc])[0]
                    # copy_baseline = du.all_reduce([copy_baseline])
                    # copy_baseline = copy_baseline[0]

                if cfg.PREDICTIVE.CPC:
                    cpc_loss = du.all_reduce([cpc_loss])
                    cpc_loss = cpc_loss[0]
                if cfg.SUPERVISED:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                if 'cbp_penalty' in out_keys:
                    penalty = du.all_reduce([penalty])[0]

            loss_logs = {}
            if cfg.PREDICTIVE.ENABLE:
                pred_loss = pred_loss.item()
                loss_logs['loss_pred'] = pred_loss
                if 'frame_errors' in out_keys:
                    frame_errors = frame_errors.item()
                    loss_logs['frame_errors'] = frame_errors

                if 'IoU' in preds:
                    loss_logs['IoU'] = iou.item()
                if 'Acc' in preds:
                    loss_logs['Acc'] = acc.item()
                # copy_baseline = copy_baseline.item()
                # loss_logs['copy_comp'] = copy_baseline
            if cfg.PREDICTIVE.CPC:
                cpc_loss = cpc_loss.item()
                loss_logs['loss_cpc'] = cpc_loss

            if cfg.SUPERVISED:
                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

                loss_logs['loss_class'] = loss
                loss_logs['top5_err'] = top5_err
                loss_logs['top1_err'] = top1_err

            if 'cbp_penalty' in out_keys:
                loss_logs['cbp_penalty'] = penalty.item()

            train_meter.iter_toc()
            # Update and log stats.
            train_meter.update_stats(lr, inputs[0].size(0) * cfg.NUM_GPUS,
                                     **loss_logs)

            if writer is not None and global_iters % cfg.LOG_PERIOD == 0:
                for k, v in loss_logs.items():
                    writer.add_scalar('loss/' + k.strip('loss_'),
                                      train_meter.stats[k].get_win_median(),
                                      global_iters)
            if nep is not None and global_iters % cfg.LOG_PERIOD == 0:
                for k, v in loss_logs.items():
                    nep.log_metric(k.strip('loss_'),
                                   train_meter.stats[k].get_win_median())

                nep.log_metric('global_iters', global_iters)

                # writer.add_scalar('loss/top1_err', train_meter.mb_top1_err.get_win_median(), global_iters)
                # writer.add_scalar('loss/top5_err', train_meter.mb_top5_err.get_win_median(), global_iters)
                # writer.add_scalar('loss/loss', train_meter.loss.get_win_median(), global_iters)
            if global_iters % cfg.SUMMARY_PERIOD == 0 and du.get_rank(
            ) == 0 and du.is_master_proc(num_gpus=cfg.NUM_GPUS):

                with torch.no_grad():
                    # logger.info(inputs[i].shape)
                    # sys.stdout.flush()
                    inputs[0] = inputs[0][:min(3, len(inputs[0]))]
                    if 'masks' in meta:
                        frames = model(
                            (inputs, meta['masks'][:min(3, len(inputs[0]))]),
                            extra=['frames'])['frames']
                    else:
                        frames = model(inputs, extra=['frames'])['frames']

                    n_rows = inputs[0].size(2) - 1

                    inputs = inputs[0].transpose(1, 2)[:, -n_rows:]
                    frames = frames.transpose(1, 2)[:, -n_rows:]

                    inputs = inputs * inputs.new(
                        cfg.DATA.STD)[None, None, :, None, None] + inputs.new(
                            cfg.DATA.MEAN)[None, None, :, None, None]
                    frames = frames * frames.new(
                        cfg.DATA.STD)[None, None, :, None, None] + frames.new(
                            cfg.DATA.MEAN)[None, None, :, None, None]
                    images = torch.cat([inputs, frames],
                                       1).reshape((-1, ) + inputs.shape[2:])

                # grid = tv.utils.make_grid(images, nrow=8, normalize=True)
                # writer.add_image('predictions', images, global_iters)

                tv.utils.save_image(
                    images,
                    os.path.join(cfg.OUTPUT_DIR,
                                 'preds_%d.jpg' % global_iters),
                    nrow=n_rows,
                    normalize=True)

                # del images
                # del frames
                # del inputs

        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

        global_iters += 1

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #22

Показать файл

Файл: train_net_des.py Проект: gabrielsluz/SlowFast

def train_epoch(
    train_loader, model, optimizer, train_meter, cur_epoch, cfg, test_imp=False
):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (ClevrerTrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    test_counter = 0
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)
    
    for cur_iter, sampled_batch in enumerate(train_loader): 
        frames = sampled_batch['frames']
        des_q = sampled_batch['question_dict']['question']
        des_ans = sampled_batch['question_dict']['ans']
        # des_len = sampled_batch['question_dict']['len']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(frames, (list,)):
                for i in range(len(frames)):
                    frames[i] = frames[i].cuda(non_blocking=True)
            else:
                frames = frames.cuda(non_blocking=True)
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            # des_len = des_len.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        #Separated batches
        #Des
        pred_des_ans = model(frames, des_q, True)
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        loss = des_loss_fun(pred_des_ans, des_ans)
        # check Nan Loss.
        misc.check_nan_losses(loss)
        #Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #Save for stats
        loss_des_val = loss

        top1_err, top5_err = None, None
        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans, (1, 5))
        top1_err, top5_err = [
            (1.0 - x / pred_des_ans.size(0)) * 100.0 for x in num_topks_correct
        ]
        mc_opt_err, mc_q_err = None, None
        mb_size_mc = None
        loss_des_val, top1_err, top5_err = (
            loss_des_val.item(),
            top1_err.item(),
            top5_err.item()
        )
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, lr, mb_size
        # Update and log stats.
        train_meter.update_stats(
            top1_err,
            top5_err,
            mc_opt_err,
            mc_q_err,
            loss_des_val,
            None,
            lr,
            des_q.size()[0],
            mb_size_mc
        )
        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()


        #For testing implementation
        if test_imp:
            print(" --- Descriptive questions results --- ")
            # print("Des_q")
            # print(des_q)
            print("Des_ans")
            print(des_ans)
            #print("Des_ans_pred")
            #print(pred_des_ans)
            print("Argmax => prediction")
            print(torch.argmax(pred_des_ans, dim=1, keepdim=False))
            print("Top1_err and Top5err")
            print(top1_err, top5_err)
            print("Loss_des_val = {}".format(loss_des_val))
            test_counter += 1
            if test_counter == 1: 
                break

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #23

Показать файл

Файл: train_net.py Проект: serre-lab/pred_gn

def eval_epoch(val_loader, model, val_meter, cur_epoch, nep, cfg):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """

    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, (inputs, labels, _, meta) in enumerate(val_loader):
        # Transferthe data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda(non_blocking=True)
        else:
            inputs = inputs.cuda(non_blocking=True)
        labels = labels.cuda()
        for key, val in meta.items():
            if isinstance(val, (list, )):
                for i in range(len(val)):
                    val[i] = val[i].cuda(non_blocking=True)
            else:
                meta[key] = val.cuda(non_blocking=True)

        if cfg.DETECTION.ENABLE:
            # Compute the predictions.
            preds = model(inputs, meta["boxes"])

            preds = preds.cpu()
            ori_boxes = meta["ori_boxes"].cpu()
            metadata = meta["metadata"].cpu()

            if cfg.NUM_GPUS > 1:
                preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
                ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes),
                                      dim=0)
                metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0)

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(preds.cpu(), ori_boxes.cpu(),
                                   metadata.cpu())

        else:

            preds = model(inputs)
            aux_loss_keys = []
            if cfg.PREDICTIVE.ENABLE:
                aux_loss_keys.append('pred_errors')
                errors = preds['pred_errors']
                pred_loss = errors.mean()

            if 'frame_errors' in preds:
                aux_loss_keys.append('frame_errors')
                frame_errors = preds['frame_errors']

            if cfg.PREDICTIVE.CPC:
                aux_loss_keys.append('cpc_loss')
                cpc_loss = preds['cpc_loss']

            if cfg.SUPERVISED:
                preds = preds['logits']

            # Explicitly declare reduction to mean.
            if cfg.MODEL.LOSS_FUNC != '' and cfg.SUPERVISED:
                loss_fun = losses.get_loss_func(
                    cfg.MODEL.LOSS_FUNC)(reduction="mean")

                # Compute the loss.
                loss = loss_fun(preds, labels)
                # total_loss = total_loss + loss

                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))

                # Combine the errors across the GPUs.
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]

            # Gather all the predictions across all the devices.
            if cfg.NUM_GPUS > 1:
                if cfg.PREDICTIVE.ENABLE:
                    pred_loss = du.all_reduce([pred_loss])[0]

                if cfg.PREDICTIVE.CPC:
                    cpc_loss = du.all_reduce([cpc_loss])[0]

                if cfg.SUPERVISED:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

            # # Copy the stats from GPU to CPU (sync point).
            # loss, top1_err, top5_err = (
            #     loss.item(),
            #     top1_err.item(),
            #     top5_err.item(),
            # )

            # if cfg.NUM_GPUS > 1:
            #     top1_err, top5_err = du.all_reduce([top1_err, top5_err])

            # Copy the errors from GPU to CPU (sync point).
            loss_logs = {}
            if 'loss_pred' in aux_loss_keys:
                loss_logs['loss_pred'] = pred_loss.item()

            if 'frame_errors' in aux_loss_keys:
                loss_logs['frame_errors'] = frame_errors.item()

            if 'loss_cpc' in aux_loss_keys:
                loss_logs['loss_cpc'] = cpc_loss.item()

            if cfg.SUPERVISED:
                loss_logs['loss_class'] = loss.item()
                loss_logs['top1_err'] = top1_err.item()
                loss_logs['top5_err'] = top5_err.item()

            val_meter.iter_toc()
            # Update and log stats.
            val_meter.update_stats(inputs[0].size(0) * cfg.NUM_GPUS,
                                   **loss_logs)

        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

    # neptune update
    if nep is not None:
        for k, v in loss_logs.items():
            nep.log_metric('val_' + k.strip('loss_'),
                           val_meter.stats[k].get_global_avg())

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()

Пример #24

Показать файл

Файл: train_net.py Проект: bqhuyy/SlowFast

def train_epoch(train_loader,
                model,
                optimizer,
                train_meter,
                cur_epoch,
                cfg,
                writer=None):
    """
    Perform the video training for one epoch.
    Args:
        train_loader (loader): video training loader.
        model (model): the video model to train.
        optimizer (optim): the optimizer to perform optimization on the model's
            parameters.
        train_meter (TrainMeter): training meters to log the training performance.
        cur_epoch (int): current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        writer (TensorboardWriter, optional): TensorboardWriter object
            to writer Tensorboard log.
    """
    # Enable train mode.
    model.train()
    train_meter.iter_tic()
    data_size = len(train_loader)

    for cur_iter, (inputs, labels, _, meta, boxes,
                   b_indices) in enumerate(train_loader):
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            labels = labels.cuda()
            for key, val in meta.items():
                if isinstance(val, (list, )):
                    for i in range(len(val)):
                        val[i] = val[i].cuda(non_blocking=True)
                else:
                    meta[key] = val.cuda(non_blocking=True)

        # Update the learning rate.
        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)
        optim.set_lr(optimizer, lr)

        train_meter.data_toc()

        if cfg.DETECTION.ENABLE:
            preds = model(inputs, meta["boxes"])
        else:
            preds = model(inputs)
        # Explicitly declare reduction to mean.
        loss_fun = losses.get_loss_func(cfg.MODEL.LOSS_FUNC)(reduction="mean")

        # Compute the loss.
        loss = loss_fun(preds, labels)

        # check Nan Loss.
        misc.check_nan_losses(loss)

        # Perform the backward pass.
        optimizer.zero_grad()
        loss.backward()
        # Update the parameters.
        optimizer.step()

        if cfg.DETECTION.ENABLE:
            if cfg.NUM_GPUS > 1:
                loss = du.all_reduce([loss])[0]
            loss = loss.item()

            # Update and log stats.
            train_meter.update_stats(None, None, None, loss, lr)
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        else:
            top1_err, top5_err = None, None
            if cfg.DATA.MULTI_LABEL:
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    [loss] = du.all_reduce([loss])
                loss = loss.item()
            else:
                # Compute the errors.
                num_topks_correct = metrics.topks_correct(
                    preds, labels, (1, 5))
                top1_err, top5_err = [(1.0 - x / preds.size(0)) * 100.0
                                      for x in num_topks_correct]
                # Gather all the predictions across all the devices.
                if cfg.NUM_GPUS > 1:
                    loss, top1_err, top5_err = du.all_reduce(
                        [loss, top1_err, top5_err])

                # Copy the stats from GPU to CPU (sync point).
                loss, top1_err, top5_err = (
                    loss.item(),
                    top1_err.item(),
                    top5_err.item(),
                )

            # Update and log stats.
            train_meter.update_stats(
                top1_err,
                top5_err,
                loss,
                lr,
                inputs[0].size(0) * max(
                    cfg.NUM_GPUS, 1
                ),  # If running  on CPU (cfg.NUM_GPUS == 1), use 1 to represent 1 CPU.
            )
            # write to tensorboard format if available.
            if writer is not None:
                writer.add_scalars(
                    {
                        "Train/loss": loss,
                        "Train/lr": lr,
                        "Train/Top1_err": top1_err,
                        "Train/Top5_err": top5_err,
                    },
                    global_step=data_size * cur_epoch + cur_iter,
                )

        train_meter.iter_toc()  # measure allreduce for this meter
        train_meter.log_iter_stats(cur_epoch, cur_iter)
        train_meter.iter_tic()

    # Log epoch stats.
    train_meter.log_epoch_stats(cur_epoch)
    train_meter.reset()

Пример #25

Показать файл

Файл: train_net.py Проект: gabrielsluz/SlowFast

def eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, test_imp=False):
    """
    Evaluate the model on the val set.
    Args:
        val_loader (loader): data loader to provide validation data.
        model (model): model to evaluate the performance.
        val_meter (ClevrerValMeter): meter instance to record and calculate the metrics.
        cur_epoch (int): number of the current epoch of training.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    test_counter = 0
    # Evaluation mode enabled. The running stats would not be updated.
    model.eval()
    val_meter.iter_tic()

    for cur_iter, sampled_batch in enumerate(val_loader):
        #Samples 2 batches. One for des and one for mc
        #There are much more des, then some batches are only des
        des_batch = sampled_batch['des']
        des_q = des_batch['question_dict']['question']
        des_ans = des_batch['question_dict']['ans']
        des_len = des_batch['question_dict']['len']
        # Transfer the data to the current GPU device.
        if cfg.NUM_GPUS:
            des_q = des_q.cuda(non_blocking=True)
            des_ans = des_ans.cuda()
            des_len = des_len.cuda(non_blocking=True)

        has_mc = sampled_batch['has_mc'][0]
        if has_mc:
            mc_batch = sampled_batch['mc']
            mc_q = mc_batch['question_dict']['question']
            mc_ans = mc_batch['question_dict']['ans']
            mc_len = mc_batch['question_dict']['len']
            if cfg.NUM_GPUS:
                mc_q = mc_q.cuda(non_blocking=True)
                mc_ans = mc_ans.cuda()
                mc_len = mc_len.cuda(non_blocking=True)

        val_meter.data_toc()

        # Explicitly declare reduction to mean.
        des_loss_fun = losses.get_loss_func('cross_entropy')(reduction="mean")
        mc_loss_fun = losses.get_loss_func('bce_logit')(reduction="mean")

        pred_des_ans = model(des_q, True)
        loss_des_val = des_loss_fun(pred_des_ans, des_ans)

        loss_mc_val = None
        if has_mc:
            pred_mc_ans = model(mc_q, False)
            loss_mc_val = mc_loss_fun(pred_mc_ans, mc_ans)

        # Compute the errors.
        num_topks_correct = metrics.topks_correct(pred_des_ans, des_ans,
                                                  (1, 5))
        # Combine the errors across the GPUs.
        top1_err, top5_err = [(1.0 - x / pred_des_ans.size(0)) * 100.0
                              for x in num_topks_correct]
        if has_mc:
            diff_mc_ans = torch.abs(
                mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float())  #Errors
            mc_opt_err = 100 * torch.true_divide(diff_mc_ans.sum(),
                                                 (4 * mc_q.size()[0]))
            mc_q_err = 100 * torch.true_divide(
                (diff_mc_ans.sum(dim=1, keepdim=True) != 0).float().sum(),
                mc_q.size()[0])
            # Copy the stats from GPU to CPU (sync point).
            loss_des_val, loss_mc_val, top1_err, top5_err, mc_opt_err, mc_q_err = (
                loss_des_val.item(), loss_mc_val.item(), top1_err.item(),
                top5_err.item(), mc_opt_err.item(), mc_q_err.item())
            mb_size_mc = mc_q.size()[0]
        else:
            mc_opt_err, mc_q_err = None, None
            mb_size_mc = None
            loss_des_val, top1_err, top5_err = (loss_des_val.item(),
                                                top1_err.item(),
                                                top5_err.item())

        val_meter.iter_toc()
        #top1_err, top5_err, mc_opt_err, mc_q_err, loss_des, loss_mc, mb_size_des, mb_size_mc
        # Update and log stats.
        val_meter.update_stats(top1_err, top5_err, mc_opt_err, mc_q_err,
                               loss_des_val, loss_mc_val,
                               des_q.size()[0], mb_size_mc)
        val_meter.log_iter_stats(cur_epoch, cur_iter)
        val_meter.iter_tic()

        #For testing implementation
        if test_imp:
            print(" --- Descriptive questions results --- ")
            # print("Des_q")
            # print(des_q)
            print("Des_ans")
            print(des_ans)
            #print("Des_ans_pred")
            #print(pred_des_ans)
            print("Argmax => prediction")
            print(torch.argmax(pred_des_ans, dim=1, keepdim=False))
            print("Top1_err and Top5err")
            print(top1_err, top5_err)
            print("Loss_des_val = {}".format(loss_des_val))
            if has_mc:
                print(" --- Multiple Choice questions results --- ")
                # print("Mc_q")
                # print(mc_q)
                # print("Mc errors pred x ans")
                # print(torch.abs(mc_ans - (torch.sigmoid(pred_mc_ans) >= 0.5).float()))
                print("mc_opt_err = {} \nmc_q_err = {}".format(
                    mc_opt_err, mc_q_err))
                print("Loss_mc_val = {}".format(loss_mc_val))
            test_counter += 1
            if test_counter == 4:
                break

    # Log epoch stats.
    val_meter.log_epoch_stats(cur_epoch)
    val_meter.reset()