Exemplo n.º 1
0
 def val_epoch(self, epoch):
     model_with_loss = self.model_with_loss
     model_with_loss.eval()
     data_time, batch_time = AverageMeter(), AverageMeter()
     avg_loss_stats = {l: AverageMeter() for l in self.loss_stats}
     end = time.time()
     for iter_id, batch in enumerate(self.val_loader):
         show_str = '[%d/%d/%d] ' % (epoch + 1, iter_id + 1,
                                     self.num_val_iter)
         data_time.update(time.time() - end)
         with torch.no_grad():
             for k in batch:
                 batch[k] = batch[k].to(device=self.config.TRAIN['DEVICE'],
                                        non_blocking=True)
             loss, loss_stats = model_with_loss(batch)
         batch_time.update(time.time() - end)
         end = time.time()
         for l in avg_loss_stats:
             avg_loss_stats[l].update(loss_stats[l].mean().item(),
                                      batch['input'].size(0))
             self.writer.add_scalar('val/' + l, avg_loss_stats[l].avg,
                                    epoch * self.num_val_iter + iter_id)
             show_str += ' {}:{:0.4}   '.format(l, avg_loss_stats[l].avg)
         print(show_str)
     save_checkpoint(
         model_with_loss.model,
         self.config.TRAIN['CHECKPOINT'] + '/model_%d.pth' % epoch)
Exemplo n.º 2
0
    def run(self):
        # checkpoint
        self.scheduler = get_scheduler(self.config, self.optimizer,
                                       self.last_epoch)
        self.model.train()
        postfix_dic = {
            'lr': 0.0,
            'acc': 0.0,
            'loss': 0.0,
        }

        if self.config.data.sampler == "weight":
            self.train_weigh()
        else:
            for epoch in range(self.last_epoch, self.num_epochs):

                self.train_single_epoch(epoch)

                if epoch % 200 == 199:
                    save_checkpoint(self.config, self.model, self.optimizer,
                                    self.optimizer_center, epoch, self.step)

                self.scheduler.step()
                if epoch > self.config.train.num_epochs:
                    break
Exemplo n.º 3
0
def fit_model(
    model,
    n_epoch,
    dev_dataloader,
    optimizer,
    criterion,
    loss_fn,
    metric_fn,
    val_dataloader=None,
    checkpoint=False,
    model_fn="pytorch",
):
    n_dev_obs, dev_batch_size, dev_batch_per_epoch = get_batch_info(
        dev_dataloader)
    for idx_epoch in tqdm(range(n_epoch), total=n_epoch):
        t = tqdm(enumerate(dev_dataloader), total=dev_batch_per_epoch)
        for idx_batch, data in t:
            model = model.train()
            loss = loss_fn(model, criterion, data)
            train_step(optimizer, loss)
            with torch.no_grad():
                model = model.eval()
                metric = metric_fn(model, data)
            t.set_postfix({"loss": loss.item(), "metric": metric.item()})
        if val_dataloader is not None:
            val_loss, val_metric = validate_model(model, criterion, loss_fn,
                                                  metric_fn, val_dataloader)
            print(" val_loss : {}, val_metric : {}".format(
                val_loss, val_metric))
        if checkpoint:
            model_filename = "{}_{}".format(model_fn, idx_epoch)
            save_checkpoint(model, optimizer, model_filename)
    return model
Exemplo n.º 4
0
    def postEpoch(self, epoch, optimizer, trainData: EpochData, validData: EpochData):
        logger = self.getLogger()
        model = self.getModel()

        trainDataRow = trainData.summaryDataRow()
        validDataRow = validData.summaryDataRow()

        # add epoch number
        trainDataRow[self.epochNumKey] = epoch
        # add learning rate
        trainDataRow[self.lrKey] = self.formats[self.lrKey](optimizer.param_groups[0]['lr'])
        # add flops ratio
        trainDataRow[self.flopsRatioKey] = self.formats[self.flopsRatioKey](model.flopsRatio())

        # merge trainDataRow with validDataRow
        for k, v in validDataRow.items():
            trainDataRow[k] = v

        # save model checkpoint
        save_checkpoint(self.getTrainFolderPath(), model, optimizer, validData.accDict())

        # add data to main logger table
        logger.addDataRow(trainDataRow)

        # select new path for next epoch
        self._selectNewPath()
Exemplo n.º 5
0
    def train_weigh(self):
        acc_sample = 0
        count_all = 0
        all_loss = 0
        all_center_loss = 0
        total_num = len(self.dataset)
        batch_size = self.config.train.batch_size.batch1 * self.config.train.batch_size.batch2

        step_num = math.ceil(total_num / batch_size)

        epoch = self.last_epoch
        iteration = epoch * step_num
        # print("step number is ", step_num)
        for seq, vID, label, _ in self.data_loader:
            iteration += 1
            count_all += len(label)
            acc_i, loss, loss_center = self.train_sigle_iteration(seq, label)
            all_loss += loss
            all_center_loss += loss_center
            acc_sample += acc_i

            if iteration % step_num == step_num - 1:
                self.scheduler.step()
                if self.scheduler_center is not None:
                    self.scheduler_center.step()
                epoch += 1

                if (epoch % self.config.train.save_step) == (
                        self.config.train.save_step - 1):
                    print("save loss log image")
                    self.plot_loss()
                    save_checkpoint(self.config, self.model, self.optimizer,
                                    self.center_model, self.optimizer_center,
                                    epoch, self.step)

                if self.writer is not None:
                    self.writer.add_scalar("train_loss", all_loss, epoch)

                acc_epoch = acc_sample * 1.0 / count_all
                if self.center_model is not None:
                    print(
                        "training in epoch :{}, the acc is {}% ,\n the cross loss is {}, the center loss is {}"
                        .format(epoch, acc_epoch * 100, all_loss,
                                all_center_loss))
                    self.loss_center_data.append(all_center_loss)
                else:
                    print(
                        "training in epoch :{}, the acc is {}% ,\n the loss is {}"
                        .format(epoch, acc_epoch * 100, all_loss))

                self.loss_data.append(all_loss)

                print("learning rate: ", self.optimizer.param_groups[0]['lr'])
                acc_sample = 0
                count_all = 0
                all_loss = 0
                all_center_loss = 0

            if epoch > self.config.train.num_epochs:
                break
Exemplo n.º 6
0
    def train(self):
        args = self.args
        model = self.model
        logger = self.logger
        epochRange = self._getEpochRange(self.nEpochs)

        # init optimizer
        optimizer = SGD(model.alphas(),
                        args.search_learning_rate,
                        momentum=args.search_momentum,
                        weight_decay=args.search_weight_decay)
        # init scheduler
        scheduler = ReduceLROnPlateau(optimizer,
                                      mode='min',
                                      factor=0.95,
                                      patience=args.search_patience,
                                      min_lr=args.search_learning_rate_min)

        for epoch in epochRange:
            print('========== Epoch:[{}/{}] =============='.format(
                epoch, self.nEpochs))
            # init epoch train logger
            trainLogger = HtmlLogger(self.trainFolderPath, epoch)
            # set loggers dictionary
            loggersDict = {self.trainLoggerKey: trainLogger}

            # create epoch jobs
            epochDataRows = self._createEpochJobs(epoch)
            # add epoch data rows
            for jobDataRow in epochDataRows:
                logger.addDataRow(jobDataRow, trType='<tr bgcolor="#2CBDD6">')

            # train alphas
            # epochLossDict, alphasDataRow = self.trainAlphas(self._getNextSearchQueueDataLoader(), optimizer, epoch, loggersDict)
            epochLossDict, alphasDataRow = self.trainAlphas(
                self.valid_queue, optimizer, epoch, loggersDict)
            # update scheduler
            scheduler.step(epochLossDict.get(self.flopsLoss.totalKey()))

            # calc model choosePathAlphasAsPartition flops ratio
            model.choosePathAlphasAsPartition()
            # add values to alphas data row
            additionalData = {
                self.epochNumKey: epoch,
                self.lrKey: optimizer.param_groups[0]['lr'],
                self.validFlopsRatioKey: model.flopsRatio()
            }
            self._applyFormats(additionalData)
            # add alphas data row
            alphasDataRow.update(additionalData)
            logger.addDataRow(alphasDataRow)

            # save checkpoint
            save_checkpoint(self.trainFolderPath, model, optimizer,
                            epochLossDict)
Exemplo n.º 7
0
def train(cfg):
    train_loader = construct_loader(cfg, train=True)
    val_loader = construct_loader(cfg, train=False)

    model = build_model(cfg)
    optimizer = construct_optimizer(model, cfg)
    for epoch in range(cfg.TRAIN.MAX_EPOCH):
        shuffle_dataset(train_loader, epoch)
        train_epoch(train_loader, model, optimizer, epoch, cfg)
        eval_epoch(val_loader, model, epoch, cfg)
        save_checkpoint(model, optimizer, epoch, cfg)
    def train(self, epochs, validate_every, start_epoch):
        """
        Runs the model on training dataset.
        
        Args:
            epochs (int): Total epochs.
            validate_every (int): Run validation after every validate_every no of epochs.
            start_epoch (int): Starting epoch if using the stored checkpoint.
        """
        #self.validation(epoch = 0)
        #batch_size = Config.get("training_batch_size")
        for epoch in range(start_epoch, epochs + 1):
            training_batch_losses = []
            for _, data in tqdm(enumerate(self.training_loader, 0)):
                images, captions, lengths, _ = data
                self.optimizer.zero_grad()
                images = images.to(Config.get("device"))
                captions = captions.to(Config.get("device"))
                #setting up training mode
                self.encoder = self.encoder.train()
                self.decoder = self.decoder.train()
                #image features
                image_features = self.encoder(images)
                #predicted captions
                predicted_captions = self.decoder.teacher_forcing(
                    image_features, captions, lengths,
                    self.pretrained_embeddings)
                #max_length, _ = lengths.max(0)
                #ref_captions_mask = torch.ones(batch_size, max_length).to(Config.get("device"))
                #loss function
                loss = self.criterion(predicted_captions, captions)
                #calculating the gradients
                loss.backward()
                #updating the parameters
                self.optimizer.step()
                training_batch_losses.append(loss.item())

            self.stat.record(training_losses=np.mean(training_batch_losses))
            self.stat.push_tensorboard_losses(epoch)
            self.stat.log_losses(epoch)
            if (epoch - 1) % validate_every == 0:
                self.validation(epoch=epoch)
                save_checkpoint(epoch=epoch,
                                outdir=self.output_dir,
                                encoder=self.encoder,
                                decoder=self.decoder,
                                optimizer=self.optimizer,
                                criterion=self.criterion)
Exemplo n.º 9
0
    def postEpoch(self, epoch, optimizer, trainData: EpochData,
                  validData: EpochData):
        logger = self.getLogger()
        model = self.getModel()
        # init data row
        dataRow = trainData.summaryDataRow()
        # add epoch number
        dataRow[self.epochNumKey] = epoch
        # add learning rate
        dataRow[self.lrKey] = self.formats[self.lrKey](
            optimizer.param_groups[0]['lr'])

        # merge trainData with validData
        for k, v in validData.summaryDataRow().items():
            dataRow[k] = v

        # get valid acc dict & loss dict
        validAccDict = validData.accDict()
        validLossDict = validData.lossDict()
        # update optimum values according to current epoch values and get optimum table for logger
        optimumTable = self.trainOptimum.update(validAccDict, epoch)
        # add update time to optimum table
        optimumTable.append(['Update time', logger.getTimeStr()])
        # update nEpochsOptimum table
        logger.addInfoTable('Optimum', optimumTable)

        # update best precision only after switching stage is complete
        is_best = self.trainOptimum.is_best(epoch)
        if is_best:
            # update optimal epoch data
            self.optimalEpochData = (validAccDict, validLossDict)
            # found new optimum, reset nEpochsOptimum
            self.nEpochsOptimum = 0
        else:
            # optimum hasn't changed
            self.nEpochsOptimum += 1

        # save model checkpoint
        save_checkpoint(self.getTrainFolderPath(), model, optimizer,
                        validAccDict, is_best)

        # add data to main logger table
        logger.addDataRow(dataRow)
Exemplo n.º 10
0
def run(args):
    df = pd.read_csv(args.df_path)
    df_train = df[df['fold'] != args.fold]

    model = get_model(args).cuda()
    dataloader = get_dataloader(args.data_dir, df_train, 'train',
                                args.pretrain, args.batch_size)
    checkpoints = get_checkpoints(args)

    checkpoint.load_checkpoint(
        args, model, None, checkpoint=checkpoints[0]
    )  # args, model, ckpt_name, checkpoint=None, optimizer=None
    for i, ckpt in enumerate(checkpoints[1:]):
        print(i, ckpt)
        model2 = get_model(args).cuda()
        last_epoch, _ = checkpoint.load_checkpoint(args,
                                                   model2,
                                                   None,
                                                   checkpoint=ckpt)
        if args.ema is None:
            swa.moving_average(model, model2, 1. / (i + 2))
        else:
            swa.moving_average(model, model2, args.ema)

    with torch.no_grad():
        swa.bn_update(dataloader, model)

    if args.ema is not None:
        output_name = f'model_ema_{len(checkpoints)}'
    else:
        output_name = f'model_swa_{len(checkpoints)}'

    print('save {}'.format(output_name))

    checkpoint.save_checkpoint(args,
                               model,
                               None,
                               0,
                               0,
                               name=output_name,
                               weights_dict={'state_dict': model.state_dict()})
Exemplo n.º 11
0
def fit_model(model,
              n_epoch,
              dev_dataloader,
              optimizer,
              criterion,
              loss_fn,
              metric_fn,
              val_dataloader=None,
              checkpoint=False,
              model_filename="checkpoint",
              **kwargs):
    cur_time = datetime.datetime.now().strftime('%Y%m%d-%H%M')
    if not os.path.exists(os.path.join(model_cp_path, cur_time)):
        os.mkdir(os.path.join(model_cp_path, cur_time))
    save_metadata(cur_time, model, n_epoch, dev_dataloader, optimizer,
                  criterion, val_dataloader)
    n_dev_obs, dev_batch_size, dev_batch_per_epoch = get_batch_info(
        dev_dataloader)
    for idx_epoch in tqdm(range(n_epoch), total=n_epoch):
        t = tqdm(enumerate(dev_dataloader), total=dev_batch_per_epoch)
        for idx_batch, data in t:
            model = model.train()
            loss = loss_fn(model, criterion, data)
            train_step(optimizer, loss)
            with torch.no_grad():
                model = model.eval()
                metric = metric_fn(model, data)
            t.set_postfix({"loss": loss.item(), "metric": metric.item()})
        if val_dataloader is not None:
            val_loss, val_metric = validate_model(model, criterion, loss_fn,
                                                  metric_fn, val_dataloader)
            print(" val_loss : {}, val_metric : {}".format(
                val_loss, val_metric))
        if checkpoint:
            filename = "{}_{}".format(model_filename, idx_epoch)
            save_checkpoint(model, optimizer, cur_time, filename)
    return model
Exemplo n.º 12
0
        if epoch % 10 == 0:
            # Don't want to save all test-stats
            test.validate(q_network,
                          epoch,
                          test_loader,
                          args,
                          ReinforcementLearning,
                          statistics,
                          TEXT,
                          still_training=True)
            # Save best checkpoint
            if training_status_handler.update_best(
                    statistics.statistics['training_test_reward']):
                statistics.update_state(q_network.state_dict())
                save_checkpoint(statistics.statistics,
                                args.name,
                                filename="best.pth.tar")

        # Save checkpoint
        if epoch % training_status_handler.SAVE == 0:
            statistics.update_state(q_network.state_dict())
            save_checkpoint(statistics.statistics, args.name)

        # Save backup checkpoint
        if epoch % training_status_handler.BACKUP == 0:
            statistics.update_state(q_network.state_dict())
            save_checkpoint(statistics.statistics,
                            args.name,
                            filename="backup.pth.tar")

    # Final checkpoint
Exemplo n.º 13
0
def main():

    args = get_arguments()

    # configuration
    CONFIG = Dict(yaml.safe_load(open(args.config)))

    # writer
    if CONFIG.writer_flag:
        writer = SummaryWriter(CONFIG.result_path)
    else:
        writer = None

    # DataLoaders
    train_data = PASCALVOC(
        CONFIG,
        mode="train",
        transform=Compose([
            RandomCrop(CONFIG),
            Resize(CONFIG),
            RandomFlip(),
            ToTensor(),
            Normalize(mean=get_mean(), std=get_std()),
        ])
    )

    val_data = PASCALVOC(
        CONFIG,
        mode="val",
        transform=Compose([
            RandomCrop(CONFIG),
            Resize(CONFIG),
            ToTensor(),
            Normalize(mean=get_mean(), std=get_std()),
        ])
    )

    train_loader = DataLoader(
        train_data,
        batch_size=CONFIG.batch_size,
        shuffle=True,
        num_workers=CONFIG.num_workers,
        drop_last=True
    )

    val_loader = DataLoader(
        val_data,
        batch_size=CONFIG.batch_size,
        shuffle=False,
        num_workers=CONFIG.num_workers
    )

    # load model
    print('\n------------------------Loading Model------------------------\n')

    if CONFIG.attention == 'dual':
        model = DANet(CONFIG)
        print('Dual Attintion modules will be added to this base model')
    elif CONFIG.attention == 'channel':
        model = CANet(CONFIG)
        print('Channel Attintion modules will be added to this base model')
    else:
        if CONFIG.model == 'drn_d_22':
            print(
                'Dilated ResNet D 22 w/o Dual Attention modules will be used as a model.')
            model = drn_d_22(pretrained=True, num_classes=CONFIG.n_classes)
        elif CONFIG.model == 'drn_d_38':
            print(
                'Dilated ResNet D 28 w/o Dual Attention modules will be used as a model.')
            model = drn_d_38(pretrained=True, num_classes=CONFIG.n_classes)
        else:
            print('There is no option you chose as a model.')
            print(
                'Therefore, Dilated ResNet D 22 w/o Dual Attention modules will be used as a model.')
            model = drn_d_22(pretrained=True, num_classes=CONFIG.n_classes)

    # set optimizer, lr_scheduler
    if CONFIG.optimizer == 'Adam':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = optim.Adam(model.parameters(), lr=CONFIG.learning_rate)
    elif CONFIG.optimizer == 'SGD':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = optim.SGD(
            model.parameters(),
            lr=CONFIG.learning_rate,
            momentum=CONFIG.momentum,
            dampening=CONFIG.dampening,
            weight_decay=CONFIG.weight_decay,
            nesterov=CONFIG.nesterov)
    elif CONFIG.optimizer == 'AdaBound':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = adabound.AdaBound(
            model.parameters(),
            lr=CONFIG.learning_rate,
            final_lr=CONFIG.final_lr,
            weight_decay=CONFIG.weight_decay)
    else:
        print('There is no optimizer which suits to your option. \
            Instead, SGD will be used as an optimizer.')
        optimizer = optim.SGD(
            model.parameters(),
            lr=CONFIG.learning_rate,
            momentum=CONFIG.momentum,
            dampening=CONFIG.dampening,
            weight_decay=CONFIG.weight_decay,
            nesterov=CONFIG.nesterov)

    # learning rate scheduler
    if CONFIG.optimizer == 'SGD':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 'min', patience=CONFIG.lr_patience)
    else:
        scheduler = None

    # send the model to cuda/cpu
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    if device == 'cuda':
        model = torch.nn.DataParallel(model)  # make parallel
        torch.backends.cudnn.benchmark = True

    # resume if you want
    begin_epoch = 0
    if args.resume:
        if os.path.exists(os.path.join(CONFIG.result_path, 'checkpoint.pth')):
            print('loading the checkpoint...')
            begin_epoch, model, optimizer, scheduler = \
                resume(CONFIG, model, optimizer, scheduler)
            print('training will start from {} epoch'.format(begin_epoch))

    # criterion for loss
    if CONFIG.class_weight:
        criterion = nn.CrossEntropyLoss(
            weight=get_class_weight().to(device),
            ignore_index=255
        )
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=255)

    # train and validate model
    print('\n------------------------Start training------------------------\n')
    losses_train = []
    losses_val = []
    val_ious = []
    mean_ious = []
    mean_ious_without_bg = []
    best_mean_iou = 0.0

    for epoch in range(begin_epoch, CONFIG.max_epoch):
        # training
        loss_train = train(
            model, train_loader, criterion, optimizer, CONFIG, device)
        losses_train.append(loss_train)

        # validation
        val_iou, loss_val = validation(
            model, val_loader, criterion, CONFIG, device)
        val_ious.append(val_iou)
        losses_val.append(loss_val)
        if CONFIG.optimizer == 'SGD':
            scheduler.step(loss_val)

        mean_ious.append(val_ious[-1].mean().item())
        mean_ious_without_bg.append(val_ious[-1][1:].mean().item())

        # save checkpoint every 5 epoch
        if epoch % 5 == 0 and epoch != 0:
            save_checkpoint(CONFIG, epoch, model, optimizer, scheduler)

        # save a model every 50 epoch
        if epoch % 50 == 0 and epoch != 0:
            torch.save(
                model.state_dict(), os.path.join(CONFIG.result_path, 'epoch_{}_model.prm'.format(epoch)))

        if best_mean_iou < mean_ious[-1]:
            best_mean_iou = mean_ious[-1]
            torch.save(
                model.state_dict(), os.path.join(CONFIG.result_path, 'best_mean_iou_model.prm'))

        # tensorboardx
        if writer:
            writer.add_scalars(
                "loss", {
                    'loss_train': losses_train[-1],
                    'loss_val': losses_val[-1]}, epoch)
            writer.add_scalar(
                "mean_iou", mean_ious[-1], epoch)
            writer.add_scalar(
                "mean_iou_w/o_bg", mean_ious_without_bg[-1], epoch)

        print(
            'epoch: {}\tloss_train: {:.5f}\tloss_val: {:.5f}\tmean IOU: {:.3f}\tmean IOU w/o bg: {:.3f}'.format(
                epoch, losses_train[-1], losses_val[-1], mean_ious[-1], mean_ious_without_bg[-1])
        )

    torch.save(
        model.state_dict(), os.path.join(CONFIG.result_path, 'final_model.prm'))
Exemplo n.º 14
0
def train(cfg):
    """
    Train a video model for many epochs on train set and evaluate it on val set.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set up environment.
    du.init_distributed_training(cfg)
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging(cfg.OUTPUT_DIR)

    # Init multigrid.
    multigrid = None
    if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE:
        multigrid = MultigridSchedule()
        cfg = multigrid.init_multigrid(cfg)
        if cfg.MULTIGRID.LONG_CYCLE:
            cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0)
    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Build the video model and print model statistics.
    model = build_model(cfg)
    # model = x3d.MyModel()
    if du.is_master_proc() and cfg.LOG_MODEL_INFO:
        misc.log_model_info(model, cfg, is_train=True)

    # Construct the optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    # Load a checkpoint to resume training if applicable.
    if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR):
        last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
        logger.info("Load from last checkpoint, {}.".format(last_checkpoint))
        checkpoint_epoch = cu.load_checkpoint(
            last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer
        )
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        logger.info("Load from given checkpoint file.")
        checkpoint_epoch = cu.load_checkpoint(
            cfg.TRAIN.CHECKPOINT_FILE_PATH,
            model,
            cfg.NUM_GPUS > 1,
            optimizer,
            inflation=cfg.TRAIN.CHECKPOINT_INFLATE,
            convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2",
        )
        start_epoch = checkpoint_epoch + 1
    else:
        start_epoch = 0

    # Create the video train and val loaders.
    train_loader = loader.construct_loader(cfg, "train")
    val_loader = loader.construct_loader(cfg, "val")
    precise_bn_loader = loader.construct_loader(
        cfg, "train", is_precise_bn=True
    )

    # Create meters.
    if cfg.DETECTION.ENABLE:
        train_meter = AVAMeter(len(train_loader), cfg, mode="train")
        val_meter = AVAMeter(len(val_loader), cfg, mode="val")
    else:
        train_meter = TrainMeter(len(train_loader), cfg)
        val_meter = ValMeter(len(val_loader), cfg)

    # set up writer for logging to Tensorboard format.
    if cfg.TENSORBOARD.ENABLE and du.is_master_proc(
        cfg.NUM_GPUS * cfg.NUM_SHARDS
    ):
        writer = tb.TensorboardWriter(cfg)
    else:
        writer = None

    # Perform the training loop.
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH):
        if cfg.MULTIGRID.LONG_CYCLE:
            cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch)
            if changed:
                (
                    model,
                    optimizer,
                    train_loader,
                    val_loader,
                    precise_bn_loader,
                    train_meter,
                    val_meter,
                ) = build_trainer(cfg)

                # Load checkpoint.
                if cu.has_checkpoint(cfg.OUTPUT_DIR):
                    last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
                    assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint
                else:
                    last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH
                logger.info("Load from {}".format(last_checkpoint))
                cu.load_checkpoint(
                    last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer
                )

        # Shuffle the dataset.
        loader.shuffle_dataset(train_loader, cur_epoch)
        # Train for one epoch.
        train_epoch(
            train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer
        )

        # Compute precise BN stats.
        if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0:
            calculate_and_update_precise_bn(
                precise_bn_loader,
                model,
                min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)),
            )
        _ = misc.aggregate_sub_bn_stats(model)

        # Save a checkpoint.
        if cu.is_checkpoint_epoch(
            cfg, cur_epoch, None if multigrid is None else multigrid.schedule
        ):
            cu.save_checkpoint(cfg.OUTPUT_DIR, model,
                               optimizer, cur_epoch, cfg)
        # Evaluate the model on validation set.
        if misc.is_eval_epoch(
            cfg, cur_epoch, None if multigrid is None else multigrid.schedule
        ):
            eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer)

    if writer is not None:
        writer.close()
Exemplo n.º 15
0
def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    bb_df = pd.read_csv(bb_repo)
    train_idx = np.arange(len(bb_df))
    dev_idx, val_idx = train_test_split(train_idx, test_size=0.20)
    dev_df = bb_df.iloc[dev_idx, :].reset_index(drop=True)
    val_df = bb_df.iloc[val_idx, :].reset_index(drop=True)

    bb_train_dataset = BBDataset(True, device, dev_df)
    bb_dev_dataset = BBDataset(True, device, dev_df)
    bb_val_dataset = BBDataset(True, device, val_df)
    bb_test_dataset = BBDataset(False, device)
    train_dataloader = DataLoader(bb_train_dataset, batch_size=32)
    dev_dataloader = DataLoader(bb_dev_dataset, batch_size=32, shuffle=True)
    val_dataloader = DataLoader(bb_val_dataset, batch_size=32)
    test_dataloader = DataLoader(bb_test_dataset, batch_size=32)

    preload_model = torchvision.models.resnet50(pretrained=True).to(device)
    header_model = Res50BBHead([1000], 0.5).to(device)
    model = ResPneuNet(preload_model, header_model)

    n_epoch = 5
    optimizer = optim.Adam(
        [
            {
                "params": model.preload_backbone.parameters(),
                "lr": 0.0001
            },
            {
                "params": model.header.parameters(),
                "lr": 0.001
            },
        ],
        betas=(0.9, 0.999),
        eps=1e-08,
        weight_decay=0,
        amsgrad=False,
    )
    criterion = nn.L1Loss().to(device)

    n_obs, batch_size, n_batch_per_epoch = get_batch_info(dev_dataloader)
    clr = CLR(n_epoch, n_batch_per_epoch, 0.1, 1., 0.95, 0.85, 2)
    callbacks = [clr]

    model = fit_model(
        model,
        n_epoch,
        dev_dataloader,
        optimizer,
        criterion,
        loss_fn,
        metric_fn,
        val_dataloader,
        checkpoint=True,
        model_fn="bb",
    )

    prediction = predict_model(model, test_dataloader, pred_fn)
    string_prediction = [
        "{} {} {} {}".format(x[0], x[1], x[2], x[3]) for x in prediction
    ]
    patientid = test_dataloader.dataset.patientId
    pneu_bb = string_prediction
    bb_pred_df = pd.DataFrame({"name": patientid, "label": pneu_bb})
    bb_pred_df.to_csv(bb_predict_repo, index=False)
    save_checkpoint(model, optimizer, fname="bb")
Exemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser(description='Dataloader test')
    parser.add_argument('--gpu', default='0', help='gpu id')
    parser.add_argument('--workers',
                        default=16,
                        type=int,
                        help='num workers for data loading')
    parser.add_argument('--nb_epoch',
                        default=100,
                        type=int,
                        help='training epoch')
    parser.add_argument('--lr', default=1e-4, type=float, help='learning rate')
    parser.add_argument('--power',
                        default=0,
                        type=float,
                        help='lr poly power; 0 indicates step decay by half')
    parser.add_argument('--batch_size', default=8, type=int, help='batch size')
    parser.add_argument('--size', default=256, type=int, help='image size')
    parser.add_argument(
        '--anchor_imsize',
        default=416,
        type=int,
        help='scale used to calculate anchors defined in model cfg file')
    parser.add_argument('--data_root',
                        type=str,
                        default='./ln_data/DMS/',
                        help='path to ReferIt splits data folder')
    parser.add_argument('--split_root',
                        type=str,
                        default='data',
                        help='location of pre-parsed dataset info')
    parser.add_argument('--dataset',
                        default='referit',
                        type=str,
                        help='referit/flickr/unc/unc+/gref')
    parser.add_argument('--time',
                        default=20,
                        type=int,
                        help='maximum time steps (lang length) per batch')
    parser.add_argument('--emb_size',
                        default=512,
                        type=int,
                        help='fusion module embedding dimensions')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument(
        '--pretrain',
        default='',
        type=str,
        metavar='PATH',
        help=
        'pretrain support load state_dict that are not identical, while have no loss saved as resume'
    )
    parser.add_argument('--print_freq',
                        '-p',
                        default=2000,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 1e3)')
    parser.add_argument('--savename',
                        default='default',
                        type=str,
                        help='Name head for saved model')
    parser.add_argument('--seed', default=13, type=int, help='random seed')
    parser.add_argument('--bert_model',
                        default='bert-base-uncased',
                        type=str,
                        help='bert model')
    parser.add_argument('--test',
                        dest='test',
                        default=False,
                        action='store_true',
                        help='test')
    parser.add_argument('--nflim', default=3, type=int, help='nflim')
    parser.add_argument('--mstage',
                        dest='mstage',
                        default=False,
                        action='store_true',
                        help='if mstage')
    parser.add_argument('--mstack',
                        dest='mstack',
                        default=False,
                        action='store_true',
                        help='if mstack')
    parser.add_argument('--w_div',
                        default=0.125,
                        type=float,
                        help='weight of the diverge loss')
    parser.add_argument('--fusion', default='prod', type=str, help='prod/cat')
    parser.add_argument('--tunebert',
                        dest='tunebert',
                        default=False,
                        action='store_true',
                        help='if tunebert')
    parser.add_argument('--large',
                        dest='large',
                        default=False,
                        action='store_true',
                        help='if large mode: fpn16, convlstm out, size 512')

    global args, anchors_full
    args = parser.parse_args()
    if args.large:
        args.gsize = 16
        args.size = 512
    else:
        args.gsize = 8
    print(
        '----------------------------------------------------------------------'
    )
    print(sys.argv[0])
    print(args)
    print(
        '----------------------------------------------------------------------'
    )
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    ## fix seed
    cudnn.benchmark = False
    cudnn.deterministic = True
    random.seed(args.seed)
    np.random.seed(args.seed + 1)
    torch.manual_seed(args.seed + 2)
    torch.cuda.manual_seed_all(args.seed + 3)

    eps = 1e-10
    ## following anchor sizes calculated by kmeans under args.anchor_imsize=416
    if args.dataset == 'refeit':
        anchors = '30,36,  78,46,  48,86,  149,79,  82,148,  331,93,  156,207,  381,163,  329,285'
    elif args.dataset == 'flickr':
        anchors = '29,26,  55,58,  137,71,  82,121,  124,205,  204,132,  209,263,  369,169,  352,294'
    else:
        anchors = '10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326'
    anchors = [float(x) for x in anchors.split(',')]
    anchors_full = [(anchors[i], anchors[i + 1])
                    for i in range(0, len(anchors), 2)][::-1]

    ## save logs
    if args.savename == 'default':
        args.savename = 'filmconv_nofpn32_%s_batch%d' % (args.dataset,
                                                         args.batch_size)
    if not os.path.exists('./logs'):
        os.mkdir('logs')
    logging.basicConfig(level=logging.INFO,
                        filename="./logs/%s" % args.savename,
                        filemode="a+",
                        format="%(asctime)-15s %(levelname)-8s %(message)s")
    logging.info(str(sys.argv))
    logging.info(str(args))

    input_transform = Compose([
        ToTensor(),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dataset = ReferDataset(data_root=args.data_root,
                                 split_root=args.split_root,
                                 dataset=args.dataset,
                                 split='train',
                                 imsize=args.size,
                                 transform=input_transform,
                                 max_query_len=args.time,
                                 augment=True)
    val_dataset = ReferDataset(data_root=args.data_root,
                               split_root=args.split_root,
                               dataset=args.dataset,
                               split='val',
                               imsize=args.size,
                               transform=input_transform,
                               max_query_len=args.time)
    ## note certain dataset does not have 'test' set:
    ## 'unc': {'train', 'val', 'trainval', 'testA', 'testB'}
    test_dataset = ReferDataset(data_root=args.data_root,
                                split_root=args.split_root,
                                dataset=args.dataset,
                                testmode=True,
                                split='val',
                                imsize=args.size,
                                transform=input_transform,
                                max_query_len=args.time)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              pin_memory=True,
                              drop_last=True,
                              num_workers=args.workers)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            drop_last=True,
                            num_workers=args.workers)
    test_loader = DataLoader(test_dataset,
                             batch_size=1,
                             shuffle=False,
                             pin_memory=True,
                             drop_last=True,
                             num_workers=0)

    ## Model
    model = grounding_model_multihop(NFilm=args.nflim, fusion=args.fusion, intmd=args.mstack, mstage=args.mstage, \
        emb_size=args.emb_size, coordmap=True, convlstm=args.large, \
        bert_model=args.bert_model, dataset=args.dataset, tunebert=args.tunebert)
    model = torch.nn.DataParallel(model).cuda()

    if args.pretrain:
        model = load_pretrain(model, args, logging)
    if args.resume:
        model = load_resume(model, args, logging)

    print('Num of parameters:',
          sum([param.nelement() for param in model.parameters()]))
    logging.info('Num of parameters:%d' %
                 int(sum([param.nelement() for param in model.parameters()])))

    if args.tunebert:
        visu_param = model.module.visumodel.parameters()
        text_param = model.module.textmodel.parameters()
        rest_param = [
            param for param in model.parameters()
            if ((param not in visu_param) and (param not in text_param))
        ]
        visu_param = list(model.module.visumodel.parameters())
        text_param = list(model.module.textmodel.parameters())
        sum_visu = sum([param.nelement() for param in visu_param])
        sum_text = sum([param.nelement() for param in text_param])
        sum_fusion = sum([param.nelement() for param in rest_param])
        print('visu, text, fusion module parameters:', sum_visu, sum_text,
              sum_fusion)
    else:
        visu_param = model.module.visumodel.parameters()
        rest_param = [
            param for param in model.parameters() if param not in visu_param
        ]
        visu_param = list(model.module.visumodel.parameters())
        sum_visu = sum([param.nelement() for param in visu_param])
        sum_text = sum([
            param.nelement() for param in model.module.textmodel.parameters()
        ])
        sum_fusion = sum([param.nelement() for param in rest_param]) - sum_text
        print('visu, text, fusion module parameters:', sum_visu, sum_text,
              sum_fusion)

    ## optimizer; rmsprop default
    if args.tunebert:
        optimizer = torch.optim.RMSprop([{
            'params': rest_param
        }, {
            'params': visu_param,
            'lr': args.lr / 10.
        }, {
            'params': text_param,
            'lr': args.lr / 10.
        }],
                                        lr=args.lr,
                                        weight_decay=0.0005)
    else:
        optimizer = torch.optim.RMSprop([{
            'params': rest_param
        }, {
            'params': visu_param,
            'lr': args.lr / 10.
        }],
                                        lr=args.lr,
                                        weight_decay=0.0005)

    ## training and testing
    best_accu = -float('Inf')
    if args.test:
        _ = test_epoch(test_loader, model)
    else:
        for epoch in range(args.nb_epoch):
            adjust_learning_rate(args, optimizer, epoch)
            train_epoch(train_loader, model, optimizer, epoch)
            accu_new = validate_epoch(val_loader, model)
            ## remember best accu and save checkpoint
            is_best = accu_new > best_accu
            best_accu = max(accu_new, best_accu)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_loss': accu_new,
                    'optimizer': optimizer.state_dict(),
                },
                is_best,
                args,
                filename=args.savename)
        print('\nBest Accu: %f\n' % best_accu)
        logging.info('\nBest Accu: %f\n' % best_accu)
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    # configuration
    CONFIG = Dict(yaml.safe_load(open(args.config)))

    # writer
    if CONFIG.writer_flag:
        writer = SummaryWriter(CONFIG.result_path)
    else:
        writer = None

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    # create model
    print('\n------------------------Loading Model------------------------\n')

    if CONFIG.model == 'resnet18':
        print('ResNet18 will be used as a model.')
        model = resnet.generate_model(18, n_classes=CONFIG.n_classes)
    elif CONFIG.model == 'resnet50':
        print('ResNet50 will be used as a model.')
        model = resnet.generate_model(50, n_classes=CONFIG.n_classes)
    else:
        print('resnet18 will be used as a model.')
        model = resnet.generate_model(18, n_classes=CONFIG.n_classes)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    if CONFIG.class_weight:
        criterion = nn.CrossEntropyLoss(
            weight=get_class_weight(CONFIG.n_classes).cuda(args.gpu)).cuda(
                args.gpu)
    else:
        criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    if CONFIG.optimizer == 'Adam':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = optim.Adam(model.parameters(), lr=CONFIG.learning_rate)
    elif CONFIG.optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=CONFIG.learning_rate,
                                    momentum=CONFIG.momentum,
                                    dampening=CONFIG.dampening,
                                    weight_decay=CONFIG.weight_decay,
                                    nesterov=CONFIG.nesterov)

    # learning rate scheduler
    if CONFIG.optimizer == 'SGD':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 'min', patience=CONFIG.lr_patience)
    else:
        scheduler = None

    # resume if you want
    begin_epoch = 0
    log = None
    if args.resume:
        if os.path.exists(os.path.join(CONFIG.result_path, 'checkpoint.pth')):
            print('loading the checkpoint...')
            begin_epoch, model, optimizer, best_acc1, scheduler = resume(
                CONFIG, model, optimizer, scheduler)
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            print('training will start from {} epoch'.format(begin_epoch))
        else:
            print("there is no checkpoint at the result folder")
        if os.path.exists(os.path.join(CONFIG.result_path, 'log.csv')):
            print('loading the log file...')
            log = pd.read_csv(os.path.join(CONFIG.result_path, 'log.csv'))
        else:
            print("there is no log file at the result folder.")
            print('Making a log file...')
            log = pd.DataFrame(columns=[
                'epoch', 'lr', 'train_loss', 'val_loss', 'train_acc@1',
                'train_acc@5', 'val_acc@1', 'val_acc@5'
            ])

    # DataLoaders
    normalize = Normalize(mean=get_mean(), std=get_std())

    train_data = Kinetics(CONFIG,
                          transform=Compose([
                              RandomCrop((CONFIG.height, CONFIG.width)),
                              ToTensor(),
                              normalize,
                          ]))

    val_data = Kinetics(CONFIG,
                        transform=Compose([
                            RandomCrop((CONFIG.height, CONFIG.width)),
                            ToTensor(),
                            normalize,
                        ]),
                        mode='validation')

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_data)
    else:
        train_sampler = None

    train_loader = DataLoader(train_data,
                              batch_size=CONFIG.batch_size,
                              shuffle=(train_sampler is None),
                              num_workers=CONFIG.num_workers,
                              pin_memory=True,
                              sampler=train_sampler,
                              drop_last=True)

    val_loader = DataLoader(val_data,
                            batch_size=CONFIG.batch_size,
                            shuffle=False,
                            num_workers=CONFIG.num_workers,
                            pin_memory=True)

    # train and validate model
    print('\n------------------------Start training------------------------\n')
    train_losses = []
    val_losses = []
    train_top1_accuracy = []
    train_top5_accuracy = []
    val_top1_accuracy = []
    val_top5_accuracy = []

    for epoch in range(begin_epoch, CONFIG.max_epoch):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train_loss, train_acc1, train_acc5 = train(train_loader, model,
                                                   criterion, optimizer, epoch,
                                                   args, CONFIG)
        train_losses.append(train_loss)
        train_top1_accuracy.append(train_acc1)
        train_top5_accuracy.append(train_acc5)

        # validation on validation set
        val_loss, val_acc1, val_acc5 = validate(val_loader, model, criterion,
                                                args, CONFIG)
        val_losses.append(val_loss)
        val_top1_accuracy.append(val_acc1)
        val_top5_accuracy.append(val_acc5)

        # scheduler
        if CONFIG.optimizer == 'SGD':
            scheduler.step(val_loss)

        # save a model if top1 acc is higher than ever
        if best_acc1 < val_acc1:
            best_acc1 = val_acc1
            torch.save(model.state_dict(),
                       os.path.join(CONFIG.result_path, 'best_acc1_model.prm'))

        # save checkpoint every epoch
        save_checkpoint(CONFIG, epoch, model, optimizer, best_acc1, scheduler)

        # save a model every 10 epoch
        # save base models, NOT DataParalled models
        if epoch % 10 == 0 and epoch != 0:
            torch.save(
                model.state_dict(),
                os.path.join(CONFIG.result_path,
                             'epoch_{}_model.prm'.format(epoch)))

        # tensorboardx
        if writer is not None:
            writer.add_scalars("loss", {
                'train': train_losses[-1],
                'val': val_losses[-1]
            }, epoch)
            writer.add_scalars("train_acc", {
                'top1': train_top1_accuracy[-1],
                'top5': train_top5_accuracy[-1]
            }, epoch)
            writer.add_scalars("val_acc", {
                'top1': val_top1_accuracy[-1],
                'top5': val_top5_accuracy[-1]
            }, epoch)

        # write logs to dataframe and csv file
        tmp = pd.Series([
            epoch,
            scheduler.get_lr()[0],
            train_losses[-1],
            val_losses[-1],
            train_top1_accuracy[-1],
            train_top5_accuracy[-1],
            val_top1_accuracy[-1],
            val_top5_accuracy[-1],
        ],
                        index=log.columns)

        log = log.append(tmp, ignore_index=True)
        log.to_csv(os.path.join(CONFIG.result_path, 'log.csv'), index=False)

        print(
            'epoch: {}\tlr: {}\tloss train: {:.4f}\tloss val: {:.4f}\tval_acc1: {:.5f}\tval_acc5: {:.4f}'
            .format(epoch,
                    scheduler.get_lr()[0], train_losses[-1], val_losses[-1],
                    val_top1_accuracy[-1], val_top5_accuracy[-1]))

    # save base models, NOT DataParalled models
    torch.save(model.module.state_dict(),
               os.path.join(CONFIG.result_path, 'final_model.prm'))
Exemplo n.º 18
0
def main():
    """
    Training and validation.
    """

    global best_bleu4, epochs_since_improvement, checkpoint, tagger_checkpoint, start_epoch, fine_tune_encoder, data_name, word_map

    print('Running on device {}\n'.format(device))

    # Read word map
    word_map_file = os.path.join(data_folder, 'WORDMAP_' + data_name + '.json')
    with open(word_map_file, 'r') as j:
        word_map = json.load(j)

    # Initialize / load checkpoint
    tagger_checkpoint = torch.load(tagger_checkpoint)
    encoder_tagger = tagger_checkpoint['encoder']
    encoder_tagger.fine_tune(False)

    if checkpoint is None:
        decoder = PureSCN(embed_dim=emb_dim,
                          decoder_dim=decoder_dim,
                          factored_dim=factored_dim,
                          semantic_dim=semantic_dim,
                          vocab_size=len(word_map),
                          dropout=dropout)
        decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                             lr=decoder_lr)
        encoder = EncoderCaption()
        encoder.fine_tune(fine_tune_encoder)
        encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                             lr=encoder_lr) if fine_tune_encoder else None

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = checkpoint['epochs_since_improvement']
        best_bleu4 = checkpoint['bleu-4']
        decoder = checkpoint['decoder']
        decoder_optimizer = checkpoint['decoder_optimizer']
        encoder = checkpoint['encoder']
        encoder_optimizer = checkpoint['encoder_optimizer']
        if fine_tune_encoder is True and encoder_optimizer is None:
            encoder.fine_tune(fine_tune_encoder)
            encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                                 lr=encoder_lr)

    # Move to GPU, if available
    decoder = decoder.to(device)
    encoder = encoder.to(device)
    encoder_tagger = encoder_tagger.to(device)

    # Loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # Custom dataloaders
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_loader = torch.utils.data.DataLoader(
        CaptionDataset(data_folder, data_name, 'TRAIN',
                       transform=transforms.Compose([normalize])),
        batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(
        CaptionDataset(data_folder, data_name, 'VAL',
                       transform=transforms.Compose([normalize])),
        batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)

    # Epochs
    for epoch in range(start_epoch, epochs):
        print('Current epoch {}\n'.format(epoch + 1))

        # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
        if epochs_since_improvement == 20:
            break
        if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
            adjust_learning_rate(decoder_optimizer, 0.8)
            if fine_tune_encoder:
                adjust_learning_rate(encoder_optimizer, 0.8)

        # One epoch's training
        train(train_loader=train_loader,
              encoder=encoder,
              encoder_tagger=encoder_tagger,
              decoder=decoder,
              criterion=criterion,
              encoder_optimizer=encoder_optimizer,
              decoder_optimizer=decoder_optimizer,
              epoch=epoch)

        # One epoch's validation
        recent_bleu4 = validate(val_loader=val_loader,
                                encoder=encoder,
                                encoder_tagger=encoder_tagger,
                                decoder=decoder,
                                criterion=criterion)

        # Check if there was an improvement
        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement,))
        else:
            epochs_since_improvement = 0

        print('Saving checkpoint for epoch {}\n'.format(epoch + 1))

        # Save checkpoint
        save_checkpoint('scn', data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer,
                        decoder_optimizer, recent_bleu4, is_best)
Exemplo n.º 19
0
def train(train_data_loader,
          eval_data_loader,
          model,
          reconstruction_loss,
          vocoder,
          mel_stat,
          optimizer,
          scheduler,
          global_step,
          writer=None,
          DEVICE=None):

    model.train()

    while global_step < args.max_training_step:

        for step, (mels, _) in tqdm(enumerate(train_data_loader),
                                    total=len(train_data_loader),
                                    unit='B',
                                    ncols=70,
                                    leave=False):
            mels = mels.float().to(DEVICE)
            optimizer.zero_grad()

            mels_hat, commitment_loss, perplexity = model(mels.detach())

            commitment_loss = args.commitment_cost * commitment_loss
            recon_loss = reconstruction_loss(mels_hat, mels)

            loss = commitment_loss + recon_loss
            loss.backward()

            nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_thresh)
            optimizer.step()

            if global_step % args.save_checkpoint_step == 0:
                save_checkpoint(checkpoint_path=args.model_checkpoint_path,
                                model=model,
                                optimizer=optimizer,
                                scheduler=scheduler,
                                global_step=global_step)

            if global_step % args.eval_step == 0:
                evaluate(model=model,
                         vocoder=vocoder,
                         eval_data_loader=eval_data_loader,
                         criterion=reconstruction_loss,
                         mel_stat=mel_stat,
                         global_step=global_step,
                         writer=writer,
                         DEVICE=DEVICE)
                model.train()

            if args.log_tensorboard:
                writer.add_scalars(mode="train_recon_loss",
                                   global_step=global_step,
                                   loss=recon_loss)
                writer.add_scalars(mode="train_commitment_loss",
                                   global_step=global_step,
                                   loss=commitment_loss)
                writer.add_scalars(mode="train_perplexity",
                                   global_step=global_step,
                                   loss=perplexity)
                writer.add_scalars(mode="train_total_loss",
                                   global_step=global_step,
                                   loss=loss)

            global_step += 1

        scheduler.step()
Exemplo n.º 20
0
 def save_training(self, out_dir):
     meta = dict(c_epoch=self.c_epoch, c_iter=self.c_iter)
     filename = out_dir + 'epoch_{}.pth'.format(self.c_epoch + 1)
     optimizer = self.optimizer
     save_checkpoint(filename, self.model, optimizer, meta)
Exemplo n.º 21
0
def main():
    best_result = math.inf if TASK == 'count' else 0.0
    best_type_meters = dict()

    train_loader, test_loader = get_dataloader(config, logger)

    num_classes = 1
    if TASK == 'frameqa':
        answer_dict = utils.load_answer_dict()
        num_classes = len(answer_dict)
    if TASK == 'youtube2text':
        if config.get_bool('abc.is_multiple_choice'):
            num_classes = 1
        else:
            num_classes = 1000
    logger.info(f'Num classes: {num_classes}')

    vocab_size = utils.get_vocab_size(config, TASK, level='word')
    char_vocab_size = utils.get_vocab_size(config, TASK, level='char')

    model = get_model(vocab_size, char_vocab_size, num_classes)
    model = model.cuda()

    if TASK in MULTIPLE_CHOICE_TASKS:
        criterion = nn.CrossEntropyLoss(reduction='sum')
    elif TASK == 'count':
        inner_criterion = nn.MSELoss()

        def criterion(input, target):
            target = (target - 1.) / 10.
            return inner_criterion(input, target)

        # criterion = nn.SmoothL1Loss()
    elif TASK in ['frameqa']:
        criterion = nn.CrossEntropyLoss()

    elif TASK == 'youtube2text':
        if config.get_bool('abc.is_multiple_choice'):
            criterion = nn.CrossEntropyLoss(reduction='sum')
        else:
            criterion = nn.CrossEntropyLoss()

    optimizer_type = config.get_string('optimizer')

    if optimizer_type == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=config.get_float('adam.lr'))
    else:
        raise Exception(f'Unknow optimizer: {optimizer_type}')

    start_epoch = 1
    end_epoch = config.get_int('num_epochs')

    for epoch in range(start_epoch, end_epoch + 1):
        logger.info(f'Epoch [{epoch}/{end_epoch}] start')

        train(model, train_loader, criterion, optimizer, epoch)
        current_result, current_type_meters = test(model, test_loader,
                                                   criterion, epoch)

        logger.info(f'Epoch [{epoch}/{end_epoch}] end')

        if args.debug:
            break

        is_best = False
        if TASK == 'count':
            if current_result < best_result:
                is_best = True
                best_result = current_result

        else:
            if current_result > best_result:
                is_best = True
                best_result = current_result
                best_type_meters = current_type_meters

        logger.info(
            colored(
                "Current best result: {:.2f}, Exp path: {}".format(
                    best_result, args.experiment_path), "red"))
        logger.info(best_type_meters)
        save_checkpoint(
            {
                'arch': config.get_string('arch'),
                'task': TASK,
                'state_dict': model.state_dict(),
                'epoch': epoch + 1,
                'best_result': best_result,
                'optimizer': optimizer.state_dict(),
                'best_type_meters': best_type_meters,
            },
            is_best=is_best,
            folder=args.experiment_path)

    if TASK == 'count':
        logger.info(f'Best MSE: {best_result}')
    else:
        logger.info(f'Best Acc: {best_result}')
Exemplo n.º 22
0
def train_model():
    """Trains the model."""
    # Build the model (before the loaders to speed up debugging)
    model = model_builder.build_model()
    log_model_info(model)

    # Define the loss function
    loss_fun = losses.get_loss_fun()
    # Construct the optimizer
    optimizer = optim.construct_optimizer(model)

    start_epoch = 0
    min_val_loss = np.inf
    cur_patience = 0

    # Create data loaders
    # train_data, val_data, test_data = loader.load_and_prepare_data()
    train_loader = loader.construct_train_loader(root=cfg.PATHS.DATAPATH)
    val_loader = loader.construct_val_loader(root=cfg.PATHS.DATAPATH)
    test_loader = loader.construct_test_loader(root=cfg.PATHS.DATAPATH)

    # Create meters
    train_meter = Meter(len(train_loader), cfg.TRAIN.BATCH_SIZE, mode="train")
    val_meter = Meter(len(val_loader), cfg.TEST.BATCH_SIZE, mode="valid")
    test_meter = Meter(len(val_loader), cfg.TEST.BATCH_SIZE, mode="test")

    # setup tb logging
    tb = None
    if cfg.IS_TB_LOG:
        tb = TensorboardLogger(log_dir=cfg.PATHS.TB_OUT_DIR, flush_secs=30)

    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        train_epoch(
            train_loader,
            model,
            loss_fun,
            optimizer,
            train_meter,
            cur_epoch,
            mode="train",
            tb=tb,
        )
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            nu.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if cu.is_checkpoint_epoch(cur_epoch):
            checkpoint_file = cu.save_checkpoint(model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        if is_eval_epoch(cur_epoch):
            val_loss = test_epoch(val_loader,
                                  model,
                                  loss_fun,
                                  val_meter,
                                  cur_epoch,
                                  mode="valid",
                                  tb=tb)
            # Save the best model based on val score
            if val_loss < min_val_loss:
                min_val_loss = val_loss
                cur_patience = 0
                checkpoint_file = cu.save_best_loss_checkpoint(
                    model, optimizer, cur_epoch, val_loss)
                print(f"Wrote best score checkpoint to: {checkpoint_file}")
            # Handle early stopping based on val score
            elif val_loss - cfg.TRAIN.ES_THRESHOLD > min_val_loss:
                cur_patience += 1
                print(
                    f"Val loss larger than min value, patience at: {cur_patience} (max {cfg.TRAIN.ES_PATIENCE})"
                )
                if cur_patience > cfg.TRAIN.ES_PATIENCE:
                    logger.info(
                        f"ES patience hit at {cur_epoch} epochs, quitting")
                    break

    best_checkpoint = cu.get_best_score_checkpoint()
    best_epoch = cu.load_checkpoint(best_checkpoint, model, optimizer)
    print(f"Loaded checkpoint from epoch: {best_epoch+1}")

    print("=" * 100)
    test_epoch(train_loader,
               model,
               loss_fun,
               train_meter,
               cur_epoch,
               mode="train",
               tb=None)
    test_epoch(test_loader,
               model,
               loss_fun,
               test_meter,
               cur_epoch,
               mode="test",
               tb=None)

    if tb is not None:
        tb.close()
Exemplo n.º 23
0
def main(cfg):
    # basic settings
    loss_F = torch.nn.CrossEntropyLoss()
    gpu_nums = int(cfg['NUM_GPUS'])
    if gpu_nums == 0:
        use_cuda = False
    else:
        use_cuda = True

    # load model
    model = AnyNet(cfg)
    if use_cuda:
        model = torch.nn.DataParallel(model, device_ids=[0])
        model = model.cuda()

    # load_dataset
    Trainpath   = cfg['TRAIN']['PATH']
    RESIZE_SIZE = cfg['TRAIN']['IM_SIZE']
    train_data  = SingleDataset(Trainpath, split='train', resize_size=RESIZE_SIZE)
    train_loader= DataLoader(dataset=train_data, batch_size=cfg['TRAIN']['BATCH_SIZE'],
                             shuffle=True, num_workers=cfg['DATA_LOADER']['NUM_WORKERS'], pin_memory=True)

    Testpath    = cfg['TEST']['PATH']
    RESIZE_SIZE_val = cfg['TEST']['IM_SIZE']
    test_data   = SingleDataset(Testpath, split='val', resize_size=RESIZE_SIZE_val)
    test_loader = DataLoader(dataset=test_data, batch_size=cfg['TEST']['BATCH_SIZE'],
                             shuffle=False, num_workers=cfg['DATA_LOADER']['NUM_WORKERS'], pin_memory=True)

    # optimizer and loss function and evaluator
    if cfg['OPTIM']['OPTIMIZER'] == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=cfg['OPTIM']['BASE_LR'], weight_decay=1e-4)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=cfg['OPTIM']['BASE_LR'], momentum=0.9, weight_decay=5e-4)

    # load checkpoint or initial weights
    start_epoch = 0
    if cfg['TRAIN']['RESUME'] is not None:
        resume = cfg['TRAIN']['RESUME']
        if not os.path.isfile(resume):
            raise RuntimeError("=> no checkpoint found at '{}'".format(resume))
        checkpoint_epoch = cp.load_checkpoint(resume, gpu_num=gpu_nums, model=model, optimizer=optimizer)
        start_epoch = checkpoint_epoch + 1
    elif cfg['TRAIN']['WEIGHTS']:
        cp.load_checkpoint(cfg['TRAIN']['WEIGHTS'], gpu_nums, model)
    else:
        init_weights(model, zero_init_gamma=cfg['BN']['ZERO_INIT_FINAL_GAMMA'])

    # save training process
    log_file = log_g.get_log_filename(os.path.join(cfg['OUT_DIR'], 'log/'))
    log = open(log_file, 'w+')

    # start training
    max_epoch   = cfg['OPTIM']['MAX_EPOCH']
    batch_size  = cfg['TRAIN']['BATCH_SIZE']
    eval_period = cfg['TRAIN']['EVAL_PERIOD']
    batch_count = 0
    total_step  = len(train_loader)
    num_class   = cfg['MODEL']['NUM_CLASSES']
    # correct_all = list(0. for i in range(cfg['MODEL']['NUM_CLASSES']))
    # total_all   = list(0. for i in range(cfg['MODEL']['NUM_CLASSES']))
    for epoch in range(start_epoch, max_epoch):
        print('**************train --%d-- **************' % (epoch))
        log.write('**************train --%d-- **************\n' % (epoch))

        # update learning rate
        lr = optim.get_epoch_lr(epoch_i=epoch, cfg=cfg)
        optim.set_lr(optimizer, lr)

        #############################################################################
        # start training an epoch
        #############################################################################
        model.train()
        c_train = 0
        t_train = 0
        for i, (img, lbl) in enumerate(train_loader):
            batch_count += 1

            # use cuda
            if use_cuda:
                img, lbl = img.cuda(), lbl.cuda()

            # forward
            preds = model(img)
            loss = loss_F(preds, lbl)

            # backward
            # optimizer.zero_grad()
            loss.backward()
            # optimizer.step()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            if (batch_count % batch_size) == 0:
                optimizer.step()
                optimizer.zero_grad()
                batch_count = 0

            _, predicted = preds.max(1)
            c_train += predicted.eq(lbl).sum().item()
            t_train += lbl.size(0)

            # print epoch, step, loss, lr
            print('[%s]--train: %d/%d\tstep:%d/%d----lr:%.5f---loss:%.4f---Acc:%.3f' % (
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                (epoch + 1), max_epoch, (i + 1), total_step, lr, loss.item(), 100*(c_train/t_train)))
            log.write('[%s]--train: [%d/%d]\tstep: [%d/%d]\t----lr:%.5f---loss:%.4f---Acc:%.3f\n' %(
                      datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                      (epoch + 1), max_epoch, (i + 1), total_step, lr, loss.item(), 100*(c_train/t_train)))


        #############################################################################
        # start validation
        #############################################################################
        if ((epoch+1) % eval_period == 0):
            print('**************validation --%d-- **************' % ((epoch + 1) // eval_period))
            model.eval()
            mean_loss_val = 0
            correct = np.zeros((num_class))
            total = np.zeros((num_class))
            top1_acc_sum = []
            with torch.no_grad():
                for val_epoch, (img_val, lbl_val) in enumerate(test_loader):
                    if use_cuda:
                        img_val, lbl_val = img_val.cuda(), lbl_val.cuda()

                    # predict
                    preds_val = model(img_val)

                    # calculate loss
                    loss_val  = loss_F(preds_val, lbl_val)
                    mean_loss_val += loss_val.item()

                    # evaluation
                    top1_acc, top2_acc = Evaluator.accuracy(preds_val, lbl_val, [1,2])
                    correct_i, total_i = Evaluator.accuracy_perclass(preds_val, lbl_val, num_class)
                    correct += correct_i
                    total   += total_i
                    top1_acc_sum.append(top1_acc)

                    print('[%s]--valid: [%d/%d]\tloss: %.4f---top1_acc: %.3f' % (
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        val_epoch, len(test_loader), loss_val.item(), top1_acc.item()))
                print('[{}]--valid: [{}]\tmean_loss: {}\ttop1_acc: {}\tper_class_acc: {}'.format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    (epoch + 1), (mean_loss_val / len(test_loader)), np.mean(top1_acc_sum), 100*(correct/total)))
                # save log
                log.write('[{}]--valid: [{}]\tmean_loss: {}\ttop1_acc: {}\tper_class_acc: {}\n'.format(
                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    (epoch + 1), (mean_loss_val / val_epoch), np.mean(top1_acc_sum), 100*(correct/total)))


        #############################################################################
        # save model
        #############################################################################
        if ((epoch+1)%5==0):
            checkpoint_file = os.path.join(cfg['OUT_DIR'], 'checkpoint/')
            checkpoint_filename = cp.save_checkpoint(model, optimizer, epoch, gpu_nums, checkpoint_file)
            log.write('[{}]--save checkpoint: {}\n'.format(
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                checkpoint_filename
            ))

    log.close()
Exemplo n.º 24
0
def train_net(args, logger, seed):
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
    logger.info('seed={}'.format(seed))

    # init seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    # cudnn.benchmark = True
    cudnn.benchmark = False
    cudnn.deterministic = True  # cudnn

    writer = SummaryWriter(args.outpath)
    start_epoch = 0
    val_best_acc = 0
    val_best_acc_index = 0

    # data_loader
    train_loader, val_loader, target_class_num, dataset_sizes = \
        get_target_dataloader(args.target_dataset, args.batch_size, args.num_workers, args.target_data_dir,
                              image_size=args.image_size, data_aug=args.data_aug, logger=logger)

    # model setting
    model_source, model_target = get_model(args.base_model_name,
                                           args.base_task, logger, args)

    # target_model split: (feature, classifier)
    model_feature, model_source_classifier, model_target_classifier = \
        model_split(args.base_model_name, model_target, target_class_num, logger, args)

    if len(args.gpu_id) > 1:
        model_source = nn.DataParallel(model_source)
        model_feature = nn.DataParallel(model_feature)
        model_source_classifier = nn.DataParallel(model_source_classifier)
        model_target_classifier = nn.DataParallel(model_target_classifier)
        model_source = model_source.cuda()
        model_feature = model_feature.cuda()
        model_target_classifier = model_target_classifier.cuda()
        model_source_classifier = model_source_classifier.cuda()
        logger.info("push all model to dataparallel and then gpu")
    else:
        model_source = model_source.cuda()
        model_feature = model_feature.cuda()
        model_target_classifier = model_target_classifier.cuda()
        model_source_classifier = model_source_classifier.cuda()
        logger.info("push all model to gpu")

    # iterations -> epochs
    num_epochs = int(np.round(args.max_iter * args.batch_size / dataset_sizes))
    step = [int(0.67 * num_epochs)]
    logger.info('num_epochs={}, step={}'.format(num_epochs, step))

    # loss
    loss_fn = get_loss_type(loss_type=args.loss_type, logger=logger)

    # get feature_criterions
    if args.reg_type in ['channel_att_fea_map_learn', 'fea_loss']:
        feature_criterions = get_reg_criterions(args, logger)

    # optimizer and lr_scheduler
    optimizer, lr_scheduler = get_optimier_and_scheduler(
        args, model_feature, model_target_classifier, feature_criterions, step,
        logger)

    # init framework
    framework = TransferFramework(args,
                                  train_loader,
                                  val_loader,
                                  target_class_num,
                                  args.data_aug,
                                  args.base_model_name,
                                  model_source,
                                  model_feature,
                                  model_source_classifier,
                                  model_target_classifier,
                                  feature_criterions,
                                  loss_fn,
                                  num_epochs,
                                  optimizer,
                                  lr_scheduler,
                                  writer,
                                  logger,
                                  print_freq=args.print_freq)

    # epochs
    for epoch in range(start_epoch, num_epochs):
        # train epoch
        clc_loss, kl_loss, fea_loss, train_total_loss, train_top1_acc = framework.train(
            epoch)
        # val epoch
        val_loss, val_top1_acc = framework.val(epoch)
        # record into txt
        ours_record_epoch_data(args.outpath, epoch, clc_loss, kl_loss,
                               fea_loss, train_total_loss, train_top1_acc,
                               val_loss, val_top1_acc)

        if val_top1_acc >= val_best_acc:
            val_best_acc = val_top1_acc
            val_best_acc_index = epoch
            # save_checkpoint
            save_checkpoint(args.outpath, epoch, model_feature,
                            model_source_classifier, model_target_classifier,
                            optimizer, lr_scheduler, val_best_acc)

        logger.info(
            '||==>Val Epoch: Val_best_acc_index={}\tVal_best_acc={:.4f}\n'.
            format(val_best_acc_index, val_best_acc))
        # break
    return val_best_acc
Exemplo n.º 25
0
def main():
    args = get_arguments()

    # configuration
    CONFIG = Dict(yaml.safe_load(open(args.config)))

    # writer
    if CONFIG.writer_flag:
        writer = SummaryWriter(CONFIG.result_path)
    else:
        writer = None

    # DataLoaders
    normalize = Normalize(mean=get_mean(), std=get_std())

    train_data = Kinetics(CONFIG,
                          transform=Compose([
                              RandomCrop((CONFIG.height, CONFIG.width)),
                              ToTensor(),
                              normalize,
                          ]))

    val_data = Kinetics(CONFIG,
                        transform=Compose([
                            RandomCrop((CONFIG.height, CONFIG.width)),
                            ToTensor(),
                            normalize,
                        ]),
                        mode='validation')

    train_loader = DataLoader(train_data,
                              batch_size=CONFIG.batch_size,
                              shuffle=True,
                              num_workers=CONFIG.num_workers,
                              drop_last=True)

    val_loader = DataLoader(val_data,
                            batch_size=CONFIG.batch_size,
                            shuffle=False,
                            num_workers=CONFIG.num_workers)

    # load model
    print('\n------------------------Loading Model------------------------\n')

    if CONFIG.model == 'resnet18':
        print(CONFIG.model + ' will be used as a model.')
        model = resnet.generate_model(18, n_classes=CONFIG.n_classes)
    elif CONFIG.model == 'resnext':
        print('ResNext101 will be used as a model.')
        model = resnext.generate_model(101, n_classes=CONFIG.n_classes)
    elif CONFIG.model == 'slowfast':
        print('slowfast will be used as a model.')
        model = slowfast.resnet152(class_num=CONFIG.n_classes)
    elif CONFIG.model == 'slowfast101_nl':
        print('slowfast101 with non local network will be used as a model.')
        model = slowfast.resnet101_NL(class_num=CONFIG.n_classes)
    elif CONFIG.model == 'slowfast_nl':
        if CONFIG.dual_attention:
            print('slowfast_nl w/ dual attention will be used as a model.')
            model = slowfast.resnet152_NL(class_num=CONFIG.n_classes,
                                          dual_attention=True)
        else:
            print('slowfast_nl w/o dual attention will be used as a model.')
            model = slowfast.resnet152_NL(class_num=CONFIG.n_classes)
    elif CONFIG.model == 'slowfast_nl':
        print('slowfast101_nl w/o dual attention will be used as a model.')
        model = slowfast.resnet101_NL(class_num=CONFIG.n_classes,
                                      dual_attention=False)
    else:
        print('resnet18 will be used as a model.')
        model = resnet.generate_model(18, n_classes=CONFIG.n_classes)

    # metric
    if CONFIG.metric == 'L2constrain':
        print('L2constrain metric will be used.')
        model.fc = L2ConstrainedLinear(model.fc.in_features,
                                       model.fc.out_features)

    # multi-scale input
    if CONFIG.msc == 'Temporal':
        print('Temporal multi-scale input will be used')
        model = TemporalMSC(model)
    elif CONFIG.msc == 'Spatial':
        print('Spatial multi-scale input will be used')
        model = SpatialMSC(model)
    elif CONFIG.msc == 'SpatioTemporal':
        print('SpatioTemporal multi-scale input will be used')
        model = SpatioTemporalMSC(model)

    # set optimizer, lr_scheduler
    if CONFIG.optimizer == 'Adam':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = optim.Adam(model.parameters(), lr=CONFIG.learning_rate)
    elif CONFIG.optimizer == 'SGD':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = optim.SGD(model.parameters(),
                              lr=CONFIG.learning_rate,
                              momentum=CONFIG.momentum,
                              dampening=CONFIG.dampening,
                              weight_decay=CONFIG.weight_decay,
                              nesterov=CONFIG.nesterov)
    elif CONFIG.optimizer == 'AdaBound':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = adabound.AdaBound(model.parameters(),
                                      lr=CONFIG.learning_rate,
                                      final_lr=CONFIG.final_lr,
                                      weight_decay=CONFIG.weight_decay)
    else:
        print('There is no optimizer which suits to your option. \
            Instead, SGD will be used as an optimizer.')
        optimizer = optim.SGD(model.parameters(),
                              lr=CONFIG.learning_rate,
                              momentum=CONFIG.momentum,
                              dampening=CONFIG.dampening,
                              weight_decay=CONFIG.weight_decay,
                              nesterov=CONFIG.nesterov)

    # learning rate scheduler
    if CONFIG.optimizer == 'SGD':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 'min', patience=CONFIG.lr_patience)
    else:
        scheduler = None

    # send the model to cuda/cpu
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    if device == 'cuda':
        model = torch.nn.DataParallel(model)  # make parallel
        torch.backends.cudnn.benchmark = True
    else:
        print(
            'You have to use GPUs because training 3DCNN is computationally expensive.'
        )
        sys.exit(1)

    # resume if you want
    begin_epoch = 0
    log = None
    if args.resume:
        if os.path.exists(os.path.join(CONFIG.result_path, 'checkpoint.pth')):
            print('loading the checkpoint...')
            begin_epoch, model, optimizer, scheduler = resume(
                CONFIG, model, optimizer, scheduler)
            print('training will start from {} epoch'.format(begin_epoch))
        if os.path.exists(os.path.join(CONFIG.result_path, 'log.csv')):
            log = pd.read_csv(os.path.join(CONFIG.result_path, 'log.csv'))

    # generate log when you start training from scratch
    if log is None:
        log = pd.DataFrame(columns=[
            'epoch', 'lr', 'train_loss', 'val_loss', 'acc@1', 'acc@5'
        ])

    # criterion for loss
    if CONFIG.class_weight:
        criterion = nn.CrossEntropyLoss(weight=get_class_weight().to(device))
    else:
        criterion = nn.CrossEntropyLoss()

    # train and validate model
    print('\n------------------------Start training------------------------\n')
    losses_train = []
    losses_val = []
    top1_accuracy = []
    top5_accuracy = []
    best_top1_accuracy = 0.0
    best_top5_accuracy = 0.0

    for epoch in range(begin_epoch, CONFIG.max_epoch):

        # training
        loss_train = train(model, train_loader, criterion, optimizer, CONFIG,
                           device)
        losses_train.append(loss_train)

        # validation
        loss_val, top1, top5 = validation(model, val_loader, criterion, CONFIG,
                                          device)

        if CONFIG.optimizer == 'SGD':
            scheduler.step(loss_val)

        losses_val.append(loss_val)
        top1_accuracy.append(top1)
        top5_accuracy.append(top5)

        # save a model if topk accuracy is higher than ever
        # save base models, NOT DataParalled models
        if best_top1_accuracy < top1_accuracy[-1]:
            best_top1_accuracy = top1_accuracy[-1]
            torch.save(
                model.module.state_dict(),
                os.path.join(CONFIG.result_path,
                             'best_top1_accuracy_model.prm'))

        if best_top5_accuracy < top5_accuracy[-1]:
            best_top5_accuracy = top5_accuracy[-1]
            torch.save(
                model.module.state_dict(),
                os.path.join(CONFIG.result_path,
                             'best_top5_accuracy_model.prm'))

        # save checkpoint every epoch
        save_checkpoint(CONFIG, epoch, model, optimizer, scheduler)

        # save a model every 10 epoch
        # save base models, NOT DataParalled models
        if epoch % 10 == 0 and epoch != 0:
            torch.save(
                model.module.state_dict(),
                os.path.join(CONFIG.result_path,
                             'epoch_{}_model.prm'.format(epoch)))

        # tensorboardx
        if writer is not None:
            writer.add_scalar("loss_train", losses_train[-1], epoch)
            writer.add_scalar('loss_val', losses_val[-1], epoch)
            writer.add_scalars(
                "iou", {
                    'top1_accuracy': top1_accuracy[-1],
                    'top5_accuracy': top5_accuracy[-1]
                }, epoch)

        # write logs to dataframe and csv file
        tmp = pd.Series([
            epoch,
            scheduler.get_lr()[0],
            losses_train[-1],
            losses_val[-1],
            top1_accuracy[-1],
            top5_accuracy[-1],
        ],
                        index=log.columns)

        log = log.append(tmp, ignore_index=True)
        log.to_csv(os.path.join(CONFIG.result_path, 'log.csv'), index=False)

        print(
            'epoch: {}\tloss train: {:.5f}\tloss val: {:.5f}\ttop1_accuracy: {:.5f}\ttop5_accuracy: {:.5f}'
            .format(epoch, losses_train[-1], losses_val[-1], top1_accuracy[-1],
                    top5_accuracy[-1]))

    # save base models, NOT DataParalled models
    torch.save(model.module.state_dict(),
               os.path.join(CONFIG.result_path, 'final_model.prm'))
Exemplo n.º 26
0
def train(cfg):
    # logger
    logger = logging.getLogger(name="merlin.baseline.train")
    logger.info("training...")

    # transform
    transform_train_list = [
        # transforms.RandomResizedCrop(size=128, scale=(0.75,1.0), ratio=(0.75,1.3333), interpolation=3), #Image.BICUBIC)
        transforms.Resize(size=cfg.INPUT.SIZE_TRAIN, interpolation=1),
        transforms.Pad(32),
        transforms.RandomCrop(cfg.INPUT.SIZE_TRAIN),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]

    transform_val_list = [
        transforms.Resize(size=cfg.INPUT.SIZE_TEST, interpolation=3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]

    # prepare dataset
    train_dataset = MyDataset(root=cfg.DATA.ROOT, transform=transforms.Compose(transform_train_list), type='train')
    val_dataset = MyDataset(root=cfg.DATA.ROOT, transform=transforms.Compose(transform_val_list), type='val')
    train_loader = DataLoader(train_dataset,
                              batch_size=cfg.SOLVER.BATCH_SIZE,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=False)
    val_loader = DataLoader(val_dataset,
                              batch_size=cfg.SOLVER.BATCH_SIZE,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=False)
    num_classes = cfg.MODEL.HEADS.NUM_CLASSES

    # prepare model
    model = build_model(cfg, num_classes)
    model = model.cuda()
    model = nn.DataParallel(model)

    # prepare solver
    optimizer = make_optimizer(cfg, model)
    scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
                                  cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD)

    start_epoch = 0

    # Train and val
    since = time.time()
    for epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCHS):
        model.train(True)
        logger.info("Epoch {}/{}".format(epoch, cfg.SOLVER.MAX_EPOCHS - 1))
        logger.info('-' * 10)

        running_loss = 0.0
        # Iterate over data
        it = 0
        running_acc = 0
        for data in train_loader:
            it += 1
            # get the inputs
            inputs, labels = data
            now_batch_size, c, h, w = inputs.shape
            if now_batch_size < cfg.SOLVER.BATCH_SIZE:  # skip the last batch
                continue

            # wrap them in Variable
            inputs = Variable(inputs.cuda().detach())
            labels = Variable(labels.cuda().detach())

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            out = model(inputs)
            loss_dict = get_loss(cfg, outs=out, label=labels)
            loss = sum(loss_dict.values())

            loss.backward()
            optimizer.step()
            scheduler.step()

            # statistics
            with torch.no_grad():
                _, preds = torch.max(out['pred_class_logits'], 1)
                running_loss += loss
                running_acc += torch.sum(preds == labels.data).float().item() / cfg.SOLVER.BATCH_SIZE

            if it % 50 == 0:
                logger.info(
                    'epoch {}, iter {}, loss: {:.3f}, acc: {:.3f}, lr: {:.5f}'.format(
                        epoch, it, running_loss / it, running_acc / it,
                        optimizer.param_groups[0]['lr']))

        epoch_loss = running_loss / it
        epoch_acc = running_acc / it

        logger.info('epoch {} loss: {:.4f} Acc: {:.4f}'.format(epoch, epoch_loss, epoch_acc))

        # save checkpoint
        if epoch % cfg.SOLVER.CHECKPOINT_PERIOD == 0:
            checkpoint = {'epoch': epoch + 1,
                          'model': model.module.state_dict() if (len(cfg.MODEL.DEVICE_ID) - 2) > 1 else model.state_dict(),
                          'optimizer': optimizer.state_dict()
                          }
            save_checkpoint(checkpoint, epoch, cfg)

        # evaluate
        if epoch % cfg.SOLVER.EVAL_PERIOD == 0:
            logger.info('evaluate...')
            model.train(False)

            total = 0.0
            correct = 0.0
            for data in val_loader:
                inputs, labels = data
                inputs = Variable(inputs.cuda().detach())
                labels = Variable(labels.cuda().detach())
                with torch.no_grad():
                    out = model(inputs)
                    _, preds = torch.max(out['pred_class_logits'], 1)
                    c = (preds == labels).squeeze()
                    total += c.size(0)
                    correct += c.float().sum().item()
            acc = correct / total
            logger.info('eval acc:{:.4f}'.format(acc))

        time_elapsed = time.time() - since
        logger.info('Training complete in {:.0f}m {:.0f}s\n'.format(
            time_elapsed // 60, time_elapsed % 60))

    return model
Exemplo n.º 27
0
def train(cfg):
    """
    Train a video model for many epochs on train set and evaluate it on val set.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Setup logging format.
    logging.setup_logging(logger, cfg)

    # Print config.
    logger.info("Train with config:")
    logger.info(pprint.pformat(cfg))

    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    if du.is_master_proc():
        misc.log_model_info(model)

    # Construct the optimizer.
    optimizer = optim.construct_optimizer(model, cfg)

    # Record global step
    gs = 0

    # Load a checkpoint to resume training if applicable.
    if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR):
        logger.info("Load from last checkpoint.")
        last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
        gs, checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model,
                                                  cfg.NUM_GPUS > 1, optimizer)
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        logger.info("Load from given checkpoint file.")
        if cfg.TRAIN.LOAD_PART_OF_CHECKPOINT:
            gs, checkpoint_epoch = cu.load_part_of_checkpoint(
                cfg.TRAIN.CHECKPOINT_FILE_PATH,
                model,
                cfg.NUM_GPUS > 1,
                optimizer=None)
        else:
            gs, checkpoint_epoch = cu.load_checkpoint(
                cfg.TRAIN.CHECKPOINT_FILE_PATH,
                model,
                cfg.NUM_GPUS > 1,
                optimizer=None,
                inflation=False,
                convert_from_caffe2=False)
        start_epoch = checkpoint_epoch + 1
    else:
        gs = 0
        start_epoch = 0

    # Create the video train and val loaders.
    train_loader = loader.construct_loader(cfg, "train")
    val_loader = loader.construct_loader(cfg, "val")

    # Create meters.
    train_meter = TrainMeter(len(train_loader), cfg)
    val_meter = ValMeter(cfg)

    # Perform the training loop.
    logger.info("Start epoch: {} gs {}".format(start_epoch + 1, gs + 1))

    for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH):
        # Shuffle the dataset.
        loader.shuffle_dataset(train_loader, cur_epoch)

        # Evaluate the model on validation set.
        if misc.is_eval_epoch(cfg, cur_epoch):
            if cfg.TRAIN.USE_CENTER_VALIDATION:
                validation_epoch_center(val_loader, model, val_meter,
                                        cur_epoch, cfg)
            else:
                validation_epoch(val_loader, model, val_meter, cur_epoch, cfg)
        # Train for one epoch.
        gs = train_epoch(train_loader, model, optimizer, train_meter,
                         cur_epoch, gs, cfg)

        # Compute precise BN stats.
        # if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0:
        #     calculate_and_update_precise_bn(
        #         train_loader, model, cfg.BN.NUM_BATCHES_PRECISE
        #     )
        # Save a checkpoint.
        if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD):
            cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, gs,
                               cfg)